diff --git a/.clang-tidy b/.clang-tidy
index 2ddbefbf9..f9b77bce8 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -1,10 +1,12 @@
 ---
 InheritParentConfig: true
-ExtraArgs: ['-v']
+ExtraArgs: []
 FormatStyle: file
 UseColor: true
 WarningsAsErrors: '*'
-ExcludeHeaderFilterRegex: '^(3rdparty|tvm)/.*$'
+# FIXME: Use `ExcludeHeaderFilterRegex` instead when all maintainers upgraded their `clang-tidy`
+HeaderFilterRegex: '^(?!.*(?:/|^)(3rdparty|tvm)/).*'
+# ExcludeHeaderFilterRegex: '^(3rdparty|tvm)/.*$'
 
 # NOTE: there must be no spaces before the '-', so put the comma last.
 Checks: >-
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index 3ba13e0ce..0086358db 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1 +1 @@
-blank_issues_enabled: false
+blank_issues_enabled: true
diff --git a/.github/ISSUE_TEMPLATE/release-plan.yml b/.github/ISSUE_TEMPLATE/release-plan.yml
new file mode 100644
index 000000000..a3528275c
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/release-plan.yml
@@ -0,0 +1,63 @@
+name: "Release Plan"
+description: "Plan the next release"
+title: "[Release Plan] vX.Y.Z"
+labels:
+  - release-plan
+  - tracking
+assignees: []
+body:
+  - type: input
+    id: version
+    attributes:
+      label: "Version"
+      placeholder: "v0.2.0"
+    validations:
+      required: true
+
+  - type: input
+    id: milestone
+    attributes:
+      label: "Milestone"
+      description: "Link or name of the milestone for this release"
+      placeholder: "https://github.com/tile-ai/tilelang/milestone/XX"
+
+  - type: textarea
+    id: scope
+    attributes:
+      label: "Scope"
+      description: "Goals and non-goals (brief)"
+      placeholder: |
+        - Goals: ...
+        - Non-goals: ...
+
+  - type: textarea
+    id: tasks
+    attributes:
+      label: "Tasks"
+      description: "Task list; link issues/PRs"
+      value: |
+        - [ ] Features
+        - [ ] Fixes
+        - [ ] Docs
+        - [ ] API/Breaking changes
+        - [ ] Benchmarks
+        - [ ] Release notes
+
+  - type: checkboxes
+    id: readiness
+    attributes:
+      label: "Readiness"
+      options:
+        - label: "All planned issues closed or deferred"
+        - label: "Docs updated"
+        - label: "CI green; artifacts verified"
+        - label: "Release notes drafted"
+
+  - type: textarea
+    id: notes
+    attributes:
+      label: "Notes"
+      description: "Risks or communications (optional)"
+      placeholder: |
+        - Risk: ...
+        - Communication: ...
diff --git a/.github/workflows/amd_ci.yml b/.github/workflows/amd_ci.yml
index 2ef300b66..144c0f09f 100644
--- a/.github/workflows/amd_ci.yml
+++ b/.github/workflows/amd_ci.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: [self-hosted, amd, gpu]
 
     permissions:
-      contents: write 
+      contents: write
 
     steps:
     - name: Checkout repository
@@ -56,7 +56,7 @@ jobs:
           echo "------------------------------------"
           exit 1
         fi
-    
+
     - name: Commit and Push Changes
       uses: stefanzweifel/git-auto-commit-action@v5
       with:
@@ -86,7 +86,7 @@ jobs:
         set -e
         REQS_HASH=$(sha256sum requirements-rocm.txt | cut -d ' ' -f 1)
         MARKER="${{ runner.tool_cache }}/.venv_marker_${{ env.PYTHON_VERSION }}_${REQS_HASH:0:8}"
-        
+
         echo "Installing requirements"
         if [[ -f "$MARKER" ]] && [[ -f "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate" ]]; then
           echo "venv exists and hash matches – reuse it"
@@ -117,4 +117,4 @@ jobs:
         source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
         cd testing/python/amd
         unset PYTHONPATH
-        python -m pytest -v test_tilelang_test_amd.py
\ No newline at end of file
+        python -m pytest -v test_tilelang_test_amd.py
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a04edc1eb..8d5f3ffb4 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,154 +1,342 @@
 name: CI
-on: [pull_request]
+on:
+  pull_request:
+    types:
+      - labeled
+      - unlabeled
+      - opened
+      - synchronize
+      - reopened
+  # Allow to trigger the workflow manually
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+concurrency:
+  group: "${{ github.workflow }}-${{ github.ref }}"
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
 
 env:
-  PYTHON_VERSION: '3.12'
-  VENV_DIR: tilelang_ci
+  CLANG_TIDY_CMAKE_OPTIONS: "-DCMAKE_EXPORT_COMPILE_COMMANDS=ON" # to be updated
+  PYTHONDEVMODE: "1"
+  PYTHONUNBUFFERED: "1"
+  PYTHONPATH: "" # explicit cleanup
+  PIP_USER: "" # explicit cleanup
+  COLUMNS: "100"
+  FORCE_COLOR: "1"
+  CLICOLOR_FORCE: "1"
+  UV_INDEX_STRATEGY: "unsafe-best-match"
+  UV_HTTP_TIMEOUT: "600"
+  XDG_CACHE_HOME: "${{ github.workspace }}/.cache" # to be updated
+  PIP_CACHE_DIR: "${{ github.workspace }}/.cache/pip" # to be updated
+  UV_CACHE_DIR: "${{ github.workspace }}/.cache/uv" # to be updated
+  PRE_COMMIT_HOME: "${{ github.workspace }}/.cache/pip/.pre-commit" # to be updated
 
 jobs:
-  format-check:
-    runs-on: [self-hosted, nvidia, hopper]
+  lint:
+    name: Quick Lint
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          submodules: recursive
 
-    permissions:
-      contents: write
+      - name: Setup Python 3.8
+        id: setup-pylowest
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.8" # use lowest supported version for linting
+          update-environment: false
+
+      - name: Check AST with Python 3.8
+        run: |
+          "${{ steps.setup-pylowest.outputs.python-path }}" -m compileall -q -f tilelang
+
+      - name: Setup Python 3.9
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.9"
+          update-environment: true
+          cache: pip
+          cache-dependency-path: |
+            pyproject.toml
+            requirements*.txt
+            .pre-commit-config.yaml
+
+      - name: Pre-commit Lint
+        run: |
+          if ! pipx run pre-commit run --all-files --color=always --show-diff-on-failure; then
+            echo "::error::Pre-commit checks failed. Please run 'pre-commit install' and 'pre-commit run --all-files' locally to see the issues."
+            exit 1
+          fi
+
+  tests:
+    name: Test for Python ${{ matrix.python-version }} with ${{ matrix.runner.toolkit }} (on ${{ matrix.runner.name }})
+    if: |
+      github.repository_owner == 'tile-ai' &&
+      (github.event_name != 'pull_request' || !github.event.pull_request.draft)
+    needs: [lint]
+    runs-on: ${{ matrix.runner.tags }}
+    strategy:
+      matrix:
+        runner:
+          - tags: [self-hosted, tilescale]
+            name: self-hosted-nvidia
+            # Format: [Nightly-]CUDA-<major>.<minor>[.<patch>]. E.g., "CUDA-12.8" or "Nightly-CUDA-13.0".
+            # Use "Nightly-" prefix to use torch nightly builds.
+            toolkit: CUDA-12.8
+        python-version:
+          - "3.12"
+      fail-fast: false
+    timeout-minutes: 120
 
     steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-      with:
-        fetch-depth: 0
-
-    - name: Set up Python
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ env.PYTHON_VERSION }}
-
-    - name: Ensure venv (local & persistent)
-      run: |
-        set -e
-        REQS_HASH=$(sha256sum requirements-test.txt 2>/dev/null | awk '{print $1}' || echo "no_requirements")
-        MARKER="${{ runner.tool_cache }}/.venv_marker_${{ env.PYTHON_VERSION }}_${REQS_HASH:0:8}"
-
-        if [[ -f "$MARKER" ]] && [[ -f "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate" ]]; then
-          echo "venv exists and hash matches – reuse it"
-        else
-          echo "venv stale or missing – recreating"
-          rm -rf "${{ runner.tool_cache }}/${{ env.VENV_DIR }}" "$MARKER"
-          python -m venv "${{ runner.tool_cache }}/${{ env.VENV_DIR }}"
-          # shellcheck source=/dev/null
-          source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
-          python -m pip install --upgrade pip --no-user
-          [[ -f requirements-test.txt ]] && \
-            PIP_NO_BUILD_ISOLATION=1 pip install -r requirements-test.txt --no-user
-          pip install flash_attn==2.5.8 --no-user --no-build-isolation
-          touch "$MARKER"
-        fi
-
-    - name: Run format check
-      run: |
-        source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
-        if ! output=$(./format.sh 2>&1); then
-          echo "------------------------------------"
-          echo "message:"
-          echo "$output"
-          printf '%s\n' "$output" | grep "Please review and stage the changes."
-          echo "------------------------------------"
-          exit 1
-        fi
-    
-    - name: Commit and Push Changes
-      uses: stefanzweifel/git-auto-commit-action@v5
-      with:
-        commit_message: "lint"
-
-  build-test-nvidia:
-    runs-on: [self-hosted, nvidia, hopper]
-    needs: format-check
-    permissions:
-      contents: read
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-      with:
-        fetch-depth: 0
-        repository: ${{ github.event.pull_request.head.repo.full_name }}
-        ref: ${{ github.event.pull_request.head.ref }}
-
-    - name: Set up Python
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ env.PYTHON_VERSION }}
-
-    - name: Ensure venv (local & persistent)
-      run: |
-        set -e
-        REQS_HASH=$(cat requirements-test.txt 2>/dev/null || true)
-        MARKER="${{ runner.tool_cache }}/.venv_marker_${{ env.PYTHON_VERSION }}_${REQS_HASH:0:8}"
-        # NOTE(wt): We disable the venv reuse for now to allow installing DeepEP
-        # echo "venv stale or missing – recreating"
-        rm -rf "${{ runner.tool_cache }}/${{ env.VENV_DIR }}"
-        python -m venv "${{ runner.tool_cache }}/${{ env.VENV_DIR }}"
-        source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
-        python -m pip install --upgrade pip --no-user
-        [[ -f requirements-test.txt ]] && \
-          PIP_NO_BUILD_ISOLATION=1 pip install -r requirements-test.txt --no-user
-        # flash attention usually requires no isolation build
-        pip install flash_attn==2.5.8 --no-user --no-build-isolation
-
-    - name: Install project (wheel form)
-      run: |
-        source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
-        pip install . --no-user -v
-        bash tilelang/distributed/install_deepep.sh  # Install DeepEP for testing purpose
-
-    - name: Run examples
-      run: |
-        source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
-        cd examples
-        unset PYTHONPATH
-
-        # find and run distributed tests with TILELANG_USE_DISTRIBUTED=1
-        mapfile -t DIST_TESTS < <(find . -type f -path '*/distributed/*' -name 'test*.py' 2>/dev/null || true)
-        if [ "${#DIST_TESTS[@]}" -gt 0 ]; then
+      - name: Checkout repository
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          submodules: recursive
+
+      - name: Set environment (self-hosted runners)
+        if: startsWith(matrix.runner.name, 'self-hosted')
+        run: |
+          # Hide sensitive data in logs for self-hosted runners
+          if [[ -n "${{ secrets.SECRET_PATH_PREFIXES }}" ]]; then
+            echo "::add-mask::${{ secrets.SECRET_PATH_PREFIXES }}"
+            # Colon separated list of secrets to mask
+            for secret in $(echo "${{ secrets.SECRET_PATH_PREFIXES }}" | tr ':' '\n'); do
+              echo "::add-mask::${secret}"
+            done
+          fi
+
+          # Use runner tool_cache as cache root for self-hosted runners to avoid internet connection
+          # issues and to share cache between jobs.
+          export XDG_CACHE_HOME="${{ runner.tool_cache }}/.ci-cache-${{ github.workflow }}"
+          echo "XDG_CACHE_HOME=${XDG_CACHE_HOME}" | tee -a "${GITHUB_ENV}"
+          echo "PIP_CACHE_DIR=${XDG_CACHE_HOME}/pip" | tee -a "${GITHUB_ENV}"
+          echo "UV_CACHE_DIR=${XDG_CACHE_HOME}/uv" | tee -a "${GITHUB_ENV}"
+          echo "PRE_COMMIT_HOME=${XDG_CACHE_HOME}/pip/.pre-commit" | tee -a "${GITHUB_ENV}"
+
+      # Do not use ccache on self-hosted runners, as it will download/upload caches which is slow.
+      # Self-hosted runners usually have more CPU power to compile without ccache.
+      - name: Setup ccache (GitHub-hosted runners)
+        id: setup-ccache
+        if: ${{ !startsWith(matrix.runner.name, 'self-hosted') }}
+        uses: hendrikmuhs/ccache-action@v1
+        with:
+          create-symlink: true
+          evict-old-files: "7d"
+          append-timestamp: false
+          key: ${{ runner.os }}-${{ runner.arch }}-${{ matrix.runner.toolkit }}-${{ hashFiles('**/*.cc') }}
+          restore-keys: |
+            ${{ runner.os }}-${{ runner.arch }}-${{ matrix.runner.toolkit }}-${{ hashFiles('**/*.cc') }}
+            ${{ runner.os }}-${{ runner.arch }}-${{ matrix.runner.toolkit }}
+            ${{ runner.os }}-${{ runner.arch }}
+
+      - name: Set environment (CUDA)
+        if: contains(matrix.runner.toolkit, 'CUDA')
+        run: |
+          TOOLKIT="${{ matrix.runner.toolkit }}"
+          CUDA_VERSION="${TOOLKIT##*-}"
+          CUDA_VERSION_MAJMIN="$(echo ${CUDA_VERSION} | cut -d '.' -f-2)"
+          CUDA_VERSION_MAJMIN_NODOT="${CUDA_VERSION_MAJMIN//./}"
+          if [[ "${TOOLKIT}" == "Nightly-"* ]]; then
+            # Use torch nightly builds
+            export PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/nightly/cu${CUDA_VERSION_MAJMIN_NODOT}"
+          else
+            export PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu${CUDA_VERSION_MAJMIN_NODOT}"
+          fi
+          export UV_INDEX="${PIP_EXTRA_INDEX_URL}"
+          export CLANG_TIDY_CMAKE_OPTIONS="${CLANG_TIDY_CMAKE_OPTIONS} -DUSE_CUDA=ON"
+
+          echo "USE_CUDA=ON" | tee -a "${GITHUB_ENV}"
+          echo "CUDA_VERSION=${CUDA_VERSION}" | tee -a "${GITHUB_ENV}"
+          echo "CUDA_VERSION_MAJMIN=${CUDA_VERSION_MAJMIN}" | tee -a "${GITHUB_ENV}"
+          echo "CUDA_VERSION_MAJMIN_NODOT=${CUDA_VERSION_MAJMIN_NODOT}" | tee -a "${GITHUB_ENV}"
+          echo "PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}" | tee -a "${GITHUB_ENV}"
+          echo "UV_INDEX=${UV_INDEX}" | tee -a "${GITHUB_ENV}"
+          echo "CLANG_TIDY_CMAKE_OPTIONS=${CLANG_TIDY_CMAKE_OPTIONS}" | tee -a "${GITHUB_ENV}"
+
+          if [[ ! -x "$(command -v nvcc)" ]]; then
+            export PATH="/usr/local/cuda/bin:${PATH}"
+            export LD_LIBRARY_PATH="/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+            echo "PATH=${PATH}" | tee -a "${GITHUB_ENV}"
+            echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" | tee -a "${GITHUB_ENV}"
+          fi
+          if [[ -x "$(command -v nvcc)" ]]; then
+            echo "\$ $(command -v nvcc) --version" && nvcc --version
+          else
+            echo "::warning::nvcc not found in PATH!"
+          fi
+
+      - name: Setup Python and uv with caching
+        id: setup-uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          python-version: ${{ matrix.python-version }}
+          activate-environment: true
+          # Do not use cache for self-hosted runners, as it will download/upload caches which is slow.
+          enable-cache: ${{ !startsWith(matrix.runner.name, 'self-hosted') }}
+          prune-cache: ${{ !startsWith(matrix.runner.name, 'self-hosted') }}
+          # Use runner tool_cache for self-hosted runners
+          cache-local-path: ${{ env.UV_CACHE_DIR }}
+          ignore-nothing-to-cache: true
+          # Extra cache key to upload/download caches on GitHub-hosted runners
+          cache-suffix: uv-${{ runner.os }}-${{ runner.arch }}-${{ matrix.python-version }}-${{ matrix.runner.name }}-${{ matrix.runner.toolkit }}
+          cache-dependency-glob: |
+            pyproject.toml
+            requirements*.txt
+            .pre-commit-config.yaml
+
+      - name: Setup venv
+        id: setup-venv
+        run: |
+          set -o pipefail
+
+          uv pip install --upgrade pip setuptools wheel
+          if [[ "${UV_INDEX}" == *"/nightly/"* ]]; then
+            uv pip install --prerelease=allow -v torch
+          fi
+          uv pip install -v -r requirements-test.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+          echo "import torch; print(f'torch: {torch.__version__}')" | uv run --no-project --script -
+          if [[ "${{ matrix.runner.toolkit }}" == *"CUDA"* ]]; then
+            uv pip install --no-build-isolation-package=flash-attn -v -r requirements-test-cuda.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+            echo "import flash_attn; print(f'flash_attn: {flash_attn.__version__}')" | uv run --no-project --script -
+          # elif [[ "${{ matrix.runner.toolkit }}" == *"ROCm"* ]]; then
+          #   uv pip install -v -r requirements-test-rocm.txt
+          # elif [[ "${{ matrix.runner.toolkit }}" == *"Metal"* ]]; then
+          #   uv pip install -v -r requirements-test-metal.txt
+          else
+            echo "::error::Unknown toolkit: ${{ matrix.runner.toolkit }}"
+            exit 1
+          fi
+          echo "::group::torch.utils.collect_env"
+          uv run --no-project -m -- torch.utils.collect_env
+          echo "::endgroup::"
+
+      - name: Clear uv cache for self-hosted runners (if setup failed)
+        if: >-
+          ${{
+            failure() &&
+            startsWith(matrix.runner.name, 'self-hosted') &&
+            (steps.setup-uv.conclusion == 'failure' || steps.setup-venv.conclusion == 'failure')
+          }}
+        run: |
+          echo "Clearing uv cache at ${UV_CACHE_DIR} due to failure."
+          uv cache clean
+
+      - name: Enable core dump generation (Linux / GitHub-hosted runners)
+        if: ${{ runner.os == 'Linux' && !startsWith(matrix.runner.name, 'self-hosted') }}
+        run: |
+          sudo sysctl -w kernel.core_pattern="core.${{ matrix.python-version }}.${{ matrix.runner.toolkit }}.%P"
+          sudo sysctl -w kernel.core_uses_pid=0
+          sudo sysctl -w fs.suid_dumpable=1
+          sysctl kernel.core_pattern kernel.core_uses_pid fs.suid_dumpable
+
+      - name: Enable core dump generation (macOS / GitHub-hosted runners)
+        if: ${{ runner.os == 'macOS' && !startsWith(matrix.runner.name, 'self-hosted') }}
+        run: |
+          sudo sysctl -w kern.corefile="core.${{ matrix.python-version }}.${{ matrix.runner.toolkit }}.%P"
+          sudo sysctl -w kern.coredump=1
+          sudo sysctl -w kern.sugid_coredump=1
+          sysctl kern.corefile kern.coredump kern.sugid_coredump
+
+      - name: Install project (wheel form)
+        run: |
+          uv pip install -v .
+          bash tilelang/distributed/install_deepep.sh  # Install DeepEP for testing purpose
+          export NCCL_IB_DISABLE=1  # Our CI machine's IB is incomplete, disable it to avoid unnecessary error msgs
+
+      # - name: Run clang-tidy
+      #   id: clang-tidy
+      #   if: runner.os == 'Linux'
+      #   run: |
+      #     echo "\$ $(command -v clang-tidy) --version" && clang-tidy --version
+
+      #     # Download run-clang-tidy script
+      #     RCT_URL=https://raw.githubusercontent.com/llvm/llvm-project/refs/heads/release/21.x/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
+      #     echo "Downloading run-clang-tidy script from ${RCT_URL}"
+      #     echo "import urllib.request; url = '${RCT_URL}'.rstrip('/'); urllib.request.urlretrieve(url, url.split('/')[-1])" | uv run --no-project --script -
+      #     RUN_CLANG_TIDY=(uv run --no-project --script -- run-clang-tidy.py)
+
+      #     if [[ -x "$(command -v clang-apply-replacements)" ]]; then
+      #       echo "Using clang-apply-replacements from $(command -v clang-apply-replacements)"
+      #       RUN_CLANG_TIDY+=(-fix -clang-apply-replacements-binary="$(command -v clang-apply-replacements)")
+      #     else
+      #       echo "::warning::clang-apply-replacements not found in PATH, automatic fixing disabled."
+      #     fi
+
+      #     # Run cmake to create the build directory with compile_commands.json
+      #     cmake -S . -B cmake-build --fresh ${CLANG_TIDY_CMAKE_OPTIONS}  # no quotes here
+      #     echo "::group::compile_commands.json"
+      #     ls -alh cmake-build/compile_commands.json
+      #     uv run --no-project -m -- json.tool --no-ensure-ascii cmake-build/compile_commands.json
+      #     echo "::endgroup::"
+
+      #     CXX_FILES=$(find src -type f -iname "*.[ch]pp" -o -iname "*.cc" -o -iname "*.c" -o -iname "*.h")
+      #     rc=0
+      #     echo "::group::run-clang-tidy"
+      #     "${RUN_CLANG_TIDY[@]}" -clang-tidy-binary="$(command -v clang-tidy)" \
+      #       -exclude-header-filter='^(3rdparty|tvm)/.*$' \
+      #       -p="cmake-build" ${CXX_FILES} || rc="$?"
+      #     echo "::endgroup::"
+      #     rm -rf cmake-build run-clang-tidy.py
+      #     if (( rc != 0 )); then
+      #       echo "::error::clang-tidy found issues (exit code: ${rc}). Please run 'clang-tidy --fix' locally to fix them."
+      #       git diff --color=always || true
+      #       exit "${rc}"
+      #     fi
+
+      - name: Run examples with Python ${{ matrix.python-version }} (${{ matrix.runner.toolkit }})
+        if: contains(matrix.runner.toolkit, 'CUDA')
+        run: |
+          cd examples
+          unset PYTHONPATH
+          PYTEST=(
+            uv run --no-project -m --
+            pytest --verbose --color=yes --durations=0 --showlocals --cache-clear -r fE
+          )
+
+          # Run distributed tests (marked with @requires_distributed) with TILELANG_USE_DISTRIBUTED=1
+          # DeepEP tests requires fullmesh nvl or internode environment, we disable for now
           echo "Running distributed examples with TILELANG_USE_DISTRIBUTED=1:"
-          printf '%s\n' "${DIST_TESTS[@]}"
-          TILELANG_USE_DISTRIBUTED=1 python -m pytest -n 1 "${DIST_TESTS[@]}" -v -r fE
-        else
-          echo "No distributed examples found."
-        fi
-
-        # run remaining example tests (non-distributed)
-        mapfile -t OTHER_TESTS < <(find . -type f -name 'test*.py' ! -path '*/distributed/*' | grep -vE 'sink|vs_sparse' 2>/dev/null || true)  # temporarily disable problematic tests
-        if [ "${#OTHER_TESTS[@]}" -gt 0 ]; then
+          TILELANG_USE_DISTRIBUTED=1 "${PYTEST[@]}" --maxfail=3 --numprocesses=1 -m distributed --ignore-glob='*deepep*' . || true
+
+          # Run remaining example tests (non-distributed)
+          # Temporarily disable problematic tests: sink, vs_sparse
           echo "Running non-distributed examples:"
-          printf '%s\n' "${OTHER_TESTS[@]}"
-          python -m pytest -n 4 "${OTHER_TESTS[@]}" -v -r fE
-        else
-          echo "No non-distributed example tests found."
-        fi
-
-    - name: Run tests
-      run: |
-        source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
-        cd testing/python
-        unset PYTHONPATH
-
-        # run distributed tests first with env var
-        mapfile -t DIST_TESTS < <(find . -type f -path '*/distributed/*' -name 'test*.py' 2>/dev/null || true)
-        if [ "${#DIST_TESTS[@]}" -gt 0 ]; then
+          "${PYTEST[@]}" --maxfail=3 --numprocesses=2 -m "not distributed" -k "not sink and not vs_sparse" . || true
+
+      # NVIDIA CUDA tests
+      - name: Run CUDA tests with Python ${{ matrix.python-version }} (${{ matrix.runner.toolkit }})
+        id: cuda-tests
+        if: contains(matrix.runner.toolkit, 'CUDA')
+        run: |
+          cd testing/python
+          unset PYTHONPATH
+          PYTEST=(
+            uv run --no-project -m --
+            pytest --verbose --color=yes --durations=0 --showlocals --cache-clear -r fE
+          )
+
+          # Run distributed tests (marked with @requires_distributed) with TILELANG_USE_DISTRIBUTED=1
           echo "Running distributed tests with TILELANG_USE_DISTRIBUTED=1:"
-          printf '%s\n' "${DIST_TESTS[@]}"
-          TILELANG_USE_DISTRIBUTED=1 python -m pytest -n 1 "${DIST_TESTS[@]}" -v -r fE
-        else
-          echo "No distributed tests found under testing/python."
-        fi
-
-        # run remaining tests
-        mapfile -t OTHER_TESTS < <(find . -type f -name 'test*.py' ! -path '*/distributed/*' | grep -vE 'tilelibrary_gemm|jit_gemm_ctypes' 2>/dev/null || true)  # temporarily disable problematic tests
-        if [ "${#OTHER_TESTS[@]}" -gt 0 ]; then
+          TILELANG_USE_DISTRIBUTED=1 "${PYTEST[@]}" --maxfail=3 --numprocesses=1 -m distributed . || true
+
+          # Run remaining tests (non-distributed)
+          # Temporarily disable problematic tests: tilelibrary_gemm, jit_gemm_ctypes
           echo "Running non-distributed tests:"
-          printf '%s\n' "${OTHER_TESTS[@]}"
-          python -m pytest -n 4 "${OTHER_TESTS[@]}" -v -r fE
-        else
-          echo "No non-distributed tests found under testing/python."
-        fi
+          "${PYTEST[@]}" --maxfail=3 --numprocesses=2 -m "not distributed" -k "not tilelibrary_gemm and not jit_gemm_ctypes" . || true
+
+      - name: List generated files
+        if: ${{ !cancelled() }}
+        run: |
+          find . -type f -name '*.py[co]' -delete
+          find . -depth -type d -name "__pycache__" -exec rm -r "{}" +
+          if git status --ignored --porcelain | grep -qvE '/$'; then
+            ls -alh $(git status --ignored --porcelain | grep -vE '/$' | grep -oE '\S+$')
+          fi
diff --git a/.github/workflows/dist.yml b/.github/workflows/dist.yml
index 904fbb13b..74132ffb3 100644
--- a/.github/workflows/dist.yml
+++ b/.github/workflows/dist.yml
@@ -1,5 +1,6 @@
 name: Dist
 on:
+  workflow_dispatch:
   schedule:
     # gemini said this is 6:00 china time
     - cron: "0 22 * * *"
@@ -28,6 +29,18 @@ concurrency:
   group: "${{ github.workflow }}-${{ github.ref }}"
   cancel-in-progress: true
 
+env:
+  PYTHONDEVMODE: "1"
+  PYTHONUNBUFFERED: "1"
+  COLUMNS: "100"
+  FORCE_COLOR: "1"
+  CLICOLOR_FORCE: "1"
+  UV_INDEX_STRATEGY: "unsafe-best-match"
+  UV_HTTP_TIMEOUT: "600"
+  XDG_CACHE_HOME: "${{ github.workspace }}/.cache" # to be updated
+  PIP_CACHE_DIR: "${{ github.workspace }}/.cache/pip" # to be updated
+  UV_CACHE_DIR: "${{ github.workspace }}/.cache/uv" # to be updated
+
 jobs:
   build-wheels:
     name: Build wheels for Python ${{ matrix.python-version }} on ${{ matrix.target.runner }} with ${{ matrix.target.toolkit }}
@@ -37,39 +50,41 @@ jobs:
     strategy:
       matrix:
         target:
-          - { runner: ubuntu-latest, toolkit: "CUDA-12.1" }
-          - { runner: ubuntu-24.04-arm, toolkit: "CUDA-12.8" }
-          - { runner: macos-latest, toolkit: "Metal" }
+          # NOTE(wt): Temporarily disable ARM and MacOS, as NVSHMEM only supports x86 (?)
+          - { runner: ubuntu-latest, toolkit: "CUDA-12.8" }
+          # - { runner: ubuntu-24.04-arm, toolkit: "CUDA-12.8" }
+          - { runner: ubuntu-latest, toolkit: "Nightly-CUDA-13.0" }
+          # - { runner: ubuntu-24.04-arm, toolkit: "Nightly-CUDA-13.0" }
+          # - { runner: macos-latest, toolkit: "Metal" }
         python-version:
-          - "3.8"
-          # TVM is built with Python 3.8 Limited API, it should work with all Python >= 3.8.
-          # - "3.9"
-          # - "3.10"
-          # - "3.11"
-          # - "3.12"
-          # - "3.13"
-          # - "3.14"
+          # Wheels are built with Python 3.8 Limited API, they should work with all Python >= 3.8.
+          # Only build wheels against Python 3.8 Limited API to save CI resources.
+          - "3.9"
       fail-fast: false
     timeout-minutes: 120
     runs-on: ${{ matrix.target.runner }}
     env:
-      NO_VERSION_LABEL: ${{ github.event_name == 'release' && 'OFF' || 'ON' }}
+      IS_RELEASE: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.title, '[Release]') }}
+      NO_VERSION_LABEL: "OFF"
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           fetch-depth: 1
           submodules: recursive
 
-      # NB: CIBW builds wheels in containers on Linux
-      - name: Setup ccache (macOS only)
-        if: runner.os == 'macOS'
+      - name: Setup ccache
         uses: hendrikmuhs/ccache-action@v1
         with:
+          max-size: "200MB"
           create-symlink: true
-          key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.python-version }}-${{ matrix.target.toolkit }}
           evict-old-files: "7d"
+          append-timestamp: false
+          key: wheel-${{ runner.os }}-${{ runner.arch }}-${{ hashFiles('**/*.cc') }}
+          restore-keys: |
+            wheel-${{ runner.os }}-${{ runner.arch }}-${{ hashFiles('**/*.cc') }}
+            wheel-${{ runner.os }}-${{ runner.arch }}
 
       - name: Set CIBW_BUILD
         run: |
@@ -80,21 +95,81 @@ jobs:
 
           if [[ "${{ matrix.target.toolkit }}" == *"CUDA"* ]]; then
             CUDA_VERSION="${{ matrix.target.toolkit }}"
-            CUDA_VERSION="${CUDA_VERSION#CUDA-}"
+            CUDA_VERSION="${CUDA_VERSION##*-}"
+            CUDA_VERSION_MAJMIN="$(echo ${CUDA_VERSION} | cut -d '.' -f-2)"
+            CUDA_VERSION_MAJMIN_NODOT="${CUDA_VERSION_MAJMIN//./}"
             echo "CUDA_VERSION=${CUDA_VERSION}" | tee -a "${GITHUB_ENV}"
+            if [[ "${{ matrix.target.toolkit }}" == "Nightly-"* ]]; then
+              # Use torch nightly builds
+              export UV_INDEX="https://download.pytorch.org/whl/nightly/cu${CUDA_VERSION_MAJMIN_NODOT}"
+            else
+              export UV_INDEX="https://download.pytorch.org/whl/cu${CUDA_VERSION_MAJMIN_NODOT}"
+              echo "UV_TORCH_BACKEND=cu${CUDA_VERSION_MAJMIN_NODOT}" | tee -a "${GITHUB_ENV}"
+            fi
+            echo "UV_INDEX=${UV_INDEX}" | tee -a "${GITHUB_ENV}"
+          fi
+
+          if [[ "${{ env.IS_RELEASE }}" == "true" ]]; then
+            if [[ "${{ matrix.target.toolkit }}" == "Nightly-"* ]]; then
+              # Avoid using same file name for different toolkit.
+              echo "NO_GIT_VERSION=ON" | tee -a "${GITHUB_ENV}"
+            else
+              echo "NO_VERSION_LABEL=ON" | tee -a "${GITHUB_ENV}"
+            fi
+          fi
+
+          if [[ "${{ runner.os }}" == "Linux" ]]; then
+            HOST_CCACHE_DIR="$(ccache --get-config cache_dir)"
+            # Install torch for tilescale_ext._C build, then setup ccache
+            echo "CIBW_BEFORE_BUILD_LINUX=pip install torch --no-cache-dir && dnf install -y ccache && ccache -o cache_dir=/host${HOST_CCACHE_DIR}" | tee -a "${GITHUB_ENV}"
           fi
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v3.2
+        uses: pypa/cibuildwheel@v3.3
         with:
           package-dir: .
           output-dir: wheelhouse
           config-file: "{package}/pyproject.toml"
 
+      - name: Setup Python and uv with caching
+        id: setup-uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          python-version: "3.12"
+          activate-environment: true
+
+      - name: Test built wheels
+        # Skip CUDA wheel tests on GitHub-hosted runners (no CUDA available)
+        # Tests should be run on self-hosted runners with CUDA or during release validation
+        if: ${{ !contains(matrix.target.toolkit, 'CUDA') || contains(matrix.target.runner, 'self-hosted') }}
+        run: |
+          for WHEEL in wheelhouse/*.whl; do
+            echo "Testing wheel: ${WHEEL}"
+            (
+              set -e
+              uv venv --python=3.12 test-venv
+              source test-venv/bin/activate
+
+              uv pip install --upgrade pip setuptools wheel
+              if [[ "${UV_INDEX}" == *"/nightly/"* ]]; then
+                uv pip install --prerelease=allow -v torch
+              fi
+
+              uv pip install -v "${WHEEL}"
+              (
+                set -e
+                cd /
+                uv run --no-project -- python -c "import tilelang; print(tilelang.__version__)"
+              )
+              deactivate
+              rm -rf test-venv
+            )
+          done
+
       - name: Upload wheels
         # Not PR to save artifact storage, as wheels are only needed for releases.
-        if: github.event_name != 'pull_request'
-        uses: actions/upload-artifact@v4
+        if: github.event_name != 'pull_request' || contains(github.event.pull_request.title, '[Release]')
+        uses: actions/upload-artifact@v6
         with:
           name: wheels-${{ matrix.python-version }}-${{ runner.os }}-${{ runner.arch }}-${{ matrix.target.toolkit }}
           path: wheelhouse/*.whl
@@ -102,14 +177,14 @@ jobs:
 
   list-artifacts:
     name: List artifacts
-    # Not PR to save artifact storage, as wheels are only needed for releases.
-    if: github.event_name != 'pull_request'
+    # Not PR to save artifact storage, as artifacts are only needed for releases.
+    if: github.event_name != 'pull_request' || contains(github.event.pull_request.title, '[Release]')
     runs-on: ubuntu-latest
     needs: [build-wheels]
     timeout-minutes: 15
     steps:
       - name: Download built wheels
-        uses: actions/download-artifact@v5
+        uses: actions/download-artifact@v7
         with:
           pattern: wheels-*
           path: dist
@@ -119,7 +194,7 @@ jobs:
         run: ls -lh dist/*
 
       - name: Upload artifacts
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v6
         with:
           name: artifacts
           path: dist/*
diff --git a/.github/workflows/pr-regression-test-bot.yml b/.github/workflows/pr-regression-test-bot.yml
new file mode 100644
index 000000000..568ce8555
--- /dev/null
+++ b/.github/workflows/pr-regression-test-bot.yml
@@ -0,0 +1,273 @@
+name: Performance Regression Bot
+
+on:
+  issue_comment:
+    types:
+      - created
+
+permissions:
+  contents: read
+  issues: write
+  pull-requests: write
+
+concurrency:
+  # Use the issue/PR number to differentiate between different PRs
+  group: "${{ github.workflow }}-${{ github.event.issue.number }}"
+  cancel-in-progress: true
+
+env:
+  PYTHONDEVMODE: "1"
+  PYTHONUNBUFFERED: "1"
+  PYTHONPATH: "" # explicit cleanup
+  PIP_USER: "" # explicit cleanup
+  COLUMNS: "100"
+  FORCE_COLOR: "1"
+  CLICOLOR_FORCE: "1"
+  UV_INDEX_STRATEGY: "unsafe-best-match"
+  UV_HTTP_TIMEOUT: "600"
+  XDG_CACHE_HOME: "${{ github.workspace }}/.cache" # to be updated
+  PIP_CACHE_DIR: "${{ github.workspace }}/.cache/pip" # to be updated
+  UV_CACHE_DIR: "${{ github.workspace }}/.cache/uv" # to be updated
+  PRE_COMMIT_HOME: "${{ github.workspace }}/.cache/pip/.pre-commit" # to be updated
+
+jobs:
+  permissions-check:
+    name: Check bot permissions
+    if: |
+      github.repository_owner == 'tile-ai' &&
+      github.event.issue.pull_request &&
+      (contains(github.event.comment.body, '@regression-perf'))
+    runs-on: ubuntu-latest
+    steps:
+      - name: Get commenter permission
+        id: perm
+        uses: actions/github-script@v8
+        with:
+          script: |
+            const username = context.payload.comment.user.login
+            const { owner, repo } = context.repo
+            const { data } = await github.rest.repos.getCollaboratorPermissionLevel({ owner, repo, username })
+            core.setOutput('permission', data.permission) // admin|maintain|write|triage|read|none
+
+      - name: Reject if not allowed
+        if: ${{ steps.perm.outputs.permission != 'admin' && steps.perm.outputs.permission != 'maintain' && steps.perm.outputs.permission != 'write' }}
+        run: |
+          echo "Not authorized: permission=${{ steps.perm.outputs.permission }}"
+          exit 1
+
+  pr-regression:
+    name: Performance regression test between PR and main
+    needs: [permissions-check]
+    runs-on: ${{ matrix.runner.tags }}
+    strategy:
+      matrix:
+        runner:
+          - tags: [self-hosted, nvidia]
+            name: self-hosted-nvidia
+            toolkit: CUDA-12.8
+        python-version:
+          - "3.12"
+      fail-fast: false
+    timeout-minutes: 120
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v6
+        with:
+          ref: refs/pull/${{ github.event.issue.number }}/merge
+          fetch-depth: 0
+          submodules: recursive
+
+      - name: Set environment (self-hosted runners)
+        if: startsWith(matrix.runner.name, 'self-hosted')
+        run: |
+          # Hide sensitive data in logs for self-hosted runners
+          if [[ -n "${{ secrets.SECRET_PATH_PREFIXES }}" ]]; then
+            echo "::add-mask::${{ secrets.SECRET_PATH_PREFIXES }}"
+            # Colon separated list of secrets to mask
+            for secret in $(echo "${{ secrets.SECRET_PATH_PREFIXES }}" | tr ':' '\n'); do
+              echo "::add-mask::${secret}"
+            done
+          fi
+
+          # Use runner tool_cache as cache root for self-hosted runners to avoid internet connection
+          # issues and to share cache between jobs.
+          export XDG_CACHE_HOME="${{ runner.tool_cache }}/.ci-cache-${{ github.workflow }}"
+          echo "XDG_CACHE_HOME=${XDG_CACHE_HOME}" | tee -a "${GITHUB_ENV}"
+          echo "PIP_CACHE_DIR=${XDG_CACHE_HOME}/pip" | tee -a "${GITHUB_ENV}"
+          echo "UV_CACHE_DIR=${XDG_CACHE_HOME}/uv" | tee -a "${GITHUB_ENV}"
+          echo "PRE_COMMIT_HOME=${XDG_CACHE_HOME}/pip/.pre-commit" | tee -a "${GITHUB_ENV}"
+
+      # Do not use ccache on self-hosted runners, as it will download/upload caches which is slow.
+      # Self-hosted runners usually have more CPU power to compile without ccache.
+      - name: Setup ccache (GitHub-hosted runners)
+        id: setup-ccache
+        if: ${{ !startsWith(matrix.runner.name, 'self-hosted') }}
+        uses: hendrikmuhs/ccache-action@v1
+        with:
+          create-symlink: true
+          evict-old-files: "7d"
+          append-timestamp: false
+          key: ${{ runner.os }}-${{ runner.arch }}-${{ matrix.runner.toolkit }}-${{ hashFiles('**/*.cc') }}
+          restore-keys: |
+            ${{ runner.os }}-${{ runner.arch }}-${{ matrix.runner.toolkit }}-${{ hashFiles('**/*.cc') }}
+            ${{ runner.os }}-${{ runner.arch }}-${{ matrix.runner.toolkit }}
+            ${{ runner.os }}-${{ runner.arch }}
+
+      - name: Set environment (CUDA)
+        if: contains(matrix.runner.toolkit, 'CUDA')
+        run: |
+          TOOLKIT="${{ matrix.runner.toolkit }}"
+          CUDA_VERSION="${TOOLKIT##*-}"
+          CUDA_VERSION_MAJMIN="$(echo ${CUDA_VERSION} | cut -d '.' -f-2)"
+          CUDA_VERSION_MAJMIN_NODOT="${CUDA_VERSION_MAJMIN//./}"
+          if [[ "${TOOLKIT}" == "Nightly-"* ]]; then
+            # Use torch nightly builds
+            export PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/nightly/cu${CUDA_VERSION_MAJMIN_NODOT}"
+          else
+            export PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu${CUDA_VERSION_MAJMIN_NODOT}"
+          fi
+          export UV_INDEX="${PIP_EXTRA_INDEX_URL}"
+          export CLANG_TIDY_CMAKE_OPTIONS="${CLANG_TIDY_CMAKE_OPTIONS} -DUSE_CUDA=ON"
+
+          echo "USE_CUDA=ON" | tee -a "${GITHUB_ENV}"
+          echo "CUDA_VERSION=${CUDA_VERSION}" | tee -a "${GITHUB_ENV}"
+          echo "CUDA_VERSION_MAJMIN=${CUDA_VERSION_MAJMIN}" | tee -a "${GITHUB_ENV}"
+          echo "CUDA_VERSION_MAJMIN_NODOT=${CUDA_VERSION_MAJMIN_NODOT}" | tee -a "${GITHUB_ENV}"
+          echo "PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}" | tee -a "${GITHUB_ENV}"
+          echo "UV_INDEX=${UV_INDEX}" | tee -a "${GITHUB_ENV}"
+          echo "CLANG_TIDY_CMAKE_OPTIONS=${CLANG_TIDY_CMAKE_OPTIONS}" | tee -a "${GITHUB_ENV}"
+
+          if [[ ! -x "$(command -v nvcc)" ]]; then
+            export PATH="/usr/local/cuda/bin:${PATH}"
+            export LD_LIBRARY_PATH="/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+            echo "PATH=${PATH}" | tee -a "${GITHUB_ENV}"
+            echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" | tee -a "${GITHUB_ENV}"
+          fi
+          if [[ -x "$(command -v nvcc)" ]]; then
+            echo "\$ $(command -v nvcc) --version" && nvcc --version
+          else
+            echo "::warning::nvcc not found in PATH!"
+          fi
+
+      - name: Setup Python and uv with caching
+        id: setup-uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          python-version: ${{ matrix.python-version }}
+          activate-environment: true
+          # Do not use cache for self-hosted runners, as it will download/upload caches which is slow.
+          enable-cache: ${{ !startsWith(matrix.runner.name, 'self-hosted') }}
+          prune-cache: ${{ !startsWith(matrix.runner.name, 'self-hosted') }}
+          # Use runner tool_cache for self-hosted runners
+          cache-local-path: ${{ env.UV_CACHE_DIR }}
+          ignore-nothing-to-cache: true
+          # Extra cache key to upload/download caches on GitHub-hosted runners
+          cache-suffix: uv-${{ runner.os }}-${{ runner.arch }}-${{ matrix.python-version }}-${{ matrix.runner.name }}-${{ matrix.runner.toolkit }}
+          cache-dependency-glob: |
+            pyproject.toml
+            requirements*.txt
+
+      - name: Setup environments
+        id: setup-venv
+        run: |
+          set -e
+
+          uv venv --python "${{ matrix.python-version }}" new
+
+          source new/bin/activate
+          uv pip install -v -r requirements-test.txt
+          uv pip install -v .
+
+      - name: Install Main version (Baseline)
+        run: |
+          set -e
+          git clean -dxf -e new/ -e .cache/
+          git checkout main
+          git submodule update --init --recursive
+          uv venv --python "${{ matrix.python-version }}" old
+          source old/bin/activate
+
+          uv pip install -v -r requirements-test.txt
+          uv pip install -v .
+          rm -rf tilelang build
+
+          uv venv --python "${{ matrix.python-version }}" test_regression
+          source test_regression/bin/activate
+          uv pip install -v -r requirements-test.txt
+
+      - name: Clear uv cache for self-hosted runners (if setup failed)
+        if: >-
+          ${{
+            failure() &&
+            startsWith(matrix.runner.name, 'self-hosted') &&
+            (steps.setup-uv.conclusion == 'failure' || steps.setup-venv.conclusion == 'failure')
+          }}
+        run: |
+          echo "Clearing uv cache at ${UV_CACHE_DIR} due to failure."
+          uv cache clean
+
+      - name: Enable core dump generation (Linux / GitHub-hosted runners)
+        if: ${{ runner.os == 'Linux' && !startsWith(matrix.runner.name, 'self-hosted') }}
+        run: |
+          sudo sysctl -w kernel.core_pattern="core.${{ matrix.python-version }}.${{ matrix.runner.toolkit }}.%P"
+          sudo sysctl -w kernel.core_uses_pid=0
+          sudo sysctl -w fs.suid_dumpable=1
+          sysctl kernel.core_pattern kernel.core_uses_pid fs.suid_dumpable
+
+      - name: Enable core dump generation (macOS / GitHub-hosted runners)
+        if: ${{ runner.os == 'macOS' && !startsWith(matrix.runner.name, 'self-hosted') }}
+        run: |
+          sudo sysctl -w kern.corefile="core.${{ matrix.python-version }}.${{ matrix.runner.toolkit }}.%P"
+          sudo sysctl -w kern.coredump=1
+          sudo sysctl -w kern.sugid_coredump=1
+          sysctl kern.corefile kern.coredump kern.sugid_coredump
+
+      - name: Run performance regression test
+        run: |
+          source test_regression/bin/activate
+          OLD_PYTHON=./old/bin/python NEW_PYTHON=./new/bin/python \
+            PERF_REGRESSION_MD=regression_result.md PERF_REGRESSION_PNG=regression_result.png \
+            python ./maint/scripts/test_perf_regression.py
+
+      - name: Read markdown table
+        id: read_md
+        run: |
+          echo "content<<EOF" >> $GITHUB_OUTPUT
+          cat regression_result.md >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+
+      - name: Upload result image as artifact
+        uses: actions/upload-artifact@v6
+        with:
+          name: perf-regression-${{ github.run_id }}
+          path: regression_result.png
+
+      - name: Post test results as PR comment
+        uses: actions/github-script@v8
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const fs = require('fs');
+            // Read the file directly instead of passing via env/outputs to avoid escaping issues
+            const md = fs.readFileSync('regression_result.md', 'utf8');
+
+            const runUrl = `${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`;
+
+            const body =
+              'Performance Regression Test Report\n' +
+              '============================\n\n' +
+              `Triggered by: @${context.payload.comment.user.login}\n` +
+              `Workflow run: ${runUrl}\n\n` +
+              'Results\n' +
+              '-------\n\n' +
+              md + '\n\n' +
+              'Artifacts\n' +
+              '---------\n\n' +
+              '- regression_result.png (speedup plot) is attached as a workflow artifact. Download it from the workflow run page above.\n';
+
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+              body
+            });
diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml
index 953303102..2197015b6 100644
--- a/.github/workflows/publish-docs.yml
+++ b/.github/workflows/publish-docs.yml
@@ -25,7 +25,7 @@ jobs:
     runs-on: [self-hosted, nvidia]
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           fetch-depth: 0
           submodules: recursive
diff --git a/.gitignore b/.gitignore
index 75aa07f82..e85c2c094 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,6 +20,8 @@
 debug/
 build/
 *dist/
+dist*/
+!distributed*/
 wheelhouse/
 __pycache__
 nnfusion.tar.gz
@@ -110,3 +112,24 @@ nvshmem_src/
 # CMake
 cmake-build/
 cmake-build-*/
+
+# Git version for sdist
+.git_commit.txt
+
+# pre-commit cache
+.pre-commit-cache/*
+
+# host checks logs
+maint/host_checks/logs/*
+
+# ncu
+*.ncu-rep
+
+# csv
+*.csv
+
+# clang-tidy
+/run-clang-tidy.py
+
+# perf regression test
+.perf_regression/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 99a05f4c6..f52f91b53 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -13,15 +13,13 @@ repos:
     hooks:
       - id: check-symlinks
       - id: destroyed-symlinks
-      # FIXME: enable these hooks
-      # - id: trailing-whitespace
-      # - id: end-of-file-fixer
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
       - id: check-added-large-files
       - id: check-merge-conflict
         fail_fast: true
-      # FIXME: enable these hooks
-      # - id: check-executables-have-shebangs
-      # - id: check-shebang-scripts-are-executable
+      - id: check-executables-have-shebangs
+      - id: check-shebang-scripts-are-executable
       - id: detect-private-key
       - id: check-yaml
       - id: check-toml
@@ -32,39 +30,30 @@ repos:
         args: [--ignore-case]
         files: ^docs/spelling_wordlist\.txt$
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v21.1.2  # sync with requirements-lint.txt
+    rev: v21.1.7  # sync with requirements-lint.txt
     hooks:
       - id: clang-format
-        exclude: |
-          (?ix)(
-            ^.+\.(cu|cuh)$|
-            ^.+\.json$
-          )
+        types_or: [c++, c]
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.14.1  # sync with requirements-lint.txt
+    rev: v0.14.9  # sync with requirements-lint.txt
     hooks:
       - id: ruff-check
         args: [--fix, --exit-non-zero-on-fix]
-  - repo: https://github.com/google/yapf
-    rev: v0.43.0  # sync with requirements-lint.txt
-    hooks:
-      - id: yapf
-        name: yapf-multiproc-bugfix
-        # yapf is not multiprocess safe, so we run a dummy yapf first.
-        args: [--in-place, docs/conf.py]
-        always_run: true
-        pass_filenames: false
-      - id: yapf
-        args: [--recursive, --in-place]
+      - id: ruff-format
+        args: [--exit-non-zero-on-format]
   - repo: https://github.com/codespell-project/codespell
     rev: v2.4.1  # sync with requirements-lint.txt
     hooks:
       - id: codespell
         additional_dependencies: [".[toml]"]
-        args: ["-L", "HDA"]
         exclude: |
           (?x)(
             ^.+\.(cpp|hpp|cxx|cc|c|h|cu|cuh)$|
             ^.+\.svg$|
             ^.*\brequirements\b.*\.txt$
           )
+  - repo: https://github.com/jackdewinter/pymarkdown
+    rev: v0.9.33
+    hooks:
+      - id: pymarkdown
+        args: ["--config", ".pymarkdown", "fix"]
diff --git a/.pymarkdown b/.pymarkdown
new file mode 100644
index 000000000..5394265ed
--- /dev/null
+++ b/.pymarkdown
@@ -0,0 +1,37 @@
+{
+  "plugins": {
+    "md003": {
+      "style": "atx"
+    },
+    "md004": {
+      "style": "dash"
+    },
+    "md013": {
+      "enabled": false
+    },
+    "md026": {
+      "enabled": false
+    },
+    "md029": {
+      "enabled": false
+    },
+    "md031": {
+      "enabled": false
+    },
+    "md032": {
+      "enabled": false
+    },
+    "md033": {
+      "enabled": false
+    },
+    "md034": {
+      "enabled": false
+    },
+    "md040": {
+      "enabled": false
+    },
+    "md041": {
+      "enabled": false
+    }
+  }
+}
diff --git a/3rdparty/tvm b/3rdparty/tvm
index 5bf17a346..23bce012f 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit 5bf17a34602931e7d7e01cbccf358a21fe972779
+Subproject commit 23bce012ffd255a24289eea6ceab74a40b94a096
diff --git a/CMakeLists.txt b/CMakeLists.txt
index afeccaceb..4fb370d50 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,6 +8,11 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND "$ENV{CIBUILDWHEEL}")
+  # Warning came from tvm submodule
+  string(APPEND CMAKE_CXX_FLAGS " -Wno-dangling-reference")
+endif()
+
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 
 if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.gitmodules" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
@@ -36,15 +41,74 @@ endif()
 
 find_program(CCACHE_PROGRAM ccache)
 if(CCACHE_PROGRAM)
+  message(STATUS "Using ccache: ${CCACHE_PROGRAM} with base_dir=${CMAKE_SOURCE_DIR}")
+  if(APPLE)
+    # Passing configs like `ccache base_dir=/xxx cc ...` is supported
+    # (likely) since ccache 4.x, which has been provided by homebrew.
+    # Our Linux builder image (manylinux2014 & manylinux_2_28) still
+    # provides ccache 3.x and do not support this form.
+    # `cibuildwheel` uses fixed folder on Linux (`/project`) as working directory,
+    # so cache would work without setting `base_dir`.
+    set(CCACHE_PROGRAM "${CCACHE_PROGRAM};base_dir=${CMAKE_SOURCE_DIR}")
+  endif()
   set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}" CACHE STRING "C compiler launcher")
   set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}" CACHE STRING "CXX compiler launcher")
   set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_PROGRAM}" CACHE STRING "CUDA compiler launcher")
+else()
+  find_program(SCCACHE_PROGRAM sccache)
+  if(SCCACHE_PROGRAM)
+    message(STATUS "Using sccache: ${SCCACHE_PROGRAM}")
+    set(CMAKE_C_COMPILER_LAUNCHER "${SCCACHE_PROGRAM}" CACHE STRING "C compiler launcher")
+    set(CMAKE_CXX_COMPILER_LAUNCHER "${SCCACHE_PROGRAM}" CACHE STRING "CXX compiler launcher")
+    set(CMAKE_CUDA_COMPILER_LAUNCHER "${SCCACHE_PROGRAM}" CACHE STRING "CUDA compiler launcher")
+  endif()
 endif()
 
 # Configs
-set(USE_CUDA OFF)
-set(USE_ROCM OFF)
-set(USE_METAL OFF)
+set(TILELANG_BACKENDS CUDA ROCM METAL)
+
+set(TILELANG_BACKEND_DOC_CUDA "Enable CUDA backend (ON/OFF/or CUDA SDK path)")
+set(TILELANG_BACKEND_DOC_ROCM "Enable ROCm backend (ON/OFF/or ROCm SDK path)")
+set(TILELANG_BACKEND_DOC_METAL "Enable Metal backend")
+
+# TVM's config.cmake redefines USE_* options later, so we cache the user's choice
+# (including explicit -DUSE_XXX arguments) before we include TVM and restore it
+# afterwards.
+
+macro(tilelang_define_backend_option BACKEND)
+  set(_backend_var "USE_${BACKEND}")
+  set(_doc "${TILELANG_BACKEND_DOC_${BACKEND}}")
+  set(_user_override_var "TILELANG_USER_OVERRIDE_${_backend_var}")
+
+  set(_user_override OFF)
+  if(DEFINED ${_user_override_var})
+    set(_user_override "${${_user_override_var}}")
+  endif()
+
+  if(DEFINED CACHE{${_backend_var}})
+    get_property(_cache_type CACHE ${_backend_var} PROPERTY TYPE)
+    if(_cache_type STREQUAL "UNINITIALIZED")
+      set(_user_override ON)
+    endif()
+  endif()
+
+  set(_default OFF)
+  if(DEFINED ${_backend_var})
+    set(_default "${${_backend_var}}")
+  endif()
+
+  option(${_backend_var} "${_doc}" "${_default}")
+  # Remember if the user explicitly set this option so that later logic
+  # won't auto-toggle backends they configured on the command line.
+  set(${_user_override_var} ${_user_override} CACHE INTERNAL
+    "User explicitly set ${_backend_var} during configuration" FORCE)
+  set(TILELANG_OPTION_${_backend_var} "${${_backend_var}}")
+endmacro()
+
+foreach(BACKEND IN LISTS TILELANG_BACKENDS)
+  tilelang_define_backend_option(${BACKEND})
+endforeach()
+
 set(PREBUILD_CYTHON ON)
 # Configs end
 
@@ -55,6 +119,14 @@ if(EXISTS ${TVM_SOURCE}/cmake/config.cmake)
 else()
   message(FATAL_ERROR "Nor tvm provided or submodule checkout-ed.")
 endif()
+# Re-apply TileLang's preferred backend settings after TVM's config may have
+# overridden the USE_* cache entries.
+foreach(BACKEND IN LISTS TILELANG_BACKENDS)
+  set(_backend_var "USE_${BACKEND}")
+  set(_doc "${TILELANG_BACKEND_DOC_${BACKEND}}")
+  set(${_backend_var} ${TILELANG_OPTION_${_backend_var}} CACHE STRING "${_doc}" FORCE)
+  set(${_backend_var} ${TILELANG_OPTION_${_backend_var}})
+endforeach()
 
 # Include directories for TileLang
 set(TILE_LANG_INCLUDES ${TVM_INCLUDES})
@@ -64,33 +136,50 @@ file(GLOB TILE_LANG_SRCS
   src/*.cc
   src/layout/*.cc
   src/transform/*.cc
+  src/transform/common/*.cc
   src/op/*.cc
   src/target/utils.cc
+  src/target/codegen_c_host.cc
   src/target/codegen_cpp.cc
   src/target/rt_mod_cpp.cc
-  # webgpu doesn't have system dependency
-  src/target/codegen_webgpu.cc
   # intrin_rule doesn't have system dependency
   src/target/intrin_rule*.cc
 )
 
-# Backend-specific checks and configs
-if($ENV{USE_METAL})
-  set(USE_METAL ON)
-elseif(APPLE)
-  message(STATUS "Enable Metal support by default.")
-  set(USE_METAL ON)
-elseif($ENV{USE_ROCM})
-  set(USE_ROCM ON)
-else()
-  if($ENV{USE_CUDA})
-    set(USE_CUDA ON)
-  elseif(DEFINED ENV{USE_CUDA} AND NOT $ENV{USE_CUDA})
-    # Build CPU-only when we explicitly disable CUDA
-    set(USE_CUDA OFF)
+# Always include CPU-safe runtime helpers
+list(APPEND TILE_LANG_SRCS
+  src/runtime/error_helpers.cc
+)
+
+# Track if the user explicitly selected a backend via cache options.
+set(TILELANG_BACKEND_USER_SELECTED OFF)
+foreach(BACKEND IN LISTS TILELANG_BACKENDS)
+  set(_backend_var "USE_${BACKEND}")
+  set(_override_var "TILELANG_USER_OVERRIDE_${_backend_var}")
+  if(${_backend_var} OR ${_override_var})
+    set(TILELANG_BACKEND_USER_SELECTED ON)
+  endif()
+endforeach()
+
+# Only auto-select a backend when the user didn't specify one explicitly.
+if(NOT TILELANG_BACKEND_USER_SELECTED)
+  if($ENV{USE_METAL})
+    set(USE_METAL ON)
+  elseif(APPLE)
+    message(STATUS "Enable Metal support by default.")
+    set(USE_METAL ON)
+  elseif($ENV{USE_ROCM})
+    set(USE_ROCM ON)
   else()
-    message(STATUS "Enable CUDA support by default.")
-    set(USE_CUDA ON)
+    if($ENV{USE_CUDA})
+      set(USE_CUDA ON)
+    elseif(DEFINED ENV{USE_CUDA} AND NOT $ENV{USE_CUDA})
+      # Build CPU-only when we explicitly disable CUDA
+      set(USE_CUDA OFF)
+    else()
+      message(STATUS "Enable CUDA support by default.")
+      set(USE_CUDA ON)
+    endif()
   endif()
 endif()
 
@@ -104,7 +193,7 @@ if(USE_METAL)
 elseif(USE_ROCM)
   set(CMAKE_HIP_STANDARD 17)
   include(${TVM_SOURCE}/cmake/utils/FindROCM.cmake)
-  find_rocm($ENV{USE_ROCM})
+  find_rocm(${USE_ROCM})
   add_compile_definitions(__HIP_PLATFORM_AMD__ __HIP_PLATFORM_HCC__=1)
 
   file(GLOB TILE_LANG_HIP_SRCS
@@ -123,16 +212,29 @@ elseif(USE_CUDA)
   cmake_path(GET CUDAToolkit_BIN_DIR PARENT_PATH USE_CUDA)
 
   file(GLOB TILE_LANG_CUDA_SRCS
-    src/runtime/*.cc
+    src/runtime/runtime.cc
+    src/runtime/tilescale_cuda_module.cc
     src/target/ptx.cc
     src/target/codegen_cuda.cc
+    src/target/codegen_py.cc
+    src/target/codegen_utils.cc
+    src/target/codegen_cutedsl.cc
     src/target/rt_mod_cuda.cc
+    src/target/rt_mod_cutedsl.cc
   )
   list(APPEND TILE_LANG_SRCS ${TILE_LANG_CUDA_SRCS})
 
   list(APPEND TILE_LANG_INCLUDES ${CUDAToolkit_INCLUDE_DIRS})
 endif()
 
+set(USE_Z3      ON CACHE STRING "Use Z3 SMT solver for TileLang optimizations")
+set(USE_PYPI_Z3 ON CACHE BOOL   "Use Z3 provided by PyPI z3-solver package")
+
+if(USE_Z3 AND USE_PYPI_Z3)
+  list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/pypi-z3")
+  find_package(Z3 REQUIRED)
+endif()
+
 # Include tvm after configs have been populated
 add_subdirectory(${TVM_SOURCE} tvm EXCLUDE_FROM_ALL)
 
@@ -140,7 +242,11 @@ add_subdirectory(${TVM_SOURCE} tvm EXCLUDE_FROM_ALL)
 add_compile_definitions(DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
 
 add_library(tilelang_objs OBJECT ${TILE_LANG_SRCS})
+
+# Set debug mode compile definitions
+# We open the deubg option of TVM, i.e. TVM_LOG_DEBUG
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+  message(STATUS "Building TileLang with DEBUG mode")
   target_compile_definitions(tilelang_objs PRIVATE "TVM_LOG_DEBUG")
 endif()
 
@@ -148,12 +254,20 @@ target_include_directories(tilelang_objs PRIVATE ${TILE_LANG_INCLUDES})
 
 add_library(tilelang SHARED $<TARGET_OBJECTS:tilelang_objs>)
 add_library(tilelang_module SHARED $<TARGET_OBJECTS:tilelang_objs>)
-target_link_libraries(tilelang PUBLIC tvm_runtime)
+target_link_libraries(tilelang PUBLIC tvm_runtime tvm)
 target_link_libraries(tilelang_module PUBLIC tvm)
-if(APPLE)
-  # FIXME: libtilelang should only link against tvm runtime
-  target_link_libraries(tilelang PUBLIC tvm)
-endif()
+
+# Place dev build outputs under build/lib for consistency
+set_target_properties(tilelang PROPERTIES
+  LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+)
+set_target_properties(tilelang_module PROPERTIES
+  LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+)
 # Build cython extension
 find_package(Python REQUIRED COMPONENTS Interpreter Development.Module ${SKBUILD_SABI_COMPONENT})
 
@@ -173,26 +287,112 @@ if(NOT "${SKBUILD_SABI_VERSION}" STREQUAL "")
 endif()
 
 python_add_library(tilelang_cython_wrapper MODULE "${CMAKE_BINARY_DIR}/tilelang_cython_wrapper.cpp" ${USE_SABI} WITH_SOABI)
-# Install extension into the tilelang package directory
+
+# Ensure dev builds drop the extension into build/lib alongside other shared libs
+set_target_properties(tilelang_cython_wrapper PROPERTIES
+  LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+)
+
+# Install the extension into tilelang/lib inside the wheel
 install(TARGETS tilelang_cython_wrapper
-        LIBRARY DESTINATION tilelang
-        RUNTIME DESTINATION tilelang
-        ARCHIVE DESTINATION tilelang)
+        LIBRARY DESTINATION tilelang/lib
+        RUNTIME DESTINATION tilelang/lib
+        ARCHIVE DESTINATION tilelang/lib)
+
+# Copy libz3.so to build folder to workaround isolated build env issue
+if(USE_Z3 AND USE_PYPI_Z3)
+  get_target_property(Z3_LIBRARY_PATH z3::libz3 IMPORTED_LOCATION)
+  install(FILES "${Z3_LIBRARY_PATH}" DESTINATION "${CMAKE_BINARY_DIR}/tvm")
+  if(APPLE)
+    set_target_properties(tvm PROPERTIES BUILD_RPATH "@loader_path")
+  else()
+    set_target_properties(tvm PROPERTIES BUILD_RPATH "\$ORIGIN")
+  endif()
+endif()
 
-# let libtilelang to search tvm/tvm_runtime in same dir
 if(APPLE)
-  set_target_properties(tilelang PROPERTIES INSTALL_RPATH "@loader_path")
-  set_target_properties(tilelang_module PROPERTIES INSTALL_RPATH "@loader_path")
-else()
-  set_target_properties(tilelang PROPERTIES INSTALL_RPATH "\$ORIGIN")
-  set_target_properties(tilelang_module PROPERTIES INSTALL_RPATH "\$ORIGIN")
+  set(TILELANG_INSTALL_RPATH "@loader_path;@loader_path/../../tvm_ffi/lib")
+  if(USE_Z3 AND USE_PYPI_Z3)
+    # some z3 is placed in lib/ and some in bin/, we add both in rpath
+    list(APPEND TILELANG_INSTALL_RPATH "@loader_path/../../z3/lib" "@loader_path/../../z3/bin")
+  endif()
+elseif(UNIX)
+  set(TILELANG_INSTALL_RPATH "\$ORIGIN:\$ORIGIN/../../tvm_ffi/lib")
+  if(USE_Z3 AND USE_PYPI_Z3)
+    # cmake uses ; by default, we explicitly use : for linux
+    string(APPEND TILELANG_INSTALL_RPATH ":\$ORIGIN/../../z3/lib")
+  endif()
 endif()
 
-install(TARGETS tvm tvm_runtime tilelang_module tilelang LIBRARY DESTINATION tilelang/lib)
+set_target_properties(
+  tilelang tilelang_module tvm tvm_runtime
+  PROPERTIES INSTALL_RPATH "${TILELANG_INSTALL_RPATH}")
 
-# Copy tvm cython ext for wheels
-# TODO: not necessary for editable builds
-if(TVM_BUILD_FROM_SOURCE)
-  add_dependencies(tilelang tvm_cython)
-  install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/tvm/python/tvm/ffi/core.abi3.so" DESTINATION tilelang/3rdparty/tvm/python/tvm/ffi/)
+install(
+  TARGETS tvm tvm_runtime tilelang_module tilelang
+  LIBRARY DESTINATION tilelang/lib
+)
+
+# Build tilescale_ext PyTorch C++ extension
+if(USE_CUDA)
+  # Find Torch
+  execute_process(
+    COMMAND "${Python_EXECUTABLE}" -c "import torch; print(torch.utils.cmake_prefix_path)"
+    OUTPUT_VARIABLE TORCH_CMAKE_PREFIX_PATH
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    RESULT_VARIABLE TORCH_CMAKE_RESULT
+  )
+  if(TORCH_CMAKE_RESULT EQUAL 0 AND EXISTS "${TORCH_CMAKE_PREFIX_PATH}")
+    list(APPEND CMAKE_PREFIX_PATH "${TORCH_CMAKE_PREFIX_PATH}")
+  endif()
+
+  find_package(Torch QUIET)
+  if(Torch_FOUND)
+    message(STATUS "Building tilescale_ext with Torch ${Torch_VERSION}")
+
+    set(TILESCALE_EXT_SOURCES
+      ${CMAKE_CURRENT_SOURCE_DIR}/tilelang/utils/ts_ext/ts_ext_bindings.cpp
+      ${CMAKE_CURRENT_SOURCE_DIR}/tilelang/utils/ts_ext/tensor.cpp
+      ${CMAKE_CURRENT_SOURCE_DIR}/tilelang/utils/ts_ext/ipc_ops.cpp
+    )
+
+    # Find libtorch_python.so
+    execute_process(
+      COMMAND "${Python_EXECUTABLE}" -c "import torch; import os; print(os.path.join(os.path.dirname(torch.__file__), 'lib', 'libtorch_python.so'))"
+      OUTPUT_VARIABLE TORCH_PYTHON_LIBRARY
+      OUTPUT_STRIP_TRAILING_WHITESPACE
+      RESULT_VARIABLE TORCH_PYTHON_RESULT
+    )
+
+    python_add_library(tilescale_ext_C MODULE ${TILESCALE_EXT_SOURCES} WITH_SOABI)
+    target_compile_definitions(tilescale_ext_C PRIVATE TORCH_EXTENSION_NAME=_C)
+    target_include_directories(tilescale_ext_C PRIVATE
+      ${TORCH_INCLUDE_DIRS}
+      ${CUDAToolkit_INCLUDE_DIRS}
+    )
+
+    if(TORCH_PYTHON_RESULT EQUAL 0 AND EXISTS "${TORCH_PYTHON_LIBRARY}")
+      message(STATUS "Found libtorch_python: ${TORCH_PYTHON_LIBRARY}")
+      target_link_libraries(tilescale_ext_C PRIVATE ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIBRARY} CUDA::cudart)
+    else()
+      message(WARNING "libtorch_python.so not found, extension may have undefined symbols")
+      target_link_libraries(tilescale_ext_C PRIVATE ${TORCH_LIBRARIES} CUDA::cudart)
+    endif()
+
+    target_compile_options(tilescale_ext_C PRIVATE -fPIC)
+    set_target_properties(tilescale_ext_C PROPERTIES
+      OUTPUT_NAME "_C"
+      CXX_STANDARD 17
+      LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    )
+
+    # Install as tilescale_ext/_C.so so it can be imported as tilescale_ext._C
+    install(TARGETS tilescale_ext_C
+            LIBRARY DESTINATION tilescale_ext
+            RUNTIME DESTINATION tilescale_ext)
+  else()
+    message(WARNING "Torch not found, tilescale_ext will not be built")
+  endif()
 endif()
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index 9e380d831..5eba9044a 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -17,23 +17,23 @@ diverse, inclusive, and healthy community.
 Examples of behavior that contributes to a positive environment for our
 community include:
 
-* Demonstrating empathy and kindness toward other people
-* Being respectful of differing opinions, viewpoints, and experiences
-* Giving and gracefully accepting constructive feedback
-* Accepting responsibility and apologizing to those affected by our mistakes,
+- Demonstrating empathy and kindness toward other people
+- Being respectful of differing opinions, viewpoints, and experiences
+- Giving and gracefully accepting constructive feedback
+- Accepting responsibility and apologizing to those affected by our mistakes,
   and learning from the experience
-* Focusing on what is best not just for us as individuals, but for the overall
+- Focusing on what is best not just for us as individuals, but for the overall
   community
 
 Examples of unacceptable behavior include:
 
-* The use of sexualized language or imagery, and sexual attention or advances of
+- The use of sexualized language or imagery, and sexual attention or advances of
   any kind
-* Trolling, insulting or derogatory comments, and personal or political attacks
-* Public or private harassment
-* Publishing others' private information, such as a physical or email address,
+- Trolling, insulting or derogatory comments, and personal or political attacks
+- Public or private harassment
+- Publishing others' private information, such as a physical or email address,
   without their explicit permission
-* Other conduct which could reasonably be considered inappropriate in a
+- Other conduct which could reasonably be considered inappropriate in a
   professional setting
 
 ## Enforcement Responsibilities
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e4b45e24b..45284e980 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -2,7 +2,7 @@
 
 That would be awesome if you want to contribute something to TileLang!
 
-### Table of Contents  <!-- omit in toc --> <!-- markdownlint-disable heading-increment -->
+## Table of Contents  <!-- omit in toc --> <!-- markdownlint-disable heading-increment -->
 
 - [Report Bugs](#report-bugs)
 - [Ask Questions](#ask-questions)
@@ -81,6 +81,8 @@ in the main directory. This installation is removable by:
 python3 -m pip uninstall tilelang
 ```
 
+We also recommend installing TileLang in a more manual way for better control over the build process, by compiling the C++ extensions first and set the `PYTHONPATH`. See [Working from Source via `PYTHONPATH`](https://tilelang.com/get_started/Installation.html#working-from-source-via-pythonpath) for detailed instructions.
+
 ## Lint Check
 
 To check the linting, run:
diff --git a/LICENSE b/LICENSE
index 2122252e9..09dd51c8c 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,7 +1,7 @@
     MIT License
 
     Copyright (c) Tile-AI.
-    **During the period from December 1, 2024, to Mar 14, 2025, this project is 
+    **During the period from December 1, 2024, to Mar 14, 2025, this project is
     subject to additional collaboration terms with Microsoft Corporation.**
 
     Permission is hereby granted, free of charge, to any person obtaining a copy
diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index 88b206825..000000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1,10 +0,0 @@
-include VERSION
-include CMakeLists.txt
-include requirements.txt
-include requirements-test.txt
-include requirements-dev.txt
-include tilelang/jit/adapter/cython/cython_wrapper.pyx
-recursive-include src *
-recursive-include 3rdparty *
-recursive-exclude 3rdparty/clang* * 
-recursive-exclude 3rdparty/llvm* *
diff --git a/README.md b/README.md
index 3962010df..886a14868 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,13 @@
 # TileScale: Tile-based AI Compute at All Scales
 
-TileScale is a distributed extension of TileLang. It expands TileLang's tile-level programming to multi-GPU, multi-node, and even distributed chip architecture scopes, with some new feature designs like tile-level communication and hierarchical programming introduced. 
+TileScale is a distributed extension of TileLang. It expands TileLang's tile-level programming to multi-GPU, multi-node, and even distributed chip architecture scopes, with some new feature designs like tile-level communication and hierarchical programming introduced.
 
-TileScale is a distributed-native domain-specific language (DSL) and compiler stack designed for deep learning on next-generation distributed architectures. 
+TileScale is a distributed-native domain-specific language (DSL) and compiler stack designed for deep learning on next-generation distributed architectures.
 As AI model entering the "scaling-law" era, modern AI infrastructure is also scaling the computation across both intra-chip and inter-chip scopes. On one side, current large AI models are already executing on multiple GPUs or even multiple nodes connected by the high-performance links like NVLink or InfiniBand. On the other side, a bunch of next-gen AI accelerators are embracing new chip architectures—such as 3D IC, near/in-memory computing, wafer-scale accelerators, etc., which are all in distributed form inner the chip for better scalability. Together, these trends are shaping modern AI compute systems into a hybrid, multi-level of "distributed architecture".
 
 TileScale is the first programming and compiler stack to unify these intra-chip and inter-chip compute resources into a unified, hierarchical, distributed architecture, which virtualizes the whole distributed system as a unified "mega-device" to users. To facilitate programming, TileScale provides a set of consistent tile-level primitives across all hardware layers for compute, memory, and communication. Thus, users can just write tile-level computing logic or flow at certain layers of interest, then TileScale automatically compiles and optimizes the scheduling of computation, communication, memory access, and their overlap. The goal of TileScale is to define an open, streamlined programming model for future distributed architectures and systems, addressing the emerging needs of modern AI computation, such as fine-grained computation and communication overlap, flexible parallel mechanisms, dataflow computation, NUMA programming, etc.
 
-#### The full technical white-paper is coming soon.
+## The full technical white-paper is coming soon.
 
 ## Hierarchical Distributed Architecture (HDA)
 Unlike traditional GPU SIMT programming, which assumes thread-level computation on a single device, TileScale is designed to manage compute, memory, and communication across all hierarchical scales, from threads and PEs to dies, chips, and nodes. It introduces a unified virtual device architecture, called Hierarchical Distributed Architecture (HDA), to abstract these distributed systems.
@@ -32,16 +32,15 @@ At each layer, the associated memory may be shared among all units or distribute
 Following the hierarchical hardware architecture, TileScale exposes a hierarchical programming interface. The fundamental unit of computation in TileScale is at the *tile* granularity. TileScale provides consistent tile-level compute, memory, and communication operators corresponding to each hardware scales.
 <div align="center">    <img src="./images/interface.png" alt="TileScale Programming Interface" width=80% />
 </div>
-  
-* *Compute*: A compute primitive takes input tensor tiles at certain memory layer and produces output tensor tiles. The same compute primitive can be used at different scale level, which will be translated to different implementations. A primitive at a high-level scale can be implemented by the lower-level-scale primitives. For example, a block-scale operator can be implemented by a group of warp-scale or thread-scale primitives.
-  
-* *Memory*: The memory primitives are used to copy data tiles at certain memory layer, as well as to copy data tile between different memory layers.
-  
-* *Communicate*: The communication primitives are used to transfer data tiles between compute units over the network, as well as to manage the synchronization. TileScale provides both basic peer-to-peer communication primitives as well as the collective communication primitives like AllReduce, All2All, etc., at a specific scale level.
+
+- *Compute*: A compute primitive takes input tensor tiles at certain memory layer and produces output tensor tiles. The same compute primitive can be used at different scale level, which will be translated to different implementations. A primitive at a high-level scale can be implemented by the lower-level-scale primitives. For example, a block-scale operator can be implemented by a group of warp-scale or thread-scale primitives.
+
+- *Memory*: The memory primitives are used to copy data tiles at certain memory layer, as well as to copy data tile between different memory layers.
+
+- *Communicate*: The communication primitives are used to transfer data tiles between compute units over the network, as well as to manage the synchronization. TileScale provides both basic peer-to-peer communication primitives as well as the collective communication primitives like AllReduce, All2All, etc., at a specific scale level.
 
 A primitive for a certain scale level may have multiple implementations. For example, a copy primitive could be implemented using TMA or LSU, while a remote copy across GPUs might be implemented using copy engines, TMA, or LSU. TileScale provides default implementations for each primitive, along with a compilation process to tune the best implementation. Users can also specify particular implementations through arguments in the tile primitives.
-With this hierarchical interface, user can easily customize the computation at certain scale level. For example, we can leverage the DSMEM feature to implement a general cluster-scale GEMM primitive. 
-  
+With this hierarchical interface, user can easily customize the computation at certain scale level. For example, we can leverage the DSMEM feature to implement a general cluster-scale GEMM primitive.
 
 ## System Overview and Design
 <div align="center">    <img src="./images/overview.png" alt="TileScale system overview" width=50% />
@@ -60,7 +59,7 @@ The layout and partition dimensions are either automatically inferred through a
 </div>
 
 ### Parallel task scheduling
-TileScale introduces a *T.Scale* primitive to control which hardware scale the current computations are conducted on. 
+TileScale introduces a *T.Scale* primitive to control which hardware scale the current computations are conducted on.
 It follows the SPMD (Single Program Multiple Data) programming model that scale the specified computation to all parallel units at this level.
 For example, the following *T.gemm* represents a warp GEMM, which executes on all warps in parallel.
 ```python
@@ -81,18 +80,18 @@ with T.Kernel(
         T.gemm(A, B, C)
 ```
 #### Task(warp) specialization
-Additionally, the T.Scale primitive can also return the rank and the total number of ranks of the current scale level. This allows you to easily leverage the rank index for task specialization, such as warp specialization or any other scale-level specialization. 
+Additionally, the T.Scale primitive can also return the rank and the total number of ranks of the current scale level. This allows you to easily leverage the rank index for task specialization, such as warp specialization or any other scale-level specialization.
 
 ```python
 # warp specialize example
 with T.Scale("warpgroup") as wg_id, wg_num:
     if wg_id == 0:
-        # do something 
+        # do something
     else:
         # do other thing
 ```
 #### MPI-style programming
-Combined with the communication primitives, you can also implement MPI-like programs if a communication channel exists across those ranks. For those compute units without hardware links, TileScale can also implement software channels by passing data through lower-level memory. 
+Combined with the communication primitives, you can also implement MPI-like programs if a communication channel exists across those ranks. For those compute units without hardware links, TileScale can also implement software channels by passing data through lower-level memory.
 ```python
 # communication example: send data to neighbor GPU
 with T.Scale("device") as dev_id, dev_num:
@@ -100,7 +99,7 @@ with T.Scale("device") as dev_id, dev_num:
     T.barrier()
 ```
 
-## Example: 
+## Example:
 ```python
 # Example of GEMM
 # 4-GPU Tensor Parallelism, using L2 to communicate
@@ -119,12 +118,12 @@ def gemm(
             A_global = T.view(A, layout=T.FullCol)
             B_global = T.view(B, layout=T.FullRow)
             C_global = T.view(C, layout=T.Replica)
-            
+
         with T.Scale("block"):
             A_local = T.alloc((block_M, block_K), dtype, level="l0")
             B_local = T.alloc((block_K, block_N), dtype, level="l0")
             C_local = T.alloc((block_M, block_N), accum_dtype, level="l0")
-            T.clear(C_local)   
+            T.clear(C_local)
 
             for k in T.Pipelined(T.ceildiv(A_global.shape[1], block_K), num_stages=3):
                 with T.Scale("warpgroup") as wg_id, wg_num:
@@ -134,7 +133,7 @@ def gemm(
                     T.copy(A_local_wg, A_global[by * block_M, k * block_K])
                     T.copy(B_local_wg, B_global[k * block_K, bx * block_N])
                     T.gemm(A_local_wg, B_local_wg, C_local_wg)
-                    
+
                     # Allreduce C_local_wg through software-defined channel on L1
                     T.allreduce(C_local_wg)
             T.copy(C_global[by * block_M, bx * block_N], C_local)
@@ -142,7 +141,7 @@ def gemm(
         with T.Scale("device") as dev_id, dev_num:
             # Allreduce C on L2
             T.allreduce(C_global)
-            
+
 ```
 ```python
 # Example of FlashMLA
@@ -156,8 +155,8 @@ def flash_mla(
         Output: T.Tensor([batch, heads, dim], dtype),
 ):
     with T.Kernel(
-        device=(4), 
-        block=(batch, heads // min(block_H, kv_group_num), 
+        device=(4),
+        block=(batch, heads // min(block_H, kv_group_num),
         threads=256)
     ):
         with T.Scale("device"):
@@ -182,8 +181,8 @@ def flash_mla(
             scores_scale = T.alloc([block_H], accum_dtype, level="l0")
             scores_sum = T.alloc([block_H], accum_dtype, level="l0")
             logsum = T.alloc([block_H], accum_dtype, level="l0")
-            
-            cur_kv_head = by // (kv_group_num // block_H)  
+
+            cur_kv_head = by // (kv_group_num // block_H)
 
             T.copy(Q_shared, Q_global[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :])
             T.copy(Q_pe_shared, Q_pe_global[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :])
@@ -199,7 +198,7 @@ def flash_mla(
 
                 T.gemm(Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                 T.gemm(Q_pe_shared, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
-                
+
                 T.copy(scores_max_prev, scores_max)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
@@ -217,7 +216,7 @@ def flash_mla(
                     T.copy(acc_s_cast_local[:, block_N // 2:block_N], acc_s_local, dst=(wg_id + 1) % wg_num)
                     # Or, you can use high level cooperative primitive
                     # T.allgather(acc_s_local), and Cast ...
-                
+
                 for i in T.Parallel(block_H):
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
                 for i, j in T.Parallel(block_H, dim):
diff --git a/THIRDPARTYNOTICES.txt b/THIRDPARTYNOTICES.txt
index b7c481841..3558662a8 100644
--- a/THIRDPARTYNOTICES.txt
+++ b/THIRDPARTYNOTICES.txt
@@ -1,5 +1,5 @@
-BitBLAS uses third-party material as listed below. The attached notices are 
-provided for informational purposes only. 
+BitBLAS uses third-party material as listed below. The attached notices are
+provided for informational purposes only.
 
 Notice for apache/tvm
 -------------------------------
diff --git a/VERSION b/VERSION
index 70f6c676e..e52aba075 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.1.6.post1
+0.1.7.post1
diff --git a/benchmark/blocksparse_attention/benchmark_library_dense_fmha.py b/benchmark/blocksparse_attention/benchmark_library_dense_fmha.py
index 6401276ac..3dd82aa5e 100644
--- a/benchmark/blocksparse_attention/benchmark_library_dense_fmha.py
+++ b/benchmark/blocksparse_attention/benchmark_library_dense_fmha.py
@@ -7,10 +7,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -28,15 +25,15 @@ def get_sparse_attn_mask_from_threshold(x, threshold, use_dense_for_last_block=F
 
 def benchmark_topk_sparse_attention():
     from benchmark_configs import configs
+
     torch.manual_seed(0)
 
     # Config
     for BATCH, N_HEADS, SEQ_LEN, D_HEAD, TOPK, BLOCK in configs:
-
         # Create inputs
-        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
         import flash_attn
 
diff --git a/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py b/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py
index aefe4d420..0018e9c93 100644
--- a/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py
+++ b/benchmark/blocksparse_attention/benchmark_tilelang_block_sparse_fmha.py
@@ -15,10 +15,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -39,16 +36,15 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
     block_N = 64
     num_stages = 2
     threads = 128
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, heads, seq_len, dim]
     block_mask_shape = [batch, heads, downsample_len, downsample_len]
 
-    dtype = "float16"
-    accum_dtype = "float"
-    block_mask_dtype = "bool"
+    dtype = T.float16
+    accum_dtype = T.float32
+    block_mask_dtype = T.bool
 
     def kernel_func(block_M, block_N, num_stages, threads):
-
         @T.macro
         def MMA0(
             K: T.Tensor(shape, dtype),
@@ -60,11 +56,10 @@ def MMA0(
             by: T.int32,
             bz: T.int32,
         ):
-            T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+            T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
             if is_causal:
                 for i, j in T.Parallel(block_M, block_N):
-                    acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                 -T.infinity(acc_s.dtype))
+                    acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
             else:
                 T.clear(acc_s)
             T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
@@ -79,22 +74,24 @@ def MMA1(
             by: T.int32,
             bz: T.int32,
         ):
-            T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+            T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
             T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
         @T.macro
         def Softmax(
-                acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-                acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-                scores_max: T.FragmentBuffer([block_M], accum_dtype),
-                scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-                scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-                logsum: T.FragmentBuffer([block_M], accum_dtype),
+            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+            scores_max: T.FragmentBuffer([block_M], accum_dtype),
+            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+            logsum: T.FragmentBuffer([block_M], accum_dtype),
         ):
             T.copy(scores_max, scores_max_prev)
             T.fill(scores_max, -T.infinity(accum_dtype))
             T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
             # To do causal softmax, we need to set the scores_max to 0 if it is -inf
             # This process is called Check_inf in FlashAttention3 code, and it only need to be done
             # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -114,22 +111,21 @@ def Softmax(
 
         @T.macro
         def Rescale(
-                acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
         ):
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] *= scores_scale[i]
 
         @T.prim_func
         def main(
-                Q: T.Tensor(shape, dtype),
-                K: T.Tensor(shape, dtype),
-                V: T.Tensor(shape, dtype),
-                BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
-                Output: T.Tensor(shape, dtype),
+            Q: T.Tensor(shape, dtype),
+            K: T.Tensor(shape, dtype),
+            V: T.Tensor(shape, dtype),
+            BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
+            Output: T.Tensor(shape, dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
+            with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
                 Q_shared = T.alloc_shared([block_M, dim], dtype)
                 K_shared = T.alloc_shared([block_N, dim], dtype)
                 V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -142,31 +138,29 @@ def main(
                 scores_scale = T.alloc_fragment([block_M], accum_dtype)
                 scores_sum = T.alloc_fragment([block_M], accum_dtype)
                 logsum = T.alloc_fragment([block_M], accum_dtype)
-                block_mask = T.alloc_local([downsample_len], block_mask_dtype)
+                block_mask = T.alloc_fragment([downsample_len], block_mask_dtype)
 
-                T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+                T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
 
-                for vj in T.serial(downsample_len):
-                    block_mask[vj] = BlockSparseMask[bz, by, bx, vj]
+                T.copy(BlockSparseMask[bz, by, bx, :], block_mask)
 
                 loop_range = (
-                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                        (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+                )
 
                 for k in T.Pipelined(loop_range, num_stages=num_stages):
-                    if block_mask[k]:
+                    if block_mask[k] != 0:
                         MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                        Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale,
-                                scores_sum, logsum)
+                        Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                         Rescale(acc_o, scores_scale)
                         MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
                 for i, j in T.Parallel(block_M, dim):
                     acc_o[i, j] /= logsum[i]
                 T.copy(acc_o, O_shared)
-                T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+                T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
         return main
 
@@ -175,26 +169,23 @@ def main(
 
 def benchmark_topk_sparse_attention():
     from benchmark_configs import configs
+
     torch.manual_seed(0)
 
     # Config
     for BATCH, N_HEADS, SEQ_LEN, D_HEAD, TOPK, BLOCK in configs:
-
         # Create inputs
-        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
         # Create sparse mask (downsampled to block level)
         downsample_factor = BLOCK
         downsample_len = math.ceil(SEQ_LEN / downsample_factor)
-        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                           device='cuda',
-                           dtype=torch.bfloat16)
+        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
         x_ds[:, :, :, 0] = 100
         block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
-        program = blocksparse_flashattn(
-            BATCH, N_HEADS, SEQ_LEN, D_HEAD, downsample_len, is_causal=True)
+        program = blocksparse_flashattn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, downsample_len, is_causal=True)
         kernel = tilelang.compile(program, out_idx=4)
 
         def benchmark_fn():
diff --git a/benchmark/blocksparse_attention/benchmark_torch_block_sparse_fmha.py b/benchmark/blocksparse_attention/benchmark_torch_block_sparse_fmha.py
index e4828ce5f..85d754ae3 100644
--- a/benchmark/blocksparse_attention/benchmark_torch_block_sparse_fmha.py
+++ b/benchmark/blocksparse_attention/benchmark_torch_block_sparse_fmha.py
@@ -10,10 +10,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -31,39 +28,37 @@ def get_sparse_attn_mask_from_threshold(x, threshold, use_dense_for_last_block=F
 
 def benchmark_topk_sparse_attention():
     from benchmark_configs import configs
+
     torch.manual_seed(0)
 
     # Config
     for BATCH, N_HEADS, SEQ_LEN, D_HEAD, TOPK, BLOCK in configs:
-
         # Create inputs
-        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
         sm_scale = 1.0 / (D_HEAD**0.5)
 
         # Create sparse mask (downsampled to block level)
         downsample_factor = BLOCK
         downsample_len = math.ceil(SEQ_LEN / downsample_factor)
-        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                           device='cuda',
-                           dtype=torch.bfloat16)
+        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
         x_ds[:, :, :, 0] = 100
         block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
 
         def benchmark_fn():
             # Compute reference
             # Expand block mask to full attention matrix
-            full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda'))
+            full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda"))
             full_mask = full_mask[..., :SEQ_LEN, :SEQ_LEN].bool()
             full_mask = full_mask & torch.tril(torch.ones_like(full_mask))  # Apply causal
 
             # PyTorch reference implementation
-            attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
-            attn = attn.masked_fill(~full_mask, float('-inf'))
+            attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
+            attn = attn.masked_fill(~full_mask, float("-inf"))
             attn = F.softmax(attn, dim=-1)
-            ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+            ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
             return ref_output
 
         ref_latency = do_bench(
diff --git a/benchmark/blocksparse_attention/benchmark_triton_block_sparse_fmha.py b/benchmark/blocksparse_attention/benchmark_triton_block_sparse_fmha.py
index 86ac894bc..7ebca93a6 100644
--- a/benchmark/blocksparse_attention/benchmark_triton_block_sparse_fmha.py
+++ b/benchmark/blocksparse_attention/benchmark_triton_block_sparse_fmha.py
@@ -15,10 +15,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -56,7 +53,6 @@ def _fwd_kernel_inner(
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
 ):
-
     mask_val = tl.load(block_mask_ptr + k_block_col_idx * stride_bmask_n)
 
     if mask_val == True:
@@ -72,8 +68,7 @@ def _fwd_kernel_inner(
 
         # the following is needed only when LAST_K_BLOCK or BLOCK_M < BLOCK_N
         if LAST_K_BLOCK:
-            qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0,
-                           float('-inf'))
+            qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0, float("-inf"))
 
         m_ij = tl.maximum(m_i, tl.max(qk, 1))
         qk -= m_ij[:, None]
@@ -153,7 +148,7 @@ def _fwd_kernel(
     v_ptrs = V + off_v
     mask_ptrs = block_mask_ptr + start_m * stride_bmm
 
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
     l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
     acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
 
@@ -191,24 +186,12 @@ def _fwd_kernel(
     acc = acc * l_recip
     acc = acc.to(Out.dtype.element_ty)
 
-    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[
-        None, :] * stride_od
+    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :] * stride_od
     out_ptrs = Out + off_o
     tl.store(out_ptrs, acc, mask=offs_m[:, None] < N_CTX)
 
 
-def _forward(ctx,
-             q,
-             k,
-             v,
-             block_sparse_mask,
-             sm_scale,
-             BLOCK_M=64,
-             BLOCK_N=64,
-             num_warps=None,
-             num_stages=1,
-             out=None):
-
+def _forward(ctx, q, k, v, block_sparse_mask, sm_scale, BLOCK_M=64, BLOCK_N=64, num_warps=None, num_stages=1, out=None):
     assert q.shape[-1] == k.shape[-1] == v.shape[-1]
     assert k.shape[2] == v.shape[2]
     o = out if out is not None else torch.empty_like(q).contiguous()
@@ -253,7 +236,6 @@ def _forward(ctx,
 
 
 class _sparse_attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, block_sparse_dense, sm_scale):
         # shape constraints
@@ -271,24 +253,22 @@ def backward(ctx, do):
 
 def benchmark_topk_sparse_attention():
     from benchmark_configs import configs
+
     torch.manual_seed(0)
 
     # Config
     for BATCH, N_HEADS, SEQ_LEN, D_HEAD, TOPK, BLOCK in configs:
-
         # Create inputs
-        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+        q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+        v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
         sm_scale = 1.0 / (D_HEAD**0.5)
 
         # Create sparse mask (downsampled to block level)
         downsample_factor = BLOCK
         downsample_len = math.ceil(SEQ_LEN / downsample_factor)
-        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                           device='cuda',
-                           dtype=torch.bfloat16)
+        x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
         x_ds[:, :, :, 0] = 100
         block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
 
diff --git a/benchmark/distributed/README.md b/benchmark/distributed/README.md
index ac1cea257..21db28531 100644
--- a/benchmark/distributed/README.md
+++ b/benchmark/distributed/README.md
@@ -1 +1 @@
-To compare with [TileLink](https://arxiv.org/abs/2503.20313), please install [Triton-distributed](https://github.com/ByteDance-Seed/Triton-distributed).
\ No newline at end of file
+To compare with [TileLink](https://arxiv.org/abs/2503.20313), please install [Triton-distributed](https://github.com/ByteDance-Seed/Triton-distributed).
diff --git a/benchmark/distributed/benchmark_ag_gemm.py b/benchmark/distributed/benchmark_ag_gemm.py
index a4b0bd785..8ac3c244e 100644
--- a/benchmark/distributed/benchmark_ag_gemm.py
+++ b/benchmark/distributed/benchmark_ag_gemm.py
@@ -1,4 +1,4 @@
-'''Bugfix first:
+"""Bugfix first:
 Triton-distributed/python/triton_dist/kernels/nvidia/allgather_gemm.py:566
 ```python
 M = M_per_rank * ctx.num_ranks
@@ -7,9 +7,9 @@
 ```python
 M = M_per_rank * num_ranks
 ```
-'''
+"""
 
-#TODO: further tune the performance
+# TODO: further tune the performance
 
 import argparse
 import torch
@@ -27,36 +27,27 @@
 
 @tilelang.jit(
     out_idx=-1,
-    pass_configs={"tl.disable_rdc": True}
-    #FIXME: https://github.com/tile-ai/tilelang/issues/659
+    pass_configs={"tl.disable_rdc": True},
+    # FIXME: https://github.com/tile-ai/tilelang/issues/659
 )
-def matmut_transpose(rank,
-                     num_ranks,
-                     M,
-                     N_per_rank,
-                     K,
-                     block_M,
-                     block_N,
-                     block_K,
-                     dtype="float16",
-                     threads=256,
-                     persistent=False) -> tilelang.JITKernel:
+def matmut_transpose(
+    rank, num_ranks, M, N_per_rank, K, block_M, block_N, block_K, dtype="float16", threads=256, persistent=False
+) -> tilelang.JITKernel:
     accum_dtype = "float32"
     signal_dtype = "uint64"  # NVSHMEM requires uint64 for signal
 
     assert M % block_M == 0 and N_per_rank % block_N == 0 and K % block_K == 0
-    M_blocks, N_blocks, K_stages = T.ceildiv(M, block_M), T.ceildiv(N_per_rank,
-                                                                    block_N), T.ceildiv(K, block_K)
+    M_blocks, N_blocks, K_stages = T.ceildiv(M, block_M), T.ceildiv(N_per_rank, block_N), T.ceildiv(K, block_K)
     M_blocks_per_rank = M_blocks // num_ranks
 
     sm_num = driver.get_num_sms()  # Get # of SMs for persistent kernel
 
     @T.prim_func
     def nonpersistent_kernel(
-            A: T.Tensor((M, K), dtype),  # type: ignore
-            B: T.Tensor((N_per_rank, K), dtype),  # type: ignore
-            signal: T.Tensor((num_ranks), signal_dtype),  # type: ignore
-            C: T.Tensor((M, N_per_rank), dtype),  # type: ignore
+        A: T.Tensor((M, K), dtype),  # type: ignore
+        B: T.Tensor((N_per_rank, K), dtype),  # type: ignore
+        signal: T.Tensor((num_ranks), signal_dtype),  # type: ignore
+        C: T.Tensor((M, N_per_rank), dtype),  # type: ignore
     ):
         with T.Kernel(N_blocks, M_blocks, threads=threads) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -81,10 +72,10 @@ def nonpersistent_kernel(
 
     @T.prim_func
     def persistent_kernel(
-            A: T.Tensor((M, K), dtype),  # type: ignore
-            B: T.Tensor((N_per_rank, K), dtype),  # type: ignore
-            signal: T.Tensor((num_ranks), signal_dtype),  # type: ignore
-            C: T.Tensor((M, N_per_rank), dtype),  # type: ignore
+        A: T.Tensor((M, K), dtype),  # type: ignore
+        B: T.Tensor((N_per_rank, K), dtype),  # type: ignore
+        signal: T.Tensor((num_ranks), signal_dtype),  # type: ignore
+        C: T.Tensor((M, N_per_rank), dtype),  # type: ignore
     ):
         with T.Kernel(sm_num, threads=threads) as (block_id):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -145,9 +136,10 @@ def overlapped_ag_gemm(
         block_K=64,
         dtype=dtype,
         threads=threads,
-        persistent=persistent)
+        persistent=persistent,
+    )
     if RANK == 0 and args.print_source:
-        print('We currently use cp-engine for producer, print consumer kernel code only...')
+        print("We currently use cp-engine for producer, print consumer kernel code only...")
         print(consumer.get_kernel_source())
 
     ag_buffer = pynvshmem.nvshmem_create_tensor_list_intra_node(
@@ -164,14 +156,13 @@ def overlapped_ag_gemm(
     gemm_stream.wait_stream(current_stream)
 
     with torch.cuda.stream(ag_stream):
-        ag_buffer[rank][rank * M_per_rank:(rank + 1) * M_per_rank, :].copy_(A)
+        ag_buffer[rank][rank * M_per_rank : (rank + 1) * M_per_rank, :].copy_(A)
         pynvshmem.write64_on_stream(signal_buffer[rank], 1, ag_stream)
-        pynvshmem.nvshmemx_barrier_all_on_stream(
-            ag_stream.cuda_stream)  # Ensure visible to all ranks
+        pynvshmem.nvshmemx_barrier_all_on_stream(ag_stream.cuda_stream)  # Ensure visible to all ranks
         rank_orders = [(rank + i) % num_ranks for i in range(1, num_ranks)]
         for src_rank in rank_orders:
-            dst = ag_buffer[rank][src_rank * M_per_rank:(src_rank + 1) * M_per_rank, :]
-            src = ag_buffer[src_rank][src_rank * M_per_rank:(src_rank + 1) * M_per_rank, :]
+            dst = ag_buffer[rank][src_rank * M_per_rank : (src_rank + 1) * M_per_rank, :]
+            src = ag_buffer[src_rank][src_rank * M_per_rank : (src_rank + 1) * M_per_rank, :]
             dst.copy_(src)
             pynvshmem.write64_on_stream(signal_buffer[src_rank], 1, ag_stream)
 
@@ -188,19 +179,17 @@ def parse_args():
     parser.add_argument("--M", type=int, default=8192)
     parser.add_argument("--N", type=int, default=49152)
     parser.add_argument("--K", type=int, default=12288)
-    parser.add_argument(
-        "--dtype", type=str, default="float16", choices=["float16", "float32", "bfloat16"])
+    parser.add_argument("--dtype", type=str, default="float16", choices=["float16", "float32", "bfloat16"])
     parser.add_argument("--threads", type=int, default=256, help="number of threads in a block")
-    parser.add_argument(
-        "--persistent", action='store_true', default=False, help="use persistent GEMM consumers")
+    parser.add_argument("--persistent", action="store_true", default=False, help="use persistent GEMM consumers")
     parser.add_argument("--print_source", action="store_true", help="print kernel source code")
     parser.add_argument("--warmup", type=int, default=5, help="number of warmup iterations")
     parser.add_argument("--repeat", type=int, default=10, help="number of repeat iterations")
     return parser.parse_args()
 
 
-if __name__ == '__main__':
-    assert torch.cuda.get_device_capability()[0] >= 9, '❗This benchmark requires sm_90 or higher'
+if __name__ == "__main__":
+    assert torch.cuda.get_device_capability()[0] >= 9, "❗This benchmark requires sm_90 or higher"
 
     WORLD_SIZE, RANK, LOCAL_RANK, TP_GROUP = init_distributed(return_tp_group=True)
     assert WORLD_SIZE <= 8, "This benchmark is designed for intra-node AG-GEMM"
@@ -231,12 +220,10 @@ def torch_ag_gemm():
     # Benchmark Triton-dist (overlapped)
     ag_intranode_stream = torch.cuda.Stream(priority=-1)
 
-    ctx = create_ag_gemm_context(
-        A, B, RANK, PE_num, max_M=M, for_correctness=False, ag_intranode_stream=ag_intranode_stream)
+    ctx = create_ag_gemm_context(A, B, RANK, PE_num, max_M=M, for_correctness=False, ag_intranode_stream=ag_intranode_stream)
 
     def triton_ag_gemm(persistent, autotune):
-        return ag_gemm(
-            A, B, ctx=ctx, rank=RANK, num_ranks=PE_num, persistent=persistent, autotune=autotune)
+        return ag_gemm(A, B, ctx=ctx, rank=RANK, num_ranks=PE_num, persistent=persistent, autotune=autotune)
 
     dist.barrier(TP_GROUP)
     triton_ag_gemm = partial(triton_ag_gemm, persistent=False, autotune=False)
@@ -257,8 +244,7 @@ def tilelang_ag_gemm():
     print(f"rank {RANK} tilelang AG-GEMM avg time: {tl_t} ms")
 
     # Check correctness
-    assert torch.allclose(
-        tl_out, torch_out, atol=1e-2, rtol=1e-2), f'max error: {(tl_out - torch_out).abs().max()}'
+    assert torch.allclose(tl_out, torch_out, atol=1e-2, rtol=1e-2), f"max error: {(tl_out - torch_out).abs().max()}"
     print(f"rank {RANK} check passed.✅")
 
     dist.destroy_process_group()
diff --git a/benchmark/distributed/benchmark_all_gather.py b/benchmark/distributed/benchmark_all_gather.py
index 24d3445b2..676ad4853 100644
--- a/benchmark/distributed/benchmark_all_gather.py
+++ b/benchmark/distributed/benchmark_all_gather.py
@@ -30,9 +30,8 @@ def cp_engine_producer_all_gather_full_mesh_pull(
             if src_rank == rank:
                 continue
             # peer: src_rank, offset src_rank[src_rank] -> rank[src_rank]
-            dst = remote_tensor_buffers[rank][src_rank * M_per_rank:(src_rank + 1) * M_per_rank, :]
-            src = remote_tensor_buffers[src_rank][src_rank * M_per_rank:(src_rank + 1) *
-                                                  M_per_rank, :]
+            dst = remote_tensor_buffers[rank][src_rank * M_per_rank : (src_rank + 1) * M_per_rank, :]
+            src = remote_tensor_buffers[src_rank][src_rank * M_per_rank : (src_rank + 1) * M_per_rank, :]
             dst.copy_(src)
             pynvshmem.write64_on_stream(
                 barrier_buffers[rank][src_rank],
@@ -47,8 +46,8 @@ def allgather(PE_num, M, N, dtype="float16", threads=128):
 
     @T.prim_func
     def a2a_pull(
-            A: T.Tensor((M_per_rank, N), dtype),  # type: ignore
-            B: T.Tensor((M, N), dtype),  # type: ignore
+        A: T.Tensor((M_per_rank, N), dtype),  # type: ignore
+        B: T.Tensor((M, N), dtype),  # type: ignore
     ):
         with T.Kernel(M_per_rank // block_M, PE_num - 1, threads=threads) as (bx, by):
             mype = T.get_pe()
@@ -57,7 +56,10 @@ def a2a_pull(
 
             T.getmem_nbi_block(
                 T.address_of(B[peer * M_per_rank + bx * block_M, 0]),
-                T.address_of(A[bx * block_M, 0]), block_M * N * dtype_map[dtype].itemsize, peer)
+                T.address_of(A[bx * block_M, 0]),
+                block_M * N * dtype_map[dtype].itemsize,
+                peer,
+            )
             # We don't need a barrier for the pull mode
 
     return a2a_pull
@@ -65,12 +67,9 @@ def a2a_pull(
 
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--M", type=int,
-        default=8192)  # Follow Triton-setting, we benchmark on (M, N) = (8192, 12288)
+    parser.add_argument("--M", type=int, default=8192)  # Follow Triton-setting, we benchmark on (M, N) = (8192, 12288)
     parser.add_argument("--N", type=int, default=12288)
-    parser.add_argument(
-        "--dtype", type=str, default="float16", choices=["float16", "float32", "bfloat16"])
+    parser.add_argument("--dtype", type=str, default="float16", choices=["float16", "float32", "bfloat16"])
     parser.add_argument("--threads", type=int, default=128, help="number of threads in a block")
     parser.add_argument("--print_source", action="store_true", help="print kernel source code")
     parser.add_argument("--warmup", type=int, default=5, help="number of warmup iterations")
@@ -78,7 +77,7 @@ def parse_args():
     return parser.parse_args()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     WORLD_SIZE, RANK, LOCAL_RANK, TP_GROUP = init_distributed(return_tp_group=True)
     assert WORLD_SIZE <= 8, "This benchmark is designed for intra-node communication"
 
@@ -111,13 +110,9 @@ def torch_ag():
 
     # Benchmark Triton-dist
     def triton_ag():
-        ag_buffer_ptrs = pynvshmem.nvshmem_create_tensor_list_intra_node(
-            [M, N], torch_dtype)  # buffer for dist-triton allgather
-        signal = pynvshmem.nvshmem_create_tensor_list_intra_node(
-            ([PE_num]), torch.uint64)  # each rank corresponds to one barrier
-        ag_buffer_ptrs[RANK][
-            RANK * M_per_rank:(RANK + 1) * M_per_rank,
-        ].copy_(local_data)
+        ag_buffer_ptrs = pynvshmem.nvshmem_create_tensor_list_intra_node([M, N], torch_dtype)  # buffer for dist-triton allgather
+        signal = pynvshmem.nvshmem_create_tensor_list_intra_node(([PE_num]), torch.uint64)  # each rank corresponds to one barrier
+        ag_buffer_ptrs[RANK][RANK * M_per_rank : (RANK + 1) * M_per_rank,].copy_(local_data)
         signal[RANK].zero_()
         pynvshmem.nvshmemx_barrier_all_on_stream(torch.cuda.current_stream().cuda_stream)
         cp_engine_producer_all_gather_full_mesh_pull(
@@ -134,7 +129,7 @@ def tilelang_ag():
         ag_buffer = pynvshmem.nvshmem_create_tensor([M_per_rank, N], torch_dtype)
         ag_buffer.copy_(local_data)
         out = pynvshmem.nvshmem_create_tensor([M, N], torch_dtype)
-        out[RANK * M_per_rank:(RANK + 1) * M_per_rank, :].copy_(local_data)
+        out[RANK * M_per_rank : (RANK + 1) * M_per_rank, :].copy_(local_data)
         kernel(ag_buffer, out)
 
         return out
@@ -145,8 +140,7 @@ def tilelang_ag():
     # Tested on 4A100 with full-mesh NVLink, comparable with Triton-dist and ~20x faster than Torch
 
     # Check correctness
-    assert torch.allclose(
-        tl_out, torch_out, atol=0, rtol=0), f'max error: {(tl_out - torch_out).abs().max()}'
+    assert torch.allclose(tl_out, torch_out, atol=0, rtol=0), f"max error: {(tl_out - torch_out).abs().max()}"
     print(f"rank {RANK} check passed.✅")
 
     dist.destroy_process_group()
diff --git a/benchmark/distributed/benchmark_all_to_all.py b/benchmark/distributed/benchmark_all_to_all.py
index 6aae8b203..d2d0ded3a 100644
--- a/benchmark/distributed/benchmark_all_to_all.py
+++ b/benchmark/distributed/benchmark_all_to_all.py
@@ -13,19 +13,18 @@
 
 
 def all_to_all(max_m, hidden, num_tot_experts, WORLD_SIZE, threads=128, dtype="float16"):
-
     scale_dtype = "float"
     EXPERTS_PER_RANK = num_tot_experts // WORLD_SIZE
 
     @T.prim_func
     def main(
-            send_buf: T.Tensor((max_m, hidden), dtype),  # type: ignore
-            recv_buf: T.Tensor((WORLD_SIZE * max_m * 2, hidden), dtype),  # type: ignore
-            scale_send_buf: T.Tensor((max_m), scale_dtype),  # type: ignore
-            scale_recv_buf: T.Tensor((WORLD_SIZE * max_m * 2), scale_dtype),  # type: ignore
-            split_send_buf: T.Tensor((num_tot_experts), "int32"),  # type: ignore
-            split_recv_buf: T.Tensor((num_tot_experts * 2), "int32"),  # type: ignore
-            signal_buf: T.Tensor((WORLD_SIZE * 2), "uint64"),  # type: ignore
+        send_buf: T.Tensor((max_m, hidden), dtype),  # type: ignore
+        recv_buf: T.Tensor((WORLD_SIZE * max_m * 2, hidden), dtype),  # type: ignore
+        scale_send_buf: T.Tensor((max_m), scale_dtype),  # type: ignore
+        scale_recv_buf: T.Tensor((WORLD_SIZE * max_m * 2), scale_dtype),  # type: ignore
+        split_send_buf: T.Tensor((num_tot_experts), "int32"),  # type: ignore
+        split_recv_buf: T.Tensor((num_tot_experts * 2), "int32"),  # type: ignore
+        signal_buf: T.Tensor((WORLD_SIZE * 2), "uint64"),  # type: ignore
     ):
         with T.Kernel(WORLD_SIZE, threads=threads) as (bx):
             peer = bx
@@ -63,17 +62,14 @@ def main(
 
 
 class TilelangAllToAll:
-
     def __init__(self, ctx: AllToAllContext):
         self.ctx = ctx
-        self.func = all_to_all(
-            ctx.max_m, ctx.hidden, ctx.num_tot_experts, ctx.WORLD_SIZE, threads=128)
+        self.func = all_to_all(ctx.max_m, ctx.hidden, ctx.num_tot_experts, ctx.WORLD_SIZE, threads=128)
         self.kernel = tilelang.compile(self.func, pass_configs={"tl.disable_tma_lower": True})
         if self.ctx.rank == 0:
             print(self.kernel.get_kernel_source())
 
-    def __call__(self, send_tensor: torch.Tensor, send_split_cumsum: torch.Tensor,
-                 send_scale: torch.Tensor | None):
+    def __call__(self, send_tensor: torch.Tensor, send_split_cumsum: torch.Tensor, send_scale: torch.Tensor | None):
         """
         low-latency all-to-all communication
         """
@@ -161,7 +157,6 @@ def calc_gather_index(
     row_end: int,
     BLOCK_SIZE: int = 1024,
 ):
-
     @triton.jit
     def _kernel(
         scatter_index: torch.Tensor,
@@ -202,8 +197,7 @@ def _kernel(
 
 
 def calc_scatter_index_stable(choosed_experts: torch.Tensor):
-    return (choosed_experts.flatten().argsort(stable=True).argsort().int().view(
-        choosed_experts.shape))
+    return choosed_experts.flatten().argsort(stable=True).argsort().int().view(choosed_experts.shape)
 
 
 def main():
@@ -227,7 +221,6 @@ def main():
     )
 
     def perf_triton(input: torch.Tensor, scale_tensor: torch.Tensor, exp_indices: torch.Tensor):
-
         # prepare the indexes
         splits_gpu_cur_rank = torch.bincount(exp_indices.view(-1), minlength=args.G).to(torch.int32)
         split_cumsum = splits_to_cumsum(splits_gpu_cur_rank)
@@ -237,20 +230,17 @@ def perf_triton(input: torch.Tensor, scale_tensor: torch.Tensor, exp_indices: to
         # calculate the gather idx accordingly
         gather_idx_cur_rank, _ = calc_gather_index(scatter_idx_cur_rank, 0, token_num * args.topk)
         # use torch native scatter forward(will not be included in the e2e time measurement)
-        scattered_input = torch.empty(
-            input.size(0) * args.topk, input.size(1), dtype=input.dtype, device=input.device)
+        scattered_input = torch.empty(input.size(0) * args.topk, input.size(1), dtype=input.dtype, device=input.device)
         scattered_scale_tensor = torch.empty(
             (scale_tensor.size(0) * args.topk),
             dtype=scale_tensor.dtype,
             device=scale_tensor.device,
         )
         scattered_input.copy_(torch.index_select(input, dim=0, index=gather_idx_cur_rank))
-        scattered_scale_tensor.copy_(
-            torch.index_select(scale_tensor, dim=0, index=gather_idx_cur_rank))
+        scattered_scale_tensor.copy_(torch.index_select(scale_tensor, dim=0, index=gather_idx_cur_rank))
 
         def fwd():
-            return fast_all_to_all(all_to_all_ctx, scattered_input, split_cumsum,
-                                   scattered_scale_tensor if args.with_scale else None)
+            return fast_all_to_all(all_to_all_ctx, scattered_input, split_cumsum, scattered_scale_tensor if args.with_scale else None)
 
         torch.cuda._sleep(1000000000)
         # warmup
@@ -269,21 +259,22 @@ def fwd():
 
         # 1. dispatch
         dispatch_splits, dispatch_token, dispatch_scale = fast_all_to_all(
-            all_to_all_ctx, scattered_input, split_cumsum,
-            scattered_scale_tensor if args.with_scale else None)
+            all_to_all_ctx, scattered_input, split_cumsum, scattered_scale_tensor if args.with_scale else None
+        )
         dispatch_token, dispatch_scale = all_to_all_post_process(
-            all_to_all_ctx, dispatch_splits, dispatch_token,
-            dispatch_scale if args.with_scale else None)
+            all_to_all_ctx, dispatch_splits, dispatch_token, dispatch_scale if args.with_scale else None
+        )
 
         # 2. compute: moe_compute(dispatch_token, dispatch_scale, moe_weight, ...)
         # ...
 
         # 3. combine
         combine_splits, combine_token, combine_scale = fast_all_to_all(
-            all_to_all_ctx, dispatch_token, splits_to_cumsum(dispatch_splits), dispatch_scale)
+            all_to_all_ctx, dispatch_token, splits_to_cumsum(dispatch_splits), dispatch_scale
+        )
         combine_token, combine_scale = all_to_all_post_process(
-            all_to_all_ctx, combine_splits, combine_token,
-            combine_scale if args.with_scale else None)
+            all_to_all_ctx, combine_splits, combine_token, combine_scale if args.with_scale else None
+        )
 
         # 3.1. reduce: [num_tokens_local_rank * topk] => [num_tokens_local_rank]
         combine_reduced_out = torch.zeros_like(input)
@@ -293,8 +284,7 @@ def fwd():
         torch.testing.assert_close(combine_reduced_out, input * args.topk, rtol=1e-2, atol=1e-2)
 
         tilelang_all_to_all = TilelangAllToAll(all_to_all_ctx)
-        tilelang_all_to_all(scattered_input, split_cumsum,
-                            scattered_scale_tensor if args.with_scale else None)
+        tilelang_all_to_all(scattered_input, split_cumsum, scattered_scale_tensor if args.with_scale else None)
 
         # torch.testing.assert_close(tilelang_out[1], dispatch_token, rtol=1e-2, atol=1e-2)
         # torch.testing.assert_close(tilelang_scale, dispatch_scale, rtol=1e-2, atol=1e-2)
@@ -307,8 +297,7 @@ def fwd():
     exp_indices = generate_random_exp_indices(token_num, args.G, args.topk)
     assert exp_indices.size(0) == token_num and exp_indices.size(1) == args.topk
     exp_indices = exp_indices.to("cuda")
-    input = (
-        torch.rand(token_num, args.N, dtype=torch.float32).to(dtype_map[args.dtype]).to("cuda"))
+    input = torch.rand(token_num, args.N, dtype=torch.float32).to(dtype_map[args.dtype]).to("cuda")
     scale_tensor = torch.rand(token_num, dtype=torch.float32).to("cuda")
 
     torch.cuda.synchronize()
diff --git a/benchmark/distributed/benchmark_gemm_rs.py b/benchmark/distributed/benchmark_gemm_rs.py
index 5be4431c3..a4570d2f4 100644
--- a/benchmark/distributed/benchmark_gemm_rs.py
+++ b/benchmark/distributed/benchmark_gemm_rs.py
@@ -1,6 +1,6 @@
 # Currently we only implement in Tilelang
-#TODO: add Triton-dist v3.4 impl
-#TODO: further tune the performance
+# TODO: add Triton-dist v3.4 impl
+# TODO: further tune the performance
 
 import argparse
 import torch
@@ -8,40 +8,33 @@
 import pynvshmem
 import tilelang
 import tilelang.language as T
+
 # from tilelang.carver.arch import driver
 from tilelang.distributed import init_distributed, dtype_map, perf_fn
 
 tilelang.disable_cache()
 
 
-@tilelang.jit(pass_configs={"tl.disable_rdc": True}
-              #FIXME: https://github.com/tile-ai/tilelang/issues/659
-             )
-def fused_gemm_scatter(rank,
-                       num_ranks,
-                       M,
-                       N,
-                       K_per_rank,
-                       block_M,
-                       block_N,
-                       block_K,
-                       dtype="float16",
-                       threads=128,
-                       persistent=False) -> tilelang.JITKernel:
+@tilelang.jit(
+    pass_configs={"tl.disable_rdc": True}
+    # FIXME: https://github.com/tile-ai/tilelang/issues/659
+)
+def fused_gemm_scatter(
+    rank, num_ranks, M, N, K_per_rank, block_M, block_N, block_K, dtype="float16", threads=128, persistent=False
+) -> tilelang.JITKernel:
     accum_dtype = "float32"
 
     assert M % block_M == 0 and N % block_N == 0 and K_per_rank % block_K == 0
-    M_blocks, N_blocks, K_stages = T.ceildiv(M, block_M), T.ceildiv(N, block_N), T.ceildiv(
-        K_per_rank, block_K)
+    M_blocks, N_blocks, K_stages = T.ceildiv(M, block_M), T.ceildiv(N, block_N), T.ceildiv(K_per_rank, block_K)
     M_blocks_per_rank = M_blocks // num_ranks
 
     # sm_num = driver.get_num_sms()  # Get # of SMs for persistent kernel
 
     @T.prim_func
     def nonpersistent_kernel(
-            A: T.Tensor((M, K_per_rank), dtype),  # type: ignore
-            B: T.Tensor((N, K_per_rank), dtype),  # type: ignore
-            C: T.Tensor((M_blocks, N_blocks, block_M, block_N), dtype),  # type: ignore
+        A: T.Tensor((M, K_per_rank), dtype),  # type: ignore
+        B: T.Tensor((N, K_per_rank), dtype),  # type: ignore
+        C: T.Tensor((M_blocks, N_blocks, block_M, block_N), dtype),  # type: ignore
     ):
         with T.Kernel(N_blocks, M_blocks, threads=threads) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -63,8 +56,8 @@ def nonpersistent_kernel(
             T.copy(C_shared, C[by, bx, :, :])
             peer = by // M_blocks_per_rank
             T.putmem_nbi_block(
-                T.address_of(C[by, bx, 0, 0]), T.address_of(C[by, bx, 0, 0]),
-                block_M * block_N * dtype_map[dtype].itemsize, peer)
+                T.address_of(C[by, bx, 0, 0]), T.address_of(C[by, bx, 0, 0]), block_M * block_N * dtype_map[dtype].itemsize, peer
+            )
 
     assert not persistent
     return nonpersistent_kernel
@@ -110,10 +103,10 @@ def overlapped_gemm_rs(
         block_K=block_K,
         dtype=dtype,
         threads=threads,
-        persistent=persistent)
+        persistent=persistent,
+    )
 
-    gemm_output = pynvshmem.nvshmem_create_tensor_list_intra_node(
-        [M_blocks, N_blocks, block_M, block_N], dtype=input.dtype)
+    gemm_output = pynvshmem.nvshmem_create_tensor_list_intra_node([M_blocks, N_blocks, block_M, block_N], dtype=input.dtype)
     output = torch.empty((M_per_rank, N), dtype=input.dtype, device="cuda")
     fused_gemm_scatter_kernel(input, weight, gemm_output[rank])
     dist.barrier(TP_GROUP)
@@ -126,19 +119,17 @@ def parse_args():
     parser.add_argument("--M", type=int, default=16384)
     parser.add_argument("--N", type=int, default=12288)
     parser.add_argument("--K", type=int, default=49152)
-    parser.add_argument(
-        "--dtype", type=str, default="float16", choices=["float16", "float32", "bfloat16"])
+    parser.add_argument("--dtype", type=str, default="float16", choices=["float16", "float32", "bfloat16"])
     parser.add_argument("--threads", type=int, default=128, help="number of threads in a block")
-    parser.add_argument(
-        "--persistent", action='store_true', default=False, help="use persistent GEMM producers")
+    parser.add_argument("--persistent", action="store_true", default=False, help="use persistent GEMM producers")
     parser.add_argument("--print_source", action="store_true", help="print kernel source code")
     parser.add_argument("--warmup", type=int, default=5, help="number of warmup iterations")
     parser.add_argument("--repeat", type=int, default=10, help="number of repeat iterations")
     return parser.parse_args()
 
 
-if __name__ == '__main__':
-    assert torch.cuda.get_device_capability()[0] >= 9, '❗This benchmark requires sm_90 or higher'
+if __name__ == "__main__":
+    assert torch.cuda.get_device_capability()[0] >= 9, "❗This benchmark requires sm_90 or higher"
 
     WORLD_SIZE, RANK, LOCAL_RANK, TP_GROUP = init_distributed(return_tp_group=True)
     assert WORLD_SIZE <= 8, "This benchmark is designed for intra-node GEMM-RS"
@@ -176,16 +167,14 @@ def torch_gemm_rs():
         print("Use non-persistent GEMM producers...")
 
     def tilelang_gemm_rs():
-        return overlapped_gemm_rs(
-            input, weight, rank=RANK, num_ranks=PE_num, persistent=args.persistent)
+        return overlapped_gemm_rs(input, weight, rank=RANK, num_ranks=PE_num, persistent=args.persistent)
 
     dist.barrier(TP_GROUP)
     tl_out, tl_t = perf_fn(tilelang_gemm_rs, warmup, repeat)
     print(f"rank {RANK} tilelang GEMM avg time: {tl_t} ms")
 
     # Check correctness
-    assert torch.allclose(
-        tl_out, torch_out, atol=1e-2, rtol=1e-2), f'max error: {(tl_out - torch_out).abs().max()}'
+    assert torch.allclose(tl_out, torch_out, atol=1e-2, rtol=1e-2), f"max error: {(tl_out - torch_out).abs().max()}"
     print(f"rank {RANK} check passed.✅")
 
     dist.destroy_process_group()
diff --git a/benchmark/distributed/benchmark_reduce_scatter.py b/benchmark/distributed/benchmark_reduce_scatter.py
index c6431f79a..277125bb6 100644
--- a/benchmark/distributed/benchmark_reduce_scatter.py
+++ b/benchmark/distributed/benchmark_reduce_scatter.py
@@ -11,13 +11,13 @@
 
 tilelang.disable_cache()
 
-#TODO: Bench on 4/8 H100
-#TODO: split N?
-'''init_nvshmem_by_torch_process_group(_TP_GROUP)
+# TODO: Bench on 4/8 H100
+# TODO: split N?
+"""init_nvshmem_by_torch_process_group(_TP_GROUP)
 Note: Minor numerical differences exist between Triton/TileLang and Torch (~1e-2)
 due to the order reductions are handled in different implementations.
 (No error when #PE = 2)
-'''
+"""
 
 
 def reducescatter(PE_num, M, N, dtype="float16", threads=128):
@@ -27,8 +27,8 @@ def reducescatter(PE_num, M, N, dtype="float16", threads=128):
 
     @T.prim_func
     def pull_reduce(
-            A: T.Tensor((M, N), dtype),  # type: ignore
-            B: T.Tensor((M_per_rank, N), dtype),  # type: ignore
+        A: T.Tensor((M, N), dtype),  # type: ignore
+        B: T.Tensor((M_per_rank, N), dtype),  # type: ignore
     ):
         with T.Kernel(M_per_rank // block_M, threads=threads) as (bx):
             mype = T.get_pe()
@@ -42,15 +42,17 @@ def pull_reduce(
                 T.getmem_nbi_block(
                     T.address_of(A_shared[peer, 0, 0]),
                     T.address_of(A[mype * M_per_rank + bx * block_M, 0]),
-                    block_M * N * dtype_map[dtype].itemsize, peer)
+                    block_M * N * dtype_map[dtype].itemsize,
+                    peer,
+                )
             base = mype * M_per_rank + bx * block_M
-            T.copy(A[base:base + block_M, :], A_shared[mype, :, :])
+            T.copy(A[base : base + block_M, :], A_shared[mype, :, :])
 
             T.fence()  # Ensure reduce happens after all IO
 
             T.copy(A_shared, A_local)
             T.reduce_sum(A_local, A_local_sum, dim=0)
-            T.copy(A_local_sum, B[bx * block_M:bx * block_M + block_M, :])
+            T.copy(A_local_sum, B[bx * block_M : bx * block_M + block_M, :])
 
     return pull_reduce
 
@@ -59,8 +61,7 @@ def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--M", type=int, default=8192)
     parser.add_argument("--N", type=int, default=16384)
-    parser.add_argument(
-        "--dtype", type=str, default="float16", choices=["float16", "float32", "bfloat16"])
+    parser.add_argument("--dtype", type=str, default="float16", choices=["float16", "float32", "bfloat16"])
     parser.add_argument("--threads", type=int, default=128, help="number of threads in a block")
     parser.add_argument("--print_source", action="store_true", help="print kernel source code")
     parser.add_argument("--warmup", type=int, default=5, help="number of warmup iterations")
@@ -68,8 +69,8 @@ def parse_args():
     return parser.parse_args()
 
 
-if __name__ == '__main__':
-    assert torch.cuda.get_device_capability()[0] >= 9, '❗This benchmark requires sm_90 or higher'
+if __name__ == "__main__":
+    assert torch.cuda.get_device_capability()[0] >= 9, "❗This benchmark requires sm_90 or higher"
 
     WORLD_SIZE, RANK, LOCAL_RANK, TP_GROUP = init_distributed(return_tp_group=True)
     assert WORLD_SIZE <= 8, "This benchmark is designed for intra-node RS"
@@ -83,7 +84,7 @@ def parse_args():
     nelems = M * PE_num
 
     func = reducescatter(PE_num, M, N, dtype=dtype, threads=threads)
-    kernel = tilelang.compile(func, pass_configs={"tl.disable_tma_lower": True}, target='cuda')
+    kernel = tilelang.compile(func, pass_configs={"tl.disable_tma_lower": True}, target="cuda")
 
     # Get CUDA Source
     if RANK == 0 and args.print_source:
@@ -142,8 +143,7 @@ def tilelang_rs():
     print(f"rank {RANK} tilelang reduce_scatter avg time: {tl_t} ms")
 
     # Check correctness
-    assert torch.allclose(
-        tl_out, torch_out, atol=1e-2, rtol=1e-2), f'max error: {(tt_out - torch_out).abs().max()}'
+    assert torch.allclose(tl_out, torch_out, atol=1e-2, rtol=1e-2), f"max error: {(tt_out - torch_out).abs().max()}"
     print(f"rank {RANK} check passed.✅")
 
     dist.destroy_process_group()
diff --git a/benchmark/distributed/ipc_impls/README.md b/benchmark/distributed/ipc_impls/README.md
index d89d00956..59ad34e50 100644
--- a/benchmark/distributed/ipc_impls/README.md
+++ b/benchmark/distributed/ipc_impls/README.md
@@ -31,4 +31,3 @@ python benchmark/distributed/ipc_impls/benchmark_unrolledcp_p2p.py
 |      4,194,304 |               10.6560 |                 2.2474 |                 11.9145 |                  2.2845  |
 
 > **Note:** All data presented above are unidirectional bandwidth.
-
diff --git a/benchmark/distributed/ipc_impls/benchmark_nvshmem_p2p.py b/benchmark/distributed/ipc_impls/benchmark_nvshmem_p2p.py
index 5ab6265ae..b4836d1c3 100644
--- a/benchmark/distributed/ipc_impls/benchmark_nvshmem_p2p.py
+++ b/benchmark/distributed/ipc_impls/benchmark_nvshmem_p2p.py
@@ -12,15 +12,14 @@
 from tilelang.distributed import init_distributed, perf_fn
 import pynvshmem
 
-os.environ['NCCL_DEBUG'] = 'WARN'
+os.environ["NCCL_DEBUG"] = "WARN"
 
 
 def nvshmem_kernel_push(size, threads):
-
     @T.prim_func
     def nvshmem_push(
-            dst: T.Tensor((size), "float32"),  # type: ignore
-            src: T.Tensor((size), "float32"),  # type: ignore
+        dst: T.Tensor((size), "float32"),  # type: ignore
+        src: T.Tensor((size), "float32"),  # type: ignore
     ):
         with T.Kernel(1, threads=threads):
             T.putmem_block(
@@ -35,11 +34,10 @@ def nvshmem_push(
 
 
 def nvshmem_kernel_pull(size, threads):
-
     @T.prim_func
     def nvshmem_pull(
-            dst: T.Tensor((size), "float32"),  # type: ignore
-            src: T.Tensor((size), "float32"),  # type: ignore
+        dst: T.Tensor((size), "float32"),  # type: ignore
+        src: T.Tensor((size), "float32"),  # type: ignore
     ):
         with T.Kernel(1, threads=threads):
             T.getmem_block(
@@ -53,8 +51,7 @@ def nvshmem_pull(
     return nvshmem_pull
 
 
-def benchmark_nvshmem_bw(rank: int, num_ranks: int, group: dist.ProcessGroup, size: int,
-                         args: argparse.Namespace):
+def benchmark_nvshmem_bw(rank: int, num_ranks: int, group: dist.ProcessGroup, size: int, args: argparse.Namespace):
     assert num_ranks == 2, "this benchmark only supports 2 ranks"
     assert args.threads % 32 == 0, "threads must be divisible by 32"
 
@@ -90,10 +87,8 @@ def pull_fn():
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--warmup", type=int, default=10, help="number of warmup iterations (default: 10)")
-    parser.add_argument(
-        "--repeat", type=int, default=50, help="number of repeat iterations (default: 50)")
+    parser.add_argument("--warmup", type=int, default=10, help="number of warmup iterations (default: 10)")
+    parser.add_argument("--repeat", type=int, default=50, help="number of repeat iterations (default: 50)")
     parser.add_argument("--threads", type=int, default=128, help="Threads per block (default: 128)")
     args = parser.parse_args()
 
@@ -102,8 +97,6 @@ def pull_fn():
         size = 2**log_size
         push_bw, pull_bw = benchmark_nvshmem_bw(rank, num_ranks, group, size, args)
         if rank == 0:
-            print(
-                f"size={size*4} bytes, nvshmem push bw: {push_bw:.4f} GB/s, nvshmem pull bw: {pull_bw:.4f} GB/s"
-            )
+            print(f"size={size * 4} bytes, nvshmem push bw: {push_bw:.4f} GB/s, nvshmem pull bw: {pull_bw:.4f} GB/s")
 
     dist.destroy_process_group()
diff --git a/benchmark/distributed/ipc_impls/benchmark_unrolledcp_p2p.py b/benchmark/distributed/ipc_impls/benchmark_unrolledcp_p2p.py
index c7d3f2556..c320688ac 100644
--- a/benchmark/distributed/ipc_impls/benchmark_unrolledcp_p2p.py
+++ b/benchmark/distributed/ipc_impls/benchmark_unrolledcp_p2p.py
@@ -8,15 +8,14 @@
 from tilelang.distributed import init_dist, perf_fn
 
 tilelang.disable_cache()
-os.environ['NCCL_DEBUG'] = 'WARN'
+os.environ["NCCL_DEBUG"] = "WARN"
 
 
 def ipc_kernel_push(size, threads, unroll_factor):
-
     @T.prim_func
     def ipc_push(
-            dst: T.Tensor((size), "float32"),  # type: ignore
-            src: T.Tensor((size), "float32"),  # type: ignore
+        dst: T.Tensor((size), "float32"),  # type: ignore
+        src: T.Tensor((size), "float32"),  # type: ignore
     ):
         with T.Kernel(1, threads=threads):
             rank = T.alloc_local([1], "uint64")
@@ -29,18 +28,18 @@ def ipc_push(
                 dst=T.address_of(dst[warp_start]),
                 size=warp_copy_size,
                 dst_pe=rank[0] ^ 1,
-                unroll_factor=unroll_factor)
+                unroll_factor=unroll_factor,
+            )
             T.fence_sys()
 
     return ipc_push
 
 
 def ipc_kernel_pull(size, threads, unroll_factor):
-
     @T.prim_func
     def ipc_pull(
-            dst: T.Tensor((size), "float32"),  # type: ignore
-            src: T.Tensor((size), "float32"),  # type: ignore
+        dst: T.Tensor((size), "float32"),  # type: ignore
+        src: T.Tensor((size), "float32"),  # type: ignore
     ):
         with T.Kernel(1, threads=threads):
             rank = T.alloc_local([1], "uint64")
@@ -53,14 +52,14 @@ def ipc_pull(
                 dst=T.address_of(dst[warp_start]),
                 size=warp_copy_size,
                 src_pe=rank[0] ^ 1,
-                unroll_factor=unroll_factor)
+                unroll_factor=unroll_factor,
+            )
             T.fence_sys()
 
     return ipc_pull
 
 
-def benchmark_ipc_bw(rank: int, num_ranks: int, group: dist.ProcessGroup, size: int,
-                     args: argparse.Namespace, allocator):
+def benchmark_ipc_bw(rank: int, num_ranks: int, group: dist.ProcessGroup, size: int, args: argparse.Namespace, allocator):
     assert num_ranks == 2, "this benchmark only supports 2 ranks"
     assert args.threads % 32 == 0, "threads must be divisible by 32"
 
@@ -100,30 +99,22 @@ def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
     rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
 
     allocator = tilelang.get_allocator(
-        size=2**30,
-        device="cuda",
-        is_distributed=True,
-        local_rank=rank,
-        num_local_ranks=num_ranks,
-        group=group)
+        size=2**30, device="cuda", is_distributed=True, local_rank=rank, num_local_ranks=num_ranks, group=group
+    )
 
     for log_size in range(9, 21):
         size = 2**log_size
         push_bw, pull_bw = benchmark_ipc_bw(rank, num_ranks, group, size, args, allocator)
         if rank == 0:
-            print(
-                f"size={size*4} bytes, ipc push bw: {push_bw:.4f} GB/s, ipc pull bw: {pull_bw:.4f} GB/s"
-            )
+            print(f"size={size * 4} bytes, ipc push bw: {push_bw:.4f} GB/s, ipc pull bw: {pull_bw:.4f} GB/s")
 
     dist.destroy_process_group()
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--warmup", type=int, default=10, help="number of warmup iterations (default: 10)")
-    parser.add_argument(
-        "--repeat", type=int, default=50, help="number of repeat iterations (default: 50)")
+    parser.add_argument("--warmup", type=int, default=10, help="number of warmup iterations (default: 10)")
+    parser.add_argument("--repeat", type=int, default=50, help="number of repeat iterations (default: 50)")
     parser.add_argument("--threads", type=int, default=128, help="Threads per block (default: 128)")
     parser.add_argument("--unroll-factor", type=int, default=4, help="Unroll factor (default: 4)")
     args = parser.parse_args()
diff --git a/benchmark/distributed/utils.py b/benchmark/distributed/utils.py
index fba164121..87cf9cc24 100644
--- a/benchmark/distributed/utils.py
+++ b/benchmark/distributed/utils.py
@@ -13,7 +13,6 @@
 
 
 class AllToAllContext:
-
     def __init__(
         self,
         max_m: int,
diff --git a/benchmark/mamba2/README.md b/benchmark/mamba2/README.md
index 8c6d933d5..f0b4b7e80 100644
--- a/benchmark/mamba2/README.md
+++ b/benchmark/mamba2/README.md
@@ -45,9 +45,14 @@ PY
 | 16384 | 2.531    | 135.711                 |
 | 32768 | 5.076    | 135.379                 |
 
+## Compare with Baselines
+
+- Triton: v3.5.0, mamba-ssm: v2.2.6.post3
+- Helion: v0.2.1
+
 <figure style="text-align: center">
   <a href="mamba_benchmark_result.png">
     <img src="mamba_benchmark_result.png" alt="Mamba2_chunk_scan Performance Comparison on H100">
    </a>
   <figcaption style="text-align: center;">Performance comparison across compilers on NVIDIA H100</figcaption>
-</figure>
\ No newline at end of file
+</figure>
diff --git a/benchmark/mamba2/benchmark_mamba_chunk_scan.py b/benchmark/mamba2/benchmark_mamba_chunk_scan.py
index 78dfb135e..55f802b4f 100644
--- a/benchmark/mamba2/benchmark_mamba_chunk_scan.py
+++ b/benchmark/mamba2/benchmark_mamba_chunk_scan.py
@@ -5,6 +5,20 @@
 import tilelang.language as T
 from einops import rearrange, repeat
 import itertools
+import math
+from tilelang.profiler import do_bench
+
+try:
+    from mamba_ssm.ops.triton.ssd_chunk_scan import _chunk_scan_fwd
+except ImportError as err:
+    raise ImportError("Please install mamba-ssm to use the triton chunk scan operator.") from err
+
+try:
+    import helion
+    from helion._testing import run_example
+    import helion.language as hl
+except ImportError as err:
+    raise ImportError("Please install helion to use the helion chunk scan operator.") from err
 
 
 def ref_program(cb, x, dt, dA_cumsum, C, prev_states, D):
@@ -37,14 +51,15 @@ def ref_program(cb, x, dt, dA_cumsum, C, prev_states, D):
     dt_segment_sum = dA_cumsum[:, :, :, :, None] - dA_cumsum[:, :, :, None, :]
     decay = torch.exp(dt_segment_sum)
     scores_decay = cb * rearrange(decay, "b h c l s -> b c h l s")
-    causal_mask = torch.tril(
-        torch.ones(chunk_size, chunk_size, device=x.device, dtype=bool), diagonal=0)
+    causal_mask = torch.tril(torch.ones(chunk_size, chunk_size, device=x.device, dtype=bool), diagonal=0)
     scores_decay = scores_decay.masked_fill(~causal_mask, 0)
-    out = torch.einsum('bchls,bhcs,bcshp->bclhp', scores_decay.to(x.dtype), dt.to(x.dtype),
-                       rearrange(x, "b (c s) h p -> b c s h p", c=nchunks))
+    out = torch.einsum(
+        "bchls,bhcs,bcshp->bclhp", scores_decay.to(x.dtype), dt.to(x.dtype), rearrange(x, "b (c s) h p -> b c s h p", c=nchunks)
+    )
     state_decay_out = torch.exp(rearrange(dA_cumsum, "b h c l -> b c l h 1"))
-    out_prev = torch.einsum('bclhn,bchpn->bclhp', rearrange(
-        C, "b (c l) h n -> b c l h n", c=nchunks), prev_states.to(C.dtype)) * state_decay_out
+    out_prev = (
+        torch.einsum("bclhn,bchpn->bclhp", rearrange(C, "b (c l) h n -> b c l h n", c=nchunks), prev_states.to(C.dtype)) * state_decay_out
+    )
     out = out + out_prev
     out = rearrange(out, "b c l h p -> b (c l) h p")
     if D is not None:
@@ -54,13 +69,114 @@ def ref_program(cb, x, dt, dA_cumsum, C, prev_states, D):
     return out
 
 
+def chunk_scan_triton(cb, x, dt, dA_cumsum, C, states, D):
+    out, _ = _chunk_scan_fwd(cb, x, dt, dA_cumsum, C, states, D)
+    return out
+
+
+def chunk_scan_helion(cb, x, dt, dA_cumsum, C, states, D):
+    @helion.kernel()
+    def helion_mamba2_chunk_scan_kernel(
+        cb: torch.Tensor,
+        x: torch.Tensor,
+        dt: torch.Tensor,
+        dA_cumsum: torch.Tensor,
+        C: torch.Tensor,
+        prev_states: torch.Tensor,
+        D: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Argument:
+            cb: (batch, nchunks, ngroups, chunk_size, chunk_size)
+            x: (batch, seqlen, nheads, headdim)
+            dt: (batch, nheads, nchunks, chunk_size)
+            dA_cumsum: (batch, nheads, nchunks, chunk_size)
+            C: (batch, seqlen, ngroups, dstate)
+            prev_states: (batch, nchunks, nheads, headdim, dstate)
+            D: (nheads,)
+        Return:
+            out: (batch, seqlen, nheads, headdim)
+        """
+
+        batch, nchunks, ngroups, chunk_size, _ = cb.shape
+        _, seqlen, nheads, headdim = x.shape
+        _, _, _, dstate = C.shape
+        assert nchunks == (seqlen + chunk_size - 1) // chunk_size
+
+        block_m = hl.register_block_size(chunk_size)
+        block_n = hl.register_block_size(headdim)
+        block_k = hl.register_block_size(64, 64)
+        dstate = hl.specialize(dstate)
+
+        assert cb.shape == (batch, nchunks, ngroups, chunk_size, chunk_size)
+        assert x.shape == (batch, seqlen, nheads, headdim)
+        assert dt.shape == (batch, nheads, nchunks, chunk_size)
+        assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
+        assert C.shape == (batch, seqlen, ngroups, dstate)
+        assert prev_states.shape == (batch, nchunks, nheads, headdim, dstate)
+        assert D.shape == (nheads,)
+
+        dtype = cb.dtype
+        accum_dtype = torch.float32
+        assert x.dtype == dt.dtype == dA_cumsum.dtype == C.dtype == prev_states.dtype == D.dtype == dtype
+
+        out = torch.empty_like(x)
+
+        p = 1.44269504
+
+        for tile_h, tile_m, tile_n, tile_b, tile_c in hl.tile(
+            [nheads, chunk_size, headdim, batch, nchunks],
+            block_size=[1, block_m, block_n, 1, 1],
+        ):
+            acc_o = hl.zeros([tile_m, tile_n], dtype=accum_dtype)
+            dA_cumsum_local_m = dA_cumsum[tile_b.begin, tile_h.begin, tile_c.begin, tile_m].to(torch.float32)
+            scale_m_local = torch.exp2(dA_cumsum_local_m * p)
+
+            C_local = C[
+                tile_b.begin,
+                tile_m.index + tile_c.begin * chunk_size,
+                tile_h.begin // (nheads // ngroups),
+                :,
+            ]
+            prev_states_local = prev_states[tile_b.begin, tile_c.begin, tile_h.begin, tile_n, :]
+            acc_o = hl.dot(C_local, prev_states_local.T, acc=acc_o)
+            acc_o *= scale_m_local[:, None]
+
+            for tile_k in hl.tile((tile_m.id + 1) * block_m, block_size=block_k):
+                cb_local = cb[
+                    tile_b.begin,
+                    tile_c.begin,
+                    tile_h.begin // (nheads // ngroups),
+                    tile_m,
+                    tile_k,
+                ]
+                dA_cumsum_local_k = dA_cumsum[tile_b.begin, tile_h.begin, tile_c.begin, tile_k].to(torch.float32)
+                cb_local *= torch.exp2(dA_cumsum_local_m[:, None] * p - dA_cumsum_local_k[None, :] * p)
+                dt_local = dt[tile_b.begin, tile_h.begin, tile_c.begin, tile_k].to(torch.float32)
+                cb_local = (cb_local * dt_local[None, :]).to(dtype)
+                pred = (tile_m.index + 0)[:, None] >= (tile_k.index + 0)[None, :]
+                cb_local = torch.where(pred, cb_local, torch.zeros_like(cb_local))
+                x_local = x[
+                    tile_b.begin,
+                    tile_c.begin * chunk_size + tile_k.index,
+                    tile_h.begin,
+                    tile_n,
+                ]
+                acc_o = hl.dot(cb_local, x_local, acc=acc_o)
+
+            D_local = D[tile_h.begin].to(torch.float32)
+            x_residual = x[tile_b.begin, tile_c.begin * chunk_size + tile_m.index, tile_h.begin, tile_n].to(torch.float32)
+            acc_o += x_residual * D_local
+            out[tile_b.begin, tile_c.begin * chunk_size + tile_m.index, tile_h.begin, tile_n] = acc_o.to(dtype=dtype)
+
+        return out
+
+    args = (cb, x, dt, dA_cumsum, C, states, D)
+    run_example(helion_mamba2_chunk_scan_kernel, ref_program, args)
+
+
 def get_configs():
-    iter_params = dict(
-        block_M=[64, 128, 256],
-        block_N=[32, 64],
-        block_K=[64, 128, 256],
-        block_Dstate=[128],
-        num_stages=[1, 2, 3, 4, 5])
+    iter_params = dict(block_M=[64, 128, 256], block_N=[32, 64], block_K=[64, 128, 256], block_Dstate=[128], num_stages=[1, 2, 3, 4, 5])
     return [dict(zip(iter_params, values)) for values in itertools.product(*iter_params.values())]
 
 
@@ -71,56 +187,58 @@ def get_configs():
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
     },
 )
-def chunk_scan_fwd(batch,
-                   seqlen,
-                   chunk_size,
-                   ngroups,
-                   nheads,
-                   headdim,
-                   dstate,
-                   block_M=64,
-                   block_N=64,
-                   block_K=64,
-                   block_Dstate=128,
-                   num_stages=2,
-                   threads=128):
-    dtype = "float16"
-    accum_dtype = "float"
+def chunk_scan_fwd(
+    batch,
+    seqlen,
+    chunk_size,
+    ngroups,
+    nheads,
+    headdim,
+    dstate,
+    block_M=64,
+    block_N=64,
+    block_K=64,
+    block_Dstate=128,
+    num_stages=2,
+    threads=128,
+):
+    dtype = T.float16
+    accum_dtype = T.float32
     nchunks = T.ceildiv(seqlen, chunk_size)
     p = 1.44269504
 
     @T.prim_func
     def main(
-            cb: T.Tensor((batch, nchunks, ngroups, chunk_size, chunk_size), dtype),  # type: ignore
-            x: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
-            dt: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
-            dA_cumsum: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
-            C: T.Tensor((batch, seqlen, ngroups, dstate), dtype),  # type: ignore
-            prev_states: T.Tensor((batch, nchunks, nheads, headdim, dstate), dtype),  # type: ignore
-            D: T.Tensor((nheads), dtype),  # type: ignore
-            Output: T.Tensor((batch, seqlen, nheads, headdim), dtype)  # type: ignore
+        cb: T.Tensor((batch, nchunks, ngroups, chunk_size, chunk_size), dtype),  # type: ignore
+        x: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
+        dt: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
+        dA_cumsum: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
+        C: T.Tensor((batch, seqlen, ngroups, dstate), dtype),  # type: ignore
+        prev_states: T.Tensor((batch, nchunks, nheads, headdim, dstate), dtype),  # type: ignore
+        D: T.Tensor((nheads), dtype),  # type: ignore
+        Output: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
     ):
-        with T.Kernel(
-                nheads,
-                T.ceildiv(chunk_size, block_M) * T.ceildiv(headdim, block_N),
-                batch * nchunks,
-                threads=threads) as (bz, bx, by):
+        with T.Kernel(nheads, T.ceildiv(chunk_size, block_M) * T.ceildiv(headdim, block_N), batch * nchunks, threads=threads) as (
+            bz,
+            bx,
+            by,
+        ):
             acc_o = T.alloc_fragment((block_M, block_N), accum_dtype)
             acc_o_shared = T.alloc_shared((block_M, block_N), dtype)
-            cb_shared = T.alloc_shared((block_M, block_K), dtype, scope="shared.dyn")
+            cb_shared = T.alloc_shared((block_M, block_K), dtype)
             cb_local = T.alloc_fragment((block_M, block_K), dtype)
-            dA_cs_k_shared = T.alloc_shared((block_K), dtype, scope="shared")
+            dA_cs_k_shared = T.alloc_shared((block_K), dtype)
             dA_cs_k_local = T.alloc_fragment((block_K), accum_dtype)
             dA_cs_m_local = T.alloc_fragment((block_M), accum_dtype)
-            dt_shared = T.alloc_shared((block_K), dtype, scope="shared")
+            dt_shared = T.alloc_shared((block_K), dtype)
             dt_local = T.alloc_fragment((block_K), accum_dtype)
-            x_shared = T.alloc_shared((block_K, block_N), dtype, scope="shared.dyn")
-            dA_cs_m_shared = T.alloc_shared((block_M), dtype, scope="shared")
+            x_shared = T.alloc_shared((block_K, block_N), dtype)
+            dA_cs_m_shared = T.alloc_shared((block_M), dtype)
             scale_m_local = T.alloc_fragment((block_M), accum_dtype)
             C_shared = T.alloc_shared((block_M, block_Dstate), dtype)
             prev_state_shared = T.alloc_shared((block_N, block_Dstate), dtype)
             D_local = T.alloc_fragment((1), accum_dtype)
-            x_residual_shared = T.alloc_shared((block_M, block_N), dtype, scope="shared.dyn")
+            x_residual_shared = T.alloc_shared((block_M, block_N), dtype)
             x_residual_local = T.alloc_fragment((block_M, block_N), accum_dtype)
 
             batch_idx = by % batch
@@ -130,27 +248,31 @@ def main(
             m_idx = bx // T.ceildiv(headdim, block_N)
             n_idx = bx % T.ceildiv(headdim, block_N)
 
-            T.annotate_layout({
-                acc_o_shared: tilelang.layout.make_swizzled_layout(acc_o_shared),
-                cb_shared: tilelang.layout.make_swizzled_layout(cb_shared),
-                x_residual_shared: tilelang.layout.make_swizzled_layout(x_residual_shared)
-            })
+            T.annotate_layout(
+                {
+                    cb_shared: tilelang.layout.make_swizzled_layout(cb_shared),
+                    x_residual_shared: tilelang.layout.make_swizzled_layout(x_residual_shared),
+                }
+            )
 
             T.no_set_max_nreg()
 
-            T.copy(dA_cumsum[batch_idx, bz, chunk_idx, m_idx * block_M:(m_idx + 1) * block_M],
-                   dA_cs_m_shared)
+            T.copy(dA_cumsum[batch_idx, bz, chunk_idx, m_idx * block_M : (m_idx + 1) * block_M], dA_cs_m_shared)
             T.copy(dA_cs_m_shared, dA_cs_m_local)
             T.clear(acc_o)
 
             for i in T.Parallel(block_M):
                 scale_m_local[i] = T.exp2(dA_cs_m_local[i] * p)
             T.copy(
-                C[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
-                  (m_idx + 1) * block_M, bz // (nheads // ngroups), 0:block_Dstate], C_shared)
-            T.copy(
-                prev_states[batch_idx, chunk_idx, bz, n_idx * block_N:(n_idx + 1) * block_N,
-                            0:block_Dstate], prev_state_shared)
+                C[
+                    batch_idx,
+                    chunk_idx * chunk_size + m_idx * block_M : chunk_idx * chunk_size + (m_idx + 1) * block_M,
+                    bz // (nheads // ngroups),
+                    0:block_Dstate,
+                ],
+                C_shared,
+            )
+            T.copy(prev_states[batch_idx, chunk_idx, bz, n_idx * block_N : (n_idx + 1) * block_N, 0:block_Dstate], prev_state_shared)
             T.gemm(C_shared, prev_state_shared, acc_o, transpose_B=True)
             for i, j in T.Parallel(block_M, block_N):
                 acc_o[i, j] *= scale_m_local[i]
@@ -159,34 +281,47 @@ def main(
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 T.copy(
-                    cb[batch_idx, chunk_idx, bz // (nheads // ngroups),
-                       m_idx * block_M:(m_idx + 1) * block_M, k * block_K:(k + 1) * block_K],
-                    cb_shared)
+                    cb[
+                        batch_idx,
+                        chunk_idx,
+                        bz // (nheads // ngroups),
+                        m_idx * block_M : (m_idx + 1) * block_M,
+                        k * block_K : (k + 1) * block_K,
+                    ],
+                    cb_shared,
+                )
                 T.copy(cb_shared, cb_local)
-                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K],
-                       dA_cs_k_shared)
+                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K : (k + 1) * block_K], dA_cs_k_shared)
                 T.copy(dA_cs_k_shared, dA_cs_k_local)
                 for i, j in T.Parallel(block_M, block_K):
-                    cb_local[i,
-                             j] = cb_local[i,
-                                           j] * T.exp2(dA_cs_m_local[i] * p - dA_cs_k_local[j] * p)
-                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K], dt_shared)
+                    cb_local[i, j] = cb_local[i, j] * T.exp2(dA_cs_m_local[i] * p - dA_cs_k_local[j] * p)
+                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K : (k + 1) * block_K], dt_shared)
                 T.copy(dt_shared, dt_local)
                 for i, j in T.Parallel(block_M, block_K):
                     cb_local[i, j] *= dt_local[j]
                 for i, j in T.Parallel(block_M, block_K):
-                    cb_local[i, j] = T.if_then_else(m_idx * block_M + i >= k * block_K + j,
-                                                    cb_local[i, j], 0)
+                    cb_local[i, j] = T.if_then_else(m_idx * block_M + i >= k * block_K + j, cb_local[i, j], 0)
                 T.copy(
-                    x[batch_idx, chunk_idx * chunk_size + k * block_K:chunk_idx * chunk_size +
-                      (k + 1) * block_K, bz, n_idx * block_N:(n_idx + 1) * block_N], x_shared)
+                    x[
+                        batch_idx,
+                        chunk_idx * chunk_size + k * block_K : chunk_idx * chunk_size + (k + 1) * block_K,
+                        bz,
+                        n_idx * block_N : (n_idx + 1) * block_N,
+                    ],
+                    x_shared,
+                )
                 T.gemm(cb_local, x_shared, acc_o)
 
             D_local[0] = D[bz]
             T.copy(
-                x[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
-                  (m_idx + 1) * block_M, bz, n_idx * block_N:(n_idx + 1) * block_N],
-                x_residual_shared)
+                x[
+                    batch_idx,
+                    chunk_idx * chunk_size + m_idx * block_M : chunk_idx * chunk_size + (m_idx + 1) * block_M,
+                    bz,
+                    n_idx * block_N : (n_idx + 1) * block_N,
+                ],
+                x_residual_shared,
+            )
             T.copy(x_residual_shared, x_residual_local)
             for i, j in T.Parallel(block_M, block_N):
                 acc_o[i, j] += x_residual_local[i, j] * D_local[0]
@@ -194,26 +329,41 @@ def main(
             T.copy(acc_o, acc_o_shared)
             T.copy(
                 acc_o_shared,
-                Output[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
-                       (m_idx + 1) * block_M, bz, n_idx * block_N:(n_idx + 1) * block_N])
+                Output[
+                    batch_idx,
+                    chunk_idx * chunk_size + m_idx * block_M : chunk_idx * chunk_size + (m_idx + 1) * block_M,
+                    bz,
+                    n_idx * block_N : (n_idx + 1) * block_N,
+                ],
+            )
 
     return main
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=80, help='heads')
-    parser.add_argument('--groups', type=int, default=1, help='groups')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--chunk_size', type=int, default=256, help='chunk size')
-    parser.add_argument('--dim', type=int, default=64, help='dim')
-    parser.add_argument('--dstate', type=int, default=128, help='dstate')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=80, help="heads")
+    parser.add_argument("--groups", type=int, default=1, help="groups")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--chunk_size", type=int, default=256, help="chunk size")
+    parser.add_argument("--dim", type=int, default=64, help="dim")
+    parser.add_argument("--dstate", type=int, default=128, help="dstate")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
-    batch, heads, groups, seq_len, chunk_size, dim, dstate = args.batch, args.heads, args.groups, args.seq_len, args.chunk_size, args.dim, args.dstate
+    batch, heads, groups, seq_len, chunk_size, dim, dstate = (
+        args.batch,
+        args.heads,
+        args.groups,
+        args.seq_len,
+        args.chunk_size,
+        args.dim,
+        args.dstate,
+    )
+    nchunks = math.ceil(seq_len / chunk_size)
     total_flops = 2 * batch * seq_len * chunk_size * heads * dim * 0.5 + 2 * batch * seq_len * heads * dim * dstate
 
+    print("Benchmarking TileLang...")
     kernel = chunk_scan_fwd(batch, seq_len, chunk_size, groups, heads, dim, dstate)
     best_latency = kernel.latency
     best_config = kernel.config
@@ -221,3 +371,18 @@ def main(
     print(f"Best latency: {best_latency}")
     print(f"Best TFlops: {total_flops / best_latency * 1e-9}")
     print(f"Best config: {best_config}")
+
+    cb = torch.randn(batch, nchunks, groups, chunk_size, chunk_size).half().cuda()
+    x = torch.randn(batch, seq_len, heads, dim).half().cuda()
+    dt = torch.randn(batch, heads, nchunks, chunk_size).half().cuda()
+    dA_cumsum = torch.randn(batch, heads, nchunks, chunk_size).half().cuda()
+    C = torch.randn(batch, seq_len, groups, dstate).half().cuda()
+    states = torch.randn(batch, nchunks, heads, dim, dstate).half().cuda()
+    D = torch.randn(heads).half().cuda()
+
+    print("Benchmarking Triton...")
+    triton_latency = do_bench(lambda: chunk_scan_triton(cb, x, dt, dA_cumsum, C, states, D), _n_warmup=10, _n_repeat=10)
+    print(f"Triton TFlops: {total_flops / triton_latency * 1e-9}")
+
+    print("Benchmarking Helion...")
+    chunk_scan_helion(cb, x, dt, dA_cumsum, C, states, D)
diff --git a/benchmark/matmul/benchmark_matmul.py b/benchmark/matmul/benchmark_matmul.py
index c64f4fabf..643c1fd5e 100644
--- a/benchmark/matmul/benchmark_matmul.py
+++ b/benchmark/matmul/benchmark_matmul.py
@@ -6,6 +6,7 @@
 import tilelang.language as T
 from tilelang.autotuner import autotune
 from tilelang import jit
+
 # Configure logger
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
@@ -61,9 +62,9 @@ def get_configs(args, kwargs):
             M=M,
             N=N,
             K=K,
-            in_dtype="float16",
-            out_dtype="float16",
-            accum_dtype="float",
+            in_dtype=T.float16,
+            out_dtype=T.float16,
+            accum_dtype=T.float32,
         ).with_arch(arch)
 
         func = carve_template.equivalent_function()
@@ -101,9 +102,7 @@ def get_configs(args, kwargs):
             policy=[T.GemmWarpPolicy.Square],
             enable_rasteration=[True, False],
         )
-        return [{
-            k: v for k, v in zip(iter_params, values)
-        } for values in itertools.product(*iter_params.values())]
+        return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
     return configs
 
 
@@ -112,7 +111,9 @@ def get_configs(args, kwargs):
     warmup=3,
     rep=20,
 )
-@jit(out_idx=[2],)
+@jit(
+    out_idx=[2],
+)
 def matmul(
     M,
     N,
@@ -154,14 +155,14 @@ def matmul(
 
     # Use half-precision for input data to reduce memory bandwidth,
     # accumulate in float for better numerical accuracy
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         """
         The compiled TVM function for block-level matrix multiplication.
@@ -176,7 +177,6 @@ def main(
         # Bind x-dimension to block index in N,
         #     y-dimension to block index in M.
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-
             # Allocate shared memory for A sub-block of shape (block_M, block_K)
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             # Allocate shared memory for B sub-block of shape (block_N, block_K)
diff --git a/benchmark/matmul/benchmark_matmul_intrinsic.py b/benchmark/matmul/benchmark_matmul_intrinsic.py
index 94e36b385..4ef860c21 100644
--- a/benchmark/matmul/benchmark_matmul_intrinsic.py
+++ b/benchmark/matmul/benchmark_matmul_intrinsic.py
@@ -6,7 +6,8 @@
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 from tilelang.autotuner import autotune
 import itertools
@@ -48,22 +49,22 @@ def tl_matmul(
     enable_rasteration=False,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if out_dtype == "int32":
+    if out_dtype == T.int32:
         micro_size_k = 32
 
     # This is a debug config
-    # chunk = 32 if in_dtype == "float16" else 64
+    # chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     block_M = block_row_warps * warp_row_tiles
@@ -103,12 +104,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -116,10 +116,12 @@ def main(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10, enable=enable_rasteration)
@@ -127,7 +129,6 @@ def main(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -137,7 +138,6 @@ def main(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(A_local, A_shared, ki)
 
@@ -194,9 +194,9 @@ def get_configs(args, kwargs):
             M=M,
             N=N,
             K=K,
-            in_dtype="float16",
-            out_dtype="float16",
-            accum_dtype="float16",
+            in_dtype=T.float16,
+            out_dtype=T.float16,
+            accum_dtype=T.float16,
         ).with_arch(arch)
 
         func = carve_template.equivalent_function()
@@ -223,7 +223,6 @@ def get_configs(args, kwargs):
         for config in configs:
             print(config)
     else:
-
         iter_params = dict(
             block_row_warps=[1, 2, 4],
             block_col_warps=[1, 2, 4],
@@ -233,9 +232,7 @@ def get_configs(args, kwargs):
             stage=[0, 2],
             enable_rasteration=[True, False],
         )
-        return [{
-            k: v for k, v in zip(iter_params, values)
-        } for values in itertools.product(*iter_params.values())]
+        return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
 
     return configs
 
@@ -247,14 +244,16 @@ def get_configs(args, kwargs):
     ref_prog=ref_program,
     skip_check=True,
 )
-@tl.jit(out_idx=[2],)
+@tl.jit(
+    out_idx=[2],
+)
 def matmul(
     M,
     N,
     K,
-    in_dtype="float16",
-    out_dtype="float16",
-    accum_dtype="float16",
+    in_dtype=T.float16,
+    out_dtype=T.float16,
+    accum_dtype=T.float16,
     with_roller=False,
     block_row_warps=None,
     block_col_warps=None,
@@ -291,19 +290,14 @@ def kernel():
     parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
     parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
     parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
-    parser.add_argument(
-        "--with_roller",
-        type=bool,
-        default=False,
-        help="Whether to use roller to deduce search spaces")
-    parser.add_argument(
-        "--dtype", type=str, default="float16", choices=["float16", "int8"], help="Input data type")
+    parser.add_argument("--with_roller", type=bool, default=False, help="Whether to use roller to deduce search spaces")
+    parser.add_argument("--dtype", type=str, default="float16", choices=["float16", "int8"], help="Input data type")
     args = parser.parse_args()
 
     M, N, K = args.m, args.n, args.k
-    in_dtype = args.dtype
-    out_dtype = "float32" if in_dtype == "int8" else "float16"
-    accum_dtype = "float32" if in_dtype == "int8" else "float16"
+    in_dtype = T.dtype(args.dtype)
+    out_dtype = T.float32 if in_dtype == T.int8 else T.float16
+    accum_dtype = T.float32 if in_dtype == T.int8 else T.float16
     with_roller = args.with_roller
     with_roller = True
     # Compute total floating-point operations
diff --git a/benchmark/matmul/benchmark_matmul_sp.py b/benchmark/matmul/benchmark_matmul_sp.py
index 4e4ed6128..7ecffc26a 100644
--- a/benchmark/matmul/benchmark_matmul_sp.py
+++ b/benchmark/matmul/benchmark_matmul_sp.py
@@ -9,7 +9,7 @@
 from tilelang.autotuner import autotune
 from tilelang import jit
 from tilelang.contrib import nvcc
-from tilelang.layout import make_metadata_layout
+from tilelang.layout import make_cutlass_metadata_layout
 
 # Configure logger
 logger = logging.getLogger(__name__)
@@ -70,7 +70,8 @@ def get_configs(M, N, K):
             thread_num,
             policy,
             enable_rasterization,
-        ))
+        )
+    )
 
     configs = [
         {
@@ -81,12 +82,13 @@ def get_configs(M, N, K):
             "thread_num": c[4],
             "policy": c[5],
             "enable_rasterization": c[6],  # keep param name for backward-compat
-        } for c in _configs
+        }
+        for c in _configs
     ]
     return configs
 
 
-def matmul_sp(M, N, K, accum_dtype):
+def matmul_sp(M, N, K, in_dtype, accum_dtype):
     """
     Create an autotuned matrix multiplication kernel for matrices of shape:
       - A: (M, K)
@@ -126,7 +128,9 @@ def matmul_sp(M, N, K, accum_dtype):
         warmup=3,
         rep=20,
     )
-    @jit(out_idx=[2],)
+    @jit(
+        out_idx=[2],
+    )
     def kernel(
         block_M=None,
         block_N=None,
@@ -161,15 +165,14 @@ def kernel(
         """
         # Use half-precision for input data to reduce memory bandwidth,
         # accumulate in float for better numerical accuracy
-        dtype = "float16"
         e_factor, e_dtype = ARCH_INFO[arch]
 
         @T.prim_func
         def main(
-                A_sparse: T.Tensor((M, K // 2), dtype),
-                E: T.Tensor((M, K // e_factor), e_dtype),
-                B: T.Tensor((K, N), dtype),
-                C: T.Tensor((M, N), accum_dtype),
+            A_sparse: T.Tensor((M, K // 2), in_dtype),
+            E: T.Tensor((M, K // e_factor), e_dtype),
+            B: T.Tensor((K, N), in_dtype),
+            C: T.Tensor((M, N), accum_dtype),
         ):
             """
             The compiled TVM function for block-level matrix multiplication.
@@ -183,13 +186,11 @@ def main(
             """
             # Bind x-dimension to block index in N,
             #     y-dimension to block index in M.
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
                 # Allocate shared memory for A sub-block of shape (block_M, block_K)
-                A_shared = T.alloc_shared((block_M, block_K // 2), dtype)
+                A_shared = T.alloc_shared((block_M, block_K // 2), in_dtype)
                 # Allocate shared memory for B sub-block of shape (block_N, block_K)
-                B_shared = T.alloc_shared((block_K, block_N), dtype)
+                B_shared = T.alloc_shared((block_K, block_N), in_dtype)
                 # Allocate shared memory for E sub-block of shape (block_M, block_K // E_factor)
                 E_shared = T.alloc_shared((block_M, block_K // e_factor), e_dtype)
                 # Allocate a local fragment for intermediate accumulation
@@ -202,14 +203,12 @@ def main(
                 T.disable_warp_group_reg_alloc()
 
                 T.use_swizzle(panel_size=10, enable=enable_rasterization)
-                T.annotate_layout({
-                    E:
-                        make_metadata_layout(
-                            E, mma_dtype="float16", backend="cutlass", block_k=block_K),
-                    E_shared:
-                        make_metadata_layout(
-                            E_shared, mma_dtype="float16", backend="cutlass", block_k=block_K),
-                })
+                T.annotate_layout(
+                    {
+                        E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, block_k=block_K),
+                        E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, block_k=block_K),
+                    }
+                )
                 # Loop over sub-blocks in K dimension, pipelined by num_stages
                 for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                     # Load a sub-block of A from global memory into A_shared
@@ -220,7 +219,7 @@ def main(
                     T.copy(B[k * block_K, bx * block_N], B_shared)
                     # Perform a partial matrix multiplication:
                     #   C_local += A_shared @ B_shared
-                    T.gemm_sp(
+                    T.gemm_sp_v2(
                         A_shared,
                         E_shared,
                         B_shared,
@@ -244,18 +243,13 @@ def main(
     parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
     parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
     parser.add_argument("--disable_cache", action="store_true")
-    parser.add_argument(
-        "--accum_dtype",
-        type=str,
-        default="float",
-        choices=["float", "float16"],
-        help="Accumulation datatype")
+    parser.add_argument("--accum_dtype", type=str, default="float", choices=["float", "float16"], help="Accumulation datatype")
     parser.add_argument(
         "--bench_torch_sparse",
         type=str,
-        choices=['cutlass', 'cusparselt'],
+        choices=["cutlass", "cusparselt"],
         default=None,
-        help="Whether to benchmark against torch sparse implementation, note that at current time only sm80 is supported"
+        help="Whether to benchmark against torch sparse implementation, note that at current time only sm80 is supported",
     )
     args = parser.parse_args()
 
@@ -268,7 +262,7 @@ def main(
     total_flops = 2 * M * N * K
 
     # matmul(...) returns (best_latency, best_config, ref_latency)
-    best_result = matmul_sp(M, N, K, args.accum_dtype)
+    best_result = matmul_sp(M, N, K, T.float16, args.accum_dtype)
     best_latency = best_result.latency
     best_config = best_result.config
     A = torch.randn(M, K, dtype=torch.float16, device="cuda")
@@ -277,7 +271,8 @@ def main(
 
     if args.bench_torch_sparse is not None:
         from torch.sparse import to_sparse_semi_structured, SparseSemiStructuredTensor
-        if args.bench_torch_sparse == 'cutlass':
+
+        if args.bench_torch_sparse == "cutlass":
             SparseSemiStructuredTensor._FORCE_CUTLASS = True
         A_sp = to_sparse_semi_structured(A, transposed=False)
         torch_sparse_latency = do_bench(lambda: A_sp @ B)
@@ -288,8 +283,6 @@ def main(
     print(f"Best config: {best_config}")
 
     if args.bench_torch_sparse is not None:
-        print(
-            f"Torch sparse ({args.bench_torch_sparse}) TFlops: {total_flops / torch_sparse_latency * 1e-9:.3f}"
-        )
+        print(f"Torch sparse ({args.bench_torch_sparse}) TFlops: {total_flops / torch_sparse_latency * 1e-9:.3f}")
 
     print(f"Reference Dense TFlops: {total_flops / ref_latency * 1e-9:.3f}")
diff --git a/benchmark/matmul_fp8/benchmark_matmul.py b/benchmark/matmul_fp8/benchmark_matmul.py
index 36b910355..64714b649 100644
--- a/benchmark/matmul_fp8/benchmark_matmul.py
+++ b/benchmark/matmul_fp8/benchmark_matmul.py
@@ -1,7 +1,7 @@
 import argparse
 import itertools
+import torch
 import logging
-import tilelang
 import tilelang.language as T
 from tilelang.autotuner import autotune
 from tilelang import jit
@@ -62,9 +62,9 @@ def get_configs(args, kwargs):
             M=M,
             N=N,
             K=K,
-            in_dtype="float16",
-            out_dtype="float16",
-            accum_dtype="float",
+            in_dtype=T.float16,
+            out_dtype=T.float16,
+            accum_dtype=T.float32,
         ).with_arch(arch)
 
         func = carve_template.equivalent_function()
@@ -99,12 +99,11 @@ def get_configs(args, kwargs):
             block_K=[64, 128],
             num_stages=[0, 1, 2, 3],
             thread_num=[128, 256],
+            k_pack=[1, 2],
             policy=[T.GemmWarpPolicy.Square],
             enable_rasteration=[True, False],
         )
-        return [{
-            k: v for k, v in zip(iter_params, values)
-        } for values in itertools.product(*iter_params.values())]
+        return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
 
     return configs
 
@@ -114,7 +113,9 @@ def get_configs(args, kwargs):
     warmup=3,
     rep=20,
 )
-@jit(out_idx=[2],)
+@jit(
+    out_idx=[2],
+)
 def matmul(
     M,
     N,
@@ -125,6 +126,7 @@ def matmul(
     block_K=None,
     num_stages=None,
     thread_num=None,
+    k_pack=None,
     policy=None,
     enable_rasteration=None,
 ):
@@ -156,14 +158,14 @@ def matmul(
 
     # Use half-precision for input data to reduce memory bandwidth,
     # accumulate in float for better numerical accuracy
-    dtype = "float8_e4m3"
-    accum_dtype = "float"
+    dtype = T.float8_e4m3fnuz if torch.version.hip is not None else T.float8_e4m3fn
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         """
         The compiled TVM function for block-level matrix multiplication.
@@ -178,7 +180,6 @@ def main(
         # Bind x-dimension to block index in N,
         #     y-dimension to block index in M.
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-
             # Allocate shared memory for A sub-block of shape (block_M, block_K)
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             # Allocate shared memory for B sub-block of shape (block_N, block_K)
@@ -190,8 +191,6 @@ def main(
 
             # Enable (or disable) swizzling optimization
             T.use_swizzle(panel_size=10, enable=enable_rasteration)
-            # to utilize swizzle tma layout
-            T.annotate_layout({C_shared: tilelang.layout.make_swizzled_layout(C_shared)})
 
             # Clear out the accumulation buffer
             T.clear(C_local)
@@ -210,6 +209,7 @@ def main(
                     C_local,
                     transpose_B=True,
                     policy=policy,
+                    k_pack=k_pack,
                 )
             # Write back the results from C_local to the global memory C
             T.copy(C_local, C_shared)
diff --git a/cmake/load_tvm.cmake b/cmake/load_tvm.cmake
index 21fe6dfb5..cb21be95f 100644
--- a/cmake/load_tvm.cmake
+++ b/cmake/load_tvm.cmake
@@ -3,16 +3,28 @@
 set(TVM_BUILD_FROM_SOURCE TRUE)
 set(TVM_SOURCE ${CMAKE_SOURCE_DIR}/3rdparty/tvm)
 
-if(DEFINED $ENV{TVM_ROOT})
+if(DEFINED ENV{TVM_ROOT})
   if(EXISTS $ENV{TVM_ROOT}/cmake/config.cmake)
     set(TVM_SOURCE $ENV{TVM_ROOT})
+    message(STATUS "Using TVM_ROOT from environment variable: ${TVM_SOURCE}")
   endif()
 endif()
 
+message(STATUS "Using TVM source: ${TVM_SOURCE}")
+
 set(TVM_INCLUDES
   ${TVM_SOURCE}/include
-  ${TVM_SOURCE}/ffi/include
   ${TVM_SOURCE}/src
   ${TVM_SOURCE}/3rdparty/dlpack/include
   ${TVM_SOURCE}/3rdparty/dmlc-core/include
 )
+
+if(EXISTS ${TVM_SOURCE}/ffi/include)
+  list(APPEND TVM_INCLUDES ${TVM_SOURCE}/ffi/include)
+elseif(EXISTS ${TVM_SOURCE}/3rdparty/tvm-ffi/include)
+  list(APPEND TVM_INCLUDES ${TVM_SOURCE}/3rdparty/tvm-ffi/include)
+endif()
+
+if(EXISTS ${TVM_SOURCE}/3rdparty/tvm-ffi/3rdparty/dlpack/include)
+  list(APPEND TVM_INCLUDES ${TVM_SOURCE}/3rdparty/tvm-ffi/3rdparty/dlpack/include)
+endif()
diff --git a/cmake/pypi-z3/FindZ3.cmake b/cmake/pypi-z3/FindZ3.cmake
new file mode 100644
index 000000000..d7920f8f9
--- /dev/null
+++ b/cmake/pypi-z3/FindZ3.cmake
@@ -0,0 +1,30 @@
+if(Z3_FOUND)
+    return()
+endif()
+find_package(Python3 COMPONENTS Interpreter REQUIRED)
+execute_process(
+    COMMAND "${Python3_EXECUTABLE}" -c "import z3; print(z3.__path__[0])"
+    OUTPUT_VARIABLE Z3_PATH
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    RESULT_VARIABLE Z3_PYTHON_RESULT
+)
+if(NOT Z3_PYTHON_RESULT EQUAL 0 OR Z3_PATH STREQUAL "")
+    message(FATAL_ERROR "Failed to locate z3 Python package. Ensure z3-solver>=4.13.0 is installed.")
+endif()
+message("-- Find Z3 in path: ${Z3_PATH}")
+find_path(Z3_INCLUDE_DIR NO_DEFAULT_PATH NAMES z3++.h PATHS ${Z3_PATH}/include)
+find_library(Z3_LIBRARY NO_DEFAULT_PATH NAMES z3 libz3 PATHS ${Z3_PATH}/bin ${Z3_PATH}/lib ${Z3_PATH}/lib64)
+message("-- Found Z3 include dir: ${Z3_INCLUDE_DIR}")
+message("-- Found Z3 library: ${Z3_LIBRARY}")
+add_library(z3::libz3 SHARED IMPORTED GLOBAL)
+set_target_properties(z3::libz3
+    PROPERTIES
+    IMPORTED_LOCATION ${Z3_LIBRARY}
+    INTERFACE_INCLUDE_DIRECTORIES ${Z3_INCLUDE_DIR}
+)
+if(NOT Z3_INCLUDE_DIR OR NOT Z3_LIBRARY)
+    message(FATAL_ERROR "Could not find Z3 library or include directory")
+endif()
+set(Z3_CXX_INCLUDE_DIRS ${Z3_INCLUDE_DIR})
+set(Z3_C_INCLUDE_DIRS ${Z3_INCLUDE_DIR})
+set(Z3_FOUND TRUE)
diff --git a/docker/Dockerfile.cu118 b/docker/Dockerfile.cu118
index 9256fc09b..969b0e43c 100644
--- a/docker/Dockerfile.cu118
+++ b/docker/Dockerfile.cu118
@@ -1,4 +1,4 @@
-FROM nvcr.io/nvidia/pytorch:22.12-py3 
+FROM nvcr.io/nvidia/pytorch:22.12-py3
 
 WORKDIR /root
 
@@ -23,6 +23,6 @@ RUN conda install pip cmake && conda install -c conda-forge libstdcxx-ng=12 && c
 RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
 
 RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main TileLang \
-  && cd TileLang && ./install_cuda.sh
+  && cd TileLang && USE_CUDA=1 pip install -e . -v
 
 CMD bash
diff --git a/docker/Dockerfile.cu120 b/docker/Dockerfile.cu120
index c89ce82ef..341fe40c0 100644
--- a/docker/Dockerfile.cu120
+++ b/docker/Dockerfile.cu120
@@ -1,4 +1,4 @@
-FROM nvcr.io/nvidia/pytorch:23.01-py3 
+FROM nvcr.io/nvidia/pytorch:23.01-py3
 
 WORKDIR /root
 
@@ -23,6 +23,6 @@ RUN conda install pip cmake && conda install -c conda-forge libstdcxx-ng=12 && c
 RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
 
 RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main TileLang \
-  && cd TileLang && ./install_cuda.sh
+  && cd TileLang && USE_CUDA=1 pip install -e . -v
 
 CMD bash
diff --git a/docker/Dockerfile.cu121 b/docker/Dockerfile.cu121
index 5b092773d..f91029d75 100644
--- a/docker/Dockerfile.cu121
+++ b/docker/Dockerfile.cu121
@@ -23,6 +23,6 @@ RUN conda install pip cmake && conda install -c conda-forge libstdcxx-ng=12 && c
 RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
 
 RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main TileLang \
-  && cd TileLang && ./install_cuda.sh
+  && cd TileLang && USE_CUDA=1 pip install -e . -v
 
 CMD bash
diff --git a/docker/Dockerfile.cu123 b/docker/Dockerfile.cu123
index 2715536a8..b3d1217fd 100644
--- a/docker/Dockerfile.cu123
+++ b/docker/Dockerfile.cu123
@@ -23,6 +23,6 @@ RUN conda install pip cmake && conda install -c conda-forge libstdcxx-ng=12 && c
 RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
 
 RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main TileLang \
-  && cd TileLang && ./install_cuda.sh
+  && cd TileLang && USE_CUDA=1 pip install -e . -v
 
 CMD bash
diff --git a/docker/Dockerfile.cu124 b/docker/Dockerfile.cu124
index fb9654f48..335f52565 100644
--- a/docker/Dockerfile.cu124
+++ b/docker/Dockerfile.cu124
@@ -23,6 +23,6 @@ RUN conda install pip cmake && conda install -c conda-forge libstdcxx-ng=12 && c
 RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
 
 RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main TileLang \
-  && cd TileLang && ./install_cuda.sh
+  && cd TileLang && USE_CUDA=1 pip install -e . -v
 
 CMD bash
diff --git a/docker/Dockerfile.cu125 b/docker/Dockerfile.cu125
index c409667cb..148e44b41 100644
--- a/docker/Dockerfile.cu125
+++ b/docker/Dockerfile.cu125
@@ -23,6 +23,6 @@ RUN conda install pip cmake && conda install -c conda-forge libstdcxx-ng=12 && c
 RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
 
 RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main TileLang \
-  && cd TileLang && ./install_cuda.sh
+  && cd TileLang && USE_CUDA=1 pip install -e . -v
 
 CMD bash
diff --git a/docker/Dockerfile.cu126 b/docker/Dockerfile.cu126
index 93593b5df..c031c2bc9 100644
--- a/docker/Dockerfile.cu126
+++ b/docker/Dockerfile.cu126
@@ -23,6 +23,6 @@ RUN conda install pip cmake && conda install -c conda-forge libstdcxx-ng=12 && c
 RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
 
 RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main TileLang \
-  && cd TileLang && ./install_cuda.sh
+  && cd TileLang && USE_CUDA=1 pip install -e . -v
 
 CMD bash
diff --git a/docker/Dockerfile.cu128 b/docker/Dockerfile.cu128
index 1617bc79c..2b895ecd8 100644
--- a/docker/Dockerfile.cu128
+++ b/docker/Dockerfile.cu128
@@ -20,9 +20,12 @@ ENV LIBGL_ALWAYS_INDIRECT=1
 
 RUN conda install pip cmake && conda install -c conda-forge libstdcxx-ng=12 && conda clean --all
 
-RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
+RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev \
+  build-essential cmake libedit-dev libxml2-dev cython3
+
+RUN pip install cython
 
 RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main TileLang \
-  && cd TileLang && ./install_cuda.sh
+  && cd TileLang && USE_CUDA=1 pip install -e . -v
 
 CMD bash
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 1fb23a9f3..5f61f0e2e 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -9,23 +9,43 @@ ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get install -y --no-install-recommends \
   build-essential git wget \
   libgtest-dev libprotobuf-dev protobuf-compiler libgflags-dev libsqlite3-dev llvm-dev \
+  rocm-dev rocm-libs hip-dev hipblas-dev rocblas-dev \
   && apt-get clean autoclean && rm -rf /var/lib/apt/lists/{cache,log} /tmp/* /var/tmp/*
 
 ENV PATH="/opt/conda/bin:${PATH}"
 ENV LIBGL_ALWAYS_INDIRECT=1
+ENV USE_ROCM=1
+ENV USE_CUDA=0
+ENV ROCM_HOME=/opt/rocm
+ENV HIP_PLATFORM=amd
+ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
 
 
 RUN conda run -n py_3.10 conda install pip cmake -y && \
     conda run -n py_3.10 conda install -c conda-forge libstdcxx-ng=12 -y && \
     conda clean --all
 
-RUN apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
+RUN apt-get update && apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev && \
+    apt-get clean autoclean && rm -rf /var/lib/apt/lists/{cache,log} /tmp/* /var/tmp/*
 
-RUN git clone https://github.com/tile-ai/tilelang.git --recursive -b main tilelang && \
-    conda run -n py_3.10 bash -c "cd tilelang && ./install_rocm.sh"
+# Copy local tilelang directory instead of cloning from git
+# Build from tilelang root: docker build -f docker/Dockerfile.rocm -t mi300:latest .
+COPY . /root/tilelang
 
-RUN conda init bash
+RUN mv /opt/conda/envs/py_3.10/compiler_compat /opt/conda/envs/py_3.10/compiler_compat.bak || true && \
+    conda run -n py_3.10 bash -c "export USE_ROCM=1 USE_CUDA=0 && pip install 'numpy<2.0' --force-reinstall" && \
+    conda run -n py_3.10 bash -c "cd /root/tilelang && \
+        # Backup and modify pyproject.toml to remove torch from dependencies \
+        cp pyproject.toml pyproject.toml.bak && \
+        sed -i '/^[[:space:]]*\"torch/d' pyproject.toml && \
+        # Install tilelang with all dependencies except torch \
+        USE_ROCM=1 USE_CUDA=0 pip install -e . -v && \
+        # Restore original pyproject.toml \
+        mv pyproject.toml.bak pyproject.toml"
+
+RUN conda init bash && \
+    echo "conda activate py_3.10" >> /root/.bashrc
 
 SHELL ["/bin/bash", "-l", "-c"]
 
-CMD ["bash", "-c", "source ~/.bashrc && conda activate py_3.10 && exec bash"]
\ No newline at end of file
+ENTRYPOINT ["/bin/bash", "--login", "-i"]
diff --git a/docs/.gitignore b/docs/.gitignore
index 4d8eb4049..79ba97163 100644
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -1,2 +1,2 @@
 _build/
-autoapi/
\ No newline at end of file
+autoapi/
diff --git a/docs/CNAME b/docs/CNAME
index ca903c694..6862cd2e9 100644
--- a/docs/CNAME
+++ b/docs/CNAME
@@ -1 +1 @@
-tilelang.com
\ No newline at end of file
+tilelang.com
diff --git a/docs/README.md b/docs/README.md
index 349c0eccc..896d778d2 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -27,4 +27,4 @@ cd _build/html
 python3 -m http.server
 ```
 
-Then you can view the documentation in your browser at `http://localhost:8000` (the port can be customized by appending ` -p PORT_NUMBER` in the python command above).
+Then you can view the documentation in your browser at `http://localhost:8000` (the port can be customized by appending `-p PORT_NUMBER` in the python command above).
diff --git a/docs/_static/custom.css b/docs/_static/custom.css
new file mode 100644
index 000000000..a1fee9c3d
--- /dev/null
+++ b/docs/_static/custom.css
@@ -0,0 +1,10 @@
+/* Reduce the displayed size of the sidebar logo in Furo */
+.sidebar-logo {
+  max-height: 125px;
+  width: auto;
+}
+
+/* Optional: keep container from growing too tall due to spacing */
+.sidebar-logo-container {
+  line-height: 0;
+}
diff --git a/docs/_static/img/logo-row.svg b/docs/_static/img/logo-row.svg
index 633243f3a..e73244b74 100644
--- a/docs/_static/img/logo-row.svg
+++ b/docs/_static/img/logo-row.svg
@@ -1 +1 @@
-<svg width="2268" height="537" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><defs><clipPath id="clip0"><rect x="0" y="1440" width="2268" height="537"/></clipPath><image width="401" height="463" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAZEAAAHPCAYAAACSpefQAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAAFgBSURBVHhe7d0HeBvnfT9wNm3TpE2zk3/SzGY0aTObttlDWW2TZseOt01vy3vHm9rU3ntviZJIihIliuLee+8tUlwgNg6Hw929997/ASgr9nuwLQm4AeD7fZ7PE+c96h23fiBAHlNSEASJmLRJ8da0KfHLbDuCIAiCvGHSJqX/TJsQq9PGRZo2IUppE+LmtAn1g+zXIQiCIMjlLJiU/iNtXFw7b0J0hwvIa01IrfPGpafSpuiH2X+HIAiCJHFChWH+uPT0vAmpfd64SN/M/HExb96EeAvbB4IgCJKEWTAu3jhvXDzLFos3M39clOdPSLvnX5R/zPaHIAiCJEHmT0jfmz8ublswIQrzZwvD1ZuQxuZPiIvSJoJfZPtHEARBEjALJwOfWjAuvbxgQhrQFIVrtGBCrJ0/IT2c5qDvZsdDEARBEiBpqvq2hePinQsnxOIFoRu/DhaOS9mLJoN/ZMdGEARB4jgLx+WfLZyQ9i0M3+j1tWhc5BZNiJuXjEvfYeeBIAiCxFEWjAe/sGhSXLxoUrq4cEKkRlo0KfUvmpReTp8SPs3OC0EQBLFwVkzRf1g8Kc1dPCHWLArd0M1VumhCvCetU307O08EQRDEYlk0Hvz14gnpWISbubkmpSOLpuT/Y+eLIAiCWCCLxsWvL54QVy2eFB2LJ0RqSZOifcmEuDp9XPx3dv4IgiCICVkyTj+weFJ6Ysmk1KK5aVuW1L5kUnp6uU39CLseBEEQxKCkT4nXp0+Ip5dMiDQuTYrn0ifFW9l1IQiCIDpm6bT07SUT4ub0SZHX3JjjTPqEqCyZkPakT8k/YdeJIAiCxDBLL/IfT5+QXkiflHrTZ2/AiWNSupg+KS5ZOhn8V3bdCIIgSJRZPinevnRSLFg6KdJEtmxKrF8+JT2yZkR9L7sPEARBkKvMsin5x0snpT3LJkWFveEmsmWT0smlk8Hr2P2BIAiCXEGW2ITPLZsSFyyfkkaXhW+qScm/fErcsnxC+h67fxAEQZAISRtR37FsUrpv2ZRYGeGmmpSWT0oDyyeltBVTwj+z+wtBEAS5lGWT8i+XT0pHlodvnMBaMSWWL58Q7w0VWnbfIQiCJG1WTYtfWTElrlg+KdrYGydorZiUji6bkn/F7kcEQZCkSugnkFZOS4+tnJKaVoRvjnClVk6JjpVT4pplE+I32P2KIAiS8FkxGfzDyknpJHtzhKuzclLqWDklPbt6hn6U3ccIgiAJl1VT0jdXToobVk2KvpXhmyDEwqpJ8fyKSfE2dn8jCIIkREKvlFdPSc+tnpa6Vk2JFHQyLe1dPS3/jN3/CIIgcZvV0+Itq6bEc5obHuhi9ZQ0vmpaTF89JX6JPRYIgiBxkzWT0g9XT8s7V09J0uopKXRzAwOtmZYbVk9Lj20eVd/HHhsEQRDLZvW08JnV01La6ml5mL2xgfHWTEun1k6R69njhCAIYqlsU9W/XT0t3rN2Wi5bE7p5gWWsnZYCa6fkrWtt0vfZ44YgCGJ61tjk/1k7JR1kb15gNfLg2vB3icJn2GOIIAhieNZNBv9t7bS8dN2UNLk29IoX4sK6Kbli/bR0/6ox+k72mCIIguiedQ767rVT0sPrpuV69gYF8WPdtHRs3ZT8a/b4IgiC6Jb10/Lv1k1JmevCr2gh7k1LrnVT8toNk9J/sscaQRAkZllnl/5jvU1eu2Facq+fligkGJvcud4mPbfFQT/GHnsEQZBrzuYp+uENNunp9Ta5XXPjgYSzwSYVbLCRO1RV/Sv2XEAQBLmqbJgiN26cls5uCN1cINns3zAt/5w9JxAEQd4yG23S9zfa5G0bbJIQ4eYCyWNy47S8dPOU+GX2HEEQBNFk4yT91MZp6eVN0/LAxmmJAsySGzdMS4/vGKPvZ88ZBEGQlDRVfdsmm3jnRptUrL2BAMzaZJNOb5oiN7DnD4IgSZzN0/LPNk1L+zeFbhIAb8UmCZtt8vbNM9IP2HMJQZAkymZ78AubbPLizdPSRc2NAuAtbLbJw5ts8ryNNuFz7LmFIEgCZ/8U/YfNNmnuFptcszl0MwCIily5dVp6IHResecagiAJlq12+ddbZqRjW2wSBYilrTPS8S3T8m/Zcw5BkATIdpv49a02efXWGcnBXvwAMeTeMiOv32aX/os9BxEEicNsGKcf2DYjPbF1Rm7ZGnq1CGAIuXubTXp+u4N+nD0nEQSJk2ybIddvtUmntRc4gDG2zUiF221iauhHyNnzE0EQi2brtPSd7TPy5u0zEr8tdCEDmG1GOrDdJv8Pe64iCGKhhN462DYjvbjNJvdqLmIAk22fkaa2zcjLt06LX2HPXQRBTM52G7l9x4xUsD10sQJYmty8fUZ6fNuE+kH2PEYQxODstMs/3jEj79lhkxTtxQpgabnbZsiN7DmNIIgB2T0T/PxOu7xg54w0umNGogDxaKddCu60yzt2zUg/Ys9xBEF0yJ4R9R277NJ9u+xy5c7QRQiQCOzySOhFUejFEXvOIwgSo+yekX+5yy4d0VyAAAli14xcvcsuzd1oU9/Fnv8Iglxjdk+LX9k1I6/YZZdsu8IXGkBi222XTuyxy79jrwUEQa4ye2zy/+62SzPsRQaQDHbb5d17bOpH2OsCQZArzG4n/cTuGemF3TNy9+7QRQWQdOSePTPSi6Frgb0+EAS5wux1SN/cbZfX756R3NqLDCApFO+yiXemqerfsNcHgiBXmL12+Td77dKxPTMSBUhGe2ekg6G3edlrA0GQK0zGGH3nXrt03167XM5eYADJYK9dmt4zI6/YOy1+lb0+EAS5wux3C/+8zyG9ss8h9++1hy8sgKSyzyG37HNKT+6apB9irw8EQa4w+53Sd/c55M37HBK3L3RhASSZ/Xbp7IEZchN7bSAIchXZ55D/sN8uZbEXGEAy2G+XpH12eecBuzyHvTYQBLnC7LTTfzxglx48YJer989eWADJ5sIBu7xw30zwX9jrA0HiKqFXRPsdUtoxEx7hcGgm+Pn9dnn+Abs8HOEiA0gCcs1+u/RQ6IUVe30giKVzxC18+qBDOnzALtFLxg6Z9MjrAzPSDw445G0HHFLgNfMBSBoH7VLWAYf8e/baQBDL5YhN+OwBuzzvgEMeYk/kMId0cL9D/jn774zIASf50wGHdEozJ4AkcNAh+Q445I0HHNK32WsDQUxP6Pc2Dtml+w7a5YqDs6983tAhuzR+0C4vCr3dxPajdw551PcddEiPHnLI9ey8AJLBIYfce9AhvbTPpX6SvT4QxJQcnJF/ddAuZbAn61uTKw7OiPduU9W/ZfvUO0ccwX89aJcXH7RLY9p5ASS+Qw6p5IBDvMuM6w9Bwjk6I37jkFNec9ghOQ45wifltTpw2KS3uA7a5R8fcsq7DjskKcK8AJLB4SNO+RfstYEguuWYTf3IEaf09GGH3B7hhLwmh53SxUNOeeF+m/A5djwjcthJbjrskM6w8wJIEjOHHPLKg27x6+y1gSAxzVEXueWIQzp3OHTj18ERh1xxyCneU2LCU0ozfPRDhx3SE0cccjM7L4DkILcedkpP7Z+iH2avDwSJKkft8o+POOXdRxyyfMQhh272ejtw1CH/jJ2HEclwil854pCXHXXKkxHmBZDwjjpI3hEnuZm9NhDkqpNhp1846pQXHXHIo+yJprejDnnsqFNekGGj5rzF5ZB/fsQp72PnBZAk5NALx9ALSPbaQJC3zEk7/cejLvJghpPUHJ29oZsmw0HKMxzkbjPe4golw05uy3CSfHZeAMkgY/bF3KITdvWL7LWBIBFz1C7/LsNJMtmTyXROef8Ru/xTdr5GJHOGfjTDSZ456iDtmnkBJIEMJ6nNcJCHDzjou9nrA0HCyXBI38xwyhuOOWRPxuwrECsaPeaUFxzx0M+y8zcix93iv2c45FUZDnkmwtwAEt4xJ8k+7pD/wF4bSBIn00E/fsxBnj/mlLvZE8aqjjlIWegtrmOq+tfseozIcaf8iwyHfIidF0BScMpchkPenOGUvsNeG0gSJU1V33bcRe445iSFx5yhVxjx57hT3pfhln/Crs2IhD6jOeYmdx53kiJ2XgBJov+4i7x8xK1+mr0+kARPhkP++XGnvD/CSRF3jjvl0eNOef4Jj/oZdp1GJPydnIs8f9wld7FzA0gGx52k9JiT3HNMVd/OXh9IguW4U/zScaecftwlTxyfvQEnEFJ23EHuCn2Hxa7biGTapf864ZTXnXDKLu3cABLfCZd85IRT/iV7bSAJkFyP+r5MF3k000kaToQOdgLLdMp7M016iyuUEy7515kuOYOdF0CSsJ9wyatDP4TCXhtInCbTIV93wklyIhzshJXplC9kOuV5GW76z+z+MCJ7RtR3ZDnJvZlOUsbODSA5kPZMJ3k69Kw99vpA4iTZTum7mS55S6ZL9mfO3liTECnNdJM7VVX9K3b/GJFst/rpTBd5OdMl92nnBpD4slzkXJaL3MpeG4iF8+qNK8sp97MHNIntPeE27/ENWU7pO1lOeVOmS/ZGmBtAQstyyiTTKe8x821m5AoS+uMy2Q5yd7aTlGbNHjh4vZEsF0nLMektrlBOuuXfZztJZoS5ASS8bJd8MdspL8ly0H9lrw3E5GQ55V9ku+XDWa7Qt4/w5khJtpukmvUW1zGb+q4sJ5mb5SZV2rkBJL5sF6nPdpFHst3qe9nrAzE4mS7xaydd8oqTLtmWHT44cKVOuuQ92W55DrtPjUqOh34u2y3PO+mSh9i5ASQFNzl50i1fx14biAEJ/QGlk27yxEkXadEcGLgawyddJC30ORK7j41Klkv6/kmXvDXbLfMR5geQ2NyyP9stbwn9IBB7bSA65aST3JjjJmdOzr6ahhjIcZHiHDdJZfe1kcnxkOtz3CSHnRtAkhgIvaAz8zPLhM9Jl/TDHJe8/aRLDkY4ABADOS55t5lvcYXeI85xkUdyXKSOnRtAMshxk/KTbnJvyYj6Dvb6QK4xoffOc9zyvFMueThn9kYHOgrt51Mukpbrop9ij4VROWVXv5jjlhfluOVRdn4ASeJojkv+FXttIFeRjDH6zlMecv9pN6k8NXtzA0OR4tMucgd7XIzMabc857RL3nnaJYva+QEkttMu2XHKJa855Ra/wV4byFvklEv+9SkXOcbuVDDWaZesnHbJu0+5pB+xx8jInHaSG0+5SS47P4Ck4CYdp9zkmbwZ+lH22kCY5Hqk/8h1yWtOu2Tn6dmbGFjDUK6LvGLqW1w+9YOn3eTxXBdpijA/gCRAzue6yG3stYGEiodf/UiumzyT6yEdp90yBYvykKLTLnI7e/yMzGmn+OVcj7w01yNPaOYHkARyPfLeXI/8M/baSNrkesmtuW6SnxvaOWB5Z9wyyfXIu3Jd0g/ZY2lkQhfRGY+894xbVtg5AiS6Mx55/IxbTj/ro//GXhtJkzNu+Se5Hnl3+KYUYSeB5Q3mesgrp1zqJ9lja2RCL0LOuMm5CPMDSAKkIddLHg39zST22kjYnPOpXzzjlhed8chjZ2Zf2UI885DCXK+579OG3g496yZPn/GQNs38AJKBh5zK85Dr2WsjoXLSTv/xjIc8dNZNajU7AOLaWbdMznjknXku6QfscTcyuW7x62c98sozbtnGzhEg0Z31yIEzbnnrWZf0ffbaiPucdcu/P+shmWdnbziQoPLc8mCeh7x8zkk/wZ4DRuacR/7fPLd8kJ0fQDIIXYdnPSQt16N+hr024i75XulbeW55wzmP7M2bXRwkgXNuUpBn8ltcx1T1r/PcJPWchxSy8wNIBuc8pCLfQ+4L/eI2e31YPmcd9ON5HvJCnlvuYRcGSUM+55F3mP0W12kH/dg5D3nunId0RpgjQMI75yHH8l3yr9lrw5JJU9W35c+++is6F35FCsku3y0P5HvIS6EXFuz5YmTOe6T/POeR1+a7ZSc7R4BEl++WXaHzP3QdsNeGZZLvkf873yPvP+cJVT6A18v3koJ8L7mVPW+Mzjmv/Kt8r3yUnR9AMsj3kM7Qd+bnefpP7LVhWvI48Uv5XnnpeY88mR+eJMAbkvI98o4Cn7k/PXKW0r877yP35HtIaYQ5AiQ+Lyko8JLbzfoz2eGEfrnlvJc8dt5LGjUTBHhz/ec95EWz3+IqctFPhd5qO++ReyPMESAZhN49+jl7beie8x5yfYGH5Jz3yBTgmnnJ+QIPuYU9v4zOea/07QKvvKHAK3s0cwRIcAUeebLAKy/N58Qvs9dGzFPklb5b4JW3FHhlnp0IwDWSCrzy9nyf9D32fDM6hT75d+e95ESEOQIkvAIvaTzvJY+f89D3s9dG1Clxq58u9JBXCrzyQMFs5QKIqUKP3FfoIS8W8vRj7PlnZPKn6D8UesgDhR5Syc4RIBkUesjp8x7yJ/bauKYcU9W3F/rIPYUeUlo4e6ED6MtL8q3wFleRh3620EPSCr3yoGaOAIlPKPTK2wt9UfyeV5FX/mWhVz4coXMAXRV5ZLHQK28LvX3KnpdGp9gnfa/IK28p8sp+dp4Aia7IKw8XeeV5JR76OfbaeMMU+cWvFfnklUVeeabIG+4EwBTFPrmv2EdesMLPtJf45OuKvOQkO0eA5EAqC33kgVMT6t+z18bllPvoh4q95MkiL2nRdgBgnmIfyS/ykJvZc9boFLjU9xR6yMNFPlLLzhEgOZDjRV75N+y1kVLiITeV+MiZ4tAFC2BBJV45WOKVt1rhLa4iL/1CsVdeWOKTL7DzBEh4Ptld4pPXlXqk/wpfECVe+VclXtKq+UIAK/LJvhIfea5EVf+GvbkbndD7xMVeckwzR4DEJxX7SN7sheAlqSVemZb6SG2pjzSG/hvA6kp95FzoO2j2xm5GyjhyQ6mPnGbnCJCISr2ksMQnj4b+O3wBvFpEwht98mCJlxSXeuUp9h8CWE2pVxZCb3GVeqXvsDd2o1Ptoe8v8ZLH8UIMEhdpK/GSshKvLL7aFn72VpmXpJZ6Sai6/IWPNJZ5SYWmHcCCyrykp8xHni/z04+yN3ejU8HRL5V6lfQyLxln5wkQj0LncplXKS7zkRF22+UiUjb7hSyp3EvKy7ykJcI2AOvxKXnlHLmRvbGbkXJO/mm5T9lT7iVEM0+A+FFR7iNNEdrDVFV925sVkVk+MlruVYrLvWRMsw3AYsq9RCj3KlsqvPTb7I3djJR5yC2h4sbOE8DKyr2k+dI3EQq77bUuF5FyH6FvpcJHmss5Ul7hIzK7DcBqKjjSXRH6KS6/+hH2xm50qjj64XKOPFnuI63sPAEsZrTcpxRX+MjFCNs0rqqIvKrCpxTODqTdBmApHKmu4qj+j7e+wlT46dfKfcqKch+Z1swVwEThbw44Ul7uIy3stjdzTFX/OqXSS1IrZju5YpU+MlXhU4orOTLEbgMwHUeqyn3kwUo7/Uf2Rm6FVPnl/6nwKQc08wYwQaWPNFZypIJtvxLXXEReo72SI6WVHPFH2AZgqPCLGk6ZV301D4szKaG3ASr85I5Kn1LArgPACJUcGQ5/M+Ajk+y2K3W5iFTOVqNrx5GqSo7UadoBDFDFEX8Vp2yp9lHT/7DV1aaSp/9U6SPPVvpIB7suAJ0EKjlSVhn6JkC77aqEnxwRkyISwhFHlU8pqeJIr2YbgF44JavCJ/+BvTnHW6r99BuVPmVN6DrSrBEgdurDL/q17dckXERqvCS1KvRqLlY40h0qJtUccWu2AcRINUeqqiz8uce1psor/1+1TznCrhcgKhwZCN+XfcSm2RYFfYrIJdUcqQlh2wGiUc2Roao4+dzjWtOpqm+v9pG7w9/ZR9gHAFeqmiNcFUdKq32kk90WC42q+rfhIlIdGkwHNRzx1YQWwJEudhvAVeGIvyZOP/e41lQE1E/W+MiLNRzp0ewPgLfCkdoajlRr2mNotoj4SWoNF77h66bWT/pr/UpJrZ/Y2G0Ab03JqkmAzz2uNdU8/WatX1lf4yce7b4BYPhJ76X7rVOzLcYMKyKvUV/rJ5UR2gE0av2kqsZH5iba5x7Xmlpe/m2NXznO7ieAMD9xXyoe3ZptOgkXkTo/Sa0NXbAGqeOIUOsnZXUcaWW3AYT5yWBdgn/uca1pVNW/r+XJ/XV+UqHZb5C06jhSE8K26y30+Z3hReRVdRwZqfMrxXUcGWe3QdLy13LKloYk+tzjWlMvqJ+p9ZO0Oj8ZiLAfIUnU+UlXrZ+U1vmJl91mhMtFpG72pm6Wpno/KY/QDkmknlOy6pP4c49rTb2Xfrfer2yu8xOO3aeQwPzEXudXSur9pF+zzUBWKSJhlwpJM9sOCc5PqurwuUfUCRXgek7J1uxfSDyha4YjdZp2E/RT+nfhIlIffiVovgaOjNf7leIGjoyw2yDB+MlgvZ+kNeFzj5il1kHfXc+Rh+o5UqPZ3xD3Gvyko8FPSus54me3mcVyReQyP2lr8JOyeo4ENdsgvvkJVx8nn3vUc/TLDZxyqoEjT4V/MzdOUuel/1LPKQvwYiwxNHBkKvTiOvzCK8J2M4WLSKOfpDbMTtRyGv2kspEj9Ww7xCslqykOPvdo9tOvNXDKikY/mX517o1+5XyDn9zOfq2V0+SjP2zglO2NHBG0xwLiQege2MCRBrbdKkpU9R3hItLoD0/Wmnhia+KVkkY/GdBsg3hR1cBb/3OPVo5+uClAnmzkSUuENczilX0NnPxz9t9aOY0B8qdGv3JKsxawLp60NvpJWaOfiJptFhIfReRVPOls9JPSJp74NNvAkpr8ZLAx9LmHYP3PPRo5cksjr+Sxa4ikiSeTjbyyrIGjX2H7sWraPer7GnnyWKOfNLDrAUsZb+KV4kY/GYmwzXLCRaTFT1KbZi/4uNDsJ9XNflLLtoOlcM1+ZUtLwPqfezRy8k+aeWV3s5/IEdbxppp50tTIk8cbfeoH2X6tmmYf/bcmXlnS5CcX2fWAuZr9pCJ0TrHtVlZN6TvjroiE8cQVeourmSfdmm1gqma/ktXEW/9zjyYf/dcmXlkck5spr5xpDpCb2DGsnGiKJ8QYT5qb/aS8yU8UzTaLu1xEmmerYPzhSW8zr5S08MSh2QaGavGTqmaezO1U1XexNywrpcWtvreZJ4+2+Ek9u4YoiS28srOFk+ewY1o5zRy5uYVXzkZYD+isxU9Gm3mluNlPLrLb4kX8F5G/qAvfxLTtoLMWPxlsjpPPPVoC5PoWv5LDriGWLt0YFrV56RfY8a2aZh/9UAtPnmjmSQu7Hoi9Fj+RQ995JML+Dj3LLVxEWmYXFu/8rX5S2sKTjgjbIPa4Fr+yOR4+92gNqN9v9StbW/0kEGEdeqlt4cnDjS71Pex8rJp2jn61lVeWt/BkKsJ6IBZ40tjiJxWa9jiVaEXkVUOtvFKMC0E/rX4lqzUOPvfoEOhnW3hlXuicYNdglFa/crKVl//Izs3KafHL/93CK/vZtUBUhkP3pVaeTEbYFrfCRaRNIKmtPKEJJ0Aa23hSoWmHaFS1xsHnHq2U/kMrTx5o40llhDUYro0n/rZAfPy02qtRVfWvWgPk9lZeOc+uB65cG08CrTwpaw2QdnZbIrhcRNpmF5tw2nkit/OkvC1AWthtcFUG2wMkLfTKnr3ZWC1tvPzb9oByIsIarCC8H9sF9TPsvK2abj/9aFuAPNMeIO0R1gNvJkDq23hSpWlPIKEXbAldRF5jtI1Ximf/V7MN3hjXHlA2d8TBK+hWnn6rLaBsaA8QT4R1WEo7TyraeXJ/+FVcnKTVT/+9PaCsbg8QO7se0Bho45WS9gCxRdiWUC4XkfbZEzvxBUhLB0/KO2a/Q9Fuh78IKJkdcfC5R2dA/WRHgLzYwZMezRosriOgHOvk5d+wa7Jy2v3yL9t55TC7FgjjOnhS2h4gnRG2JaTkKyJ/UdERII0R2oEnVR1x8LlH6G87twvkrvbQKz7tGuJGR4C4OwLK+k6efpNdo1Xzmn1fzK4naQVIbQdPqjXtCS58n+gUSGpH6GRONgEy1ckrxZ08GdJsS0KdPBnsjJPPPbr88i87eOUwu4Z41smT7o4AeaEjQD/BrteqCc01NOfw3COsKRl08qS3k1dKOgLEyW5LBsldRC7pDJD2Tp6UdvLEz25LElzn7Oce32VvElZLl5/+ewevrOoIEHuEdSSE0Aub0Kv80Kt9dv1WTQdP/6szoKzrDH1XFWFNiSi01lDx6ORJD7stmVwuIp2zFTWpdfGkqitA6tj2RNYVL597+NWPdAXIM5cKvmYdiaiLVw51+eVfsPvCygl9vtMV/pxHu56EEiA1YWx7Euqh9B9RRF4rQBxds68uejXbEkgXTyq74uBzj1C6AuS2Ll7JZ9eQFAJkpotXVoW+A2P3i1UzRuk7O3lyXydPyjXriXNdPOm69K6Fl92WrMJFpEcgqV2B0KtSeFV3gHR3BZSSboG42W3xrDtABrvj5XMPQf5Zl6DsZdeQpNq6AuTp0Hdk7H6yanoF+s/dAfJKd4D0R1hPfBGIPXw/SIS1xNjlItI9e4MBRo9AanoEUs22x5ueAOG6A8rmnnj43EOkX+oRlKXdAplg15HsegJKfug7M3afWTndAfqdnoCyqTtAfOx64oJAqroFUqdph7B+St+NIvLWfD0CKe0JkK4I2yyvR1Ayu3n59+zFbbV0e+kHunnyeI9AGtk1wOsoPaHv0AT5Z+w+tHJC52CPoGRFWI8l9QRIx6Xr3s9ug7+4XER6ZncavIneAOnvDSjFPQKxsdssSSCVvUJ8fO7RGyA39AaU05o1wBvqFchE+Ds2kX6Z3Z9WTeitj16BPNgrkGp2PVbRK5Cp8HUeIIPsNtBCEbkWAqkP3aA17dYx2Bsgaf1x8LlHd4D+qCeg7OgJkGCEdcAV6BVIYy9PHg99J8fuX6umK0g/3yMo83sCZJhdj6lmX3g1aNrhDQ2p6ntS+gSS2jv7ShuunNArkLK+AGmLsM0UfQHC9cXJ5x7dQfovfYKyoC9ARth1wLVScvsC5EZ2X1s5fQH6g96Asq03QALa9Riq9dL1LEbYBm8CRSRKoZtgX0Ap7hXIOLvNUIKS2RcHn3u85u2MGs0aIBaCfQFlR3+A/ojd91ZOb4Bc3xdQciKsR18CGQ9dv3gxc+0uFRE1tS+gUIiCoDT1C0q5pl1n/YJS2S/QuPjco4+nf+gXlCx2DaADQbnQLygLQ9/xscfBqhlxq+/tC9JH+wWlXrMeHfQLSkXoumXb4eqgiMRYuJAISjPbroOB/gCNi889BgL0O30BZVN/QPFFWAfoSVBqegX6UPjDzzhJX5D+a7+gLO4TlDHNemJBUJovveBTNNvgqoWKf7iI9AvhGyDEwEBQGe8PKsUDQWWE3RatAUHhBoLK5oE4+NyjR1A/3R+kL/cLSj+7DjBcdl+QWv7xNq/NgEB/PBBUdg0IihRhPVcvqIxeui4varbBNRtR1femDApq6sDsDQpiq20wqJQNCkowwrZrkTkYpJb/3KOf0r8bEOi9obVHWAOYZDCOXoC8NoMivWlAUM6w67lSg4IiXzoXW9ltED0UESMElcoBQanXtF+poFI5KNAH4uFzj6Eg/dWgoBzVrAEsYzCoDAwG6Suhx5Kwx8+q6fWpH+wP0icGBaWZXc+bGRSUxoGgUsG2Q+yMqur7wkVkcHaHg06GgoptKKiUDIUu4AjbIwl/bTA+PvcYlNT/GBKUtYOC4mTXARYVVMoHBfW+0AMT2eNp1QyK9CtDgrJsMKhMatbz+rUNDwWV4rf8OogaiojBhgSlazColA4Jio/d9pqv8Q3GydsOfTz92FBQfW5IUDrZdUDcyBgK0l+zx9bKGRLoz4eCyr4Iawlcegu5PcI20EG4iIwIaurQ7M0LjBJUqoeCSi3bPiwomSNx8LmHqqpvGw7QO4aDSgG7BohLriFBWTck0f9ij7WVMyLQ/x0SlO7wGoLKyHBQKYqwNtDRGKXvRxExj+vSTTj0v2NDAXore5FYMcMC/e/hoHIgwnog/nUNBenz/Tz9OHvcrZoSVf2b0Ftcw4IiRVgP6GxEUD8dLiLDs6+CwQQjgtI3oqrvYC8Oq2VIpF8dEZTlw0Flil0DJJigUjQi0DtDN2j2PLBihoM0TbMGMMTl70TYDWAoF3tRWCn9PvqhkSB9YlhQmiPMHRLbwdBbRuw5YbWgiJhntoiIaupIUKFgGssWkRGR3nxBVM5GmDMkiQuiYrsgKitHRPXr7PlhlVwI0jR23mCMcUo/kDIqqqkXQicLmMVyRWREoD8eDSq7RoOKFGG+kIRGg0rrSJA+NcTR/8eeL2bnQpC+ws4XjIEiYg2WKSIjQfWLF0Rl0QVRGYswT4DQdybnLojW+iEQFBHzoIhYg+lFJPQkzgsCfeRCUKmLMD8AFrkQVPZcEOhP2XPJjKCImGdCVT8YLiKjs9+uggnGTC4iF4L0ujFROcnOC+Aticr4mKikj4r0S+x5ZWTGgvRlzdzAEJeLyNjszQxMcNGkIjIh0e+NBZUtY0GFZ+cEcDUuBpWGi0H6WPgndUxIqIiwcwJjoIhYgNFFZFSgnx0L0rSLojLIzgUgKqJyelSkN7DnnN5BETHPJKUfShkX1dSLszczMMG4QUVkQlX//qJE778YVCrYOQDEynhQEcaDyvYxif6QPQf1yniQvsTOA4zxlyIiKhTMMS7qX0TGg/Q345JynB0bQDeSMnJRVBZcDNLPs+djrDMu0Zc044MhLheR8dmbGZhgQsciEvqDMROSspsdE8AoE6IyeTFIr2PPzVgmVETYccEYU5R+OGVKVFMnZg82mGBSxyIyJatz2PEAjDYpKc3suRnLTEr0JXZMMAaKiAWgiECiM6CIvMiOCcZAEbEAFBFIdCgiietyEZmcvZmBCaZ0LiLseABGmzKgiLBjgjGmKf1/4SIyNXszAxNM61xE2PEADKdzEZmW6AuaMcEQKCIWgCICCQ9FJGGFi4hNVFOnZ29mYA5di0iE8QCMZUAR0YwJhrCp6kdmi4gUPtBgDn2LiHY8AGPJ+hYRm0Sf14wJhrhcRGyh/wNm0a2I2GV1ToTxAIxlQBHRjAnGeLWIzEgKBdPoWkQijAdgKLvORcQh0efZMcEglH4URcR8KCKQ0PQuInZJfY4dEwyCImIJKCKQ0FBEElioiDhENdUeOtBgFl2LSITxAAzlMKCIsGOCQSj9p3ARcYQONJhF1yISYTwAY+lcRJwSfVYzJhgCRcQaUEQgsaGIJKxwEXGLaqpTUiiYRrci4pbVORHGAzCWAUVEMyYYwkHpx1LcRE11yuEDDebQt4hoxwMwFtG5iBD6jGZMMMTlIuKSFQqm0bWIRBgPwFg6FxE3oc9oxgRDXC4iblmhYBpdi0iE8QAM5TGgiLBjgjEclH4cRcR8KCKQ0AwoIk+zY4IxUESsAUUEEhqKSOIKFxEvUVM9oQMNpvDqXETY8QCM5tW5iHgJfZodE4zhpPQT4SLinb2ZgQl8OhYRTlbnsOMBGM6AIqIZEwwRQBExH4oIJDz9i8hTmjHBEJeLiE+mFMzByVTXIsKOB2A4QnUtIhyhT2nGBEP8pYiQ8IEGE3BE5yISYUwAg+lfRLRjggECqvrJFD9RU7nZmxmYwK9zEWHHAzCB3kXkyQhjggFQRCwARQSSAIpIgrpcRPyzNzMwAa9zEWHHAzAar3MRCRD6JDsmGCNA6adQREyGIgKJTu8iwhP6BDsmGCNcRASipvKzBxpMENCxiAiyOocdD8BoAQOKCDsmGONyEQnMHmgwgaBzEWHHAzCaYEARYccEYwiq+mkUEZOhiECi07uIBAhNY8cEQ/j8qvqRFEFVUwWFUjCNfkVEVedEGA/AaLoWkSClj0cYE/TjDyp0E0/pt8MHAEXEdCgikOhQRBLHqSCl173uAIiqmhpUKAXT6FpEIowHYDTdi0iEMSG2mkP7mVL6fnb/o4iYD0UEEh2KSPyaERW6UqT0q+x+v5xQEREVSsE0uhURWVXnRBgPwFCSzkVEovRxdkyInqTQozKlv2T3tyYoIqZDEYGEZkAReYwdE6JSKVF6v6qq72D3dcSEiog0e6DBHLoWkQjjARhKNqCIsGPCNbkgK3RBkNLPs/v4TUNUNVWePdBgDl2LSITxAAxFdC4ihNLH2DHhqiiyQnfLlP6Y3bdXFBQR06GIQEIzoIg8yo4JV4YotIBQehu7T68qoSJCKKVgGl2LSITxAIymexGJMCa8uR5C6fOU0o+x+/OqgyJiOhQRSHQoItbBKZRuopR+i92P1xxVVVMVSimYRrcioqrqnAjjARhN1yJCKX0kwpiglUMp/SO7/6IOiojpUEQg0aGImKuJUvq4qqrvY/ddTBIqIhQxM7oWEXYwBDEhuhcRdkAkHBuldAV9s982j0XCRSRC6QLD6FtEtOMBGE3/IqIdM9kdoVfy2+axyGwRUSiYRuciohkPwGh6F5GHI4yZrCpUVb3vin/bPBZRCUmlhFAwjX5FRJbnRBgPwGj6FhFCHo4wZrK5QBVlPqX0c+z+0Suqqn6QEvLSbBGRCQXT6FtEtOMBGMuIIsKOmTwIlZXdoWud3S96RVXVv6GEPBW6d1GZDKeoIkmlkkzBNPoWEe14AMaSdS4iEnlIM2ZSIOcpIbey+0OvqKr617P1ghRenoMsD4SLiCLKFEyjaxGJMB6AsST9i4hmzEQmyd1UIs9Tnv4Tuy/0CpXl/1Uk+WCEufSnqKKYqogSBdPoXEQ04wEYS5J1LiLSQ5oxE5NPEeWNVJK+ye4DvaKK4tcVUV6piJItwnxCx7ZvtogEJQqm0a+ICPKcCOMBGEz3IvKgdswEI0onaVCO/W+bv0FUv/oRGpSeVoJym2Yur5uX3HOpiIgUTKNzEdGMB2AwyYAiwo6ZKKRGGpQeU1X1vey69QoVxdsUUczXziUCUepOUQUxVRFECqbRt4hoxwMwmM5FRJAe1I4Z54KiTRHE5VQUv8KuV69QQf65EpT2aebypqSu2SISCFIwiRDUsYgIczTjARhNCOpdROZqxoxnQvAIDQZ/wa5Tr4QKlSKIyxRBnNTM5a11hm40qQovUDBJQNC3iLDjARgtIOhbRHhhrmbMuBSsoH7hXkrp37Fr1CPU5/sQ5YNPhI6Pdi5XKCB0pKh+IVXxCxRMwutYRDhhjmY8AKPxBhQRdsy4EhxReGE+FYTPsmvTK5QTb1Z44ax2LleJF9ovFZEABZPoXkQijAlgJP2LyAOaMeODrPCBXTQQ+BG7Jr1COeGnCh/Yo/gDJMJ8rh4faJstIlyAgkn8OhcRdjwAwxlQRDRjWl4+5QKG/bY55cQvKZyQrnDCeIS5REFoSVG9Qqri4ymYhAvoW0TY8QAMF9C3iPj4B7RjWhTHd6m+wHPU7/8ouw49Qr3eD1COf1zxBRo1c4mJQDOKiNlQRCDh6V5E7teOaTEc71N8gQ3Uwxv22+aUC9yocIFczVxiKtCUQrz+VOL1UzCJj9etiMgcN0czHoDheF2LCPHx92vHtA7F68+Wffwf2HnrldB1r3j5ncTrF9m5xJyPb5wtIh4/BZN4dSwibm6OZjwAo+ldRDz8/ZoxrcDLNxIv/6jqdhvy2+aqz/dFxcsvIl7/qGYuevH6G1KI259K3BwFs/j1LSKa8QAM5uH0LiL3acY0lX9a8XDLqZP7MjtXPRIqUsTDPULcXJ12LrqrTyFubypx+yiYhdO5iLDjARjM49O5iPju04xpEsXtOyx7/P/LzlGvEA93veLhcth5GIermy0iLh8Fs+hcRDTjARhO/yKiHdNYbq6cuH33qqr6dnZ+eoS6fD9QXNw24vYFNHMxFFc7W0ScXgpm8elbRDTjARhO3yLi9N0XYUyD+IYVp3ee6vF8hp2XHqFe7+cVp3d+aFztXEzg8lWnEIc3lTi8FMyiYxGxc3O04wEYTu8icm+EMfUmK07vTuryGfLb5tTheDexcw8Rp68mwlxM5Ku6VEQ8FMzi1bGIuOdoxwMwmt5FxH2vdkz9KA7vOeLy3MLOQ6/IDvcfFacnm52HRVTOFhG7h4JZdC4imvEADKZ3EZlx36sZUx9dxO57ls4Y9NvmTt/3FId3C3F4/RHmYhUVKcTmTiUzbgqm0beIaMcDMJbdo3cRuUczZkx5vIrdvZ7aPf/Fjq1HqM3zWTLjSSN296B2LhZj95TPFhGbi4Jp9C0i2vEAjKZ/EdGOGRPKjCtbnnb/nh1Tj9CpqX8g084HyIy7kp2HdbnLwkVEtrkomEbXIhJhPABDGVFE2DGjRWyuBjLjelR1ud7DjqdH5Gn378iM6wQ7D6sjNnfpbBGZdlIwjX5FZMo9J8J4AIYi0059i8iU4252zGvnmpKnXcvEKeeX2HH0iDTt+LY87dwoT7u82rlYH7E5S1LIlDtVnnJSMIneRYQdD8BgZMqAIhJh3GtwSLY5Dfltc3XK/Wky7XpZnnL1RZhH3CDTzqLQAUiVJx0UTDLl0LGI2OdoxgMwGJly6F9EIox7pciUo4xMO+9RVfVv2b5jHXVk5B1kwnlvaEx2HvGITDoLUUTMhiICCc7CRWSYTDrS1Gljfttcnrb/hkw5j0WYR9wik86C2SIyYadgkkm7vkWEHQ/AYGRS5yIy7riLHfMtSPKkfYc0OfNDti89Ik05vilP2tfLk3Z3hLnENTJpPx86AKnyuJ2CSSZ0LCIX7XM04wEYjEwYUEQijBsJGbefIxP2m9k+9Ig64fokGXe8KI87eth5JAoybs9PIeO2VHl8hoJJJmZ0LiIRxgQwEJmw61xEZu5ix2SR8ZlOMm5/RrXZPsL++1hH7VTfTiZm7iYTMyXsPBJNqCjPFpGLNgomGbfpW0TY8QAMRi7adC4itjvZMS8bt3nk8Zn10qj9P9l/p0fkizP/J1+0HdHMI0GR8Zm8FDJqS5XHbBRMclHnIsKOB2AwMqZzERmz3cmOGR734kyWPDZtyG+bS6P2/5DHbGvksRkHO49ERsZsZ2eLyOg0BZOMTetYRKbmaMYDMBgZm9a/iLx2vFFbPRmzPUL7He9mvzbWoWOOj5FR23NkzNbJrjsZkLHp3BRx1JYqXZimYBr9isjI1JwI4wEYSh7Vt4hcvoeNTkvyBdtqdcT9XvZrYh1VVf86PO6orZBdb1IZtZ1OEUenUqULUxTMoncRYccDMJY8OqVzEQnfw84HRya+yG7TI/KY7X+l0amD7DqT0/SpFHFkKlUamaJgFp2LiGY8AGPJF/QtIkZ85xGKOGL7unxhapV0YXqGXWMSy5ktIsOTFMwypW8R0YwHYCx5WN8ionfohZmPSsNTz8gjk+3s2pLeyORJFBHToYhAYovXIqKq6l+R4cnbpZHJ8+ya4FVT2bNFZGiCglkm9S0imvEAjCUPTcZdEZGHJ/5bGprYz64FWJNZKeLgeKo0OE7BLBP6FZGBi3O04wEYSx6aiJsiIg5d/Ko8NLFcGhyfYtcBEQyNZ84WkYFxCmbRr4gII1Oflgcm+rVjAhhHHphYwJ6bVgsdnPqwNDD+pDw40cLOH97E4MTxS0XkIgWzjOtWREKhFy58VBqceFYeGO/Qjg2go8HxY/LA+G/Yc9JqIUNjt0iD43ma+cNbGxw/liL2jadK/RcpmEXfIvJqxIGJb8gD46ulgYsO7RwAYkfuv1ghDV68j46NvZM9D60UeXD8p/LAxT1S/0XCrgGuWEaK2DeaKvWNUTDLRUOKyKuR+y7+n9R/8Yh2HgDRkfsuDkj9Y2lC36ghf+TpWiMOjH5J7htLl/ovjrNrgKvUf/EoiojpjC0ioaidnW8Xe8fukfsulmrnA3CV+sf8ct/YFmlg/LvsuWal0O7xD0j9Fx+X+y42atYA16b/4pFwERF7RymYpG/M8CLyagJdw5+S+sZekvpG+zTzArgCUt9odrD/4h/Zc8tqEfvGbhJ7R8+w84foSL2jh1PEntFUsWeUgkl6zSsir0bqvvhtsWd0o9gz5tXMDyCyWql37CHa36/7k3Kjidxz8cdS7+gusXdMirAGiJLUO3potoh0X6Bgkp4LpheRVxPsGvud1DOaqZkjwCVSz+gFsXt0YbBn+Avs+WOlBPtG/lXsvrBY6rkwxq4BYkfqHj2QIvaMpIpdIxRM0j1imSISitppe5fUNTJX7Bqp0swVklf3BVHqHtkpdQ//iD1nrBS1ffR9UteFR8XukXrNGiDmpO6R/SgiZrNYEXk1wa6Lnxe7RuZLXSPDmjlDsskVu0duZM8Rq0XsGv2T1DVyKsL8QSdS18i+FLFzJFXsHKZgkq5hSxaRVyN1Df1A6hrZLnYOC5q5Q0KTOoebpO6Rx0M/1cSeF1aK1DX8Q5yj5pC6RvbOFpGOYQom6bR2EXk1Yvvwn6TOkVOa+UPCkTqGJ8SOkaVi9+CX2fPASgl2D/2L2DGyQOoYHmHXAMaQOob3pIhtI6li+xAFk3QMxUURCYV2jr1f6hh+TOwYbtCsAxKC1D60T24f+hl77K2U0E+ESZ1DD4kdQzXs/MFYUsfwbhQRs8VREXk1wc4L/yZ2DKdLHUPjmvVAvMoXO4ZvY4+11RLsGPqj1DGUHWH+YAKpY3hXitg2mCq2DVIwTdwVkVcjtw3+VGof2iO2DSoR1gVxQGofbJfaBp+h3Rc+yh5fK0VqHfq+2Da0VWwf5Nk1gHmktsGds0WkdZCCaeK2iLwasWXoVrFt8FyEtYFVtQ3ZxbbBVWJr/7+zx9NKEToHPie2Dc2TWoeGNGsA00mtg9vDRSTYOkDBNHFfRELh2of+X7B14Klg60BbhDWCtRwOtg/+kj2GVora2fkuqXVwbrB1sCrC/ME6tqWILX2pwZZ+CqZJiCLyasSWvq+LLf0rxZZ+W4S1gonE1v5isbX/LrWx8W/Z42alBFsGfy+2DGSy8wcLah3YOltEmvspmCahisirCTYP/iLYMnAownrBYGLLQE+wZeDFQOPQJ9njZKVITQPfCbYMbAo2D/jYNYBFtQxsmS0iTX0UTJOQRSQUtaTkb8Sm/jvFpr7iCOsGvTX3ecTm/vVSS9832WNjpQitvf8cbO5/RWzq69esAaxuc4rY2JcabOyjYJqELSKvJlA/8IlgY98LYmN/d4T1gw7Exr7jweaB37LHwkqh1WPvlJr67gs29Zez84c40dS36VIR6aVglsQvIq9Gauj7ptjYuz7Y2OvW7geIiaa+Sqmp7361ceLv2f1vpQSben4jNvUd08wf4orY1Ldxtog09FIwS3RFJHT81MbeD7LtVk6wvuc34VfKmn0B10ps6B0MNvamCQ39n2X3t5Vy+YVEQ6+bXQPEH7Gxd0OK2NiTGmzooWCW3uiKSENvcaiPYGPvc2rJyDvY7VaN2tj491JDz/3Bhp4K7T6BK9fLBxt7t0oNvd9j97GVEmjq+lSwoedFsaGnR7sGiFdiQ++6FLGuJzVY30PBLFEWkfqe7Nf0Vy419DxAK3v+kf06q0ao7/tMsL43TazvGdTuG3gzYn3PyWBDz3XsPrVSaH//34kNvfcEG3pL2flD/BMbetfOFpG6bgomqe+JrojUdZ9k+wy1iXW9f2K/1soJvZIO1ndvCdb18Ox6QKNOqOt5RG0ceg+7H62UYF33r4J1PUcjzB8ShFjXs2a2iNR2UTBJXXf0RYTtM6xbDNZ17xbquyz9RFY2wdrO6954TclNrO0eFWu7FwUbe7/I7jcrRarr+U+xrnttsK7bya4BEotY17U6RajpSRVquiiYpDa6IiLUdudo+nyNYE3XpFDTvUKs6rL0M5Jem9ArbKG68xGhtruOXU9y6paCNd27hJqeOey+slL42raPB2u7nxdquru0a4CEVNu1KkWo6UwVajopmKS2K8oi0pWj6TOyzmBt1wuBqq5PsX1YNcGq9i8KtZ2LgjVdYxHWkxxqO8+ItZ03sfvGSgn9UqlQ03lnsKarSDN/SGy1XStni0h1JwWT1ERZRGq6Tmn6fBPB6s5yobrrAbWk811sX1aNUNvx42BN5y6hukti15Owajqbg9WdT/jK+z/E7g8rJVjT9QuhpuuQZv6QLFbMFpGqDgomqe6MrohUdZ7S9Hklqjuyxequ69n+rByxuvNmoarjrGYtCSRY3TEpVHUuE6vbvsKu30oJvT0qVHWsClZ3zrBrgGTSuRxFxGxmFZHw2B3BYHXnLqGq66dsv1aNr7z5Q6FX6EJ1R7NmPXEuWN25X6js+G92zVaKv6z7o8HqjmeEqs52dv6QjDqXpQiVbalCZTsFk1R1RFdEKttPa/q8SsGq9kmhsn2FWNn5dbZ/q0asaf+qUNW+PFjZPsWuJ94EqzrOB6o7bldV9a/YdVolobmF5hiaKzt/SGYd6bNFpKKdgkkqY1BE2D6vWUdHsKL9+UBFp6UfGf7ahF65Bys6DmjXEg86OoKVHc/ylT3/xK7LShGqOv4nfvcx6KqyY0mKUNaWKpS3UTBNdEWkoi03Qp9RCVa0lwkV7ffHy4fvqqq+LVDRdkewvK2AXYsVBcvbHEJF+2qxrO0b7FqsFLGi42tCeduKYHnbNLsGgLCK9sWXikgrBdPEoIho+oyRtqxgeZulH6vx2vDVLR8LVrQ+J5S3dWrXYhEVbUeCZS3/x87dSuFq2v9fsKL1KaG8tVUzf4DXqmhdNFtEylopmCa6IlLeeiZCn7FT3ioEy9t2CmWtP2HHtmqkivb/EMpb1wplrU7NekwSLGsrEcpa71Y7O9/OztdKCZS33BosbzvHzh8govLWhSlCWUuqUNpCwTTRFZGy1jMR+oy5YGnrhFDaulwsaYmbD9+D5a2/EkpbjrJrMVhvsKzlpUBRk6V/yVMoaf5ZsLR1r1DaokRYA8AbWZAilLSkBkpaKJhDKImuiARKWs+wfeqsPVDS+lygrOMT7FysmNDj8YWSlnuFkpayCGvRUas3UNq6gS9u/RY7JytFLGn9slDSulQoaZ3QrgHgzQklLfMvFZFmCuYQSpqjLCItZ9k+jSCUtpQKpS330fzWf2DnZMUIJS2fDpQ2vxIoaeln1xJrQmnLCb6k5XfsHKwUX0njB/mSlseFkuYmdv4AV0ooaZk3W0SKmymYQyiOsogUt5xl+zSSUNycxZc0x82H74HSpu8ESlo2BYqbfexaoiWUtFQJJU0PWL2wBkqabwqUtJxh5w9wDdJmi0hREwVzCEVN0RWRouY8tk/jNQcCRU07hMKWH7Pzs2r4oqY/hAqgdi3XoLh5SChqmieUNH2OHcdKCR2fQHHzrkBRs6RZA8C1KG5+JXRipQYKmyiYQyiMQRGJ0K8ZhKKmcaGoeZlY1Pw1dp5WTOgvQArFzQ8Khc017FquTHMgUNi8LVDY+AO2bysleL7pX4WipsVCUfOYdg0AUShqfjlFKGxMDRQ2UjCHUNgYXREpbDrH9mm6oqb2QFHjc3xJ28fZ+VoxwYLWfxEKmxYEChtHNGt5YzmBokZLP8DSc676/cHCxseEwsaGCPMHiF5R00uzRaSggYI5hIKGKItIwzm2T8sobCwRzjfeq55q/Ht23lZM4Hz9jwIFDTsChQ1BzVouEQob64MFDY+6S1rey/57KyVQ0HRDoLDhNDt/gJgqbHgxRchvTA2cb6BgjqiLSEHDObZPqxEKGjL5gvo/snO3asI34PMNua9fQ+OYUNCwOPTWEPv1VsrlQljQEGSPA0DsNb6QIuTXpQby6ymYQzhfH10ROV+fz/ZpTQ2BQH79DqGgwdJ/4vXVhP9a3/mGB4Tz9TOB8w2nhHPW/tA8mFf/BeF8/UIhv/6Cdt8D6OZ5FBGTJU8RmSWcb7go5NcvFQubvsquxYpRS0rewbZZKWpB43uE8w0PC+fra9l9DaC7gsbnZovIuToK5hDyoywi+XXn2T7jQn59WyC/9ln+bG1cfPhuxfD5DdcF8utOavYtgFHya59NEfLqUgN5dRTMIZyLsojk1Z1n+4wv9cVCXt09NKP6nezakMgJnG38fuBc3dbAuTpeuz8BjFT/TLiI8Hm1FMwROFcXVRHhz9UWsH3Go0Be3Qk+r/4P7PqQv0Q4V/05Pq9uHp9XO8TuPwAzBM7VPZUi5NWk8mdrKJgjkFcbXRE5W1vA9hm38mr4QF7tdiGvJi4+fDcqaknJu/izNXP5s7VVmn0GYKJAXs2Ts0XkTA0FcwTORllEztQWsH0mgDH+TG06d6b6K+x6ky18Xs0fAmdqsiLsIwDzna19IkXIrUnlc6spmCNwpia6IpJbU8j2mSgCZ6pbA7nVz/Cnqz/GrjvRE8it+m4gt3ozn1vDsfsFwDJOVz8+W0ROh/8PmCBwOgZFJEK/CabYf6b6Hqv/uG0sIuTWfyaQW53G59YMRNgPAFbzahGpomCOwOnq6IrI6eoits9EFcitPs6frv49uw8SIaFHw/C51ffzuVUV7LoBLOyxFCGnJpU/VUXBHIFTMSgiEfpNYP7AqeptgVPVP2L3RbyGz6n8beBU9fEIawWwukdThJzKVD6nkoI5AjlV0RWRU5VFbJ9J4VTVGJ9Tmc6drvoyu0/iJfzJ8m8FTlVtCJyq9GjWBxAfHkERMVn0RaSqmO0zmQRyqloCp6qe5k9W/hO7b6yaQG7VpwKnql7iT1X1susBiCunqh6aLSInKyiYI3CyMroiklNRzPaZpIqEnIq7rfzhOz179u/82ZX38CcrSyPMHyD+nCp/MEXILkvls8spmCOQXR5dETlZXsL2mcwC2RXH+ezy37H7yez4s8p/xZ8sP8rOFyCuZZXPTfFnl6X6s8opmIPPirKIZJWXsH0mvewKzp9VvjVwsuyH7P4yOvzJyv/0Z5evDR1nzTwB4hyfVfnApSJSRsEcfFZZVEXEn11WyvYJl43y2eVLuMwSwz98D2SXfcKfXf4Cn13eHWFeAAmBzy67f7aIZJZRMAefGWURySwrZfuE1+Ozypu5E6VP+TPLPsruv1hH3db4t/6sirv4rPJidh4AiYbPrrgvxX+iLNV/opSCOfgTpdEVkROlpWyfEBmfWVroP156V+gDbnY/xiL+E6W/9GeWHWbHBUhYWSX3ooiYLOoikllaxvYJb6XsGH8idh+++zPLvuE/UbqaP1Fm144FkMAyy+5J8Z8oSfUfL6FgDv5ESXRF5HhJGdsnvDX+eInPf6J0S+B46Q/YfXqlCf1uiv9E6bP88dIOtn+ApHCi7O4U/7GSVP+xEgrm4I9HWUSOlZSzfcLVKL3AHytZzB0v+hK7b98oapr6Nv/xsjv4Y6UF2v4Aksqds0Uko5iCOfhjxdEVkYzicrZPuHp8RnFzIKP4Sf+xko+w+/i14Y6WfJnPKD7P/nuApHSsJHW2iBwtomAOPqMouiJytKiC7ROuHZ9RVOA/WnSneuzY25n9/DX+aPEKPqN4mv03AMmGP1rU7s8oeib8E4/80ZJvh25k7BeBYYZfe7O62qCI6KU4gz9S/FvucOH/444WP8VnFLdqvwYgufAZRXb/0eLV/qOF33jdjYg7XPhV/mjhcv5o4aT/SCEF4/BHC6P6ToQ/UlTM9gkxVRqhDSDphO81x0q+zt6DXhfucOHPucOF+7gjRQp3uJCC/vxHoisi/sOFlWyfEENHCms1bQBJ5NILqbvZt3jfNNzhwlu5Q4V53KECCvryHy6IvohE6BdipjZCG0DC8x8u6PUfLngpcLDoU+x954rC7c//MHeo4EnuUGEz2znEDoqIxR0uqNO0ASQw/+FCr/9QwUb+wPlvs/ebawp38PxXuEPnl3EHCya4g+cpxJb/YJRF5OD5KrZPiKm6CG0ACcl/qCCTP5D/e/Y+E5Nwhwp+xh0o2MMdKCDcgfMUYsN/IAZFJEK/ECMHz9dr2gAST7XvQMGDtmMl72LvMTEPd+jcLdyB82e5A/kUouc/cD6qIsIdyK9m+4QYOpjfoGkDSBjnR7iD+fO9h85/nr236BrfrrMf4vbnP8Htz2/i9p+jcO38+/OjLyIR+oVYyW/UtgHEuQP5QW5//nbfwXxz/3Abtz//y9z+/HRu/7lxbt85ClfPvy/KIrLvXA3bJ8RQqIiwbQDxbH/+aW7fuRvYe4mp4fac/Sm3L283tzdP5vbmUbhy/r3noi8iEfqFWDnXpG0DiEP78kIviB737Dj3fvY+Yplwu8/dzO09d4bbk0fhyvj3RFlE9p6rYfuEGAoVEbYNIJ7szZvg9uQt5fbmXfGTqk2N79CpD3J7zj7O7T3byO05S+HN+ffkRVdE9uTVsn1CDO3Na9G0AcSNvL3cvryfsfeNuEio6nF7zi7h9uRd5HafpRCZf3cMikiEfiFW8lq0bQBWl5fv35N3G3u/iMtwu8/8hNtzZpdvzxnJt/sMhdfjdp+NrojsPlPH9gkxtOdMq6YNwKr2nG337sl7xn8g76PsvSLuw+05eyO3+0yub9cZCn/B7YpBEYnQL8TI7jOtmjYAi+F2n7X7dp9Z7d+d9/pHtCdavPuyPsDtPPOYb1dug29nLoVcyu06E1UR8e3MrWf7hJhqj9AGYB27zhzx7z77S/bekNDx7Tr7b9yO3MW+Hbljvh2naTLjdp6OvohE6BdiJbdD2wZgPm5HbqlvV+7VPaI90cLtOD2H23F6p297rujbfpomI25HlEVkR2492yfEUm6Htg3APNz2072+7bkvBXbmXtsj2hMx3I5TN3A7Tp/2bT9Fkw2341R0RWT7qQa2T4gdbvvpLrYNwAzc9lNe347TG73bc2LziPZES+i3KLmtpx71bT9d79t2iiYLbnsMikiEfiE2wkUkQjuAkbhtpzJ920/r84j2RItv26kvcltzFvm25oz6tubQRMdty4muiGzNaWT7hNjhtuZ0s20ARuG25VT7tubMtW08pv8j2hMtvm2nfsRtzdnh25ojsDs2kaCIWBu3NaeHbQPQG7ft1Ai3NWe+d8tpYx/RnojhtuT8iduSc8q35SRNRNzWk9EVkS0nm9g+IXa4rTm9bBuAjgRua85239aT5j6iPdHiXpP9Xm5rziPc5pw63+aTNJFwW6IsIptPNrF9Quxwm3P62DYAPXBbTp7mtmZb6xHtiRbv5pNf8G7KXujbdHLEuymbJgLfpuyoioh3U3Yz2yfEjm9Tdj/bBhBLvs0nG72bsx/37Miw7iPaEy2+jSd/6N2cvd27OTvg3ZRF45lvU1aURSSrme0TYse3OWuAbQOIic1ZE97NWUu5jTnx8Yj2RAy3Oet638asHO/GLBqvfBujLCIbs5rZPiF2fJuyB9k2gGj5Nmbv5TafjM9HtCdaXNuOvcezMeth34asGu+GTBpvfBsyoysiGzJb2D4hdnwbsobYNoBr5duQme/deCIxHtGeaPFuyPoX74asBb4NmcPe9Zk0XvjWR1lE1me2sH1C7MTb+QSW1e5dn/WMf3Vm4j2iPdHiW3/8B971mdt860/w3vUnqNX51p+IrohsONHK9gmx41ufeYFtA7gKdt+GE6v9647/O3vtIhaPb13mdd51mdnedSeolfnWRVlE1p1oZfuE2PGtzxxl2wCuyPoTR7xrTyTXI9oTLY51B97NrTv+kG/diWrv2uPUinxroy4ibWyfEEPrjo9p2gDezLoTJb61J+5W05L4Ee2JFu/aE5/3rj0+37f2+JB3zXFqJb41URaRNSfa2D4hpi5GaAPQ8K090etbc+Il18YTeER7osa39tj3vWuObfGtPe73rjlGrcC35niUReRYG9snxNDa4+OaNoDX8K057vWuPbbRuy4Tj2hPlvjWHv2jd01Glnd1BjWbb/WxKItIRjvbJ8TSsQltG8Blmb5VGXhEezLGvuzkP3pWHXvQszqjyrMqg5rFG2UR8azOaGf7hBhanTGlaQNYlVHtWZmBR7QjKSmeVRmf86w8muZdlTHoWXmUGs27KiO6IrIqo4PtE2Jo1dEpTRskLe+qo8PelUfne5edwCPakdfHt/ro97yrjm72rjzKsSeOnlBELG7VUZumDZLPqgzBu/Lodt+Ko3hEO/Lmca848gfvyqOZnhVHqBG8K49GV0RWHO1k+4SYmonQBknl6GnPyiN4RDty5Qm9z+lZkfGAZ8WRSs/yI1RP3hUxKCIR+oUYWXHEoWmDpOBdfqTRu/zo455VeEQ7co3xLD/yWc/yw2ne5YcHPMsPUz14VxyJrogsP9zF9gmx411+2Mm2QYJbcWTCu+JwOrf0MB7RjsQm3qVHvutdfniTd/lhn2fZIRpL3uWHoysiyw53sX1C7ISOD9sGCWz5ob2e5YfwiHZEn/iWHfqdd+nhE56lh2iseJdFWUSWHu5i+4TY8S475GHbIPF4lx7K9y47gke0I/pnasX+f/AsPXS/J/1ghSf9II2Wd+nB6IpI+sFutk+IHe/SQ162DRLJoXZv+qFn/IsO4BHtiLHxpB/6jCf90CueJQf7PUsO0mvlXRJlEVlysJvtE2LHm36QY9sgIdg9Sw6tdqcfxiPaEXPjXXrg294lBzZ60w96PUsO0KvlXXIguiKSfqCH7RNix5t+gGPbIL55lxw44l1yCI9oR6wVX/rB33qXHDzuWXyAXg3v4iiLyJIDPWyfEDveJQd5tg3ik3fxwRLf4oN4RDti3Uykbft7z6L993kW7S93L9pPr1BURcS9eH9vhD4hRjyL9gfYNogzi/f3ehbvf8m18CAe0Y7ER9zpez7tWXTgZfeiA32aE1oLRcTSDgS1bRAPPIv3e92LDmz0LjqAR7Qj8Rnv/P3fci/ct8GzaL/bvXAffQPRFZGF+/si9Akxs1/StoHVeRbtz3Qv2P879npBkLiMa8He33gW7D3mXrCXRhBdEVmwty9CnxArC/fKmjawroV7qz0L9861peER7UiCRU3b8w73vH33eubvK3PP30tfI/oi8vr+IFYW7FPC2HawHM/8vcPuBXvnexfsxiPakcSOa+HOT3nm733JvWBvj3v+Huqevye6IjJ/b/+lfiD2SIQ2sJIFewXPgr3bXQv34hHtSHLFn7bnI575e3Z75u0ZZrddTdzz9va75+2hoIe9krYNLGP+ntOeefvwiHYkueNJ2/E5tu1q4p63Z8CdtpuCDubtFjVtYDpP2u5Gd9qexz1pO/CIdgSJNigiOpq3W9C0gWk88/ZMuOftTncu2ItHtCNIrOJO2zXoTttFIfY8absCbBuYw5O2a68nbRce0Y4gsY77lV2D7ld2UYg9zyu7/GwbGOzl3fneV/bgEe0IoldcL+0ccr28k0LsuV/axbFtYAz3yzvb3S/tfMaWtucj7DmPIEgM43pxx5DrpR0UYs/90g4f2wY6e3Gn3fXiztXul3biEe0IYkRQRPTjfnGHl20D/bhf2nnY+dIOPKIdQYyM64Udw64Xt1PQwQvb3Zo2iL0Xtpe4X9yOR7QjiBlxvbA9zf3CjoDrhfDFCDHkfn7HONsGMfT89l7X89tfcr24E49oRxAz435+60/dL2zf63phG4VY2u7QtkG03M9v97pf2LbR8cJ2PKIdQawU13Nb73A9t73Q9fw2CrGw3aFtg6g8ty3T/fw2PKIdQawa54ubPuF6btuLrj9v7XM9t5VCFP68dUbTBtfE/edt1c5nt861pW3EI9oRJB7ifG7rd9zPbt3sfnYr7/pz+IYIV22bTdsGV2nY/ezW+d5ntuAR7QgSj/E8t+1695+3nnb9eQuFq+P+89Yptg2ujPvZLYLrz1u3u/68FY9oR5B4j/e5DR9wP7v5cdezW1pcz26mcGXcz26ZYtvgrbmf3XLa88xWPKIdQRItrj9v+Zrrmc2rnM9stjuf2Uzhzbme3jLBtsGb2dLoembLY54n8Ih2BEnoOJ/a9H+upzdlOJ/eROGNuZ7eNM62gZbrqc0Tzqc2pTuf3YhHtCNIsmTsiVXvdD616QHXU5urnE9toqDlenLzRbYNXuPJzYrzyU17HU9vxiPaESRZE/qpGeeTmxY6n9w46nxqI4W/cD21cYxtg1muJzfmu57aiEe0IwgyG/dTm3/ifGLTHucTmxTnkxsphI1GaEtym9qdT2x8xvb0RjyiHUEQbVxPbLrd9eTGAueTG2jSe2LDBU1bsnpig931xIbV7ifX4RHtCIK8eRyPrPu46/GNL7ie2NDrfCJ8A0lOj28c1rQlIdcTGw47n9iIR7QjCHJ1cTyx7tvOxzZscj22we98fD1NOo+tH9K0JZcSx6Pr71bT0vCIdgRBrj2Ox9dd53x0/SnnY+toclk/qG1LfK5H1/e6Hlv/kuvBjXhEO4IgsYnniVXvdz+y7nHno+ubnY+uo0nhkXUDmrYE5np0vdf56PqNjsfWfIs9/giCIDGJ69G1X3U9sm6l89G1M85H19KE9si6fk1b4sp0P7YGj2hHEMSYOB9d+0vXI+uOOh8J3WwTk+uRtX1sW6JxPby2yvnIurm2B/GIdgRBDM7IHXve4Xx4zf2Oh9ZWOh5eQxON86G1vWxbwnhozbDz4bXzZx5di0e0IwhibmwPrvqc86E1CxwPr7mguVnFMedDa3rYtnjnfHiN4Hho7XbXg2vwiHYEQawV+9xVP3Y+uGa346HVxPHQahrvnA+t7mbb4tqDq087H1z9J/a4IQiCWCr2B1ff5py7+rzjwfCNK245H1zdxbbFI+eDqxsdc1c/5rl7FR7RjiBIfMRx/6qPOeauet75wKpux9xVNB45567uYtviiXPu6nHH3FXpzgdX4xHtCILEZxwPrviWc+6qjc4HVnKOuStpfFnVoW2LAw+sUpxzV+51zF2NR7QjCJIYcTyw6o/OB1aedDwQusnFiftXdWjaLM55/8p8+wMr8Ih2BEESL5656e9z3L/yMef9K5sc96+gcaA9QptVtTvvW/GM7cHleEQ7giCJHee9q77iuG/FCud9K20RbobWcd+KVk2b9dgd969Y5X5gJR7RjiBIcsV574pfOO5bfsRx33JqSfcub9W0Wcm9yw8771+GR7QjCJK8oY+s+zvnPSvvc9yzvMJxb/jGaCErWrRt5nPeu7zEcc+Ku9Xr8Yh2BEGQcGz3Lv+s/Z7l8+13Lx+x37OMWoHj7mUtbJuZHPcs73Xcs+wl110r8Ih2BEGQSLHfvWKO/e7lu+x3LZftdy+jZnLcvayZbTOJ13HXso2OO1fgEe0IgiBXEvvd6bfa716ab797KTWL4+6lTWyb4e5almm/axke0Y4gCHK1sd+17J/sdy59zn7X0i77XenUaI47lzaybQaqst+ZPtf2YBoe0Y4gCBJNHHcv/ab9zvQNjtR0n/3OdGqUcBGJ0K6r1PRhe+rS+TOpy/CIdgRBkFjGkbr4D47U9Gx76hJqBMcdSxrYNr04UtMFR2r69pnb0/GIdgRBEL3iviPtvY47ljzqSE1vZG/Esea4I72ebdOD4470087bl+IR7QiCIEbFeeviLztuX7LMcfuSafsdi6k+ltRp22LHccfiRsftix/z3J2GR7QjCIKYEecdS/7Xcdviw/bbF1Md1EZoi95tS8Ydty9Jd96+FI9oRxAEMTuh39yeuW3xvY7bF5Xbb19EY6g2Qtu1u22xYr990V7HbQvxiHYEQRCrZfrm9M/Yb1s8z37b4mH7baGbdrQW12jbrtGti/Ltty3BI9oRBEGsnpnbFv/IfsuinTO3LJRmbl1Io1AToe2q2G9d1G6/deHTtjvS8Ih2BEGQeIr9poW32G9ZeG7mlgX02iys1rZdGfstC+32mxesmrlxAR7RjiAIEq+ZuWXRR+03L3x25uaFnTM3L6BXZ2G1tu2t2W9ecHjmpgV4RDuCIEiixH7L/P+y3zx//czN8z0zN8+nV8J+8/wqtu1N3TS/ZObmeXhEO4IgSKLGceP838/ctCBr5qbwTf9N2W+aX8W2RXTj/N6ZG+e/5Lo+7ZPseAiCIEiCxXX90vfYbpz3yMwN8xtmbpxH34j9xnmVbNvr3DDfO3PD/I2OG+fjEe0IgiDJFueNC740c+P8pfYb509pCsRbFBH7DfNO2G9KwyPaEQRBkj22P83/n5k/zTs0c0MafS37DfMqNG1/mldlv2HeXNv1eEQ7giAIcinqfff97cx1r9xtv/6Vspk/pdEQ+59eqfjLf6cN2//0yvyZP7yER7QjCIIgkTNyR9o7Zq57Jc1+XVrAfn1a+cz1adR+XdrhqRvSPs1+LYIgCIJETKho2K97eZH9j2lz2G0Igvwl/x+CzBCctq+LIgAAAABJRU5ErkJggg==" preserveAspectRatio="none" id="img1"></image><clipPath id="clip2"><rect x="1548" y="1469" width="401" height="463"/></clipPath><linearGradient x1="5.66719" y1="56.5386" x2="5.66719" y2="-222.487" gradientUnits="userSpaceOnUse" spreadMethod="pad" id="fill3"><stop offset="0" stop-color="#E73768"/><stop offset="0.5" stop-color="#FFFFFF"/><stop offset="1" stop-color="#69E0F9"/></linearGradient></defs><g clip-path="url(#clip0)" transform="matrix(1 0 0 1 0 -1440)"><path d="M0 0 2266.88 0 2266.88 1666.82 0 1666.82Z" fill="#0A0619" transform="matrix(1.0005 0 0 1 0 867.18)"/><g clip-path="url(#clip2)"><use width="100%" height="100%" xlink:href="#img1" transform="translate(1548 1469)"></use></g><path d="M155.214-196.218 5.66719-196.218 5.66719-147.014 52.2715-147.014 52.2715 0 108.277 0 108.277-147.014 155.214-147.014ZM203.019-168.482C227.888-168.482 232.088-171.016 232.088-195.952 232.088-220.487 227.888-222.487 203.019-222.487 178.417-222.487 173.883-220.487 173.883-195.952 173.883-171.016 178.417-168.482 203.019-168.482ZM176.15 0 229.821 0 229.821-152.681 176.15-152.681ZM313.829 1.66682C320.296 1.66682 326.564 1.13344 332.764 0L332.764-40.4038C329.631-39.6037 328.764-39.8704 326.83-39.8704 318.03-39.8704 315.229-43.5374 315.229-56.2719L315.229-214.887 261.491-214.887 261.491-43.5374C261.491-12.7345 271.692 1.66682 313.829 1.66682ZM493.646-86.208C493.646-132.879 478.911-155.481 424.64-155.481 374.035-155.481 344.632-139.08 344.632-76.3404 344.632-13.6013 374.035 2.80026 422.106 2.80026 452.309 2.80026 476.644-2.80026 485.979-9.3342L485.979-48.9379C476.911-43.5374 453.442-37.8702 432.84-37.8702 413.305-37.8702 401.171-43.2707 398.037-56.5386L491.912-62.2058C492.779-64.4727 493.646-73.8069 493.646-86.208ZM397.77-93.0087C398.904-111.944 406.838-116.211 424.906-116.211 441.908-116.211 446.108-108.277 446.108-96.6757ZM577.32-48.3378 577.32-196.218 521.049-196.218 521.049 0 649.927 0 649.927-48.3378ZM739.269-155.214C728.268-155.214 711.866-154.614 700.265-152.681L700.265-107.743C710.199-109.41 720.6-110.277 731.668-110.277 754.003-110.277 759.671-108.01 760.471-92.4753L729.135-92.4753C684.73-92.4753 664.662-79.4741 664.662-44.4041 664.662-11.6011 684.73 2.80026 716.667 2.80026 743.536 2.80026 756.537-6.53394 761.071-14.1346L765.271 0 813.942 0 813.942-103.743C813.942-139.346 792.74-155.214 739.269-155.214ZM733.668-37.0034C722.601-37.0034 716.667-39.0036 716.667-46.9377 716.667-56.0052 722.067-58.5388 739.002-58.5388L760.471-58.5388 760.471-46.671C756.27-41.2705 746.936-37.0034 733.668-37.0034ZM949.888-155.481C925.019-155.481 910.351-147.547 902.684-137.146L902.684-152.681 849.012-152.681 849.012 0 902.684 0 902.684-100.676C904.951-108.277 910.885-113.077 924.486-113.077 941.421-113.077 946.221-109.944 946.221-91.0085L946.221 0 999.96 0 999.96-103.21C999.96-140.213 985.825-155.481 949.888-155.481ZM1129.97-152.681 1129.97-139.346C1124.04-150.681 1112.5-155.481 1086.43-155.481 1038.1-155.481 1025.7-119.611 1025.7-77.7406 1025.7-31.4029 1038.1 0 1086.43 0 1112.17 0 1124.04-6.53394 1129.97-17.535L1129.97-13.6013C1129.97 10.7343 1117.84 16.6682 1085.3 16.6682 1072.03 16.6682 1054.76 14.4013 1042.63 11.3344L1042.63 52.8716C1057.03 55.1385 1076.57 56.5386 1090.97 56.5386 1163.38 56.5386 1183.18 29.136 1183.18-12.7345L1183.18-152.681ZM1105.1-37.0034C1083.37-37.0034 1079.97-55.6719 1079.97-77.7406 1079.97-98.4092 1083.37-117.011 1105.1-117.011 1130.84-117.011 1132.57-102.343 1132.57-77.7406 1132.57-51.7382 1130.84-37.0034 1105.1-37.0034Z" fill="url(#fill3)" transform="matrix(1.0005 0 0 1 313.501 1774.2)"/><path d="M0.533383-56.0052 0.533383-46.8044 17.4683-46.8044 17.4683 0 28.2693 0 28.2693-46.8044 45.0709-46.8044 45.0709-56.0052ZM51.0714 0 61.8724 0 61.8724-56.0052 51.0714-56.0052ZM72.8068 0 113.544 0 113.544-10.5343 83.6745-10.5343 83.6745-56.0052 72.8068-56.0052ZM120.078 0 161.548 0 161.548-10.5343 130.946-10.5343 130.946-23.6689 158.948-23.6689 158.948-33.6031 130.946-33.6031 130.946-45.5376 161.548-45.5376 161.548-56.0052 120.078-56.0052ZM192.285 0 232.955 0 232.955-10.5343 203.152-10.5343 203.152-56.0052 192.285-56.0052ZM268.492-56.0052 254.824-56.0052 234.755 0 246.423 0 249.957-10.401 274.292-10.401 278.026 0 290.494 0ZM252.757-18.8018 261.291-44.4041 262.091-44.4041 271.292-18.8018ZM296.361 0 306.829 0 306.829-38.0702 307.229-38.0702 334.164 0 344.765 0 344.765-56.0052 334.431-56.0052 334.431-17.4683 334.031-17.4683 306.962-56.0052 296.361-56.0052ZM400.571-8.40078 400.171 0 410.305 0C410.305-0.400037 410.305-29.4694 410.305-29.8694L380.502-29.8694 380.502-22.002 400.371-22.002C399.571-15.2014 392.57-8.13409 382.436-8.13409 370.034-8.13409 363.767-16.8016 363.767-28.0026 363.767-38.8703 371.301-47.2711 382.502-47.2711 391.103-47.2711 396.57-43.2707 398.704-36.8701L410.105-36.8701C407.371-50.8714 397.704-56.8053 382.436-56.8053 364.567-56.8053 353.3-45.6042 353.3-28.0026 353.3-10.4676 363.501 0.800074 381.302 0.800074 394.037 0.800074 398.904-5.33383 400.171-8.40078ZM455.509-21.8687C455.509-13.868 450.175-9.73424 442.508-9.73424 434.574-9.73424 430.107-13.868 430.107-21.8687L430.107-56.0052 419.172-56.0052 419.172-20.2686C419.172-5.93389 430.04 0.800074 442.575 0.800074 455.576 0.800074 466.11-5.93389 466.11-20.2686L466.11-56.0052 455.509-56.0052ZM503.18-56.0052 489.512-56.0052 469.377 0 481.111 0 484.578-10.401 508.914-10.401 512.714 0 525.182 0ZM487.379-18.8018 495.979-44.4041 496.78-44.4041 505.98-18.8018ZM573.387-8.40078 572.987 0 583.121 0C583.121-0.400037 583.121-29.4694 583.121-29.8694L553.318-29.8694 553.318-22.002 573.187-22.002C572.387-15.2014 565.386-8.13409 555.252-8.13409 542.851-8.13409 536.583-16.8016 536.583-28.0026 536.583-38.8703 544.117-47.2711 555.318-47.2711 563.919-47.2711 569.386-43.2707 571.52-36.8701L582.921-36.8701C580.187-50.8714 570.52-56.8053 555.252-56.8053 537.383-56.8053 526.116-45.6042 526.116-28.0026 526.116-10.4676 536.317 0.800074 554.118 0.800074 566.853 0.800074 571.72-5.33383 572.987-8.40078ZM592.455 0 633.926 0 633.926-10.5343 603.323-10.5343 603.323-23.6689 631.325-23.6689 631.325-33.6031 603.323-33.6031 603.323-45.5376 633.926-45.5376 633.926-56.0052 592.455-56.0052Z" fill="#FFFFFF" transform="matrix(1.0005 0 0 1 590.804 1927.55)"/></g></svg>
\ No newline at end of file
+<svg width="2268" height="537" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><defs><clipPath id="clip0"><rect x="0" y="1440" width="2268" height="537"/></clipPath><image width="401" height="463" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAZEAAAHPCAYAAACSpefQAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAAFgBSURBVHhe7d0HeBvnfT9wNm3TpE2zk3/SzGY0aTObttlDWW2TZseOt01vy3vHm9rU3ntviZJIihIliuLee+8tUlwgNg6Hw929997/ASgr9nuwLQm4AeD7fZ7PE+c96h23fiBAHlNSEASJmLRJ8da0KfHLbDuCIAiCvGHSJqX/TJsQq9PGRZo2IUppE+LmtAn1g+zXIQiCIMjlLJiU/iNtXFw7b0J0hwvIa01IrfPGpafSpuiH2X+HIAiCJHFChWH+uPT0vAmpfd64SN/M/HExb96EeAvbB4IgCJKEWTAu3jhvXDzLFos3M39clOdPSLvnX5R/zPaHIAiCJEHmT0jfmz8ublswIQrzZwvD1ZuQxuZPiIvSJoJfZPtHEARBEjALJwOfWjAuvbxgQhrQFIVrtGBCrJ0/IT2c5qDvZsdDEARBEiBpqvq2hePinQsnxOIFoRu/DhaOS9mLJoN/ZMdGEARB4jgLx+WfLZyQ9i0M3+j1tWhc5BZNiJuXjEvfYeeBIAiCxFEWjAe/sGhSXLxoUrq4cEKkRlo0KfUvmpReTp8SPs3OC0EQBLFwVkzRf1g8Kc1dPCHWLArd0M1VumhCvCetU307O08EQRDEYlk0Hvz14gnpWISbubkmpSOLpuT/Y+eLIAiCWCCLxsWvL54QVy2eFB2LJ0RqSZOifcmEuDp9XPx3dv4IgiCICVkyTj+weFJ6Ysmk1KK5aVuW1L5kUnp6uU39CLseBEEQxKCkT4nXp0+Ip5dMiDQuTYrn0ifFW9l1IQiCIDpm6bT07SUT4ub0SZHX3JjjTPqEqCyZkPakT8k/YdeJIAiCxDBLL/IfT5+QXkiflHrTZ2/AiWNSupg+KS5ZOhn8V3bdCIIgSJRZPinevnRSLFg6KdJEtmxKrF8+JT2yZkR9L7sPEARBkKvMsin5x0snpT3LJkWFveEmsmWT0smlk8Hr2P2BIAiCXEGW2ITPLZsSFyyfkkaXhW+qScm/fErcsnxC+h67fxAEQZAISRtR37FsUrpv2ZRYGeGmmpSWT0oDyyeltBVTwj+z+wtBEAS5lGWT8i+XT0pHlodvnMBaMSWWL58Q7w0VWnbfIQiCJG1WTYtfWTElrlg+KdrYGydorZiUji6bkn/F7kcEQZCkSugnkFZOS4+tnJKaVoRvjnClVk6JjpVT4pplE+I32P2KIAiS8FkxGfzDyknpJHtzhKuzclLqWDklPbt6hn6U3ccIgiAJl1VT0jdXToobVk2KvpXhmyDEwqpJ8fyKSfE2dn8jCIIkREKvlFdPSc+tnpa6Vk2JFHQyLe1dPS3/jN3/CIIgcZvV0+Itq6bEc5obHuhi9ZQ0vmpaTF89JX6JPRYIgiBxkzWT0g9XT8s7V09J0uopKXRzAwOtmZYbVk9Lj20eVd/HHhsEQRDLZvW08JnV01La6ml5mL2xgfHWTEun1k6R69njhCAIYqlsU9W/XT0t3rN2Wi5bE7p5gWWsnZYCa6fkrWtt0vfZ44YgCGJ61tjk/1k7JR1kb15gNfLg2vB3icJn2GOIIAhieNZNBv9t7bS8dN2UNLk29IoX4sK6Kbli/bR0/6ox+k72mCIIguiedQ767rVT0sPrpuV69gYF8WPdtHRs3ZT8a/b4IgiC6Jb10/Lv1k1JmevCr2gh7k1LrnVT8toNk9J/sscaQRAkZllnl/5jvU1eu2Facq+fligkGJvcud4mPbfFQT/GHnsEQZBrzuYp+uENNunp9Ta5XXPjgYSzwSYVbLCRO1RV/Sv2XEAQBLmqbJgiN26cls5uCN1cINns3zAt/5w9JxAEQd4yG23S9zfa5G0bbJIQ4eYCyWNy47S8dPOU+GX2HEEQBNFk4yT91MZp6eVN0/LAxmmJAsySGzdMS4/vGKPvZ88ZBEGQlDRVfdsmm3jnRptUrL2BAMzaZJNOb5oiN7DnD4IgSZzN0/LPNk1L+zeFbhIAb8UmCZtt8vbNM9IP2HMJQZAkymZ78AubbPLizdPSRc2NAuAtbLbJw5ts8ryNNuFz7LmFIEgCZ/8U/YfNNmnuFptcszl0MwCIily5dVp6IHResecagiAJlq12+ddbZqRjW2wSBYilrTPS8S3T8m/Zcw5BkATIdpv49a02efXWGcnBXvwAMeTeMiOv32aX/os9BxEEicNsGKcf2DYjPbF1Rm7ZGnq1CGAIuXubTXp+u4N+nD0nEQSJk2ybIddvtUmntRc4gDG2zUiF221iauhHyNnzE0EQi2brtPSd7TPy5u0zEr8tdCEDmG1GOrDdJv8Pe64iCGKhhN462DYjvbjNJvdqLmIAk22fkaa2zcjLt06LX2HPXQRBTM52G7l9x4xUsD10sQJYmty8fUZ6fNuE+kH2PEYQxODstMs/3jEj79lhkxTtxQpgabnbZsiN7DmNIIgB2T0T/PxOu7xg54w0umNGogDxaKddCu60yzt2zUg/Ys9xBEF0yJ4R9R277NJ9u+xy5c7QRQiQCOzySOhFUejFEXvOIwgSo+yekX+5yy4d0VyAAAli14xcvcsuzd1oU9/Fnv8Iglxjdk+LX9k1I6/YZZdsu8IXGkBi222XTuyxy79jrwUEQa4ye2zy/+62SzPsRQaQDHbb5d17bOpH2OsCQZArzG4n/cTuGemF3TNy9+7QRQWQdOSePTPSi6Frgb0+EAS5wux1SN/cbZfX756R3NqLDCApFO+yiXemqerfsNcHgiBXmL12+Td77dKxPTMSBUhGe2ekg6G3edlrA0GQK0zGGH3nXrt03167XM5eYADJYK9dmt4zI6/YOy1+lb0+EAS5wux3C/+8zyG9ss8h9++1hy8sgKSyzyG37HNKT+6apB9irw8EQa4w+53Sd/c55M37HBK3L3RhASSZ/Xbp7IEZchN7bSAIchXZ55D/sN8uZbEXGEAy2G+XpH12eecBuzyHvTYQBLnC7LTTfzxglx48YJer989eWADJ5sIBu7xw30zwX9jrA0HiKqFXRPsdUtoxEx7hcGgm+Pn9dnn+Abs8HOEiA0gCcs1+u/RQ6IUVe30giKVzxC18+qBDOnzALtFLxg6Z9MjrAzPSDw445G0HHFLgNfMBSBoH7VLWAYf8e/baQBDL5YhN+OwBuzzvgEMeYk/kMId0cL9D/jn774zIASf50wGHdEozJ4AkcNAh+Q445I0HHNK32WsDQUxP6Pc2Dtml+w7a5YqDs6983tAhuzR+0C4vCr3dxPajdw551PcddEiPHnLI9ey8AJLBIYfce9AhvbTPpX6SvT4QxJQcnJF/ddAuZbAn61uTKw7OiPduU9W/ZfvUO0ccwX89aJcXH7RLY9p5ASS+Qw6p5IBDvMuM6w9Bwjk6I37jkFNec9ghOQ45wifltTpw2KS3uA7a5R8fcsq7DjskKcK8AJLB4SNO+RfstYEguuWYTf3IEaf09GGH3B7hhLwmh53SxUNOeeF+m/A5djwjcthJbjrskM6w8wJIEjOHHPLKg27x6+y1gSAxzVEXueWIQzp3OHTj18ERh1xxyCneU2LCU0ozfPRDhx3SE0cccjM7L4DkILcedkpP7Z+iH2avDwSJKkft8o+POOXdRxyyfMQhh272ejtw1CH/jJ2HEclwil854pCXHXXKkxHmBZDwjjpI3hEnuZm9NhDkqpNhp1846pQXHXHIo+yJprejDnnsqFNekGGj5rzF5ZB/fsQp72PnBZAk5NALx9ALSPbaQJC3zEk7/cejLvJghpPUHJ29oZsmw0HKMxzkbjPe4golw05uy3CSfHZeAMkgY/bF3KITdvWL7LWBIBFz1C7/LsNJMtmTyXROef8Ru/xTdr5GJHOGfjTDSZ456iDtmnkBJIEMJ6nNcJCHDzjou9nrA0HCyXBI38xwyhuOOWRPxuwrECsaPeaUFxzx0M+y8zcix93iv2c45FUZDnkmwtwAEt4xJ8k+7pD/wF4bSBIn00E/fsxBnj/mlLvZE8aqjjlIWegtrmOq+tfseozIcaf8iwyHfIidF0BScMpchkPenOGUvsNeG0gSJU1V33bcRe445iSFx5yhVxjx57hT3pfhln/Crs2IhD6jOeYmdx53kiJ2XgBJov+4i7x8xK1+mr0+kARPhkP++XGnvD/CSRF3jjvl0eNOef4Jj/oZdp1GJPydnIs8f9wld7FzA0gGx52k9JiT3HNMVd/OXh9IguW4U/zScaecftwlTxyfvQEnEFJ23EHuCn2Hxa7biGTapf864ZTXnXDKLu3cABLfCZd85IRT/iV7bSAJkFyP+r5MF3k000kaToQOdgLLdMp7M016iyuUEy7515kuOYOdF0CSsJ9wyatDP4TCXhtInCbTIV93wklyIhzshJXplC9kOuV5GW76z+z+MCJ7RtR3ZDnJvZlOUsbODSA5kPZMJ3k69Kw99vpA4iTZTum7mS55S6ZL9mfO3liTECnNdJM7VVX9K3b/GJFst/rpTBd5OdMl92nnBpD4slzkXJaL3MpeG4iF8+qNK8sp97MHNIntPeE27/ENWU7pO1lOeVOmS/ZGmBtAQstyyiTTKe8x821m5AoS+uMy2Q5yd7aTlGbNHjh4vZEsF0nLMektrlBOuuXfZztJZoS5ASS8bJd8MdspL8ly0H9lrw3E5GQ55V9ku+XDWa7Qt4/w5khJtpukmvUW1zGb+q4sJ5mb5SZV2rkBJL5sF6nPdpFHst3qe9nrAzE4mS7xaydd8oqTLtmWHT44cKVOuuQ92W55DrtPjUqOh34u2y3PO+mSh9i5ASQFNzl50i1fx14biAEJ/QGlk27yxEkXadEcGLgawyddJC30ORK7j41Klkv6/kmXvDXbLfMR5geQ2NyyP9stbwn9IBB7bSA65aST3JjjJmdOzr6ahhjIcZHiHDdJZfe1kcnxkOtz3CSHnRtAkhgIvaAz8zPLhM9Jl/TDHJe8/aRLDkY4ABADOS55t5lvcYXeI85xkUdyXKSOnRtAMshxk/KTbnJvyYj6Dvb6QK4xoffOc9zyvFMueThn9kYHOgrt51Mukpbrop9ij4VROWVXv5jjlhfluOVRdn4ASeJojkv+FXttIFeRjDH6zlMecv9pN6k8NXtzA0OR4tMucgd7XIzMabc857RL3nnaJYva+QEkttMu2XHKJa855Ra/wV4byFvklEv+9SkXOcbuVDDWaZesnHbJu0+5pB+xx8jInHaSG0+5SS47P4Ck4CYdp9zkmbwZ+lH22kCY5Hqk/8h1yWtOu2Tn6dmbGFjDUK6LvGLqW1w+9YOn3eTxXBdpijA/gCRAzue6yG3stYGEiodf/UiumzyT6yEdp90yBYvykKLTLnI7e/yMzGmn+OVcj7w01yNPaOYHkARyPfLeXI/8M/baSNrkesmtuW6SnxvaOWB5Z9wyyfXIu3Jd0g/ZY2lkQhfRGY+894xbVtg5AiS6Mx55/IxbTj/ro//GXhtJkzNu+Se5Hnl3+KYUYSeB5Q3mesgrp1zqJ9lja2RCL0LOuMm5CPMDSAKkIddLHg39zST22kjYnPOpXzzjlhed8chjZ2Zf2UI885DCXK+579OG3g496yZPn/GQNs38AJKBh5zK85Dr2WsjoXLSTv/xjIc8dNZNajU7AOLaWbdMznjknXku6QfscTcyuW7x62c98sozbtnGzhEg0Z31yIEzbnnrWZf0ffbaiPucdcu/P+shmWdnbziQoPLc8mCeh7x8zkk/wZ4DRuacR/7fPLd8kJ0fQDIIXYdnPSQt16N+hr024i75XulbeW55wzmP7M2bXRwkgXNuUpBn8ltcx1T1r/PcJPWchxSy8wNIBuc8pCLfQ+4L/eI2e31YPmcd9ON5HvJCnlvuYRcGSUM+55F3mP0W12kH/dg5D3nunId0RpgjQMI75yHH8l3yr9lrw5JJU9W35c+++is6F35FCsku3y0P5HvIS6EXFuz5YmTOe6T/POeR1+a7ZSc7R4BEl++WXaHzP3QdsNeGZZLvkf873yPvP+cJVT6A18v3koJ8L7mVPW+Mzjmv/Kt8r3yUnR9AMsj3kM7Qd+bnefpP7LVhWvI48Uv5XnnpeY88mR+eJMAbkvI98o4Cn7k/PXKW0r877yP35HtIaYQ5AiQ+Lyko8JLbzfoz2eGEfrnlvJc8dt5LGjUTBHhz/ec95EWz3+IqctFPhd5qO++ReyPMESAZhN49+jl7beie8x5yfYGH5Jz3yBTgmnnJ+QIPuYU9v4zOea/07QKvvKHAK3s0cwRIcAUeebLAKy/N58Qvs9dGzFPklb5b4JW3FHhlnp0IwDWSCrzy9nyf9D32fDM6hT75d+e95ESEOQIkvAIvaTzvJY+f89D3s9dG1Clxq58u9JBXCrzyQMFs5QKIqUKP3FfoIS8W8vRj7PlnZPKn6D8UesgDhR5Syc4RIBkUesjp8x7yJ/bauKYcU9W3F/rIPYUeUlo4e6ED6MtL8q3wFleRh3620EPSCr3yoGaOAIlPKPTK2wt9UfyeV5FX/mWhVz4coXMAXRV5ZLHQK28LvX3KnpdGp9gnfa/IK28p8sp+dp4Aia7IKw8XeeV5JR76OfbaeMMU+cWvFfnklUVeeabIG+4EwBTFPrmv2EdesMLPtJf45OuKvOQkO0eA5EAqC33kgVMT6t+z18bllPvoh4q95MkiL2nRdgBgnmIfyS/ykJvZc9boFLjU9xR6yMNFPlLLzhEgOZDjRV75N+y1kVLiITeV+MiZ4tAFC2BBJV45WOKVt1rhLa4iL/1CsVdeWOKTL7DzBEh4Ptld4pPXlXqk/wpfECVe+VclXtKq+UIAK/LJvhIfea5EVf+GvbkbndD7xMVeckwzR4DEJxX7SN7sheAlqSVemZb6SG2pjzSG/hvA6kp95FzoO2j2xm5GyjhyQ6mPnGbnCJCISr2ksMQnj4b+O3wBvFpEwht98mCJlxSXeuUp9h8CWE2pVxZCb3GVeqXvsDd2o1Ptoe8v8ZLH8UIMEhdpK/GSshKvLL7aFn72VpmXpJZ6Sai6/IWPNJZ5SYWmHcCCyrykp8xHni/z04+yN3ejU8HRL5V6lfQyLxln5wkQj0LncplXKS7zkRF22+UiUjb7hSyp3EvKy7ykJcI2AOvxKXnlHLmRvbGbkXJO/mm5T9lT7iVEM0+A+FFR7iNNEdrDVFV925sVkVk+MlruVYrLvWRMsw3AYsq9RCj3KlsqvPTb7I3djJR5yC2h4sbOE8DKyr2k+dI3EQq77bUuF5FyH6FvpcJHmss5Ul7hIzK7DcBqKjjSXRH6KS6/+hH2xm50qjj64XKOPFnuI63sPAEsZrTcpxRX+MjFCNs0rqqIvKrCpxTODqTdBmApHKmu4qj+j7e+wlT46dfKfcqKch+Z1swVwEThbw44Ul7uIy3stjdzTFX/OqXSS1IrZju5YpU+MlXhU4orOTLEbgMwHUeqyn3kwUo7/Uf2Rm6FVPnl/6nwKQc08wYwQaWPNFZypIJtvxLXXEReo72SI6WVHPFH2AZgqPCLGk6ZV301D4szKaG3ASr85I5Kn1LArgPACJUcGQ5/M+Ajk+y2K3W5iFTOVqNrx5GqSo7UadoBDFDFEX8Vp2yp9lHT/7DV1aaSp/9U6SPPVvpIB7suAJ0EKjlSVhn6JkC77aqEnxwRkyISwhFHlU8pqeJIr2YbgF44JavCJ/+BvTnHW6r99BuVPmVN6DrSrBEgdurDL/q17dckXERqvCS1KvRqLlY40h0qJtUccWu2AcRINUeqqiz8uce1psor/1+1TznCrhcgKhwZCN+XfcSm2RYFfYrIJdUcqQlh2wGiUc2Roao4+dzjWtOpqm+v9pG7w9/ZR9gHAFeqmiNcFUdKq32kk90WC42q+rfhIlIdGkwHNRzx1YQWwJEudhvAVeGIvyZOP/e41lQE1E/W+MiLNRzp0ewPgLfCkdoajlRr2mNotoj4SWoNF77h66bWT/pr/UpJrZ/Y2G0Ab03JqkmAzz2uNdU8/WatX1lf4yce7b4BYPhJ76X7rVOzLcYMKyKvUV/rJ5UR2gE0av2kqsZH5iba5x7Xmlpe/m2NXznO7ieAMD9xXyoe3ZptOgkXkTo/Sa0NXbAGqeOIUOsnZXUcaWW3AYT5yWBdgn/uca1pVNW/r+XJ/XV+UqHZb5C06jhSE8K26y30+Z3hReRVdRwZqfMrxXUcGWe3QdLy13LKloYk+tzjWlMvqJ+p9ZO0Oj8ZiLAfIUnU+UlXrZ+U1vmJl91mhMtFpG72pm6Wpno/KY/QDkmknlOy6pP4c49rTb2Xfrfer2yu8xOO3aeQwPzEXudXSur9pF+zzUBWKSJhlwpJM9sOCc5PqurwuUfUCRXgek7J1uxfSDyha4YjdZp2E/RT+nfhIlIffiVovgaOjNf7leIGjoyw2yDB+MlgvZ+kNeFzj5il1kHfXc+Rh+o5UqPZ3xD3Gvyko8FPSus54me3mcVyReQyP2lr8JOyeo4ENdsgvvkJVx8nn3vUc/TLDZxyqoEjT4V/MzdOUuel/1LPKQvwYiwxNHBkKvTiOvzCK8J2M4WLSKOfpDbMTtRyGv2kspEj9Ww7xCslqykOPvdo9tOvNXDKikY/mX517o1+5XyDn9zOfq2V0+SjP2zglO2NHBG0xwLiQege2MCRBrbdKkpU9R3hItLoD0/Wmnhia+KVkkY/GdBsg3hR1cBb/3OPVo5+uClAnmzkSUuENczilX0NnPxz9t9aOY0B8qdGv3JKsxawLp60NvpJWaOfiJptFhIfReRVPOls9JPSJp74NNvAkpr8ZLAx9LmHYP3PPRo5cksjr+Sxa4ikiSeTjbyyrIGjX2H7sWraPer7GnnyWKOfNLDrAUsZb+KV4kY/GYmwzXLCRaTFT1KbZi/4uNDsJ9XNflLLtoOlcM1+ZUtLwPqfezRy8k+aeWV3s5/IEdbxppp50tTIk8cbfeoH2X6tmmYf/bcmXlnS5CcX2fWAuZr9pCJ0TrHtVlZN6TvjroiE8cQVeourmSfdmm1gqma/ktXEW/9zjyYf/dcmXlkck5spr5xpDpCb2DGsnGiKJ8QYT5qb/aS8yU8UzTaLu1xEmmerYPzhSW8zr5S08MSh2QaGavGTqmaezO1U1XexNywrpcWtvreZJ4+2+Ek9u4YoiS28srOFk+ewY1o5zRy5uYVXzkZYD+isxU9Gm3mluNlPLrLb4kX8F5G/qAvfxLTtoLMWPxlsjpPPPVoC5PoWv5LDriGWLt0YFrV56RfY8a2aZh/9UAtPnmjmSQu7Hoi9Fj+RQ995JML+Dj3LLVxEWmYXFu/8rX5S2sKTjgjbIPa4Fr+yOR4+92gNqN9v9StbW/0kEGEdeqlt4cnDjS71Pex8rJp2jn61lVeWt/BkKsJ6IBZ40tjiJxWa9jiVaEXkVUOtvFKMC0E/rX4lqzUOPvfoEOhnW3hlXuicYNdglFa/crKVl//Izs3KafHL/93CK/vZtUBUhkP3pVaeTEbYFrfCRaRNIKmtPKEJJ0Aa23hSoWmHaFS1xsHnHq2U/kMrTx5o40llhDUYro0n/rZAfPy02qtRVfWvWgPk9lZeOc+uB65cG08CrTwpaw2QdnZbIrhcRNpmF5tw2nkit/OkvC1AWthtcFUG2wMkLfTKnr3ZWC1tvPzb9oByIsIarCC8H9sF9TPsvK2abj/9aFuAPNMeIO0R1gNvJkDq23hSpWlPIKEXbAldRF5jtI1Ximf/V7MN3hjXHlA2d8TBK+hWnn6rLaBsaA8QT4R1WEo7TyraeXJ/+FVcnKTVT/+9PaCsbg8QO7se0Bho45WS9gCxRdiWUC4XkfbZEzvxBUhLB0/KO2a/Q9Fuh78IKJkdcfC5R2dA/WRHgLzYwZMezRosriOgHOvk5d+wa7Jy2v3yL9t55TC7FgjjOnhS2h4gnRG2JaTkKyJ/UdERII0R2oEnVR1x8LlH6G87twvkrvbQKz7tGuJGR4C4OwLK+k6efpNdo1Xzmn1fzK4naQVIbQdPqjXtCS58n+gUSGpH6GRONgEy1ckrxZ08GdJsS0KdPBnsjJPPPbr88i87eOUwu4Z41smT7o4AeaEjQD/BrteqCc01NOfw3COsKRl08qS3k1dKOgLEyW5LBsldRC7pDJD2Tp6UdvLEz25LElzn7Oce32VvElZLl5/+ewevrOoIEHuEdSSE0Aub0Kv80Kt9dv1WTQdP/6szoKzrDH1XFWFNiSi01lDx6ORJD7stmVwuIp2zFTWpdfGkqitA6tj2RNYVL597+NWPdAXIM5cKvmYdiaiLVw51+eVfsPvCygl9vtMV/pxHu56EEiA1YWx7Euqh9B9RRF4rQBxds68uejXbEkgXTyq74uBzj1C6AuS2Ll7JZ9eQFAJkpotXVoW+A2P3i1UzRuk7O3lyXydPyjXriXNdPOm69K6Fl92WrMJFpEcgqV2B0KtSeFV3gHR3BZSSboG42W3xrDtABrvj5XMPQf5Zl6DsZdeQpNq6AuTp0Hdk7H6yanoF+s/dAfJKd4D0R1hPfBGIPXw/SIS1xNjlItI9e4MBRo9AanoEUs22x5ueAOG6A8rmnnj43EOkX+oRlKXdAplg15HsegJKfug7M3afWTndAfqdnoCyqTtAfOx64oJAqroFUqdph7B+St+NIvLWfD0CKe0JkK4I2yyvR1Ayu3n59+zFbbV0e+kHunnyeI9AGtk1wOsoPaHv0AT5Z+w+tHJC52CPoGRFWI8l9QRIx6Xr3s9ug7+4XER6ZncavIneAOnvDSjFPQKxsdssSSCVvUJ8fO7RGyA39AaU05o1wBvqFchE+Ds2kX6Z3Z9WTeitj16BPNgrkGp2PVbRK5Cp8HUeIIPsNtBCEbkWAqkP3aA17dYx2Bsgaf1x8LlHd4D+qCeg7OgJkGCEdcAV6BVIYy9PHg99J8fuX6umK0g/3yMo83sCZJhdj6lmX3g1aNrhDQ2p6ntS+gSS2jv7ShuunNArkLK+AGmLsM0UfQHC9cXJ5x7dQfovfYKyoC9ARth1wLVScvsC5EZ2X1s5fQH6g96Asq03QALa9Riq9dL1LEbYBm8CRSRKoZtgX0Ap7hXIOLvNUIKS2RcHn3u85u2MGs0aIBaCfQFlR3+A/ojd91ZOb4Bc3xdQciKsR18CGQ9dv3gxc+0uFRE1tS+gUIiCoDT1C0q5pl1n/YJS2S/QuPjco4+nf+gXlCx2DaADQbnQLygLQ9/xscfBqhlxq+/tC9JH+wWlXrMeHfQLSkXoumXb4eqgiMRYuJAISjPbroOB/gCNi889BgL0O30BZVN/QPFFWAfoSVBqegX6UPjDzzhJX5D+a7+gLO4TlDHNemJBUJovveBTNNvgqoWKf7iI9AvhGyDEwEBQGe8PKsUDQWWE3RatAUHhBoLK5oE4+NyjR1A/3R+kL/cLSj+7DjBcdl+QWv7xNq/NgEB/PBBUdg0IihRhPVcvqIxeui4varbBNRtR1femDApq6sDsDQpiq20wqJQNCkowwrZrkTkYpJb/3KOf0r8bEOi9obVHWAOYZDCOXoC8NoMivWlAUM6w67lSg4IiXzoXW9ltED0UESMElcoBQanXtF+poFI5KNAH4uFzj6Eg/dWgoBzVrAEsYzCoDAwG6Suhx5Kwx8+q6fWpH+wP0icGBaWZXc+bGRSUxoGgUsG2Q+yMqur7wkVkcHaHg06GgoptKKiUDIUu4AjbIwl/bTA+PvcYlNT/GBKUtYOC4mTXARYVVMoHBfW+0AMT2eNp1QyK9CtDgrJsMKhMatbz+rUNDwWV4rf8OogaiojBhgSlazColA4Jio/d9pqv8Q3GydsOfTz92FBQfW5IUDrZdUDcyBgK0l+zx9bKGRLoz4eCyr4Iawlcegu5PcI20EG4iIwIaurQ7M0LjBJUqoeCSi3bPiwomSNx8LmHqqpvGw7QO4aDSgG7BohLriFBWTck0f9ij7WVMyLQ/x0SlO7wGoLKyHBQKYqwNtDRGKXvRxExj+vSTTj0v2NDAXore5FYMcMC/e/hoHIgwnog/nUNBenz/Tz9OHvcrZoSVf2b0Ftcw4IiRVgP6GxEUD8dLiLDs6+CwQQjgtI3oqrvYC8Oq2VIpF8dEZTlw0Flil0DJJigUjQi0DtDN2j2PLBihoM0TbMGMMTl70TYDWAoF3tRWCn9PvqhkSB9YlhQmiPMHRLbwdBbRuw5YbWgiJhntoiIaupIUKFgGssWkRGR3nxBVM5GmDMkiQuiYrsgKitHRPXr7PlhlVwI0jR23mCMcUo/kDIqqqkXQicLmMVyRWREoD8eDSq7RoOKFGG+kIRGg0rrSJA+NcTR/8eeL2bnQpC+ws4XjIEiYg2WKSIjQfWLF0Rl0QVRGYswT4DQdybnLojW+iEQFBHzoIhYg+lFJPQkzgsCfeRCUKmLMD8AFrkQVPZcEOhP2XPJjKCImGdCVT8YLiKjs9+uggnGTC4iF4L0ujFROcnOC+Aticr4mKikj4r0S+x5ZWTGgvRlzdzAEJeLyNjszQxMcNGkIjIh0e+NBZUtY0GFZ+cEcDUuBpWGi0H6WPgndUxIqIiwcwJjoIhYgNFFZFSgnx0L0rSLojLIzgUgKqJyelSkN7DnnN5BETHPJKUfShkX1dSLszczMMG4QUVkQlX//qJE778YVCrYOQDEynhQEcaDyvYxif6QPQf1yniQvsTOA4zxlyIiKhTMMS7qX0TGg/Q345JynB0bQDeSMnJRVBZcDNLPs+djrDMu0Zc044MhLheR8dmbGZhgQsciEvqDMROSspsdE8AoE6IyeTFIr2PPzVgmVETYccEYU5R+OGVKVFMnZg82mGBSxyIyJatz2PEAjDYpKc3suRnLTEr0JXZMMAaKiAWgiECiM6CIvMiOCcZAEbEAFBFIdCgiietyEZmcvZmBCaZ0LiLseABGmzKgiLBjgjGmKf1/4SIyNXszAxNM61xE2PEADKdzEZmW6AuaMcEQKCIWgCICCQ9FJGGFi4hNVFOnZ29mYA5di0iE8QCMZUAR0YwJhrCp6kdmi4gUPtBgDn2LiHY8AGPJ+hYRm0Sf14wJhrhcRGyh/wNm0a2I2GV1ToTxAIxlQBHRjAnGeLWIzEgKBdPoWkQijAdgKLvORcQh0efZMcEglH4URcR8KCKQ0PQuInZJfY4dEwyCImIJKCKQ0FBEElioiDhENdUeOtBgFl2LSITxAAzlMKCIsGOCQSj9p3ARcYQONJhF1yISYTwAY+lcRJwSfVYzJhgCRcQaUEQgsaGIJKxwEXGLaqpTUiiYRrci4pbVORHGAzCWAUVEMyYYwkHpx1LcRE11yuEDDebQt4hoxwMwFtG5iBD6jGZMMMTlIuKSFQqm0bWIRBgPwFg6FxE3oc9oxgRDXC4iblmhYBpdi0iE8QAM5TGgiLBjgjEclH4cRcR8KCKQ0AwoIk+zY4IxUESsAUUEEhqKSOIKFxEvUVM9oQMNpvDqXETY8QCM5tW5iHgJfZodE4zhpPQT4SLinb2ZgQl8OhYRTlbnsOMBGM6AIqIZEwwRQBExH4oIJDz9i8hTmjHBEJeLiE+mFMzByVTXIsKOB2A4QnUtIhyhT2nGBEP8pYiQ8IEGE3BE5yISYUwAg+lfRLRjggECqvrJFD9RU7nZmxmYwK9zEWHHAzCB3kXkyQhjggFQRCwARQSSAIpIgrpcRPyzNzMwAa9zEWHHAzAar3MRCRD6JDsmGCNA6adQREyGIgKJTu8iwhP6BDsmGCNcRASipvKzBxpMENCxiAiyOocdD8BoAQOKCDsmGONyEQnMHmgwgaBzEWHHAzCaYEARYccEYwiq+mkUEZOhiECi07uIBAhNY8cEQ/j8qvqRFEFVUwWFUjCNfkVEVedEGA/AaLoWkSClj0cYE/TjDyp0E0/pt8MHAEXEdCgikOhQRBLHqSCl173uAIiqmhpUKAXT6FpEIowHYDTdi0iEMSG2mkP7mVL6fnb/o4iYD0UEEh2KSPyaERW6UqT0q+x+v5xQEREVSsE0uhURWVXnRBgPwFCSzkVEovRxdkyInqTQozKlv2T3tyYoIqZDEYGEZkAReYwdE6JSKVF6v6qq72D3dcSEiog0e6DBHLoWkQjjARhKNqCIsGPCNbkgK3RBkNLPs/v4TUNUNVWePdBgDl2LSITxAAxFdC4ihNLH2DHhqiiyQnfLlP6Y3bdXFBQR06GIQEIzoIg8yo4JV4YotIBQehu7T68qoSJCKKVgGl2LSITxAIymexGJMCa8uR5C6fOU0o+x+/OqgyJiOhQRSHQoItbBKZRuopR+i92P1xxVVVMVSimYRrcioqrqnAjjARhN1yJCKX0kwpiglUMp/SO7/6IOiojpUEQg0aGImKuJUvq4qqrvY/ddTBIqIhQxM7oWEXYwBDEhuhcRdkAkHBuldAV9s982j0XCRSRC6QLD6FtEtOMBGE3/IqIdM9kdoVfy2+axyGwRUSiYRuciohkPwGh6F5GHI4yZrCpUVb3vin/bPBZRCUmlhFAwjX5FRJbnRBgPwGj6FhFCHo4wZrK5QBVlPqX0c+z+0Suqqn6QEvLSbBGRCQXT6FtEtOMBGMuIIsKOmTwIlZXdoWud3S96RVXVv6GEPBW6d1GZDKeoIkmlkkzBNPoWEe14AMaSdS4iEnlIM2ZSIOcpIbey+0OvqKr617P1ghRenoMsD4SLiCLKFEyjaxGJMB6AsST9i4hmzEQmyd1UIs9Tnv4Tuy/0CpXl/1Uk+WCEufSnqKKYqogSBdPoXEQ04wEYS5J1LiLSQ5oxE5NPEeWNVJK+ye4DvaKK4tcVUV6piJItwnxCx7ZvtogEJQqm0a+ICPKcCOMBGEz3IvKgdswEI0onaVCO/W+bv0FUv/oRGpSeVoJym2Yur5uX3HOpiIgUTKNzEdGMB2AwyYAiwo6ZKKRGGpQeU1X1vey69QoVxdsUUczXziUCUepOUQUxVRFECqbRt4hoxwMwmM5FRJAe1I4Z54KiTRHE5VQUv8KuV69QQf65EpT2aebypqSu2SISCFIwiRDUsYgIczTjARhNCOpdROZqxoxnQvAIDQZ/wa5Tr4QKlSKIyxRBnNTM5a11hm40qQovUDBJQNC3iLDjARgtIOhbRHhhrmbMuBSsoH7hXkrp37Fr1CPU5/sQ5YNPhI6Pdi5XKCB0pKh+IVXxCxRMwutYRDhhjmY8AKPxBhQRdsy4EhxReGE+FYTPsmvTK5QTb1Z44ax2LleJF9ovFZEABZPoXkQijAlgJP2LyAOaMeODrPCBXTQQ+BG7Jr1COeGnCh/Yo/gDJMJ8rh4faJstIlyAgkn8OhcRdjwAwxlQRDRjWl4+5QKG/bY55cQvKZyQrnDCeIS5REFoSVG9Qqri4ymYhAvoW0TY8QAMF9C3iPj4B7RjWhTHd6m+wHPU7/8ouw49Qr3eD1COf1zxBRo1c4mJQDOKiNlQRCDh6V5E7teOaTEc71N8gQ3Uwxv22+aUC9yocIFczVxiKtCUQrz+VOL1UzCJj9etiMgcN0czHoDheF2LCPHx92vHtA7F68+Wffwf2HnrldB1r3j5ncTrF9m5xJyPb5wtIh4/BZN4dSwibm6OZjwAo+ldRDz8/ZoxrcDLNxIv/6jqdhvy2+aqz/dFxcsvIl7/qGYuevH6G1KI259K3BwFs/j1LSKa8QAM5uH0LiL3acY0lX9a8XDLqZP7MjtXPRIqUsTDPULcXJ12LrqrTyFubypx+yiYhdO5iLDjARjM49O5iPju04xpEsXtOyx7/P/LzlGvEA93veLhcth5GIermy0iLh8Fs+hcRDTjARhO/yKiHdNYbq6cuH33qqr6dnZ+eoS6fD9QXNw24vYFNHMxFFc7W0ScXgpm8elbRDTjARhO3yLi9N0XYUyD+IYVp3ee6vF8hp2XHqFe7+cVp3d+aFztXEzg8lWnEIc3lTi8FMyiYxGxc3O04wEYTu8icm+EMfUmK07vTuryGfLb5tTheDexcw8Rp68mwlxM5Ku6VEQ8FMzi1bGIuOdoxwMwmt5FxH2vdkz9KA7vOeLy3MLOQ6/IDvcfFacnm52HRVTOFhG7h4JZdC4imvEADKZ3EZlx36sZUx9dxO57ls4Y9NvmTt/3FId3C3F4/RHmYhUVKcTmTiUzbgqm0beIaMcDMJbdo3cRuUczZkx5vIrdvZ7aPf/Fjq1HqM3zWTLjSSN296B2LhZj95TPFhGbi4Jp9C0i2vEAjKZ/EdGOGRPKjCtbnnb/nh1Tj9CpqX8g084HyIy7kp2HdbnLwkVEtrkomEbXIhJhPABDGVFE2DGjRWyuBjLjelR1ud7DjqdH5Gn378iM6wQ7D6sjNnfpbBGZdlIwjX5FZMo9J8J4AIYi0059i8iU4252zGvnmpKnXcvEKeeX2HH0iDTt+LY87dwoT7u82rlYH7E5S1LIlDtVnnJSMIneRYQdD8BgZMqAIhJh3GtwSLY5Dfltc3XK/Wky7XpZnnL1RZhH3CDTzqLQAUiVJx0UTDLl0LGI2OdoxgMwGJly6F9EIox7pciUo4xMO+9RVfVv2b5jHXVk5B1kwnlvaEx2HvGITDoLUUTMhiICCc7CRWSYTDrS1Gljfttcnrb/hkw5j0WYR9wik86C2SIyYadgkkm7vkWEHQ/AYGRS5yIy7riLHfMtSPKkfYc0OfNDti89Ik05vilP2tfLk3Z3hLnENTJpPx86AKnyuJ2CSSZ0LCIX7XM04wEYjEwYUEQijBsJGbefIxP2m9k+9Ig64fokGXe8KI87eth5JAoybs9PIeO2VHl8hoJJJmZ0LiIRxgQwEJmw61xEZu5ix2SR8ZlOMm5/RrXZPsL++1hH7VTfTiZm7iYTMyXsPBJNqCjPFpGLNgomGbfpW0TY8QAMRi7adC4itjvZMS8bt3nk8Zn10qj9P9l/p0fkizP/J1+0HdHMI0GR8Zm8FDJqS5XHbBRMclHnIsKOB2AwMqZzERmz3cmOGR734kyWPDZtyG+bS6P2/5DHbGvksRkHO49ERsZsZ2eLyOg0BZOMTetYRKbmaMYDMBgZm9a/iLx2vFFbPRmzPUL7He9mvzbWoWOOj5FR23NkzNbJrjsZkLHp3BRx1JYqXZimYBr9isjI1JwI4wEYSh7Vt4hcvoeNTkvyBdtqdcT9XvZrYh1VVf86PO6orZBdb1IZtZ1OEUenUqULUxTMoncRYccDMJY8OqVzEQnfw84HRya+yG7TI/KY7X+l0amD7DqT0/SpFHFkKlUamaJgFp2LiGY8AGPJF/QtIkZ85xGKOGL7unxhapV0YXqGXWMSy5ktIsOTFMwypW8R0YwHYCx5WN8ionfohZmPSsNTz8gjk+3s2pLeyORJFBHToYhAYovXIqKq6l+R4cnbpZHJ8+ya4FVT2bNFZGiCglkm9S0imvEAjCUPTcZdEZGHJ/5bGprYz64FWJNZKeLgeKo0OE7BLBP6FZGBi3O04wEYSx6aiJsiIg5d/Ko8NLFcGhyfYtcBEQyNZ84WkYFxCmbRr4gII1Oflgcm+rVjAhhHHphYwJ6bVgsdnPqwNDD+pDw40cLOH97E4MTxS0XkIgWzjOtWREKhFy58VBqceFYeGO/Qjg2go8HxY/LA+G/Yc9JqIUNjt0iD43ma+cNbGxw/liL2jadK/RcpmEXfIvJqxIGJb8gD46ulgYsO7RwAYkfuv1ghDV68j46NvZM9D60UeXD8p/LAxT1S/0XCrgGuWEaK2DeaKvWNUTDLRUOKyKuR+y7+n9R/8Yh2HgDRkfsuDkj9Y2lC36ghf+TpWiMOjH5J7htLl/ovjrNrgKvUf/EoiojpjC0ioaidnW8Xe8fukfsulmrnA3CV+sf8ct/YFmlg/LvsuWal0O7xD0j9Fx+X+y42atYA16b/4pFwERF7RymYpG/M8CLyagJdw5+S+sZekvpG+zTzArgCUt9odrD/4h/Zc8tqEfvGbhJ7R8+w84foSL2jh1PEntFUsWeUgkl6zSsir0bqvvhtsWd0o9gz5tXMDyCyWql37CHa36/7k3Kjidxz8cdS7+gusXdMirAGiJLUO3potoh0X6Bgkp4LpheRVxPsGvud1DOaqZkjwCVSz+gFsXt0YbBn+Avs+WOlBPtG/lXsvrBY6rkwxq4BYkfqHj2QIvaMpIpdIxRM0j1imSISitppe5fUNTJX7Bqp0swVklf3BVHqHtkpdQ//iD1nrBS1ffR9UteFR8XukXrNGiDmpO6R/SgiZrNYEXk1wa6Lnxe7RuZLXSPDmjlDsskVu0duZM8Rq0XsGv2T1DVyKsL8QSdS18i+FLFzJFXsHKZgkq5hSxaRVyN1Df1A6hrZLnYOC5q5Q0KTOoebpO6Rx0M/1cSeF1aK1DX8Q5yj5pC6RvbOFpGOYQom6bR2EXk1Yvvwn6TOkVOa+UPCkTqGJ8SOkaVi9+CX2fPASgl2D/2L2DGyQOoYHmHXAMaQOob3pIhtI6li+xAFk3QMxUURCYV2jr1f6hh+TOwYbtCsAxKC1D60T24f+hl77K2U0E+ESZ1DD4kdQzXs/MFYUsfwbhQRs8VREXk1wc4L/yZ2DKdLHUPjmvVAvMoXO4ZvY4+11RLsGPqj1DGUHWH+YAKpY3hXitg2mCq2DVIwTdwVkVcjtw3+VGof2iO2DSoR1gVxQGofbJfaBp+h3Rc+yh5fK0VqHfq+2Da0VWwf5Nk1gHmktsGds0WkdZCCaeK2iLwasWXoVrFt8FyEtYFVtQ3ZxbbBVWJr/7+zx9NKEToHPie2Dc2TWoeGNGsA00mtg9vDRSTYOkDBNHFfRELh2of+X7B14Klg60BbhDWCtRwOtg/+kj2GVora2fkuqXVwbrB1sCrC/ME6tqWILX2pwZZ+CqZJiCLyasSWvq+LLf0rxZZ+W4S1gonE1v5isbX/LrWx8W/Z42alBFsGfy+2DGSy8wcLah3YOltEmvspmCahisirCTYP/iLYMnAownrBYGLLQE+wZeDFQOPQJ9njZKVITQPfCbYMbAo2D/jYNYBFtQxsmS0iTX0UTJOQRSQUtaTkb8Sm/jvFpr7iCOsGvTX3ecTm/vVSS9832WNjpQitvf8cbO5/RWzq69esAaxuc4rY2JcabOyjYJqELSKvJlA/8IlgY98LYmN/d4T1gw7Exr7jweaB37LHwkqh1WPvlJr67gs29Zez84c40dS36VIR6aVglsQvIq9Gauj7ptjYuz7Y2OvW7geIiaa+Sqmp7361ceLv2f1vpQSben4jNvUd08wf4orY1Ldxtog09FIwS3RFJHT81MbeD7LtVk6wvuc34VfKmn0B10ps6B0MNvamCQ39n2X3t5Vy+YVEQ6+bXQPEH7Gxd0OK2NiTGmzooWCW3uiKSENvcaiPYGPvc2rJyDvY7VaN2tj491JDz/3Bhp4K7T6BK9fLBxt7t0oNvd9j97GVEmjq+lSwoedFsaGnR7sGiFdiQ++6FLGuJzVY30PBLFEWkfqe7Nf0Vy419DxAK3v+kf06q0ao7/tMsL43TazvGdTuG3gzYn3PyWBDz3XsPrVSaH//34kNvfcEG3pL2flD/BMbetfOFpG6bgomqe+JrojUdZ9k+wy1iXW9f2K/1soJvZIO1ndvCdb18Ox6QKNOqOt5RG0ceg+7H62UYF33r4J1PUcjzB8ShFjXs2a2iNR2UTBJXXf0RYTtM6xbDNZ17xbquyz9RFY2wdrO6954TclNrO0eFWu7FwUbe7/I7jcrRarr+U+xrnttsK7bya4BEotY17U6RajpSRVquiiYpDa6IiLUdudo+nyNYE3XpFDTvUKs6rL0M5Jem9ArbKG68xGhtruOXU9y6paCNd27hJqeOey+slL42raPB2u7nxdquru0a4CEVNu1KkWo6UwVajopmKS2K8oi0pWj6TOyzmBt1wuBqq5PsX1YNcGq9i8KtZ2LgjVdYxHWkxxqO8+ItZ03sfvGSgn9UqlQ03lnsKarSDN/SGy1XStni0h1JwWT1ERZRGq6Tmn6fBPB6s5yobrrAbWk811sX1aNUNvx42BN5y6hukti15Owajqbg9WdT/jK+z/E7g8rJVjT9QuhpuuQZv6QLFbMFpGqDgomqe6MrohUdZ7S9Hklqjuyxequ69n+rByxuvNmoarjrGYtCSRY3TEpVHUuE6vbvsKu30oJvT0qVHWsClZ3zrBrgGTSuRxFxGxmFZHw2B3BYHXnLqGq66dsv1aNr7z5Q6FX6EJ1R7NmPXEuWN25X6js+G92zVaKv6z7o8HqjmeEqs52dv6QjDqXpQiVbalCZTsFk1R1RFdEKttPa/q8SsGq9kmhsn2FWNn5dbZ/q0asaf+qUNW+PFjZPsWuJ94EqzrOB6o7bldV9a/YdVolobmF5hiaKzt/SGYd6bNFpKKdgkkqY1BE2D6vWUdHsKL9+UBFp6UfGf7ahF65Bys6DmjXEg86OoKVHc/ylT3/xK7LShGqOv4nfvcx6KqyY0mKUNaWKpS3UTBNdEWkoi03Qp9RCVa0lwkV7ffHy4fvqqq+LVDRdkewvK2AXYsVBcvbHEJF+2qxrO0b7FqsFLGi42tCeduKYHnbNLsGgLCK9sWXikgrBdPEoIho+oyRtqxgeZulH6vx2vDVLR8LVrQ+J5S3dWrXYhEVbUeCZS3/x87dSuFq2v9fsKL1KaG8tVUzf4DXqmhdNFtEylopmCa6IlLeeiZCn7FT3ioEy9t2CmWtP2HHtmqkivb/EMpb1wplrU7NekwSLGsrEcpa71Y7O9/OztdKCZS33BosbzvHzh8govLWhSlCWUuqUNpCwTTRFZGy1jMR+oy5YGnrhFDaulwsaYmbD9+D5a2/EkpbjrJrMVhvsKzlpUBRk6V/yVMoaf5ZsLR1r1DaokRYA8AbWZAilLSkBkpaKJhDKImuiARKWs+wfeqsPVDS+lygrOMT7FysmNDj8YWSlnuFkpayCGvRUas3UNq6gS9u/RY7JytFLGn9slDSulQoaZ3QrgHgzQklLfMvFZFmCuYQSpqjLCItZ9k+jSCUtpQKpS330fzWf2DnZMUIJS2fDpQ2vxIoaeln1xJrQmnLCb6k5XfsHKwUX0njB/mSlseFkuYmdv4AV0ooaZk3W0SKmymYQyiOsogUt5xl+zSSUNycxZc0x82H74HSpu8ESlo2BYqbfexaoiWUtFQJJU0PWL2wBkqabwqUtJxh5w9wDdJmi0hREwVzCEVN0RWRouY8tk/jNQcCRU07hMKWH7Pzs2r4oqY/hAqgdi3XoLh5SChqmieUNH2OHcdKCR2fQHHzrkBRs6RZA8C1KG5+JXRipQYKmyiYQyiMQRGJ0K8ZhKKmcaGoeZlY1Pw1dp5WTOgvQArFzQ8Khc017FquTHMgUNi8LVDY+AO2bysleL7pX4WipsVCUfOYdg0AUShqfjlFKGxMDRQ2UjCHUNgYXREpbDrH9mm6oqb2QFHjc3xJ28fZ+VoxwYLWfxEKmxYEChtHNGt5YzmBokZLP8DSc676/cHCxseEwsaGCPMHiF5R00uzRaSggYI5hIKGKItIwzm2T8sobCwRzjfeq55q/Ht23lZM4Hz9jwIFDTsChQ1BzVouEQob64MFDY+6S1rey/57KyVQ0HRDoLDhNDt/gJgqbHgxRchvTA2cb6BgjqiLSEHDObZPqxEKGjL5gvo/snO3asI34PMNua9fQ+OYUNCwOPTWEPv1VsrlQljQEGSPA0DsNb6QIuTXpQby6ymYQzhfH10ROV+fz/ZpTQ2BQH79DqGgwdJ/4vXVhP9a3/mGB4Tz9TOB8w2nhHPW/tA8mFf/BeF8/UIhv/6Cdt8D6OZ5FBGTJU8RmSWcb7go5NcvFQubvsquxYpRS0rewbZZKWpB43uE8w0PC+fra9l9DaC7gsbnZovIuToK5hDyoywi+XXn2T7jQn59WyC/9ln+bG1cfPhuxfD5DdcF8utOavYtgFHya59NEfLqUgN5dRTMIZyLsojk1Z1n+4wv9cVCXt09NKP6nezakMgJnG38fuBc3dbAuTpeuz8BjFT/TLiI8Hm1FMwROFcXVRHhz9UWsH3Go0Be3Qk+r/4P7PqQv0Q4V/05Pq9uHp9XO8TuPwAzBM7VPZUi5NWk8mdrKJgjkFcbXRE5W1vA9hm38mr4QF7tdiGvJi4+fDcqaknJu/izNXP5s7VVmn0GYKJAXs2Ts0XkTA0FcwTORllEztQWsH0mgDH+TG06d6b6K+x6ky18Xs0fAmdqsiLsIwDzna19IkXIrUnlc6spmCNwpia6IpJbU8j2mSgCZ6pbA7nVz/Cnqz/GrjvRE8it+m4gt3ozn1vDsfsFwDJOVz8+W0ROh/8PmCBwOgZFJEK/CabYf6b6Hqv/uG0sIuTWfyaQW53G59YMRNgPAFbzahGpomCOwOnq6IrI6eoits9EFcitPs6frv49uw8SIaFHw/C51ffzuVUV7LoBLOyxFCGnJpU/VUXBHIFTMSgiEfpNYP7AqeptgVPVP2L3RbyGz6n8beBU9fEIawWwukdThJzKVD6nkoI5AjlV0RWRU5VFbJ9J4VTVGJ9Tmc6drvoyu0/iJfzJ8m8FTlVtCJyq9GjWBxAfHkERMVn0RaSqmO0zmQRyqloCp6qe5k9W/hO7b6yaQG7VpwKnql7iT1X1susBiCunqh6aLSInKyiYI3CyMroiklNRzPaZpIqEnIq7rfzhOz179u/82ZX38CcrSyPMHyD+nCp/MEXILkvls8spmCOQXR5dETlZXsL2mcwC2RXH+ezy37H7yez4s8p/xZ8sP8rOFyCuZZXPTfFnl6X6s8opmIPPirKIZJWXsH0mvewKzp9VvjVwsuyH7P4yOvzJyv/0Z5evDR1nzTwB4hyfVfnApSJSRsEcfFZZVEXEn11WyvYJl43y2eVLuMwSwz98D2SXfcKfXf4Cn13eHWFeAAmBzy67f7aIZJZRMAefGWURySwrZfuE1+Ozypu5E6VP+TPLPsruv1hH3db4t/6sirv4rPJidh4AiYbPrrgvxX+iLNV/opSCOfgTpdEVkROlpWyfEBmfWVroP156V+gDbnY/xiL+E6W/9GeWHWbHBUhYWSX3ooiYLOoikllaxvYJb6XsGH8idh+++zPLvuE/UbqaP1Fm144FkMAyy+5J8Z8oSfUfL6FgDv5ESXRF5HhJGdsnvDX+eInPf6J0S+B46Q/YfXqlCf1uiv9E6bP88dIOtn+ApHCi7O4U/7GSVP+xEgrm4I9HWUSOlZSzfcLVKL3AHytZzB0v+hK7b98oapr6Nv/xsjv4Y6UF2v4Aksqds0Uko5iCOfhjxdEVkYzicrZPuHp8RnFzIKP4Sf+xko+w+/i14Y6WfJnPKD7P/nuApHSsJHW2iBwtomAOPqMouiJytKiC7ROuHZ9RVOA/WnSneuzY25n9/DX+aPEKPqN4mv03AMmGP1rU7s8oeib8E4/80ZJvh25k7BeBYYZfe7O62qCI6KU4gz9S/FvucOH/444WP8VnFLdqvwYgufAZRXb/0eLV/qOF33jdjYg7XPhV/mjhcv5o4aT/SCEF4/BHC6P6ToQ/UlTM9gkxVRqhDSDphO81x0q+zt6DXhfucOHPucOF+7gjRQp3uJCC/vxHoisi/sOFlWyfEENHCms1bQBJ5NILqbvZt3jfNNzhwlu5Q4V53KECCvryHy6IvohE6BdipjZCG0DC8x8u6PUfLngpcLDoU+x954rC7c//MHeo4EnuUGEz2znEDoqIxR0uqNO0ASQw/+FCr/9QwUb+wPlvs/ebawp38PxXuEPnl3EHCya4g+cpxJb/YJRF5OD5KrZPiKm6CG0ACcl/qCCTP5D/e/Y+E5Nwhwp+xh0o2MMdKCDcgfMUYsN/IAZFJEK/ECMHz9dr2gAST7XvQMGDtmMl72LvMTEPd+jcLdyB82e5A/kUouc/cD6qIsIdyK9m+4QYOpjfoGkDSBjnR7iD+fO9h85/nr236BrfrrMf4vbnP8Htz2/i9p+jcO38+/OjLyIR+oVYyW/UtgHEuQP5QW5//nbfwXxz/3Abtz//y9z+/HRu/7lxbt85ClfPvy/KIrLvXA3bJ8RQqIiwbQDxbH/+aW7fuRvYe4mp4fac/Sm3L283tzdP5vbmUbhy/r3noi8iEfqFWDnXpG0DiEP78kIviB737Dj3fvY+Yplwu8/dzO09d4bbk0fhyvj3RFlE9p6rYfuEGAoVEbYNIJ7szZvg9uQt5fbmXfGTqk2N79CpD3J7zj7O7T3byO05S+HN+ffkRVdE9uTVsn1CDO3Na9G0AcSNvL3cvryfsfeNuEio6nF7zi7h9uRd5HafpRCZf3cMikiEfiFW8lq0bQBWl5fv35N3G3u/iMtwu8/8hNtzZpdvzxnJt/sMhdfjdp+NrojsPlPH9gkxtOdMq6YNwKr2nG337sl7xn8g76PsvSLuw+05eyO3+0yub9cZCn/B7YpBEYnQL8TI7jOtmjYAi+F2n7X7dp9Z7d+d9/pHtCdavPuyPsDtPPOYb1dug29nLoVcyu06E1UR8e3MrWf7hJhqj9AGYB27zhzx7z77S/bekNDx7Tr7b9yO3MW+Hbljvh2naTLjdp6OvohE6BdiJbdD2wZgPm5HbqlvV+7VPaI90cLtOD2H23F6p297rujbfpomI25HlEVkR2492yfEUm6Htg3APNz2072+7bkvBXbmXtsj2hMx3I5TN3A7Tp/2bT9Fkw2341R0RWT7qQa2T4gdbvvpLrYNwAzc9lNe347TG73bc2LziPZES+i3KLmtpx71bT9d79t2iiYLbnsMikiEfiE2wkUkQjuAkbhtpzJ920/r84j2RItv26kvcltzFvm25oz6tubQRMdty4muiGzNaWT7hNjhtuZ0s20ARuG25VT7tubMtW08pv8j2hMtvm2nfsRtzdnh25ojsDs2kaCIWBu3NaeHbQPQG7ft1Ai3NWe+d8tpYx/RnojhtuT8iduSc8q35SRNRNzWk9EVkS0nm9g+IXa4rTm9bBuAjgRua85239aT5j6iPdHiXpP9Xm5rziPc5pw63+aTNJFwW6IsIptPNrF9Quxwm3P62DYAPXBbTp7mtmZb6xHtiRbv5pNf8G7KXujbdHLEuymbJgLfpuyoioh3U3Yz2yfEjm9Tdj/bBhBLvs0nG72bsx/37Miw7iPaEy2+jSd/6N2cvd27OTvg3ZRF45lvU1aURSSrme0TYse3OWuAbQOIic1ZE97NWUu5jTnx8Yj2RAy3Oet638asHO/GLBqvfBujLCIbs5rZPiF2fJuyB9k2gGj5Nmbv5TafjM9HtCdaXNuOvcezMeth34asGu+GTBpvfBsyoysiGzJb2D4hdnwbsobYNoBr5duQme/deCIxHtGeaPFuyPoX74asBb4NmcPe9Zk0XvjWR1lE1me2sH1C7MTb+QSW1e5dn/WMf3Vm4j2iPdHiW3/8B971mdt860/w3vUnqNX51p+IrohsONHK9gmx41ufeYFtA7gKdt+GE6v9647/O3vtIhaPb13mdd51mdnedSeolfnWRVlE1p1oZfuE2PGtzxxl2wCuyPoTR7xrTyTXI9oTLY51B97NrTv+kG/diWrv2uPUinxroy4ibWyfEEPrjo9p2gDezLoTJb61J+5W05L4Ee2JFu/aE5/3rj0+37f2+JB3zXFqJb41URaRNSfa2D4hpi5GaAPQ8K090etbc+Il18YTeER7osa39tj3vWuObfGtPe73rjlGrcC35niUReRYG9snxNDa4+OaNoDX8K057vWuPbbRuy4Tj2hPlvjWHv2jd01Glnd1BjWbb/WxKItIRjvbJ8TSsQltG8Blmb5VGXhEezLGvuzkP3pWHXvQszqjyrMqg5rFG2UR8azOaGf7hBhanTGlaQNYlVHtWZmBR7QjKSmeVRmf86w8muZdlTHoWXmUGs27KiO6IrIqo4PtE2Jo1dEpTRskLe+qo8PelUfne5edwCPakdfHt/ro97yrjm72rjzKsSeOnlBELG7VUZumDZLPqgzBu/Lodt+Ko3hEO/Lmca848gfvyqOZnhVHqBG8K49GV0RWHO1k+4SYmonQBknl6GnPyiN4RDty5Qm9z+lZkfGAZ8WRSs/yI1RP3hUxKCIR+oUYWXHEoWmDpOBdfqTRu/zo455VeEQ7co3xLD/yWc/yw2ne5YcHPMsPUz14VxyJrogsP9zF9gmx411+2Mm2QYJbcWTCu+JwOrf0MB7RjsQm3qVHvutdfniTd/lhn2fZIRpL3uWHoysiyw53sX1C7ISOD9sGCWz5ob2e5YfwiHZEn/iWHfqdd+nhE56lh2iseJdFWUSWHu5i+4TY8S475GHbIPF4lx7K9y47gke0I/pnasX+f/AsPXS/J/1ghSf9II2Wd+nB6IpI+sFutk+IHe/SQ162DRLJoXZv+qFn/IsO4BHtiLHxpB/6jCf90CueJQf7PUsO0mvlXRJlEVlysJvtE2LHm36QY9sgIdg9Sw6tdqcfxiPaEXPjXXrg294lBzZ60w96PUsO0KvlXXIguiKSfqCH7RNix5t+gGPbIL55lxw44l1yCI9oR6wVX/rB33qXHDzuWXyAXg3v4iiLyJIDPWyfEDveJQd5tg3ik3fxwRLf4oN4RDti3Uykbft7z6L993kW7S93L9pPr1BURcS9eH9vhD4hRjyL9gfYNogzi/f3ehbvf8m18CAe0Y7ER9zpez7tWXTgZfeiA32aE1oLRcTSDgS1bRAPPIv3e92LDmz0LjqAR7Qj8Rnv/P3fci/ct8GzaL/bvXAffQPRFZGF+/si9Akxs1/StoHVeRbtz3Qv2P879npBkLiMa8He33gW7D3mXrCXRhBdEVmwty9CnxArC/fKmjawroV7qz0L9861peER7UiCRU3b8w73vH33eubvK3PP30tfI/oi8vr+IFYW7FPC2HawHM/8vcPuBXvnexfsxiPakcSOa+HOT3nm733JvWBvj3v+Huqevye6IjJ/b/+lfiD2SIQ2sJIFewXPgr3bXQv34hHtSHLFn7bnI575e3Z75u0ZZrddTdzz9va75+2hoIe9krYNLGP+ntOeefvwiHYkueNJ2/E5tu1q4p63Z8CdtpuCDubtFjVtYDpP2u5Gd9qexz1pO/CIdgSJNigiOpq3W9C0gWk88/ZMuOftTncu2ItHtCNIrOJO2zXoTttFIfY8absCbBuYw5O2a68nbRce0Y4gsY77lV2D7ld2UYg9zyu7/GwbGOzl3fneV/bgEe0IoldcL+0ccr28k0LsuV/axbFtYAz3yzvb3S/tfMaWtucj7DmPIEgM43pxx5DrpR0UYs/90g4f2wY6e3Gn3fXiztXul3biEe0IYkRQRPTjfnGHl20D/bhf2nnY+dIOPKIdQYyM64Udw64Xt1PQwQvb3Zo2iL0Xtpe4X9yOR7QjiBlxvbA9zf3CjoDrhfDFCDHkfn7HONsGMfT89l7X89tfcr24E49oRxAz435+60/dL2zf63phG4VY2u7QtkG03M9v97pf2LbR8cJ2PKIdQawU13Nb73A9t73Q9fw2CrGw3aFtg6g8ty3T/fw2PKIdQawa54ubPuF6btuLrj9v7XM9t5VCFP68dUbTBtfE/edt1c5nt861pW3EI9oRJB7ifG7rd9zPbt3sfnYr7/pz+IYIV22bTdsGV2nY/ezW+d5ntuAR7QgSj/E8t+1695+3nnb9eQuFq+P+89Yptg2ujPvZLYLrz1u3u/68FY9oR5B4j/e5DR9wP7v5cdezW1pcz26mcGXcz26ZYtvgrbmf3XLa88xWPKIdQRItrj9v+Zrrmc2rnM9stjuf2Uzhzbme3jLBtsGb2dLoembLY54n8Ih2BEnoOJ/a9H+upzdlOJ/eROGNuZ7eNM62gZbrqc0Tzqc2pTuf3YhHtCNIsmTsiVXvdD616QHXU5urnE9toqDlenLzRbYNXuPJzYrzyU17HU9vxiPaESRZE/qpGeeTmxY6n9w46nxqI4W/cD21cYxtg1muJzfmu57aiEe0IwgyG/dTm3/ifGLTHucTmxTnkxsphI1GaEtym9qdT2x8xvb0RjyiHUEQbVxPbLrd9eTGAueTG2jSe2LDBU1bsnpig931xIbV7ifX4RHtCIK8eRyPrPu46/GNL7ie2NDrfCJ8A0lOj28c1rQlIdcTGw47n9iIR7QjCHJ1cTyx7tvOxzZscj22we98fD1NOo+tH9K0JZcSx6Pr71bT0vCIdgRBrj2Ox9dd53x0/SnnY+toclk/qG1LfK5H1/e6Hlv/kuvBjXhEO4IgsYnniVXvdz+y7nHno+ubnY+uo0nhkXUDmrYE5np0vdf56PqNjsfWfIs9/giCIDGJ69G1X3U9sm6l89G1M85H19KE9si6fk1b4sp0P7YGj2hHEMSYOB9d+0vXI+uOOh8J3WwTk+uRtX1sW6JxPby2yvnIurm2B/GIdgRBDM7IHXve4Xx4zf2Oh9ZWOh5eQxON86G1vWxbwnhozbDz4bXzZx5di0e0IwhibmwPrvqc86E1CxwPr7mguVnFMedDa3rYtnjnfHiN4Hho7XbXg2vwiHYEQawV+9xVP3Y+uGa346HVxPHQahrvnA+t7mbb4tqDq087H1z9J/a4IQiCWCr2B1ff5py7+rzjwfCNK245H1zdxbbFI+eDqxsdc1c/5rl7FR7RjiBIfMRx/6qPOeauet75wKpux9xVNB45567uYtviiXPu6nHH3FXpzgdX4xHtCILEZxwPrviWc+6qjc4HVnKOuStpfFnVoW2LAw+sUpxzV+51zF2NR7QjCJIYcTyw6o/OB1aedDwQusnFiftXdWjaLM55/8p8+wMr8Ih2BEESL5656e9z3L/yMef9K5sc96+gcaA9QptVtTvvW/GM7cHleEQ7giCJHee9q77iuG/FCud9K20RbobWcd+KVk2b9dgd969Y5X5gJR7RjiBIcsV574pfOO5bfsRx33JqSfcub9W0Wcm9yw8771+GR7QjCJK8oY+s+zvnPSvvc9yzvMJxb/jGaCErWrRt5nPeu7zEcc+Ku9Xr8Yh2BEGQcGz3Lv+s/Z7l8+13Lx+x37OMWoHj7mUtbJuZHPcs73Xcs+wl110r8Ih2BEGQSLHfvWKO/e7lu+x3LZftdy+jZnLcvayZbTOJ13HXso2OO1fgEe0IgiBXEvvd6bfa716ab797KTWL4+6lTWyb4e5almm/axke0Y4gCHK1sd+17J/sdy59zn7X0i77XenUaI47lzaybQaqst+ZPtf2YBoe0Y4gCBJNHHcv/ab9zvQNjtR0n/3OdGqUcBGJ0K6r1PRhe+rS+TOpy/CIdgRBkFjGkbr4D47U9Gx76hJqBMcdSxrYNr04UtMFR2r69pnb0/GIdgRBEL3iviPtvY47ljzqSE1vZG/Esea4I72ebdOD4470087bl+IR7QiCIEbFeeviLztuX7LMcfuSafsdi6k+ltRp22LHccfiRsftix/z3J2GR7QjCIKYEecdS/7Xcdviw/bbF1Md1EZoi95tS8Ydty9Jd96+FI9oRxAEMTuh39yeuW3xvY7bF5Xbb19EY6g2Qtu1u22xYr990V7HbQvxiHYEQRCrZfrm9M/Yb1s8z37b4mH7baGbdrQW12jbrtGti/Ltty3BI9oRBEGsnpnbFv/IfsuinTO3LJRmbl1Io1AToe2q2G9d1G6/deHTtjvS8Ih2BEGQeIr9poW32G9ZeG7mlgX02iys1rZdGfstC+32mxesmrlxAR7RjiAIEq+ZuWXRR+03L3x25uaFnTM3L6BXZ2G1tu2t2W9ecHjmpgV4RDuCIEiixH7L/P+y3zx//czN8z0zN8+nV8J+8/wqtu1N3TS/ZObmeXhEO4IgSKLGceP838/ctCBr5qbwTf9N2W+aX8W2RXTj/N6ZG+e/5Lo+7ZPseAiCIEiCxXX90vfYbpz3yMwN8xtmbpxH34j9xnmVbNvr3DDfO3PD/I2OG+fjEe0IgiDJFueNC740c+P8pfYb509pCsRbFBH7DfNO2G9KwyPaEQRBkj22P83/n5k/zTs0c0MafS37DfMqNG1/mldlv2HeXNv1eEQ7giAIcinqfff97cx1r9xtv/6Vspk/pdEQ+59eqfjLf6cN2//0yvyZP7yER7QjCIIgkTNyR9o7Zq57Jc1+XVrAfn1a+cz1adR+XdrhqRvSPs1+LYIgCIJETKho2K97eZH9j2lz2G0Igvwl/x+CzBCctq+LIgAAAABJRU5ErkJggg==" preserveAspectRatio="none" id="img1"></image><clipPath id="clip2"><rect x="1548" y="1469" width="401" height="463"/></clipPath><linearGradient x1="5.66719" y1="56.5386" x2="5.66719" y2="-222.487" gradientUnits="userSpaceOnUse" spreadMethod="pad" id="fill3"><stop offset="0" stop-color="#E73768"/><stop offset="0.5" stop-color="#FFFFFF"/><stop offset="1" stop-color="#69E0F9"/></linearGradient></defs><g clip-path="url(#clip0)" transform="matrix(1 0 0 1 0 -1440)"><path d="M0 0 2266.88 0 2266.88 1666.82 0 1666.82Z" fill="#0A0619" transform="matrix(1.0005 0 0 1 0 867.18)"/><g clip-path="url(#clip2)"><use width="100%" height="100%" xlink:href="#img1" transform="translate(1548 1469)"></use></g><path d="M155.214-196.218 5.66719-196.218 5.66719-147.014 52.2715-147.014 52.2715 0 108.277 0 108.277-147.014 155.214-147.014ZM203.019-168.482C227.888-168.482 232.088-171.016 232.088-195.952 232.088-220.487 227.888-222.487 203.019-222.487 178.417-222.487 173.883-220.487 173.883-195.952 173.883-171.016 178.417-168.482 203.019-168.482ZM176.15 0 229.821 0 229.821-152.681 176.15-152.681ZM313.829 1.66682C320.296 1.66682 326.564 1.13344 332.764 0L332.764-40.4038C329.631-39.6037 328.764-39.8704 326.83-39.8704 318.03-39.8704 315.229-43.5374 315.229-56.2719L315.229-214.887 261.491-214.887 261.491-43.5374C261.491-12.7345 271.692 1.66682 313.829 1.66682ZM493.646-86.208C493.646-132.879 478.911-155.481 424.64-155.481 374.035-155.481 344.632-139.08 344.632-76.3404 344.632-13.6013 374.035 2.80026 422.106 2.80026 452.309 2.80026 476.644-2.80026 485.979-9.3342L485.979-48.9379C476.911-43.5374 453.442-37.8702 432.84-37.8702 413.305-37.8702 401.171-43.2707 398.037-56.5386L491.912-62.2058C492.779-64.4727 493.646-73.8069 493.646-86.208ZM397.77-93.0087C398.904-111.944 406.838-116.211 424.906-116.211 441.908-116.211 446.108-108.277 446.108-96.6757ZM577.32-48.3378 577.32-196.218 521.049-196.218 521.049 0 649.927 0 649.927-48.3378ZM739.269-155.214C728.268-155.214 711.866-154.614 700.265-152.681L700.265-107.743C710.199-109.41 720.6-110.277 731.668-110.277 754.003-110.277 759.671-108.01 760.471-92.4753L729.135-92.4753C684.73-92.4753 664.662-79.4741 664.662-44.4041 664.662-11.6011 684.73 2.80026 716.667 2.80026 743.536 2.80026 756.537-6.53394 761.071-14.1346L765.271 0 813.942 0 813.942-103.743C813.942-139.346 792.74-155.214 739.269-155.214ZM733.668-37.0034C722.601-37.0034 716.667-39.0036 716.667-46.9377 716.667-56.0052 722.067-58.5388 739.002-58.5388L760.471-58.5388 760.471-46.671C756.27-41.2705 746.936-37.0034 733.668-37.0034ZM949.888-155.481C925.019-155.481 910.351-147.547 902.684-137.146L902.684-152.681 849.012-152.681 849.012 0 902.684 0 902.684-100.676C904.951-108.277 910.885-113.077 924.486-113.077 941.421-113.077 946.221-109.944 946.221-91.0085L946.221 0 999.96 0 999.96-103.21C999.96-140.213 985.825-155.481 949.888-155.481ZM1129.97-152.681 1129.97-139.346C1124.04-150.681 1112.5-155.481 1086.43-155.481 1038.1-155.481 1025.7-119.611 1025.7-77.7406 1025.7-31.4029 1038.1 0 1086.43 0 1112.17 0 1124.04-6.53394 1129.97-17.535L1129.97-13.6013C1129.97 10.7343 1117.84 16.6682 1085.3 16.6682 1072.03 16.6682 1054.76 14.4013 1042.63 11.3344L1042.63 52.8716C1057.03 55.1385 1076.57 56.5386 1090.97 56.5386 1163.38 56.5386 1183.18 29.136 1183.18-12.7345L1183.18-152.681ZM1105.1-37.0034C1083.37-37.0034 1079.97-55.6719 1079.97-77.7406 1079.97-98.4092 1083.37-117.011 1105.1-117.011 1130.84-117.011 1132.57-102.343 1132.57-77.7406 1132.57-51.7382 1130.84-37.0034 1105.1-37.0034Z" fill="url(#fill3)" transform="matrix(1.0005 0 0 1 313.501 1774.2)"/><path d="M0.533383-56.0052 0.533383-46.8044 17.4683-46.8044 17.4683 0 28.2693 0 28.2693-46.8044 45.0709-46.8044 45.0709-56.0052ZM51.0714 0 61.8724 0 61.8724-56.0052 51.0714-56.0052ZM72.8068 0 113.544 0 113.544-10.5343 83.6745-10.5343 83.6745-56.0052 72.8068-56.0052ZM120.078 0 161.548 0 161.548-10.5343 130.946-10.5343 130.946-23.6689 158.948-23.6689 158.948-33.6031 130.946-33.6031 130.946-45.5376 161.548-45.5376 161.548-56.0052 120.078-56.0052ZM192.285 0 232.955 0 232.955-10.5343 203.152-10.5343 203.152-56.0052 192.285-56.0052ZM268.492-56.0052 254.824-56.0052 234.755 0 246.423 0 249.957-10.401 274.292-10.401 278.026 0 290.494 0ZM252.757-18.8018 261.291-44.4041 262.091-44.4041 271.292-18.8018ZM296.361 0 306.829 0 306.829-38.0702 307.229-38.0702 334.164 0 344.765 0 344.765-56.0052 334.431-56.0052 334.431-17.4683 334.031-17.4683 306.962-56.0052 296.361-56.0052ZM400.571-8.40078 400.171 0 410.305 0C410.305-0.400037 410.305-29.4694 410.305-29.8694L380.502-29.8694 380.502-22.002 400.371-22.002C399.571-15.2014 392.57-8.13409 382.436-8.13409 370.034-8.13409 363.767-16.8016 363.767-28.0026 363.767-38.8703 371.301-47.2711 382.502-47.2711 391.103-47.2711 396.57-43.2707 398.704-36.8701L410.105-36.8701C407.371-50.8714 397.704-56.8053 382.436-56.8053 364.567-56.8053 353.3-45.6042 353.3-28.0026 353.3-10.4676 363.501 0.800074 381.302 0.800074 394.037 0.800074 398.904-5.33383 400.171-8.40078ZM455.509-21.8687C455.509-13.868 450.175-9.73424 442.508-9.73424 434.574-9.73424 430.107-13.868 430.107-21.8687L430.107-56.0052 419.172-56.0052 419.172-20.2686C419.172-5.93389 430.04 0.800074 442.575 0.800074 455.576 0.800074 466.11-5.93389 466.11-20.2686L466.11-56.0052 455.509-56.0052ZM503.18-56.0052 489.512-56.0052 469.377 0 481.111 0 484.578-10.401 508.914-10.401 512.714 0 525.182 0ZM487.379-18.8018 495.979-44.4041 496.78-44.4041 505.98-18.8018ZM573.387-8.40078 572.987 0 583.121 0C583.121-0.400037 583.121-29.4694 583.121-29.8694L553.318-29.8694 553.318-22.002 573.187-22.002C572.387-15.2014 565.386-8.13409 555.252-8.13409 542.851-8.13409 536.583-16.8016 536.583-28.0026 536.583-38.8703 544.117-47.2711 555.318-47.2711 563.919-47.2711 569.386-43.2707 571.52-36.8701L582.921-36.8701C580.187-50.8714 570.52-56.8053 555.252-56.8053 537.383-56.8053 526.116-45.6042 526.116-28.0026 526.116-10.4676 536.317 0.800074 554.118 0.800074 566.853 0.800074 571.72-5.33383 572.987-8.40078ZM592.455 0 633.926 0 633.926-10.5343 603.323-10.5343 603.323-23.6689 631.325-23.6689 631.325-33.6031 603.323-33.6031 603.323-45.5376 633.926-45.5376 633.926-56.0052 592.455-56.0052Z" fill="#FFFFFF" transform="matrix(1.0005 0 0 1 590.804 1927.55)"/></g></svg>
diff --git a/docs/_static/img/logo-v2.png b/docs/_static/img/logo-v2.png
new file mode 100644
index 000000000..410773f60
Binary files /dev/null and b/docs/_static/img/logo-v2.png differ
diff --git a/docs/_static/img/logo.png b/docs/_static/img/logo.png
new file mode 100644
index 000000000..5d04697ce
Binary files /dev/null and b/docs/_static/img/logo.png differ
diff --git a/docs/_static/img/sparse_mma_storage_example.png b/docs/_static/img/sparse_mma_storage_example.png
new file mode 100644
index 000000000..0b1639819
Binary files /dev/null and b/docs/_static/img/sparse_mma_storage_example.png differ
diff --git a/docs/compiler_internals/inject_fence_proxy.md b/docs/compiler_internals/inject_fence_proxy.md
index 81f498e57..7a89456ac 100644
--- a/docs/compiler_internals/inject_fence_proxy.md
+++ b/docs/compiler_internals/inject_fence_proxy.md
@@ -17,7 +17,7 @@ The pass is conservative: unknown extern calls are treated as async so that the
 ### Timeline View
 
 ```
-generic initialize_descriptor → generic shared-store → async wgmma
+generic initialize_wgmma_descriptor → generic shared-store → async wgmma
              │                           │                   │
              └─ generic proxy            ┴─ generic proxy    ┴─ async proxy
                          │        fence inserted here   ↑
@@ -53,7 +53,7 @@ def kernel():
     with T.Kernel(1):
         desc = T.decl_buffer((1,), "uint64", scope="local.descriptor")
         smem = T.decl_buffer((128,), "float16", scope="shared")
-        T.initialize_descriptor(desc, T.uint64(0), 2, 1, 32)
+        T.initialize_wgmma_descriptor(desc, T.uint64(0), 2, 1, 32)
         smem[0] = T.float16(0)
         T.ptx_wgmma_ss(
             "float16",
@@ -83,7 +83,7 @@ def kernel():
     with T.Kernel(1):
         desc = T.decl_buffer((1,), "uint64", scope="local.descriptor")
         smem = T.decl_buffer((128,), "float16", scope="shared")
-        T.initialize_descriptor(desc, T.uint64(0), 2, 1, 32)
+        T.initialize_wgmma_descriptor(desc, T.uint64(0), 2, 1, 32)
         smem[0] = T.float16(0)
         T.fence_proxy_async()
         T.ptx_wgmma_ss(
diff --git a/docs/compiler_internals/tensor_checks.md b/docs/compiler_internals/tensor_checks.md
new file mode 100644
index 000000000..ed5a9e691
--- /dev/null
+++ b/docs/compiler_internals/tensor_checks.md
@@ -0,0 +1,386 @@
+# Tensor Checks (Host-Side Auto-Validation)
+
+This page explains the host-side checks that TileLang automatically inserts into the generated host stub for kernels. When you pass `torch.Tensor` or any DLPack-compatible object to a TileLang kernel, the host stub validates argument count, pointer kinds, dtype, shape, strides, device, and more — so you don’t need to handwrite Python checks. This keeps the ABI stable and significantly reduces Python overhead compared to doing equivalent checks in Python or via pybind.
+
+## Why Host-Side Checks
+- ABI stability: the entry is based on TVM FFI + DLPack, consistently accepting tensors and scalars.
+- Lower overhead: shifting checks from Python into C reduces interpreter/property-access costs; the call overhead is lower than pybind-based approaches.
+- Focused error reporting: assertions are raised close to the call site with precise “which field failed” messages.
+
+## How To Inspect Host Source
+You can inspect the auto-generated host source (with all checks and the final device-kernel call) for debugging:
+
+```python
+print(matmul_relu_kernel.get_host_source())
+```
+
+---
+
+## What The Host Checks
+
+### 1) Argument count and pointer kind
+- `num_args` must match the number of formal parameters; otherwise the kernel returns `-1` with an error message.
+- Each argument’s FFI type must be a pointer kind (for DLTensor/handle) or a valid scalar type; otherwise you’ll see errors like `Expect arg[i] to be pointer` or a scalar type error.
+
+### 2) Tensor checks (per tensor, after nullability decision)
+- Nullability
+  - If the tensor is “statically reachable/used” by the function body, the handle must be non-NULL; otherwise: `xxx is expected to have non-NULL pointer`.
+  - If an input tensor is not used by the function (statically unreachable), NULL is allowed; other field checks are executed only when `handle != NULL`.
+- Rank (`ndim`)
+  - Runtime `ndim` must equal the compile-time rank.
+- Data type (`dtype`)
+  - Match the triple `(code, bits, lanes)` with tolerance:
+    - `float8_e4m3`: accept `e4m3`, `e4m3fn`, `e4m3fnuz`.
+    - `float8_e5m2`: accept `e5m2`, `e5m2fnuz`.
+    - `bool`: accept `int8/uint8` with `bits=8` (same lanes), `kDLBool(code=6, bits=1 or 8)`, and any `bitwidth=1` (lanes must match).
+  - For packed-bit dtypes (e.g., `Int(1)`, `Int(4)`, `UInt(4)`), strict dtype checking is skipped.
+- Shape
+  - Each runtime dimension is bound to the compile-time shape (constants or symbols) and checked for consistency.
+  - Linear equations among symbolic dims can be solved on the fly (when there’s only one unknown at a given check point), enabling cross-tensor constraints.
+- Strides
+  - If `buffer_type = AutoBroadcast`: allow `strides == NULL` and derive strides from `shape`. If explicit `strides` is present, bind to compile-time constraints and check for equality.
+  - Otherwise: check per-dimension; if `strides == NULL`, derive from `shape` and compare (e.g., contiguous: `strides[-1] == 1`, `strides[-2] == shape[-1]`).
+- `byte_offset`
+  - Must be 0 (non-zero raises an error) to keep addressing simple and aligned.
+- Device info
+  - Assert `device_type == target backend` (CUDA/ROCM/Metal/OneAPI/WebGPU/CPU, etc.). Error messages include a DLPack code legend.
+  - When multiple tensors participate, assert that `device_id` matches across them.
+- Data pointer
+  - Must be non-NULL when the tensor is required to be non-null by the nullability rule.
+
+### 3) Scalar checks
+- `T.int*` family: require integer; error: `Expect arg[i] to be int`.
+- `T.bool`: require boolean; error: `Expect arg[i] to be boolean`.
+
+---
+
+## Shapes and Symbolic Equations: Linear Solving
+When shapes are symbolic, the host binds and (when possible) solves linear relations at runtime (only one unknown per check point). Example:
+
+```python
+@T.prim_func
+def main(
+    A: T.Tensor((m,), dtype),
+    B: T.Tensor((m + n,), dtype),
+    C: T.Tensor((n * k,), dtype),
+):
+    ...
+```
+
+This enables enforcing cross-tensor relationships like `len(B) == m + n` and `len(C) == n * k` at runtime.
+
+---
+
+## Nullability Rules and Examples
+Which tensors may be NULL?
+
+- Rule: If an input tensor is not used by the function under static analysis (i.e., the access is statically unreachable), it is considered Nullable; otherwise it must be non-NULL.
+- Examples:
+
+1) Must be non-NULL (used)
+```python
+@T.prim_func
+def main(A: T.Tensor((M, K), dtype)):
+    A[0] = 1
+```
+Passing `None` raises: `main.A_handle is expected to have non-NULL pointer`.
+
+2) Still must be non-NULL (constant-true branch)
+```python
+some_cond: bool = True
+@T.prim_func
+def main(A: T.Tensor((M, K), dtype)):
+    if some_cond:
+        A[0] = 1
+```
+
+3) Nullable (constant-false branch, statically unreachable)
+```python
+some_cond: bool = False
+@T.prim_func
+def main(A: T.Tensor((M, K), dtype)):
+    if some_cond:
+        A[0] = 1
+```
+
+4) Must be non-NULL (runtime condition)
+```python
+@T.prim_func
+def main(A: T.Tensor((M, K), dtype), some_cond: T.bool):
+    if some_cond:
+        A[0] = 1
+```
+Since `some_cond` is only known at runtime, static analysis cannot prove `A` is unused; `A` is thus non-nullable.
+
+---
+
+## Device Type Codes (DLPack)
+Supported and referenced device codes in error messages: `1=CPU, 2=CUDA, 7=Vulkan, 8=Metal, 10=ROCM, 14=OneAPI, 15=WebGPU`.
+Kernels assert that `device_type` matches the target backend, and require `device_id` consistency across tensors.
+
+---
+
+## Common Error Examples (What you’ll see)
+- Argument count mismatch (num_args)
+  - Trigger: missing/extra argument
+  - Error: `<kernel>: num_args should be N; expected: <num_args>, got: N`
+
+- Pointer-typed argument expected
+  - Trigger: scalar passed where a tensor is expected
+  - Error: `<kernel>: Expect arg[i] to be pointer`
+
+- Rank (ndim) mismatch
+  - Trigger: runtime rank differs from compile-time rank
+  - Error: `<kernel>.<name>.ndim is expected to equal R, but got mismatched ndim`
+
+- Dtype mismatch
+  - Trigger: dtype not equal to the compiled dtype and not within the tolerance set
+  - Error: `<kernel>.<name>.dtype is expected to be <dtype>, but got incompatible dtype`
+
+- Shape constraint violation
+  - Trigger: a dimension doesn’t match a constant/symbol binding
+  - Error: `Argument <kernel>.<name>.shape[i] has an unsatisfied constraint: ... == <expected>`
+
+- Strides check failed (e.g., non-contiguous layout)
+  - Trigger: transposed/sliced tensors that violate expected strides
+  - Error: `Argument <kernel>.<name>.strides[j] has an unsatisfied constraint: ... == <expected>`
+
+- Device type mismatch
+  - Trigger: calling a CUDA kernel with CPU tensors, etc.
+  - Error: `<kernel>.<name>.device_type mismatch [expected: <code> (<name>)] ...`
+
+- Device id mismatch
+  - Trigger: mixing tensors from different GPUs
+  - Error: `Argument <kernel>.<name>.device_id has an unsatisfied constraint: ... == ...`
+
+- NULL data pointer
+  - Trigger: tensor required to be non-null has a NULL data pointer
+  - Error: `<kernel>.<name> is expected to have non-NULL data pointer, but got NULL`
+
+- Scalar type mismatch
+  - Trigger: passing float to `T.int32`, or non-boolean to `T.bool`
+  - Error: `<kernel>: Expect arg[i] to be int/boolean`
+
+---
+
+## Troubleshooting Tips
+- Print the host source: `print(fn.get_host_source())` to see the exact assertion and expected vs. actual fields.
+- Fix strides: call `.contiguous()` for non-contiguous tensors, or avoid generating transposed/sliced layouts that break assumptions.
+- Align devices: ensure all participating tensors share the same `device_type` and `device_id`.
+- Align dtype: use `.to(<dtype>)` or construct tensors with the correct dtype; pay attention to `float8` and `bool` tolerance.
+- Dynamic shapes: ensure cross-tensor linear relations can be uniquely determined at the check point (only one unknown at a time).
+
+---
+
+## FAQ
+- Can I disable the checks?
+  - Not recommended and usually not supported. Checks are done on the host to preserve ABI stability and fail early close to the device call.
+- Is the overhead noticeable?
+  - The checks are lightweight (branches and field reads). Compared to Python-side checks, it’s faster; the dominating cost remains the Python→C boundary. Overall it’s cheaper than equivalent checks in Python.
+
+---
+
+## Reference Example (Matmul + ReLU)
+
+```python
+@T.prim_func
+def matmul_relu_kernel(
+    A: T.Tensor((M, K), dtype),
+    B: T.Tensor((K, N), dtype),
+    C: T.Tensor((M, N), dtype),
+):
+    # Initialize Kernel Context
+    with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+        A_shared = T.alloc_shared((block_M, block_K), dtype)
+        B_shared = T.alloc_shared((block_K, block_N), dtype)
+        C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+        T.clear(C_local)
+        for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=0):
+            T.copy(A[by * block_M, ko * block_K], A_shared)
+            T.copy(B[ko * block_K, bx * block_N], B_shared)
+            T.gemm(A_shared, B_shared, C_local)
+        T.copy(C_local, C[by * block_M, bx * block_N])
+
+# For debugging, print the host source
+print(matmul_relu_kernel.get_host_source())
+```
+
+The host will insert all checks described above for this example.
+
+---
+
+## Quick Error Reference (Short List)
+- Argument count
+  - Trigger: missing/extra args; Error: `num_args should be N; expected: <num_args>, got: N`.
+- Pointer kind
+  - Trigger: scalar passed to tensor arg; Error: `Expect arg[i] to be pointer`.
+- Rank (ndim)
+  - Trigger: runtime rank != compile-time; Error: `ndim ... expected to equal R`.
+- Dtype
+  - Trigger: mismatch and not tolerated; Error: `dtype ... expected to be <dtype>`.
+- Shape
+  - Trigger: constant/symbol binding violated; Error: `shape[i] ... == <expected>`.
+- Strides
+  - Trigger: layout mismatch; Error: `strides[j] ... == <expected>`.
+- Device type
+  - Trigger: wrong backend device; Error: `device_type mismatch [expected: ...]`.
+- Device id
+  - Trigger: tensors on different GPUs; Error: `device_id ... == ...`.
+- Data pointer
+  - Trigger: required non-NULL but NULL; Error: `non-NULL data pointer`.
+- Scalar types
+  - Trigger: wrong scalar type; Error: `Expect arg[i] to be int/boolean`.
+
+---
+
+## Host Error Troubleshooting (Minimal Repros)
+
+Below are minimal repro snippets for common host-side errors, assuming a CUDA-targeted kernel like `matmul_relu_kernel` with:
+
+```python
+# Convention:
+# A: float16 [M, K]
+# B: float16 [K, N]
+# C: float16 [M, N]
+# Target: CUDA (device_type=2)
+fn = matmul_relu_kernel  # your compiled function
+M = N = K = 1024
+```
+
+Adjust dtype/device if your kernel differs.
+
+### 0. Tip: print the host source
+```python
+print(fn.get_host_source())
+```
+
+### 1. num_args mismatch
+```python
+import torch
+
+A = torch.empty((M, K), device='cuda', dtype=torch.float16)
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+# Missing C
+fn(A, B)
+```
+Expected: `<kernel>: num_args should be 3; expected: <num_args>, got: 3`.
+
+Fix: pass all arguments per the signature.
+
+### 2. Expect pointer (tensor) but got scalar
+```python
+import torch
+
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+fn(1, B, C)
+```
+Expected: `<kernel>: Expect arg[0] to be pointer`.
+
+Fix: pass a DLPack-compatible tensor (e.g., torch.Tensor).
+
+### 3. ndim mismatch
+```python
+import torch
+
+A = torch.empty((M, K, 1), device='cuda', dtype=torch.float16)  # rank=3
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+fn(A, B, C)
+```
+Expected: `<kernel>.A_handle.ndim is expected to equal 2, but got mismatched ndim`.
+
+Fix: ensure runtime rank equals compiled rank.
+
+### 4. dtype mismatch
+```python
+import torch
+
+A = torch.empty((M, K), device='cuda', dtype=torch.float32)  # should be float16
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+fn(A, B, C)
+```
+Expected: `<kernel>.A_handle.dtype is expected to be float16, but got incompatible dtype`.
+
+Fix: `A = A.to(torch.float16)` or create with the correct dtype.
+
+### 5. Shape constant/symbol mismatch
+```python
+import torch
+
+A = torch.empty((M, K + 1), device='cuda', dtype=torch.float16)  # K mismatched
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+fn(A, B, C)
+```
+Expected: `Argument <kernel>.A_handle.shape[i] has an unsatisfied constraint: ... == <expected>`.
+
+Fix: satisfy linear constraints and constants across tensors.
+
+### 6. Strides check failure (non-contiguous)
+```python
+import torch
+
+A = torch.empty((M, K), device='cuda', dtype=torch.float16)
+A_nc = A.t()  # transpose -> non-contiguous
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+fn(A_nc, B, C)
+```
+Expected: `Argument <kernel>.A_handle.strides[1] has an unsatisfied constraint: ... == 1`.
+
+Fix: pass `A_nc.contiguous()` or align the layout expectation in the kernel.
+
+### 7. device_type mismatch
+```python
+import torch
+
+A = torch.empty((M, K), device='cpu', dtype=torch.float16)
+B = torch.empty((K, N), device='cpu', dtype=torch.float16)
+C = torch.empty((M, N), device='cpu', dtype=torch.float16)
+fn(A, B, C)  # CUDA-targeted kernel
+```
+Expected: `<kernel>.A_handle.device_type mismatch [expected: 2 (cuda)] ...`.
+
+Fix: move tensors to the CUDA device.
+
+### 8. device_id mismatch (multi-GPU)
+```python
+import torch
+
+A = torch.empty((M, K), device='cuda:0', dtype=torch.float16)
+B = torch.empty((K, N), device='cuda:1', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda:0', dtype=torch.float16)
+fn(A, B, C)
+```
+Expected: `Argument <kernel>.B_handle.device_id has an unsatisfied constraint: ... == ...`.
+
+Fix: place all tensors on the same GPU (e.g., `cuda:0`).
+
+### 9. NULL data pointer (advanced)
+This usually comes from hand-constructed DLTensor/NDArray, or external frameworks passing unallocated/freed storage. Regular `torch.Tensor` allocations rarely hit this.
+
+Expected: `<kernel>.<name> is expected to have non-NULL data pointer, but got NULL`.
+
+Fix: ensure valid underlying storage; in PyTorch scenarios, avoid constructing tensors from invalid external handles.
+
+### 10. Scalar type mismatch (int / bool)
+```python
+import tilelang.language as T
+
+@T.prim_func
+def scalar_check(x: T.int32, flag: T.bool()):
+    T.evaluate(0)
+
+scalar_check(1.0, True)  # x is float -> Expect arg[0] to be int
+scalar_check(1, 2.5)     # flag is float -> Expect arg[1] to be boolean
+```
+
+Fix: pass correct scalar types, e.g., `scalar_check(1, True)`.
+
+---
+
+## Closing Notes
+- Cross-check “shape / strides / device / dtype” against the kernel signature to localize issues efficiently.
+- For complex symbolic relations, print the host source to confirm binding/solving order, then adjust runtime shapes/layouts accordingly.
diff --git a/docs/conf.py b/docs/conf.py
index 1b1289038..877b5582e 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,5 +1,5 @@
 # General information about the project.
-project = "Tile Language <br>"
+project = "TileLang <br>"
 author = "Tile Lang Contributors"
 copyright = f"2025-2025, {author}"
 
@@ -20,33 +20,27 @@
     "autoapi.extension",
 ]
 
-autoapi_type = 'python'
-autoapi_dirs = ['../tilelang']
+autoapi_type = "python"
+autoapi_dirs = ["../tilelang"]
 
 autoapi_options = [
-    'members',
-    'undoc-members',
-    'show-inheritance',
-    'show-module-summary',
-    'special-members',
+    "members",
+    "undoc-members",
+    "show-inheritance",
+    "show-module-summary",
+    "special-members",
 ]
 autoapi_keep_files = False  # Useful for debugging the generated rst files
 
 autoapi_generate_api_docs = True
 
-autodoc_typehints = 'description'
+autodoc_typehints = "description"
 
 autoapi_ignore = ["*language/ast*", "*version*", "*libinfo*", "*parser*"]
 
-source_suffix = {
-    '.rst': 'restructuredtext',
-    '.md': 'markdown',
-}
+source_suffix = {".rst": "restructuredtext", ".md": "markdown"}
 
-myst_enable_extensions = [
-    "colon_fence",
-    "deflist",
-]
+myst_enable_extensions = ["colon_fence", "deflist"]
 
 redirects = {"get_started/try_out": "../index.html#getting-started"}
 
@@ -62,13 +56,11 @@
 html_theme = "furo"
 templates_path = []
 html_static_path = ["_static"]
-footer_copyright = "© 2025-2025 Tile Language"
+html_css_files = ["custom.css"]
+footer_copyright = "© 2025-2026 TileLang"
 footer_note = " "
 
-html_theme_options = {
-    "light_logo": "img/logo-row.svg",
-    "dark_logo": "img/logo-row.svg",
-}
+html_theme_options = {"light_logo": "img/logo-v2.png", "dark_logo": "img/logo-v2.png"}
 
 header_links = [
     ("Home", "https://github.com/tile-ai/tilelang"),
diff --git a/docs/deeplearning_operators/deepseek_mla.md b/docs/deeplearning_operators/deepseek_mla.md
index 08175778f..ed02b58b1 100644
--- a/docs/deeplearning_operators/deepseek_mla.md
+++ b/docs/deeplearning_operators/deepseek_mla.md
@@ -1,8 +1,7 @@
 # 🚀 Write High Performance FlashMLA with TileLang on Hopper
 
-
 <div style="text-align: left;">
-    <em>Author:</em> <a href="https://github.com/chengyupku">Yu Cheng</a> 
+    <em>Author:</em> <a href="https://github.com/chengyupku">Yu Cheng</a>
     <em>Author:</em> <a href="https://github.com/LeiWang1999">Lei Wang</a>
 </div>
 
@@ -32,14 +31,14 @@ Figure 1: Performance under batch size=64
 Figure 2: Performance under batch size=128
 ```
 
-As shown in the results, TileLang achieves performance comparable to FlashMLA in most cases, significantly outperforming both FlashInfer and Triton. 
+As shown in the results, TileLang achieves performance comparable to FlashMLA in most cases, significantly outperforming both FlashInfer and Triton.
 Notably, **TileLang accomplishes this with just around 80 lines of Python code**, demonstrating its exceptional ease of use and efficiency. Let's dive in and see how TileLang achieves this.
 
 ## Implementation
 
 First, let's review the core computation logic of traditional FlashAttention:
 
-```python   
+```python
 # acc_s: [block_M, block_N]
 # scores_max: [block_M]
 # scores_scale: [block_M]
@@ -62,7 +61,7 @@ Compared to traditional attention operators like MHA (Multi-Headed Attention) or
 
 This raises the question of how to partition the matrix multiplication operation. On the Hopper architecture, most computation kernels use [`wgmma.mma_async`](https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions) instructions for optimal performance. The `wgmma.mma_async` instruction organizes 4 warps (128 threads) into a warpgroup for collective MMA operations. However, `wgmma.mma_async` instructions require a minimum M dimension of 64. This means each warpgroup's minimum M dimension can only be reduced to 64, but a tile size of 64*512 is too large for a single warpgroup, leading to register spilling.
 
-Therefore, our only option is to partition `acc_o` along the `dim` dimension, with two warpgroups computing the left and right part of `acc_o` respectively. However, this introduces another challenge: both warpgroups require the complete `acc_s` result as input. 
+Therefore, our only option is to partition `acc_o` along the `dim` dimension, with two warpgroups computing the left and right part of `acc_o` respectively. However, this introduces another challenge: both warpgroups require the complete `acc_s` result as input.
 
 Our solution is to have each warpgroup compute half of `acc_s` during `Q @ K` computation, then obtain the other half computed by the other warpgroup through shared memory.
 
@@ -106,7 +105,6 @@ T.use_swizzle(panel_size: int, order: str = "row")
 
 Here, `panel_size` specifies the width of the swizzled threadblock group, and `order` determines the swizzling pattern, which can be either "row" or "col".
 
-
 ### Shared Memory Swizzling
 
 In CUDA programming, shared memory is divided into multiple memory banks, with each bank capable of servicing one thread request per clock cycle in parallel. Bank conflicts occur when multiple threads simultaneously access different addresses mapped to the same bank, forcing these accesses to be serialized and degrading performance.
@@ -123,17 +121,14 @@ T.annotate_layout({
 
 Here, `T.annotate_layout` allows users to specify any desired layout for a buffer. For convenience, TileLang provides the `make_swizzled_layout` primitive to automatically generate a swizzled layout.
 
-
 ### Warp-Specialization
 
 The Hopper architecture commonly employs warp specialization for performance optimization. A typical approach is to designate one warpgroup as a producer that handles data movement using TMA (Tensor Memory Accelerator), while the remaining warpgroups serve as consumers performing computations. However, this programming pattern is complex, requiring developers to manually manage the execution logic for producers and consumers, including synchronization through the `mbarrier` objects.
 
 In TileLang, users are completely shielded from these implementation details. The frontend script is automatically transformed into a warp-specialized form, where TileLang handles all producer-consumer synchronization automatically, enabling efficient computation.
 
-
 ### Pipeline
 
-
 Pipeline is a technique used to improve memory access efficiency by overlapping memory access and computation. In TileLang, pipeline can be implemented through the `T.pipelined` annotation:
 
 ```python
@@ -142,14 +137,12 @@ T.pipelined(range: int, stage: int)
 
 Here, `range` specifies the range of the pipeline, and `stage` specifies the stage of the pipeline. Multi-stage pipelining enables overlapping of computation and memory access, which can significantly improve performance for memory-intensive operators. However, setting a higher number of stages consumes more shared memory resources, so the optimal configuration needs to be determined based on specific use cases.
 
-
 ### Split-KV
 
 We have also implemented Split-KV optimization similar to [FlashDecoding](https://pytorch.org/blog/flash-decoding/). Specifically, when the batch size is small, parallel SM resources cannot be fully utilized due to low parallelism. In such cases, we can split the kv_ctx dimension across multiple SMs for parallel computation and then merge the results.
 
 In our implementation, we have developed both split and combine kernels, allowing users to control the split size through a `num_split` parameter.
 
-
 ## 🚀 On AMD MI300X Accelerators
 
 Following our previous demonstration of [high-performance FlashMLA implementation on NVIDIA Hopper architectures using TileLang](https://github.com/tile-ai/tilelang/blob/main/examples/deepseek_mla/README.md), this work presents an optimized implementation for AMD MI300X accelerators. We examine architectural differences and corresponding optimization strategies between these platforms.
@@ -167,7 +160,7 @@ Key implementation differences between Hopper and MI300X architectures include:
    # Original shared memory allocation
    Q_shared = T.alloc_shared([block_H, dim], dtype)
    Q_pe_shared = T.alloc_shared([block_H, pe_dim], dtype)
-   
+
    # Optimized register allocation
    Q_local = T.alloc_fragment([block_H, dim], dtype)
    Q_pe_local = T.alloc_fragment([block_H, pe_dim], dtype)
diff --git a/docs/deeplearning_operators/elementwise.md b/docs/deeplearning_operators/elementwise.md
index 5e1243c26..6aa8e4085 100644
--- a/docs/deeplearning_operators/elementwise.md
+++ b/docs/deeplearning_operators/elementwise.md
@@ -8,7 +8,7 @@
 :class: myclass1 myclass2
 :name: a-tip-reference
 
-   This document is still **experimental** and may be incomplete.  
+   This document is still **experimental** and may be incomplete.
    Suggestions and improvements are highly encouraged—please submit a PR!
 :::
 
@@ -24,7 +24,7 @@ Please note that this tutorial does not delve deeply into the design principles
 ## Elementwise add in TileLang
 
 ```python
-def elementwise_add(N, threads=256, dtype="bfloat16"):
+def elementwise_add(N, threads=256, dtype=T.bfloat16):
 
     @T.prim_func
     def main(A: T.Tensor((N), dtype), B: T.Tensor((N), dtype), C: T.Tensor((N), dtype)):
@@ -43,7 +43,7 @@ Those familiar with CUDA programming might wonder where `threadIdx` fits into th
 The program can be compiled using the following code:
 
 ```python
-program = elementwise_add(1024, threads=256, dtype="bfloat16")
+program = elementwise_add(1024, threads=256, dtype=T.bfloat16)
 kernel = tilelang.compile(program, out_idx=-1, target="cuda", execution_backend="cython")
 ```
 Launching the kernel is straightforward, just call it directly like a function:
@@ -89,7 +89,7 @@ def elementwise_add(
 In the compilation process above, a fixed shape was used. However, in practical usage, we often want the kernel to support dynamic shapes. So, how can we compile a kernel in TileLang to handle dynamic shapes? In TileLang, we can replace the target size with a dynamic symbolic value, making the dimension dynamic. The following example illustrates this:
 
 ```python
-program = elementwise_add(T.dynamic("N"), threads=256, dtype="bfloat16")
+program = elementwise_add(T.dynamic("N"), threads=256, dtype=T.bfloat16)
 kernel = tilelang.compile(program, out_idx=-1, target="cuda", execution_backend="cython")
 ```
 
@@ -102,7 +102,7 @@ TileLang automatically incorporates boundary-checking conditions; however, this
 When compiling the example below, let's set `N` to 2047:
 
 ```python
-def elementwise_add(N, num_per_thread=8, threads=256, dtype="bfloat16"):
+def elementwise_add(N, num_per_thread=8, threads=256, dtype=T.bfloat16):
 
     @T.prim_func
     def main(A: T.Tensor((N), dtype), B: T.Tensor((N), dtype), C: T.Tensor((N), dtype)):
@@ -176,7 +176,7 @@ While TileLang incorporates various optimizations for the aforementioned case, i
 In such scenarios, explicitly specifying the number of elements computed per thread can help "guide" TileLang's code generation process, leading to implementations that are more closely aligned with the intended design.
 
 ```python
-def elementwise_add(N, num_per_thread=8, threads=256, dtype="bfloat16"):
+def elementwise_add(N, num_per_thread=8, threads=256, dtype=T.bfloat16):
 
     @T.prim_func
     def main(A: T.Tensor((N), dtype), B: T.Tensor((N), dtype), C: T.Tensor((N), dtype)):
@@ -212,7 +212,7 @@ Aha, this CUDA code aligns closely with conventional programming practices, maki
 But what happens if we provide additional hints to TileLang? For instance, by explicitly specifying register copies using the `T.copy(...)` operation. The example below demonstrates a vector addition implementation. Unlike the previous examples, this code explicitly loads data into registers before performing computations.
 
 ```python
-def elementwise_add(N, NUM_ELE_PER_THREAD=8, threads=256, dtype="bfloat16"):
+def elementwise_add(N, NUM_ELE_PER_THREAD=8, threads=256, dtype=T.bfloat16):
 
     @T.prim_func
     def main(A: T.Tensor((N), dtype), B: T.Tensor((N), dtype), C: T.Tensor((N), dtype)):
@@ -280,8 +280,8 @@ To evaluate complexity, one could implement the same elementwise addition operat
 
 ```c++
 template<int NUM_ELE_PER_THREAD=8>
-__global__ void elementwise_add(nv_bfloat16* C, 
-                                 const nv_bfloat16* A, 
+__global__ void elementwise_add(nv_bfloat16* C,
+                                 const nv_bfloat16* A,
                                  const nv_bfloat16* B,
                                  int N) {
   using namespace cute;
diff --git a/docs/deeplearning_operators/gemv.md b/docs/deeplearning_operators/gemv.md
index c75a961b8..38287f220 100644
--- a/docs/deeplearning_operators/gemv.md
+++ b/docs/deeplearning_operators/gemv.md
@@ -6,7 +6,7 @@
 </div>
 
 :::{warning}
-   This document is still **experimental** and may be incomplete.  
+   This document is still **experimental** and may be incomplete.
    Suggestions and improvements are highly encouraged—please submit a PR!
 :::
 
@@ -206,7 +206,6 @@ def splitk_gemv(
     return main
 ```
 
-
 ## Vectorized Reads
 
 GEMV is less computation intensive than GEMM as the computation intensity and memory throughput will be the optimization bottleneck. One effective strategy is to use vectorized load/store operations (e.g., `float2`, `float4`). In `TileLang`, you can specify vectorized operations via `T.vectorized`:
@@ -254,7 +253,6 @@ def splitk_gemv_vectorized(
 
 With vectorized read, now the kernel finishes in **~0.0084 ms**, which is getting close to cuBLAS performance.
 
-
 ## `tvm_thread_allreduce` Instead of `atomicAdd`
 
 [`tvm_thread_allreduce`](https://tvm.apache.org/docs/reference/api/python/tir/tir.html#tvm.tir.tvm_thread_allreduce) has implemented optimization when making an all-reduce across a number of threads, which should outperfrom out plain smem + `atomidAdd`:
@@ -459,6 +457,5 @@ This corresponds closely to our `TileLang` program, with necessary synchronizati
 | splitk_gemv_vectorized | 0.00809 ms |
 | splitk_gemv_vectorized_tvm | 0.00675 ms |
 
-
 Triton Time: 0.0077344514429569244
-In this tutorial, we implemented a simple GEMV kernel and learn that `TileLang` exposes low level control to user such as thread-level programming and CUDA primitives.
\ No newline at end of file
+In this tutorial, we implemented a simple GEMV kernel and learn that `TileLang` exposes low level control to user such as thread-level programming and CUDA primitives.
diff --git a/docs/deeplearning_operators/matmul.md b/docs/deeplearning_operators/matmul.md
index fea036ebe..12189eb8f 100644
--- a/docs/deeplearning_operators/matmul.md
+++ b/docs/deeplearning_operators/matmul.md
@@ -14,11 +14,11 @@
 
 TileLang is a domain-specific language (DSL) designed for writing high-performance GPU kernels. It provides three main levels of abstraction:
 
-* **Level 1:** A user writes pure compute logic without knowledge of or concern for hardware details (e.g., GPU caches, tiling, etc.). The compiler or runtime performs automatic scheduling and optimization. This level is conceptually similar to the idea behind TVM.
+- **Level 1:** A user writes pure compute logic without knowledge of or concern for hardware details (e.g., GPU caches, tiling, etc.). The compiler or runtime performs automatic scheduling and optimization. This level is conceptually similar to the idea behind TVM.
 
-* **Level 2:** A user is aware of GPU architecture concepts—such as shared memory, tiling, and thread blocks—but does not necessarily want to drop down to the lowest level of explicit thread control. This mode is somewhat comparable to Triton's programming model, where you can write tile-level operations and let the compiler do layout inference, pipelining, etc.
+- **Level 2:** A user is aware of GPU architecture concepts—such as shared memory, tiling, and thread blocks—but does not necessarily want to drop down to the lowest level of explicit thread control. This mode is somewhat comparable to Triton's programming model, where you can write tile-level operations and let the compiler do layout inference, pipelining, etc.
 
-* **Level 3:** A user takes full control of thread-level primitives and can write code that is almost as explicit as a hand-written CUDA/HIP kernel. This is useful for performance experts who need to manage every detail, such as PTX inline assembly, explicit thread behavior, etc.
+- **Level 3:** A user takes full control of thread-level primitives and can write code that is almost as explicit as a hand-written CUDA/HIP kernel. This is useful for performance experts who need to manage every detail, such as PTX inline assembly, explicit thread behavior, etc.
 
 ```{figure} ../_static/img/overview.png
 :width: 50%
@@ -52,12 +52,12 @@ While Level 1 in TileLang can be very comfortable for general users—since it r
 
 Below is a simplified code snippet for a 1024 x 1024 x 1024 matrix multiplication. It uses:
 
-* **`T.Kernel(...)`** to initialize the thread block configuration (grid dimensions, block size, etc.).
-* **`T.alloc_shared(...)`** to allocate GPU shared memory.
-* **`T.alloc_fragment(...)`** to allocate a register fragment for accumulation.
-* **`T.Pipelined(...)`** to express software pipelining across the K dimension.
-* **`T.Parallel(...)`** to parallelize data copy loops.
-* **`T.gemm(...)`** to perform tile-level GEMM operations (which map to the appropriate backends, such as MMA instructions on NVIDIA GPUs).
+- **`T.Kernel(...)`** to initialize the thread block configuration (grid dimensions, block size, etc.).
+- **`T.alloc_shared(...)`** to allocate GPU shared memory.
+- **`T.alloc_fragment(...)`** to allocate a register fragment for accumulation.
+- **`T.Pipelined(...)`** to express software pipelining across the K dimension.
+- **`T.Parallel(...)`** to parallelize data copy loops.
+- **`T.gemm(...)`** to perform tile-level GEMM operations (which map to the appropriate backends, such as MMA instructions on NVIDIA GPUs).
 
 ```python
 import tilelang
@@ -147,14 +147,12 @@ with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx,
 - This sets up the block grid dimensions based on N/block_N and M/block_M.
 - `threads=128` specifies that each thread block uses 128 threads. The compiler will infer how loops map to these threads.
 
-
 ```{figure} ../_static/img/Parallel.png
 :alt: Parallel
 :align: center
 
 ```
 
-
 2. **Shared & Fragment Memory**:
 
 ```python
@@ -182,7 +180,6 @@ for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
 
 ```
 
-
 4. **Parallel Copy**:
 
 ```python
@@ -252,8 +249,8 @@ For more advanced usage—including partial lowering, explicitly controlling thr
 
 ## Further Resources
 
-* [TileLang GitHub](https://github.com/tile-ai/tilelang)
-* [BitBLAS](https://github.com/tile-ai/bitblas)
-* [Triton](https://github.com/openai/triton)
-* [Cutlass](https://github.com/NVIDIA/cutlass)
-* [PyCUDA](https://documen.tician.de/pycuda/)  <!-- codespell:ignore -->
+- [TileLang GitHub](https://github.com/tile-ai/tilelang)
+- [BitBLAS](https://github.com/tile-ai/bitblas)
+- [Triton](https://github.com/openai/triton)
+- [Cutlass](https://github.com/NVIDIA/cutlass)
+- [PyCUDA](https://documen.tician.de/pycuda/)  <!-- codespell:ignore -->
diff --git a/docs/deeplearning_operators/matmul_sparse.md b/docs/deeplearning_operators/matmul_sparse.md
new file mode 100644
index 000000000..8caa6182f
--- /dev/null
+++ b/docs/deeplearning_operators/matmul_sparse.md
@@ -0,0 +1,261 @@
+# Sparse Matrix-Matrix Multiplication with Tile Library
+
+<div style="text-align: left;">
+    <em>Author:</em> <a href="https://github.com/botbw">botbw</a>
+</div>
+
+:::{warning}
+   This document is still **experimental** and may be incomplete.
+
+   This feature is still **experimental** and need further optimization.
+
+   Suggestions and improvements are highly encouraged—please submit a PR!
+:::
+
+:::{tip}
+It's suggested to go through `docs/deeplearning_operators/matmul.md` first.
+
+Example code can be found at `examples/gemm_sp`.
+:::
+
+## Structured sparsity in the NVIDIA Ampere architecture
+
+Since the Ampere architecture (sm80 and above), sparsity support has been integrated into Tensor Cores. This allows a 2:4 (or 1:2 for 32-bit data types) semi-structured matrix to be compressed into its non-zero values along with associated metadata, which can then be fed into the Tensor Core. This enables up to **2x throughput** compared to the equivalent dense computation.
+
+:::{warning}
+   This tutorial primarily focuses on CUDA, as this feature is not yet supported on ROCm. However, AMD provides a similar capability in the matrix cores of GPUs such as the MI300X.
+:::
+
+```{figure} ../_static/img/sparse_mma_storage_example.png
+:align: center
+
+Figure: Sparse MMA storage example (from PTX doc)
+```
+
+## Compress a dense tensor
+
+To utilize sparse Tensor Cores, a dense tensor must first be **compressed** into its non-zero values along with the corresponding metadata.
+
+Both `PyTorch` and `vLLM` use `CUTLASS` as their computation backend (see references [here](https://github.com/pytorch/pytorch/blob/a8d6afb511a69687bbb2b7e88a3cf67917e1697e/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredOps.cu#L47) and [here](https://github.com/vllm-project/vllm/blob/a5dd03c1ebc5e4f56f3c9d3dc0436e9c582c978f/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh#L116)), leveraging `CUTLASS`’s built-in compressor (or reimplementing it in `PyTorch`).
+
+A set of **CUTLASS-compatible** compressors is provided in `tilelang.utils.sparse`, where a dense tensor—along with other required arguments (e.g., block_K for sm90, transpose options)—can be passed in to perform the compression.
+
+```python
+from tilelang.utils.sparse import compress
+A_sparse, E = compress(A, transposed=trans_A, block_k=block_K)
+```
+
+Here, `A_sparse` contains all the non-zero elements of `A`, while `E` stores the corresponding metadata (indexing information) required to reconstruct the original sparse pattern.
+
+> NOTE: When using CUTLASS compressor, there is no naive position correspondence between the positions in `A_sparse`/`A` and `E`. (i.e. the 4-element group at [n, k] doesn't match the 4-bit metadata at [n, k] if you consider metadata as int4 tensor)
+The metadata is reordered internally to optimize memory access patterns (e.g., for ldsm instructions and vectorized loads).
+For more information, see **A note on `gemm_sp` and `gemm_sp_v2`**.
+
+## `T.gemm_sp` with CUTLASS's compressor
+
+:::{warning}
+
+It is strongly recommended to use T.gemm_sp_v2 due to its greater flexibility and faster compilation time.
+
+:::
+
+A 2:4 sparse GEMM kernel is similar to its dense counterpart, except that it also requires handling the associated metadata.
+
+Check comments in below kernel code for required modification.
+
+```python
+def matmul_sp_sm80(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+    trans_A,
+    trans_B,
+):
+    is_8_bit = "8" in in_dtype
+    metadata_dtype = 'int32' if is_8_bit else 'int16'
+    E_factor = SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype]  # Calculate shape for given datatypes
+    A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
+    B_shape = (K, N) if not trans_B else (N, K)
+    A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
+    B_shared_shape = (block_K, block_N) if not trans_B else (block_N, block_K)
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+            A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+            E: T.Tensor((M, K // E_factor), metadata_dtype),
+            B: T.Tensor(B_shape, in_dtype),
+            C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)  # Allocate smem for metadata
+            C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.annotate_layout({  # Annotate reordered cutlass metadata layout
+                E:
+                    make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                E_shared:
+                    make_cutlass_metadata_layout(
+                        E_shared, mma_dtype=in_dtype, arch="8.0"),
+            })
+            T.clear(C_frag)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
+                if trans_A:
+                    T.copy(A_sparse[k * block_K // 2, by * block_M], A_shared)
+                else:
+                    T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm_sp(A_shared, E_shared, B_shared, C_frag, trans_A, trans_B)  # Call gemm_sp with non-zero values and metadata
+            T.copy(C_frag, C[by * block_M, bx * block_N])
+
+    return main
+```
+
+Under the hood, `gemm_sp` invokes templates adapted from `CUTLASS`, and a compatible metadata layout must be specified using `T.annotate_layout`.
+
+## `T.gemm_sp_v2` with a custom compressor
+
+To migrate to `gemm_sp_v2`, simply replace occurrences of `gemm_sp`.
+
+Unlike `gemm_sp`, `gemm_sp_v2` can operate without `T.annotate_layout`, and it also supports user-defined layouts and compressors.
+
+The metadata is stored in a `(u)int8`/`(u)int16`/`(u)int32` tensor, where **each 4-bit chunk represents two 2-bit indices** of non-zero elements within four consecutive elements. Here, we start with an `int16` example, which is the **default dtype** for `bf16` and `fp16` on Ampere GPUs.
+
+Suppose we have the following row vector:
+```python
+t = tensor([[0, 7, 0, 3], [1, 5, 0, 0], [0, 0, 2, 4], [9, 0, 9, 0]], dtype=torch.float16).flatten()
+```
+
+The non-zero elements and their corresponding indices are:
+
+```python
+t_sp = tensor([[7, 3], [1, 5], [2, 4], [9, 9]], dtype=torch.float16).flatten()
+indices = tensor([[1, 3], [0, 1], [2, 3], [0, 2]], dtype=torch.float16).flatten()
+```
+
+The corresponding uint16 metadata is:
+```python
+# metadata_bits = tensor([0b1101, 0b0100, 0b1110, 0b1000])
+# Note: storage uses little-endian order: tensor(0b1000111001001101, dtype=torch.int16)
+# Note: the above code is not runnable in python as the interpreter won't take the binary
+#       as 2's complement
+metadata_int16 = tensor(-29107)
+```
+
+You can decode an int16 metadata tensor using the following utility:
+```python
+def decode_metadata(meta: torch.Tensor) -> torch.Tensor:
+    assert meta.dtype is torch.int16
+    groups_per_meta = 16 // 4
+    out = []
+    for g in range(groups_per_meta):
+        group_bits = (meta >> (g * 4)) & 0xF
+        idx0 = group_bits & 0x3
+        idx1 = (group_bits >> 2) & 0x3
+        out.append(torch.stack([idx0, idx1], dim=-1))
+    return torch.concat(out, dim=-1).view(meta.shape[0], -1)
+```
+
+The compressor can be implement at either `PyTorch`/`NumPy` level or kernel level.
+
+For example, `PyTorch` provides an Ampere compressor [here](https://github.com/pytorch/pytorch/blob/267d0197bfca0232488d51dd1ff735d619adc2cf/torch/sparse/_semi_structured_conversions.py#L47-L179). Note that in this implementation, a [permutation](https://github.com/pytorch/pytorch/blob/267d0197bfca0232488d51dd1ff735d619adc2cf/torch/sparse/_semi_structured_conversions.py#L173-L175) is applied to match CUTLASS’s metadata layout. If you do not annotate a metadata layout when using `gemm_sp_v2`, your compressor should replicate the same behavior as the PyTorch example—but without using the `_calculate_meta_reordering_scatter_offsets` function.
+
+If you want to use a custom metadata layout in your kernel, one approach is to define the layout in `TileLang` and then apply the same layout to both your compressor kernel and the matmul_sp kernel.
+
+```python
+
+@tilelang.jit(out_idx=[1, 2], pass_configs={
+    tilelang.PassConfigKey.TIR_DISABLE_VECTORIZE: True,
+})
+def compress_kernel(M, K, block_M, block_K, dtype, use_cutlass_layout):
+    e_factor, e_dtype = ARCH_INFO["8.0"]
+    e_K = K // e_factor
+    elem, group = 2, 4
+
+    assert M % block_M == 0, "M must be divisible by block_M"
+    assert K % block_K == 0, "K must be divisible by block_K"
+    assert K % e_factor == 0, "K must be divisible by e_factor"
+    assert block_K % e_factor == 0, "block_K must be divisible by e_factor"
+
+    @T.prim_func
+    def kernel(
+        A: T.Tensor((M, K), dtype),
+        A_sp: T.Tensor((M, K // 2), dtype),
+        E: T.Tensor((M, e_K), e_dtype),
+    ):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(K, block_K), threads=block_M) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            A_sp_shared = T.alloc_shared((block_M, block_K // 2), dtype)
+            E_shared = T.alloc_shared((block_M, block_K // e_factor), e_dtype)
+            if use_cutlass_layout:  # NOTE: Make sure compressor metadata layout
+                T.annotate_layout({ # is same with your computation kernel
+                    E:
+                        make_cutlass_metadata_layout(
+                            E, mma_dtype="float16", arch="8.0", block_k=block_K),
+                    E_shared:
+                        make_cutlass_metadata_layout(
+                            E_shared,
+                            mma_dtype="float16",
+                            arch="8.0",
+                            block_k=block_K),
+                })
+            T.clear(A_sp_shared)
+            T.clear(E_shared)
+            non_zero_cnt = T.alloc_local((1, ), dtype="uint8")
+            non_zero_elt_log_idx = T.alloc_local((elem, ), dtype="uint8")
+            T.copy(A[bx * block_M, by * block_K], A_shared)
+            for tm in T.Parallel(block_M):
+                for g_i in range(0, block_K // group):
+                    a_k = g_i * group
+                    T.clear(non_zero_cnt)
+                    T.clear(non_zero_elt_log_idx)
+                    for i in range(group):
+                        val = A_shared[tm, a_k + i]
+                        if val != 0.0:
+                            non_zero_elt_log_idx[non_zero_cnt[0]] = i
+                            A_sp_shared[tm, a_k // 2 + non_zero_cnt[0]] = val
+                            non_zero_cnt[0] += 1
+                    if non_zero_cnt[0] == 1 and non_zero_elt_log_idx[0] == 3:
+                        non_zero_elt_log_idx[0] = 0
+                        non_zero_elt_log_idx[1] = 3
+                        A_sp_shared[tm, a_k // 2 + 1] = A_sp_shared[tm, a_k // 2]
+                        A_sp_shared[tm, a_k // 2] = 0.0
+                    elif non_zero_cnt[0] == 1:
+                        A_sp_shared[tm, a_k // 2 + 1] = 0
+                        non_zero_elt_log_idx[1] = 3
+                    for i in T.serial(elem):
+                        val = non_zero_elt_log_idx[i]
+                        E_shared[tm, a_k // e_factor] |= T.shift_left(val, 4 * (g_i % (e_factor // group)) + 2 * i)
+            T.copy(A_sp_shared, A_sp[bx * block_M, by * block_K // 2])
+            T.copy(E_shared, E[bx * block_M, by * block_K // e_factor])
+
+    return kernel
+```
+
+## A note on `gemm_sp` and `gemm_sp_v2`
+
+Initially, `T.gemm_sp` followed the same design as `T.gemm`, lowering to a `CUTLASS` template. This inherently requires metadata to be reordered offline following a predetermined layout.
+
+However, fixing a specific layout introduces several potential issues:
+
+1. Painful debugging experience: Debugging a failed kernel becomes difficult due to the reordered indexing, including permutations and swizzling.
+
+2. Limited flexibility: For example, concatenating two compressed tensors, such as `A_sparse_0` and `A_sparse_1`, into a new `A_sparse` makes sense. However, concatenating their metadata `E_0` and `E_1` may not be valid unless the layout allows it mathematically.
+
+3. Alignment requirements: `CUTLASS` enforces strict alignment checks, and many hyperparameter configurations can lead to compilation errors. (For reference, sm8x was implemented in `CUTLASS 2`.)
+
+`T.gemm_sp_v2` was designed to address these limitations, following the approach of `T.gemm_v2`. It lowers directly to PTX, removing the need for a fixed metadata layout.
diff --git a/docs/get_started/Installation.md b/docs/get_started/Installation.md
index f441d1a83..ea980b59b 100644
--- a/docs/get_started/Installation.md
+++ b/docs/get_started/Installation.md
@@ -15,7 +15,7 @@ We currently provide three methods to install **TileScale**:
 
 ```bash
 docker pull nvcr.io/nvidia/pytorch:25.03-py3
-docker run --name tilescale --ipc=host --network=host --privileged --cap-add=SYS_ADMIN --shm-size=10g --gpus=all -it nvcr.io/nvidia/pytorch:25.03-py3 /bin/bash	
+docker run --name tilescale --ipc=host --network=host --privileged --cap-add=SYS_ADMIN --shm-size=10g --gpus=all -it nvcr.io/nvidia/pytorch:25.03-py3 /bin/bash
 echo -n > /etc/pip/constraint.txt
 bash Miniconda3-latest-Linux-x86_64.sh # install conda
 conda install -c conda-forge libstdcxx-ng
@@ -44,7 +44,7 @@ Verify that **TileScale** is working correctly:
 python -c "import tilelang; print(tilelang.__version__)"
 ```
 
-You can now run TileScale examples and develop your applications. 
+You can now run TileScale examples and develop your applications.
 
 **Example Usage:**
 
@@ -55,12 +55,11 @@ cd /home/tilelang
 TILELANG_USE_DISTRIBUTED=1 python examples/distributed/example_allgather_gemm_overlapped.py
 ```
 
-
 ## To use NVSHMEM APIs
 
 Before running the examples using NVSHMEM APIs (e.g., [example_allgather.py](../../examples/distributed/example_allgather.py)), you need to build NVSHMEM library for device-side code generation.
 
-```bash 
+```bash
 pip install mpich  # building NVSHMEM needs MPI
 export NVSHMEM_SRC="your_custom_nvshmem_dir" # default to 3rdparty/nvshmem_src
 cd tilelang/distributed
diff --git a/docs/get_started/overview.md b/docs/get_started/overview.md
index 18fa9f193..a7c154f31 100644
--- a/docs/get_started/overview.md
+++ b/docs/get_started/overview.md
@@ -15,49 +15,49 @@ Figure 1: High-level overview of the TileLang compilation flow.
 ## Programming Interfaces
 
 1. **Beginner Level (Hardware-Unaware)**
-   - Intended for users who need to write code that is independent of specific hardware details.  
-   - The goal is to let developers focus on the basic logic without worrying about memory hierarchies or hardware-specific optimizations.  
+   - Intended for users who need to write code that is independent of specific hardware details.
+   - The goal is to let developers focus on the basic logic without worrying about memory hierarchies or hardware-specific optimizations.
    - *Note:* This interface is not yet fully implemented.
 
 2. **Developer Level (Hardware-Aware with Tile Library)**
-   - Designed for developers who have a basic understanding of GPU memory hierarchies and performance considerations.  
-   - Provides a **Tile Library**, containing predefined operations and patterns optimized for various hardware architectures.  
+   - Designed for developers who have a basic understanding of GPU memory hierarchies and performance considerations.
+   - Provides a **Tile Library**, containing predefined operations and patterns optimized for various hardware architectures.
    - Users at this level can leverage these ready-made primitives without diving into low-level threading details.
 
 3. **Expert Level (Hardware-Aware with Thread Primitives)**
-   - For highly experienced users who have an in-depth understanding of low-level hardware characteristics (e.g., threading models, memory coalescing).  
-   - Offers direct access to **thread primitives** and other low-level constructs, allowing for fine-grained control of performance-critical kernels.  
+   - For highly experienced users who have an in-depth understanding of low-level hardware characteristics (e.g., threading models, memory coalescing).
+   - Offers direct access to **thread primitives** and other low-level constructs, allowing for fine-grained control of performance-critical kernels.
    - This level grants maximum flexibility for specialized optimizations tailored to specific GPU or multi-core architectures.
 
 ## Compilation Flow
 
-1. **Tile Program**  
+1. **Tile Program**
    A high-level specification of the computation. Depending on the user’s expertise, they may write a purely hardware-unaware tile program or incorporate constructs from the Tile Library or thread primitives.
 
-2. **Tile Program with Tile Library**  
+2. **Tile Program with Tile Library**
    When developers choose from the Tile Library, the original Tile Program is expanded with specialized library calls. These calls encapsulate efficient implementation patterns for different operations.
 
-3. **Tile Program with Thread Primitives**  
+3. **Tile Program with Thread Primitives**
    Expert-level developers can explicitly use low-level threading constructs to hand-optimize data layout, synchronization, and memory usage.
 
-4. **IRModule**  
+4. **IRModule**
    After the program is composed with libraries or thread primitives, it is lowered to an intermediate representation (IR) that captures the necessary hardware details.
 
-5. **Source Code Generation (C/CUDA/HIP/LLVM/…)**  
+5. **Source Code Generation (C/CUDA/HIP/LLVM/…)**
    From the IR, the system generates target-specific source code. This source code is tuned for the desired backends or GPU architectures (e.g., NVIDIA, AMD).
 
-6. **Hardware-Specific Executable/Runtime**  
+6. **Hardware-Specific Executable/Runtime**
    Finally, the generated source is compiled into hardware-specific executables, ready to run on the corresponding devices. The pipeline supports multiple GPU backends and can be extended to additional architectures.
 
 ## Tile-based Programming Model
 
-[Figure 2](#fig-overview-gemm) provides a concise matrix multiplication (GEMM) example in ``TileLang``, 
-illustrating how developers can employ high-level constructs such as tiles, memory placement, pipelining, 
+[Figure 2](#fig-overview-gemm) provides a concise matrix multiplication (GEMM) example in ``TileLang``,
+illustrating how developers can employ high-level constructs such as tiles, memory placement, pipelining,
 and operator calls to manage data movement and computation with fine-grained control.
-In particular, this snippet ([Figure 2](#fig-overview-gemm) (a)) demonstrates how multi-level tiling 
-leverages different memory hierarchies (global, shared, and registers) to optimize bandwidth utilization 
+In particular, this snippet ([Figure 2](#fig-overview-gemm) (a)) demonstrates how multi-level tiling
+leverages different memory hierarchies (global, shared, and registers) to optimize bandwidth utilization
 and reduce latency.
-Overall, [Figure 2](#fig-overview-gemm) (b) showcases how the Python-like syntax of ``TileLang`` 
+Overall, [Figure 2](#fig-overview-gemm) (b) showcases how the Python-like syntax of ``TileLang``
 allows developers to reason about performance-critical optimizations within a user-friendly programming model.
 
 ```{figure} ../_static/img/MatmulExample.png
diff --git a/docs/get_started/run_example.md b/docs/get_started/run_example.md
index aced5d5a8..e25f42fb8 100644
--- a/docs/get_started/run_example.md
+++ b/docs/get_started/run_example.md
@@ -5,11 +5,11 @@
 Before running, enable TileLang’s distributed mode:
 
 ```bash
-export TILELANG_USE_DISTRIBUTED=1 
+export TILELANG_USE_DISTRIBUTED=1
 ```
 Then start an example directly with Python:
 ```bash
- python examples/distributed/primitives/example_put_warp.py 
+ python examples/distributed/primitives/example_put_warp.py
 ```
 
 ## Examples using NVSHMEM APIs
@@ -18,4 +18,4 @@ Use the provided launcher `tilelang/distributed/launch.sh` to start programs tha
 ```bash
 GPUS=2 ./tilelang/distributed/launch.sh examples/distributed/example_allgather.py
 ```
-You can change GPUS to the number of local GPUs you want to use. The launcher will set the required environment variables and invoke `torch.distributed.run`.
\ No newline at end of file
+You can change GPUS to the number of local GPUs you want to use. The launcher will set the required environment variables and invoke `torch.distributed.run`.
diff --git a/docs/get_started/targets.md b/docs/get_started/targets.md
index c2b3f2fb5..3a464bd66 100644
--- a/docs/get_started/targets.md
+++ b/docs/get_started/targets.md
@@ -14,6 +14,7 @@ the generated code. The most frequent choices are listed below:
 | --------- | ----------- |
 | `auto` | Detects CUDA → HIP → Metal in that order. Useful when running the same script across machines. |
 | `cuda` | NVIDIA GPUs. Supports options such as `-arch=sm_80`, `-max_num_threads=1024`, etc. |
+| `cutedsl` | NVIDIA CUTLASS/CuTe DSL backend. Requires `nvidia-cutlass-dsl`. `cuda` options can also be applied to this target. |
 | `hip` | AMD GPUs via ROCm. Options like `-mcpu=gfx90a` can be appended. |
 | `metal` | Apple Silicon GPUs (arm64 Macs). |
 | `llvm` | CPU execution; accepts the standard TVM LLVM switches. |
diff --git a/docs/index.md b/docs/index.md
index 5d9a158f8..ca5a125eb 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -2,10 +2,10 @@
 
 [GitHub](https://github.com/tile-ai/tilelang)
 
-Tile Language (tile-lang) is a concise domain-specific language designed to streamline 
-the development of high-performance GPU/CPU kernels (e.g., GEMM, Dequant GEMM, FlashAttention, LinearAttention). 
-By employing a Pythonic syntax with an underlying compiler infrastructure on top of TVM, 
-tile-lang allows developers to focus on productivity without sacrificing the 
+Tile Language (tile-lang) is a concise domain-specific language designed to streamline
+the development of high-performance GPU/CPU kernels (e.g., GEMM, Dequant GEMM, FlashAttention, LinearAttention).
+By employing a Pythonic syntax with an underlying compiler infrastructure on top of TVM,
+tile-lang allows developers to focus on productivity without sacrificing the
 low-level optimizations necessary for state-of-the-art performance.
 
 :::{toctree}
@@ -17,13 +17,25 @@ get_started/overview
 get_started/targets
 :::
 
-
 :::{toctree}
 :maxdepth: 1
 :caption: TUTORIALS
 
 tutorials/debug_tools_for_tilelang
 tutorials/auto_tuning
+tutorials/logging
+:::
+
+:::{toctree}
+:maxdepth: 1
+:caption: PROGRAMMING GUIDES
+
+programming_guides/overview
+programming_guides/language_basics
+programming_guides/instructions
+programming_guides/control_flow
+programming_guides/autotuning
+programming_guides/type_system
 :::
 
 :::{toctree}
@@ -33,6 +45,7 @@ tutorials/auto_tuning
 deeplearning_operators/elementwise
 deeplearning_operators/gemv
 deeplearning_operators/matmul
+deeplearning_operators/matmul_sparse
 deeplearning_operators/deepseek_mla
 :::
 
@@ -42,6 +55,7 @@ deeplearning_operators/deepseek_mla
 
 compiler_internals/letstmt_inline
 compiler_internals/inject_fence_proxy
+compiler_internals/tensor_checks
 :::
 
 :::{toctree}
diff --git a/docs/programming_guides/autotuning.md b/docs/programming_guides/autotuning.md
new file mode 100644
index 000000000..9cc5a2d94
--- /dev/null
+++ b/docs/programming_guides/autotuning.md
@@ -0,0 +1,308 @@
+# Autotuning
+
+TileLang includes a built‑in autotuner that searches configuration spaces
+for the best performing kernel, compiles candidates in parallel, validates
+correctness, benchmarks them, and caches the best result for reuse.
+
+This guide covers two workflows:
+- Decorator‑based: `@tilelang.autotune(configs=...)` stacked on `@tilelang.jit`
+- Programmatic: `AutoTuner.from_kernel(...).set_*().run()`
+
+It also explains input tensor supply, validation, caching, and environment
+variables that affect parallelism and cache behavior.
+
+## 1) Decorator‑based Autotune
+
+Use `@tilelang.autotune` above `@tilelang.jit` and expose tunable parameters as
+function arguments with defaults. The autotuner overrides these parameters with
+values from your config space.
+
+```python
+import tilelang
+import tilelang.language as T
+
+def matmul_configs(M, N, K):
+    # Example space — tailor to your target
+    tiles = [64, 128]
+    stages = [2, 3]
+    threads = [128, 256]
+    return [
+        dict(block_M=BM, block_N=BN, block_K=BK, num_stages=S, threads=TH)
+        for BM in tiles
+        for BN in tiles
+        for BK in [32, 64]
+        for S in stages
+        for TH in threads
+    ]
+
+@tilelang.autotune(configs=matmul_configs, warmup=25, rep=100, timeout=60)
+@tilelang.jit(out_idx=[-1])
+def matmul(M: int, N: int, K: int,
+           block_M: int = 128, block_N: int = 128, block_K: int = 32,
+           threads: int = 128, num_stages: int = 3,
+           dtype: str = 'float16', accum_dtype: str = 'float32'):
+
+    @T.prim_func
+    def kernel(A: T.Tensor((M, K), dtype),
+               B: T.Tensor((K, N), dtype),
+               C: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_s = T.alloc_shared((block_M, block_K), dtype)
+            B_s = T.alloc_shared((block_K, block_N), dtype)
+            C_f = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_f)
+
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(A[by * block_M, ko * block_K], A_s)
+                T.copy(B[ko * block_K, bx * block_N], B_s)
+                T.gemm(A_s, B_s, C_f)
+
+            T.copy(C_f, C[by * block_M, bx * block_N])
+
+    return kernel
+
+# Usage
+# Provide inputs via context (recommended for reproducibility across configs)
+import torch
+M = N = K = 1024
+A = torch.randn(M, K, device='cuda', dtype=torch.float16)
+B = torch.randn(K, N, device='cuda', dtype=torch.float16)
+C = torch.empty(M, N, device='cuda', dtype=torch.float16)
+
+from tilelang.autotuner import set_autotune_inputs
+with set_autotune_inputs(A, B, C):
+    tuned_kernel = matmul(M, N, K)   # compiles, tunes, returns best kernel
+    tuned_kernel(A, B, C)            # run best kernel
+```
+
+Notes
+- `configs` can be a list of dicts or a callable `(args...) -> list[dict]`. Each
+  dict’s keys must match the tunable function arguments (e.g., `block_M`).
+- The decorator returns a callable that runs autotune once per argument tuple
+  and caches the resulting best kernel in‑process.
+- For explicit input control during tuning, wrap the call with
+  `set_autotune_inputs(...)`. Otherwise, `supply_type` (below) is used.
+
+## 2) Programmatic Autotune
+
+Use the `AutoTuner` class to manage configs and arguments more explicitly.
+
+```python
+from tilelang.autotuner import AutoTuner
+
+kernel_factory = matmul  # the function above (already @tilelang.jit)
+tuner = AutoTuner.from_kernel(kernel_factory(M, N, K), configs=matmul_configs(M, N, K))
+
+tuner.set_profile_args(
+    warmup=25, rep=100, timeout=60,
+    supply_type=tilelang.TensorSupplyType.Auto,  # or provide supply_prog/ref_prog
+    ref_prog=lambda A, B, C: torch.allclose(C, (A @ B).to(C.dtype), rtol=1e-2, atol=1e-2),
+)
+
+tuner.set_compile_args(
+    target='auto',                  # or 'cuda'/'hip'/'metal'
+    execution_backend='auto',       # resolves per-target
+    out_idx=[-1],                   # which outputs to return if multiple
+    pass_configs={                  # optional TVM passes/flags
+        # tilelang.PassConfigKey.EXAMPLE_KEY: value,
+    },
+)
+
+artifact = tuner.run()             # compiles + runs + validates all configs
+best_kernel = artifact.kernel      # JITKernel
+best_latency = artifact.latency
+best_config = artifact.config
+
+# Reuse best kernel
+best_kernel(A, B, C)
+```
+
+### Example Gallery (in repo)
+- examples/gdn/example_chunk_delta_h.py:101 — uses `@autotune` to sweep configs
+- examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py:451 — uses `@tilelang.autotune`
+- examples/quickstart.py:84 — profiles a tuned kernel with `get_profiler`
+- examples/hadamard_transform/example_hadamard.py:152 — profiler with custom warmup
+- examples/dynamic_shape/example_dynamic.py:94 — profiler for dynamic shapes
+- examples/gemm/example_gemm_persistent.py:135 — compare persistent vs non‑persistent
+
+Click any path to open the code and compare patterns.
+
+## Input Tensor Supply
+
+The tuner needs inputs to compile and benchmark kernels. Provide them in one of
+three ways (priority order):
+
+1) Context manager (fixed inputs across configs)
+```python
+with set_autotune_inputs(A, B, C):
+    tuned = matmul(M, N, K)
+```
+
+2) Custom supplier program
+```python
+def supply_prog(signature):
+    # signature holds KernelParam objects describing shapes/dtypes
+    # Return a list of torch tensors matching the kernel’s arguments
+    return [A, B, C]
+
+tuner.set_profile_args(supply_prog=supply_prog)
+```
+
+3) Built‑in generators via `supply_type`
+- `TensorSupplyType.Auto` (default): heuristic per dtype (uniform ints / fp ranges)
+- `Integer`, `Uniform`, `Normal`, `Randn`, `Zero`, `One`
+
+Important
+- Built‑in generators require static shapes; if your PrimFunc uses symbolic
+  dimensions (T.dyn), supply concrete inputs via (1) or (2).
+- Float8 dtypes require PyTorch 2.1+ for `torch.float8_*` support.
+
+## Correctness Checking and Tolerances
+
+Use one of the following validation methods:
+- `ref_prog`: Provide a reference program that receives the same inputs and
+  checks results. You can return a boolean or raise on mismatch.
+- `manual_check_prog`: A callable that inspects outputs and raises on mismatch.
+- `skip_check=True`: Skip correctness checks (faster, use with caution).
+
+Control numeric drift via:
+- `rtol` and `atol` (defaults 1e‑2)
+- `max_mismatched_ratio` (default 1%)
+
+## Configuration Spaces and Best Practices
+
+What to tune
+- Tile sizes: `block_M`, `block_N`, `block_K`
+- Software pipelining: `num_stages`
+- Threads per block: `threads` (or (x, y) tuple)
+- Optional: dtype variants, epilogues, small scheduling knobs
+
+Tips
+- Start from a working baseline. Tune a small, meaningful space first.
+- Respect hardware limits (shared memory bytes, registers per thread/block,
+  max threads per block). Eliminate impossible configs up‑front.
+- Keep block sizes multiples of vector widths and warp sizes when relevant.
+- Use `set_autotune_inputs` to ensure each config is measured on identical data.
+- Record your best configs and bake them as defaults when stable.
+
+## Parallel Compilation/Benchmarking and Timeouts
+
+The tuner compiles configurations in parallel using a thread pool and benchmarks
+them with a per‑config timeout. On CUDA, each worker sets the current device to
+avoid context issues.
+
+Notes
+- `timeout` uses POSIX signals; on non‑Unix systems, it may not take effect.
+- Logs are written to `autotuner.log` in the working directory.
+
+## Caching
+
+The autotuner caches best artifacts both in‑memory (per process) and on disk under
+`$TILELANG_CACHE_DIR/autotuner`. The cache key includes:
+- TileLang version, function source, closure free‑vars
+- Config list, compile args, profile args
+
+Disk cache contents (per key)
+- Best config and latency: `best_config.json`, `latency.json`
+- Kernel sources and library: `device_kernel.cu`, `host_kernel.cu`, `kernel_lib.so` (or `kernel.cubin`/`executable.so` depending on backend)
+- Function and params: `function.pkl`, `params.pkl`
+
+Control via env vars (tilelang.env)
+- `TILELANG_CACHE_DIR` (default `~/.tilelang/cache`)
+- `TILELANG_TMP_DIR` (default `$TILELANG_CACHE_DIR/tmp`)
+- Disable all kernel caches: `TILELANG_DISABLE_CACHE=1`
+- Disable autotune disk cache only: `TILELANG_AUTO_TUNING_DISABLE_CACHE=1`
+
+CPU worker control
+- `TILELANG_AUTO_TUNING_CPU_UTILITIES` (fraction, default 0.9)
+- `TILELANG_AUTO_TUNING_CPU_COUNTS` (int, `-1` auto)
+- `TILELANG_AUTO_TUNING_MAX_CPU_COUNT` (int, `-1` unlimited)
+
+Backend notes
+- NVRTC backend persists `.cubin` and a Python launcher.
+- Torch/DLPack backend may not save artifacts to disk; in this case, only
+  in‑memory caching applies and a warning is logged.
+
+## Alternative: Manual Sweeps with par_compile
+
+If you prefer manual control, use `JITImpl.par_compile` to compile a batch of
+configs and drive your own benchmarking:
+
+```python
+@tilelang.jit
+def factory(M, N, K, block_M=128, block_N=128, block_K=32):
+    @T.prim_func
+    def k(A: T.Tensor((M, K), 'float16'),
+           B: T.Tensor((K, N), 'float16'),
+           C: T.Tensor((M, N), 'float16')):
+        ...
+    return k
+
+impl = factory  # JITImpl
+cfgs = [
+    dict(block_M=64, block_N=128, block_K=32),
+    dict(block_M=128, block_N=128, block_K=64),
+]
+kernels = impl.par_compile(cfgs, num_workers=4)
+# Now benchmark kernels[i](A, B, C) yourself
+```
+
+## Recording and Reusing Best Configs
+
+The programmatic path returns an `AutotuneResult` that can be saved and later
+reloaded. This is useful for CI, multi‑host workflows, or shipping tuned configs.
+
+```python
+artifact = tuner.run()  # AutotuneResult
+
+# Save to disk
+from pathlib import Path
+save_dir = Path('out/best/matmul_1024')
+artifact.save_to_disk(save_dir, verbose=True)
+
+# Reload later
+from tilelang.autotuner.param import AutotuneResult, CompileArgs
+restored = AutotuneResult.load_from_disk(save_dir, CompileArgs())
+best = restored.kernel
+best(A, B, C)
+```
+
+Notes
+- DLPack/Torch execution backend may not persist compiled binaries; in that
+  case, re‑compilation is needed on load or use a different backend.
+- The directory contains human‑readable JSONs (best config/latency) and sources.
+
+## Advanced: Config Space Callables
+
+Derive config spaces from problem sizes to keep searches targeted and legal:
+
+```python
+def matmul_configs(M, N, K):
+    large = min(M, N, K) >= 1024
+    tiles = [128] if large else [64, 128]
+    for BM in tiles:
+        for BN in tiles:
+            for BK in [32, 64]:
+                for S in [2, 3]:
+                    for TH in [128, 256]:
+                        yield dict(block_M=BM, block_N=BN, block_K=BK,
+                                    num_stages=S, threads=TH)
+```
+
+## Device and Backend Selection
+
+Tune compile‑time options explicitly:
+- `target='auto'|'cuda'|'hip'|'metal'` (normalized to a TVM Target)
+- `execution_backend='auto'|'tvm_ffi'|'cython'|'nvrtc'|'torch'`
+- `pass_configs={...}` to toggle TileLang/TVM passes for experiments
+
+On CUDA with multiple GPUs, the tuner sets the current device per worker thread
+to avoid context mixups.
+
+## Troubleshooting
+- “No configurations to tune”: Ensure `configs` is a non‑empty list or callable.
+- Timeouts: Increase `timeout`; ensure inputs fit device memory; verify that
+  your reference check isn’t the bottleneck.
+- Dynamic shapes: Provide concrete inputs via `set_autotune_inputs` or a custom
+  `supply_prog`.
+- Disk cache disabled: Check `TILELANG_AUTO_TUNING_DISABLE_CACHE` and backend.
diff --git a/docs/programming_guides/control_flow.md b/docs/programming_guides/control_flow.md
new file mode 100644
index 000000000..158c51166
--- /dev/null
+++ b/docs/programming_guides/control_flow.md
@@ -0,0 +1,145 @@
+# Control Flow
+
+This guide covers the control‑flow primitives in TileLang and how they lower to
+efficient GPU code. You will use these to structure loops, handle boundaries,
+and express pipelined compute.
+
+## Overview
+- Conditionals: `if` / `elif` / `else`, ternary (`x if c else y`)
+- Loops: `T.serial`, `T.unroll`, `T.Parallel`, `T.Pipelined`
+- While loops: `while` with a TIR condition
+- Flow control: Python `break` / `continue`
+- Safety: automatic OOB guards via the LegalizeSafeMemoryAccess pass
+
+The examples assume `import tilelang.language as T`.
+
+## Conditionals
+
+Standard Python `if`/`elif`/`else` is supported inside `@T.prim_func` kernels.
+Conditions should be TIR expressions (e.g., `i < N`). Python plain booleans are
+treated as compile‑time constants and will be folded.
+
+```python
+for i in T.serial(N):
+    if i < N:            # TIR condition
+        C[i] = A[i] + B[i]
+    else:
+        pass
+
+# Ternary
+x = (A[i] if i < N else 0)
+```
+
+Short‑circuit boolean ops are supported. For multi‑dimensional bounds, use
+`T.any_of` / `T.all_of` for clarity:
+
+```python
+if T.all_of(i < M, j < N):
+    C[i, j] = A[i, j] + B[i, j]
+```
+
+Boundary handling note
+- The LegalizeSafeMemoryAccess pass automatically inserts guards when an access
+  may be out‑of‑bounds, and elides them when proven safe. You can often omit
+  explicit `if` checks for simple edge handling, but keep them when you need
+  custom logic or clarity.
+
+## Loops
+
+### Serial
+
+`T.serial` creates a plain for‑loop. Common forms:
+
+```python
+for i in T.serial(N):
+    ...                     # 0..N-1
+
+for i in T.serial(0, N, 2):
+    ...                     # 0, 2, 4, ...
+```
+
+### Unroll
+
+`T.unroll` requests loop unrolling for small trip counts.
+
+```python
+for k in T.unroll(K_TILE):
+    acc += a[k] * b[k]
+```
+
+Advanced: TileLang forwards unroll hints to TIR; factor/explicit knobs are
+available for expert tuning.
+
+### Parallel (elementwise)
+
+`T.Parallel(ext0, ext1, ...)` builds nested loops that map well to elementwise
+operations. The body receives all indices in one `for` header:
+
+```python
+for i, j in T.Parallel(M, N):
+    C[i, j] = A[i, j] + B[i, j]
+```
+
+Optional: `coalesced_width=` can hint memory coalescing for the innermost loop.
+
+### Pipelined (software pipelining)
+
+`T.Pipelined(iters, num_stages=...)` overlaps producer/consumer stages (e.g.,
+Global→Shared copies with compute). This is the backbone of GEMM/attention
+pipelines.
+
+```python
+for ko in T.Pipelined(T.ceildiv(K, BK), num_stages=3):
+    T.copy(A[by * BM, ko * BK], A_s)  # stage: copy A tile
+    T.copy(B[ko * BK, bx * BN], B_s)  # stage: copy B tile
+    T.gemm(A_s, B_s, C_f)             # stage: compute
+```
+
+### Persistent (advanced)
+
+`T.Persistent(domain, wave_size, index, group_size=...)` exposes persistent
+thread‑block style looping. It is an advanced construct that TileLang lowers in
+later passes and is typically used by specialized templates.
+
+## While Loops
+
+`while` is supported when the condition is a TIR expression. Avoid infinite
+loops; TileLang will error if it detects a constant‑true condition.
+
+```python
+i = 0
+while i < N:
+    ...
+    if done:
+        break
+    i += 1
+```
+
+## Break and Continue
+
+Use Python `break`/`continue` to exit or skip within `T.serial`/`T.unroll`/
+`T.Parallel`/`while` loops. Keep the body clean after a `break`/`continue` for
+readability; the compiler will ignore the dead path.
+
+## Putting It Together: Residual Tile Handling
+
+Below is a typical edge pattern for a 2D kernel. With LegalizeSafeMemoryAccess,
+the explicit guard can be omitted when you don’t need a custom edge path.
+
+```python
+for i, j in T.Parallel(M, N):
+    gi = by * BM + i
+    gj = bx * BN + j
+    if T.all_of(gi < M, gj < N):     # optional in many cases
+        C[gi, gj] = A[gi, gj] + B[gi, gj]
+```
+
+## Debugging Conditions
+
+Use `T.print` to inspect values under predicates. For buffers, TileLang prints
+from a single thread to avoid duplicate outputs.
+
+```python
+if i == 0:
+    T.print(C, msg='C tile:')
+```
diff --git a/docs/programming_guides/instructions.md b/docs/programming_guides/instructions.md
new file mode 100644
index 000000000..69025c347
--- /dev/null
+++ b/docs/programming_guides/instructions.md
@@ -0,0 +1,180 @@
+# Instructions
+
+This page summarizes the core TileLang “instructions” available at the DSL
+level, how they map to hardware concepts, and how to use them correctly.
+
+## Quick Categories
+- Data movement: `T.copy`, `T.c2d_im2col`, staging Global ↔ Shared ↔ Fragment
+- Compute primitives: `T.gemm`/`T.gemm_sp`, elementwise math (`T.exp`, `T.max`),
+  reductions (`T.reduce_sum`, `T.cumsum`, warp reducers)
+- Control helpers: `T.clear`/`T.fill`, `T.reshape`/`T.view`
+- Diagnostics: `T.print`, `T.device_assert`
+- Advanced: atomics, memory barriers, warp‑group ops
+
+## Data Movement
+
+Use `T.copy(src, dst, coalesced_width=None, disable_tma=False, eviction_policy=None)`
+to move tiles between memory scopes. It accepts `tir.Buffer`, `BufferLoad`, or
+`BufferRegion`; extents are inferred or broadcast when possible.
+
+```python
+# Global → Shared tiles (extents inferred from dst)
+T.copy(A[by * BM, ko * BK], A_s)
+T.copy(B[ko * BK, bx * BN], B_s)
+
+# Fragment/Register → Global (store result)
+T.copy(C_f, C[by * BM, bx * BN])
+```
+
+Semantics
+- Extents are deduced from arguments; missing sides broadcast to the other’s rank.
+- Access patterns are legalized and coalesced during lowering. Explicit
+  vectorization is not required in HL mode.
+- Safety: the LegalizeSafeMemoryAccess pass inserts boundary guards when an
+  access may be out‑of‑bounds and drops them when proven safe.
+
+Other helpers
+- `T.c2d_im2col(img, col, ...)`: convenience for conv‑style transforms.
+
+## Compute Primitives
+
+GEMM and sparse GEMM
+- `T.gemm(A_shared, B_shared, C_fragment)`: computes a tile GEMM using shared
+  inputs and a fragment accumulator; lowered to target‑specific tensor cores.
+- `T.gemm_sp(...)`: 2:4 sparse tensor core variant (see examples and README).
+
+Reductions and scans
+- `T.reduce_sum`, `T.reduce_max`, `T.reduce_min`, `T.cumsum`, plus warp
+  reducers (`T.warp_reduce_sum`, etc.).
+- Allocate and initialize accumulators via `T.alloc_fragment` + `T.clear` or
+  `T.fill`.
+
+Elementwise math
+- Most math ops mirror TVM TIR: `T.exp`, `T.log`, `T.max`, `T.min`, `T.rsqrt`,
+  `T.sigmoid`, etc. Compose freely inside loops.
+
+Reshape/view (no copy)
+- `T.reshape(buf, new_shape)` and `T.view(buf, shape=None, dtype=None)` create
+  new views that share storage, with shape/dtype checks enforced.
+
+## Synchronization (HL usage)
+
+In HL pipelines, you usually don’t need to write explicit barriers. Passes such
+as PipelinePlanning/InjectSoftwarePipeline/InjectTmaBarrier orchestrate
+producer/consumer ordering and thread synchronization behind the scenes.
+
+If you need debugging or explicit checks:
+- `T.device_assert(cond, msg='')` emits device‑side asserts on CUDA targets.
+- `T.print(obj, msg='...')` prints scalars or buffers safely from one thread.
+
+## Putting It Together: GEMM Tile
+
+```python
+@T.prim_func
+def gemm(
+    A: T.Tensor((M, K), 'float16'),
+    B: T.Tensor((K, N), 'float16'),
+    C: T.Tensor((M, N), 'float16'),
+):
+    with T.Kernel(T.ceildiv(N, BN), T.ceildiv(M, BM), threads=128) as (bx, by):
+        A_s = T.alloc_shared((BM, BK), 'float16')
+        B_s = T.alloc_shared((BK, BN), 'float16')
+        C_f = T.alloc_fragment((BM, BN), 'float32')
+        T.clear(C_f)
+
+        for ko in T.Pipelined(T.ceildiv(K, BK), num_stages=3):
+            T.copy(A[by * BM, ko * BK], A_s)  # Global → Shared
+            T.copy(B[ko * BK, bx * BN], B_s)
+            T.gemm(A_s, B_s, C_f)             # compute into fragment
+
+        T.copy(C_f, C[by * BM, bx * BN])      # store back
+```
+
+## Instruction Reference (Concise)
+
+Below is a concise list of TileLang instructions grouped by category. For full
+signatures, behaviors, constraints, and examples, refer to API Reference
+(`autoapi/tilelang/index`).
+
+Data movement
+- `T.copy(src, dst, ...)`: Move tiles between Global/Shared/Fragment.
+- `T.c2d_im2col(img, col, ...)`: 2D im2col transform for conv.
+
+Memory allocation and descriptors
+- `T.alloc_shared(shape, dtype, scope='shared.dyn')`: Allocate shared buffer.
+- `T.alloc_fragment(shape, dtype, scope='local.fragment')`: Allocate fragment.
+- `T.alloc_var(dtype, [init], scope='local.var')`: Scalar var buffer (1 elem).
+- `T.alloc_barrier(arrive_count)`: Shared barrier buffer.
+- `T.alloc_tmem(shape, dtype)`: Tensor memory (TMEM) buffer (Hopper+).
+- `T.alloc_reducer(shape, dtype, op='sum', replication=None)`: Reducer buf.
+- `T.alloc_descriptor(kind, dtype)`: Generic descriptor allocator.
+  - `T.alloc_wgmma_desc(dtype='uint64')`
+  - `T.alloc_tcgen05_smem_desc(dtype='uint64')`
+  - `T.alloc_tcgen05_instr_desc(dtype='uint32')`
+- `T.empty(shape, dtype='float32')`: Declare function output tensors.
+
+Compute primitives
+- `T.gemm(A_s, B_s, C_f)`: Tile GEMM into fragment accumulator.
+- `T.gemm_sp(...)`: Sparse (2:4) tensor core GEMM.
+- Reductions: `T.reduce_sum/max/min/abssum/absmax`, bitwise `and/or/xor`.
+- Scans: `T.cumsum`, finalize: `T.finalize_reducer`.
+- Warp reducers: `T.warp_reduce_sum/max/min/bitand/bitor`.
+- Elementwise math: TIR ops (`T.exp`, `T.log`, `T.max`, `T.min`, `T.rsqrt`, ...).
+- Fast math: `T.__log/__log2/__log10/__exp/__exp2/__exp10/__sin/__cos/__tan`.
+- IEEE math: `T.ieee_add/sub/mul/fmaf` (configurable rounding).
+- Helpers: `T.clear(buf)`, `T.fill(buf, value)`.
+- Views: `T.reshape(buf, shape)`, `T.view(buf, shape=None, dtype=None)`.
+
+Diagnostics
+- `T.print(obj, msg='')`: Print scalar/buffer from one thread.
+- `T.device_assert(cond, msg='')`: Device-side assert (CUDA).
+
+Logical helpers
+- `T.any_of(a, b, ...)`, `T.all_of(a, b, ...)`: Multi-term predicates.
+
+Annotation helpers
+- `T.use_swizzle(panel_size=..., enable=True)`: Rasterization hint.
+- `T.annotate_layout({...})`: Attach explicit layouts to buffers.
+- `T.annotate_safe_value(var, ...)`: Safety/const hints.
+- `T.annotate_l2_hit_ratio(buf, ratio)`: Cache behavior hint.
+
+Atomics
+- `T.atomic_add(dst, value, memory_order=None, return_prev=False, use_tma=False)`.
+- `T.atomic_addx2(dst, value, return_prev=False)`; `T.atomic_addx4(...)`.
+- `T.atomic_max(dst, value, memory_order=None, return_prev=False)`.
+- `T.atomic_min(dst, value, memory_order=None, return_prev=False)`.
+- `T.atomic_load(dst)`, `T.atomic_store(dst, value)`.
+
+Custom intrinsics
+- `T.dp4a(A, B, C)`: 4‑element dot‑product accumulate.
+- `T.clamp(x, lo, hi)`: Clamp to [lo, hi].
+- `T.loop_break()`: Break from current loop via intrinsic.
+
+Barriers, TMA, warp‑group
+- Barriers: `T.create_list_of_mbarrier(...)`, `T.get_mbarrier(i)`.
+- Parity ops: `T.mbarrier_wait_parity(barrier, parity)`, `T.mbarrier_arrive(barrier)`.
+- Expect tx: `T.mbarrier_expect_tx(...)`; sugar: `T.barrier_wait(id, parity=None)`.
+- TMA: `T.create_tma_descriptor(...)`, `T.tma_load(...)`,
+  `T.tma_store_arrive(...)`, `T.tma_store_wait(...)`.
+- Proxy/fences: `T.fence_proxy_async(...)`, `T.warpgroup_fence_operand(...)`.
+- Warp‑group: `T.warpgroup_arrive()`, `T.warpgroup_commit_batch()`,
+  `T.warpgroup_wait(num_mma)`, `T.wait_wgmma(id)`.
+
+Lane/warp index
+- `T.get_lane_idx(warp_size=None)`: Lane id in warp.
+- `T.get_warp_idx_sync(warp_size=None)`: Canonical warp id (sync).
+- `T.get_warp_idx(warp_size=None)`: Canonical warp id (no sync).
+- `T.get_warp_group_idx(warp_size=None, warps_per_group=None)`: Group id.
+
+Register control
+- `T.set_max_nreg(reg_count, is_inc)`, `T.inc_max_nreg(n)`, `T.dec_max_nreg(n)`.
+- `T.annotate_producer_reg_dealloc(n=24)`, `T.annotate_consumer_reg_alloc(n=240)`.
+- `T.no_set_max_nreg()`, `T.disable_warp_group_reg_alloc()`.
+
+## Notes on Dtypes
+
+Dtypes accept three equivalent forms:
+- String: `'float32'`
+- TileLang dtype: `T.float32`
+- Framework dtype: `torch.float32`
+All are normalized internally. See Type System for details.
diff --git a/docs/programming_guides/language_basics.md b/docs/programming_guides/language_basics.md
new file mode 100644
index 000000000..1152680c9
--- /dev/null
+++ b/docs/programming_guides/language_basics.md
@@ -0,0 +1,234 @@
+# Language Basics
+
+This page introduces the core TileLang (tile‑lang) DSL that you’ll use to write
+high‑performance kernels. It focuses on how to define a kernel, express
+iteration, move data across memory scopes, and run it with JIT.
+
+The examples use the conventional aliases:
+
+```python
+import tilelang
+import tilelang.language as T
+from tilelang import jit
+```
+
+## 1. Defining a Kernel with `@T.prim_func`
+
+TileLang kernels are TIR (TVM IR) functions produced by the `@T.prim_func`
+decorator. Arguments are annotated with shapes and dtypes via `T.Tensor` or
+`T.Buffer`.
+
+Note on dtypes
+- You can pass dtypes as a string (e.g., 'float32'), a TileLang dtype (e.g., `T.float32`),
+  or a framework dtype (e.g., `torch.float32`). TileLang normalizes all of these.
+  See Type System for details.
+
+```python
+@T.prim_func
+def add_kernel(
+    A: T.Tensor((N,), dtype),    # dtype could be 'float32' | T.float32 | torch.float32
+    B: T.Tensor((N,), dtype),
+    C: T.Tensor((N,), dtype),
+):
+    ...  # kernel body
+```
+
+- Shapes may be concrete integers or symbolic. For symbolic, you can pass
+  Python ints through the outer `@jit` wrapper (shown below), or annotate with
+  `T.dyn` when you want a named symbolic dimension.
+
+```python
+# Named symbolic dimension (optional)
+K = T.dyn['K']
+@T.prim_func
+def uses_dyn(A: T.Tensor((K,), 'float32')):
+    ...
+```
+
+### Dynamic symbolic dimensions: two ways
+
+TileLang supports two complementary ways to introduce symbolic (dynamic) dims:
+
+- Type-level annotations via `T.dyn[...]` (recommended for function signatures)
+  - Use in `T.Tensor((T.dyn['K'], ...), dtype)` or bind once then reuse (as above).
+  - Inside the kernel body, prefer reading from the buffer’s shape, e.g. `M = A.shape[0]`.
+
+- Term-level variables via `T.dynamic(name, dtype)`
+  - Creates a TIR `tir.Var` you can use directly in expressions/loops.
+  - Handy when you need to reference the dimension symbol in the body.
+
+```python
+# 1) Annotation-only symbol; read the bound size via shape
+K = T.dyn['K']  # dtype defaults to int32
+@T.prim_func
+def foo(A: T.Tensor((K,), 'float32')):
+    N = A.shape[0]
+    for i in T.serial(N):
+        ...
+
+# 2) Explicit Var symbol usable in the body
+K = T.dynamic('K', 'int32')   # or T.dynamic('K') defaults to int32
+@T.prim_func
+def bar(A: T.Tensor((K,), 'float32')):
+    for i in T.serial(K):
+        ...
+```
+
+Notes
+- `T.symbolic(name, dtype)` is a deprecated alias of `T.dynamic`; prefer `T.dynamic`.
+- Under `@jit`, concrete sizes come from the actual tensor arguments at the first call.
+- Symbols in annotations do not need to be separate kernel arguments; TileLang binds them from argument shapes.
+
+## 2. Launching Work with `T.Kernel`
+
+`with T.Kernel(...)` declares a launch context and creates block/thread
+bindings. For GPU backends, specify a grid and threads per block.
+
+```python
+with T.Kernel(grid_x, grid_y, threads=128) as (bx, by):
+    ...  # bx/by are blockIdx.x/y
+```
+
+You rarely need raw thread indices; most kernels use structured loops
+(`T.serial`, `T.unroll`, `T.Parallel`, `T.Pipelined`) inside a `T.Kernel`.
+
+## 3. Loops and Control Flow
+
+Core loop constructs map to familiar hardware patterns:
+
+- `T.serial(start, stop[, step])`: plain for‑loop
+- `T.unroll(start, stop[, step])`: unrolled loop
+- `T.Parallel(ext0, ext1, ...)`: nested parallel loops (elementwise‑friendly)
+- `T.Pipelined(iters, num_stages=N)`: software pipelining for producer/consumer
+
+```python
+for i in T.serial(N):
+    ...
+
+for i, j in T.Parallel(M, N):
+    C[i, j] = A[i, j] + B[i, j]
+
+for k in T.Pipelined(T.ceildiv(K, BK), num_stages=3):
+    # overlap copy/compute across stages
+    ...
+```
+
+Conditionals use standard Python `if`/`else`. Guard edges with predicates when
+tile sizes do not divide problem sizes evenly.
+
+## 4. Memory Scopes and Allocation
+
+TileLang exposes key software‑managed scopes:
+
+- Global: device memory (default for `T.Tensor` arguments)
+- Shared: on‑chip, block‑visible (`T.alloc_shared(shape, dtype)`)
+- Fragment and scalars: per‑thread fragments and scalar vars but in Shared View
+  (`T.alloc_fragment`, `T.alloc_var`)
+
+```python
+A_shared = T.alloc_shared((BM, BK), 'float16')
+B_shared = T.alloc_shared((BK, BN), 'float16')
+C_local  = T.alloc_fragment((BM, BN), 'float32')
+T.clear(C_local)  # zero accumulators
+```
+
+## 5. Moving Data: `T.copy`
+
+Use `T.copy(src, dst)` to move tiles between scopes. It accepts buffers,
+buffer regions, or buffer loads; extents are inferred or can be broadcast.
+
+```python
+# Global -> Shared (tile copy), extents inferred from dst
+T.copy(A[by * BM, ko * BK], A_shared)
+T.copy(B[ko * BK, bx * BN], B_shared)
+
+# Fragment -> Global (store back)
+T.copy(C_local, C[by * BM, bx * BN])
+```
+
+`T.copy` performs coalescing and scope‑specific lowering during compilation.
+
+## 6. A Minimal End‑to‑End Example (Vector Add)
+
+```python
+import tilelang
+import tilelang.language as T
+from tilelang import jit
+
+@jit  # infers target from tensors at first call
+def add(N: int, block: int = 256, dtype: str = 'float32'):
+
+    @T.prim_func
+    def add_kernel(
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
+        C: T.Tensor((N,), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block), threads=block) as bx:
+            for i in T.Parallel(block):
+                gi = bx * block + i
+                # Optional — LegalizeSafeMemoryAccess inserts a guard when an access may be OOB
+                C[gi] = A[gi] + B[gi]
+
+    return add_kernel
+
+# Host side (PyTorch shown; NumPy/DLPack also supported)
+import torch
+N = 1 << 20
+A = torch.randn(N, device='cuda', dtype=torch.float32)
+B = torch.randn(N, device='cuda', dtype=torch.float32)
+C = torch.empty(N, device='cuda', dtype=torch.float32)
+
+kernel = add(N)
+kernel(A, B, C)  # runs on GPU
+torch.testing.assert_close(C, A + B)
+```
+
+Notes
+- The `@jit` wrapper returns a callable kernel after the first compilation.
+- You can pass compile‑time tunables (tile sizes, dtypes) through the outer
+  Python function and bake them into the generated TIR.
+
+## 7. Tiled GEMM Skeleton
+
+Below is a minimal pattern for a tiled GEMM using shared memory staging and a
+fragment accumulator. It mirrors the quickstart style found in the repository.
+
+```python
+@T.prim_func
+def gemm(
+    A: T.Tensor((M, K), 'float16'),
+    B: T.Tensor((K, N), 'float16'),
+    C: T.Tensor((M, N), 'float16'),
+):
+    with T.Kernel(T.ceildiv(N, BN), T.ceildiv(M, BM), threads=128) as (bx, by):
+        A_s = T.alloc_shared((BM, BK), 'float16')
+        B_s = T.alloc_shared((BK, BN), 'float16')
+        C_f = T.alloc_fragment((BM, BN), 'float32')
+        T.clear(C_f)
+
+        for ko in T.Pipelined(T.ceildiv(K, BK), num_stages=3):
+            T.copy(A[by * BM, ko * BK], A_s)
+            T.copy(B[ko * BK, bx * BN], B_s)
+            T.gemm(A_s, B_s, C_f)  # lowered to tensor‑core/ISA specific kernels
+
+        T.copy(C_f, C[by * BM, bx * BN])
+```
+
+## 8. Debugging and Printing
+
+Use `T.print` inside a kernel for quick introspection. TileLang emits printing
+from a single thread for shared/fragment scopes to avoid floods.
+
+```python
+T.print(C_f, msg='accumulator:')
+T.print(A_s, msg='A tile:')
+T.print(C[0], msg='C[0] = ')
+```
+
+## 9. Where to Go Next
+
+- Control flow details: see Programming Guides → Control Flow
+- Memory topics: see Programming Guides → (removed cache/layout); basics are covered inline
+- Autotuning tile sizes and mappings: Programming Guides → Autotuning
+- Operator examples (GEMM, GEMV, attention): see Deep Learning Operators
diff --git a/docs/programming_guides/overview.md b/docs/programming_guides/overview.md
new file mode 100644
index 000000000..64b6d2039
--- /dev/null
+++ b/docs/programming_guides/overview.md
@@ -0,0 +1,27 @@
+# Programming Guides Overview
+
+This section provides a practical guide to writing high‑performance kernels with Tile Language (tile‑lang).
+It mirrors the structure of a similar guide in another project and adapts it to tile‑lang concepts and APIs.
+
+- Audience: Developers implementing custom GPU/CPU kernels with tile‑lang
+- Prereqs: Basic Python, NumPy/Tensor concepts, and familiarity with GPU programming notions
+- Scope: Language basics, control flow, instructions, autotuning, and type system
+
+## What You’ll Learn
+- How to structure kernels with TileLang’s core DSL constructs
+- How to move data across global/shared/fragment and pipeline compute
+- How to apply autotuning to tile sizes and schedules
+- How to specify and work with dtypes in kernels
+
+## Suggested Reading Order
+1. Language Basics
+2. Control Flow
+3. Instructions
+4. Autotuning
+5. Type System
+
+## Related Docs
+- Tutorials: see existing guides in `tutorials/`
+- Operators: examples in `deeplearning_operators/`
+
+> NOTE: This is a draft scaffold. Fill in code snippets and benchmarks as APIs evolve.
diff --git a/docs/programming_guides/type_system.md b/docs/programming_guides/type_system.md
new file mode 100644
index 000000000..60061df3f
--- /dev/null
+++ b/docs/programming_guides/type_system.md
@@ -0,0 +1,41 @@
+# Type System
+
+This page lists the data types supported by TileLang and how to specify them in
+kernels. For full details and the authoritative list, see the API Reference
+(`autoapi/tilelang/index`) and `tilelang.language.v2.dtypes`.
+
+How to specify dtypes
+- Use any of the following forms; TileLang normalizes them internally:
+  - String: `'float32'`, `'int8'`, `'bfloat16'`, ...
+  - TileLang dtype object: `T.float32`, `T.int8`, `T.bfloat16`, ...
+  - Framework dtype: `torch.float32`, `torch.int8`, `torch.bfloat16`, ...
+
+Common scalar types
+- Boolean: `bool`
+- Signed integers: `int8`, `int16`, `int32`, `int64`
+- Unsigned integers: `uint8`, `uint16`, `uint32`, `uint64`
+- Floating‑point: `float16` (half), `bfloat16`, `float32`, `float64`
+
+Float8 and low‑precision families
+- Float8: `float8_e3m4`, `float8_e4m3`, `float8_e4m3b11fnuz`, `float8_e4m3fn`,
+  `float8_e4m3fnuz`, `float8_e5m2`, `float8_e5m2fnuz`, `float8_e8m0fnu`
+- Float6: `float6_e2m3fn`, `float6_e3m2fn`
+- Float4: `float4_e2m1fn`
+
+Vectorized element types (SIMD packs)
+- For many base types, vector‑packed variants are available by lane count:
+  `x2`, `x4`, `x8`, `x16`, `x32`, `x64`.
+- Examples:
+  - Integers: `int8x2`, `int8x4`, ..., `int32x2`, `int32x4`, ...
+  - Unsigned: `uint8x2`, `uint8x4`, ...
+  - Floats: `float16x2`, `float16x4`, `float32x2`, `float32x4`, ...
+  - Float8/6/4 families also provide `x2/x4/x8/x16/x32/x64` where applicable,
+    e.g., `float8_e4m3x2`, `float8_e4m3x4`, `float6_e2m3fnx8`, `float4_e2m1fnx16`.
+
+Notes
+- Availability of certain low‑precision formats (float8/6/4) depends on target
+  architecture and backend support.
+- Choose accumulation dtypes explicitly for mixed‑precision compute (e.g.,
+  GEMM with `float16` inputs and `float32` accumulators).
+- The complete, up‑to‑date list is exposed in
+  `tilelang.language.v2.dtypes` and rendered in the API Reference.
diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt
index e859d0e7b..6fd433459 100644
--- a/docs/spelling_wordlist.txt
+++ b/docs/spelling_wordlist.txt
@@ -1,4 +1,5 @@
 cancelled
+HDA
 hsa
 ist
 LOD
diff --git a/docs/tutorials/auto_tuning.md b/docs/tutorials/auto_tuning.md
index 3f3cad832..33368a2f0 100644
--- a/docs/tutorials/auto_tuning.md
+++ b/docs/tutorials/auto_tuning.md
@@ -14,7 +14,7 @@ Auto-tuning a Tile Language program involves three main steps:
 
 ## Matrix Multiplication Example
 
-The following example demonstrates auto-tuning matrix multiplication. Code has been simplified for readability - see `examples/gemm/example_gemm.py` for complete implementation. 
+The following example demonstrates auto-tuning matrix multiplication. Code has been simplified for readability - see `examples/gemm/example_gemm.py` for complete implementation.
 
 ### Step 1: Implement with Reserved Parameters
 Users can implement matrix multiplication in Tile Language while reserving parameters for optimization:
@@ -145,4 +145,4 @@ for hint in roller_hints:
     config["thread_num"] = block_rows * block_cols * 32
     config["enable_rasteration"] = hint.rasterization_plan is not NoRasterization
 
-```
\ No newline at end of file
+```
diff --git a/docs/tutorials/debug_tools_for_tilelang.md b/docs/tutorials/debug_tools_for_tilelang.md
index e18b13279..d98d4cb5e 100644
--- a/docs/tutorials/debug_tools_for_tilelang.md
+++ b/docs/tutorials/debug_tools_for_tilelang.md
@@ -12,7 +12,6 @@ A Tile Language program (hereafter referred to as a *program*) is transformed in
 2. The program undergoes multiple *Passes* for transformation and optimization (the *lower* stage, see `tilelang/engine/lower.py`), finally producing an intermediate representation (e.g., LLVM or C for CPU, CUDA for NVIDIA GPUs, etc.).
 3. The generated code is compiled by the respective compiler (e.g., nvcc) into a hardware-executable file.
 
-
 ```{figure} ../_static/img/overview.png
 :width: 300
 :alt: Overview of the compilation process
@@ -22,9 +21,9 @@ A Tile Language program (hereafter referred to as a *program*) is transformed in
 
 During this process, users may encounter roughly three categories of issues:
 
-* **Generation issues**: The Tile Language program fails to generate a valid hardware-executable file (i.e., errors during the lowering process).
-* **Correctness issues**: The resulting executable runs, but produces incorrect results.
-* **Performance issues**: The executable runs with performance significantly below the expected theoretical hardware limits.
+- **Generation issues**: The Tile Language program fails to generate a valid hardware-executable file (i.e., errors during the lowering process).
+- **Correctness issues**: The resulting executable runs, but produces incorrect results.
+- **Performance issues**: The executable runs with performance significantly below the expected theoretical hardware limits.
 
 This tutorial focuses on the first two issues—how to debug generation and correctness problems. Performance tuning often requires using vendor-provided profiling tools (e.g., **Nsight Compute**, **rocProf**, etc.) for further hardware-level analysis, which we will address in future materials.
 
@@ -52,7 +51,6 @@ func = matmul(1024, 1024, 1024, 128, 128, 32)
 
 TileLang essentially performs *progressive lowering*. For example, a `T.copy` may first be expanded into `T.Parallel` (see the pass `LowerTileOP`), which is then expanded again, eventually resulting in lower-level statements that can be translated to CUDA C code.
 
-
 ```{figure} ../_static/img/ir_transform_diagram.png
 :width: 400
 :alt: IR transformation diagram
@@ -171,6 +169,31 @@ The output messages will include something like:
 msg='hello world' BlockIdx=(0, 0, 0), ThreadIdx=(0, 0, 0): 0
 ```
 
+### Visual Layout Inference For TileLang
+ The **Visual Layout Inference** tool automatically generates visual diagrams that illustrate the mapping between logical indices, thread IDs, and register file locations.
+
+When TileLang performs layout inference, it determines how fragment buffers are distributed across threads. The visual layout tool captures this information and generates:
+1. **Textual output**: A human-readable description of the layout mapping
+2. **Visual diagrams**: Color-coded plots showing the thread-to-data mapping
+
+The visual layout inference tool is controlled through the `TL_LAYOUT_VISUALIZATION_ENABLE` and `TL_LAYOUT_VISUALIZATION_FORMATS` pass configuration. By default, `TL_LAYOUT_VISUALIZATION_ENABLE` is **disabled** to avoid performance overhead during compilation.
+
+When enabled, `TL_LAYOUT_VISUALIZATION_FORMATS` accepts string values to control output formats:
+- "txt": Text output only (same as default)
+- "all": Generates all formats (TXT, PDF, PNG, SVG)
+- "png": Generate PNG format only
+- "pdf": Generate PDF format only
+- "svg": Generate SVG format only
+- "txt,svg": Generate multiple formats (comma-separated) in addition to text output
+
+The output messages of "txt" will include something like:
+```
+C_local inferenced layout:
+  Shape: [32, 32] -> [8]
+  Thread: _j // 16 * 64 + _i // 16 * 32 + _i % 8 * 4 + _j % 8 // 2
+  Index:  [_j % 16 // 8 * 4 + _i % 16 // 8 * 2 + _j % 2]
+```
+
 ## Conclusion
 
 By carefully examining intermediate representations (IR) before final code generation—and by leveraging runtime printing through `T.print`—one can quickly diagnose where index calculations, copy logic, or other kernel operations deviate from the intended behavior. This two-pronged approach (inspecting IR transformations and using runtime prints) is often sufficient for resolving generation and correctness issues in TileLang programs.
diff --git a/docs/tutorials/logging.md b/docs/tutorials/logging.md
new file mode 100644
index 000000000..1a015432d
--- /dev/null
+++ b/docs/tutorials/logging.md
@@ -0,0 +1,116 @@
+Logging in Tilelang/TVM
+===================================================
+<div style="text-align: left;">
+<em>Author:</em> <a href="https://github.com/SiriusNEO">SiriusNEO</a>
+</div>
+
+## TVM Logging Overview
+
+Tilelang currently utilizes the logging system from TVM. The implementation can be found in:
+
+- [include/tvm/runtime/logging.h](https://github.com/apache/tvm/blob/main/include/tvm/runtime/logging.h): Macro definitions
+- [src/runtime/logging.cc](https://github.com/apache/tvm/blob/main/src/runtime/logging.cc): Logging logic implementation
+
+The design style is inspired by [Google's glog](https://google.github.io/glog/stable/).
+
+## Logging Categories
+
+There are three primary macro types:
+
+```c++
+LOG(INFO) << "aaa";
+DLOG(INFO) << "aaa";
+VLOG(1) << "aaa";
+```
+
+- **LOG**: Standard logging preserved in code for displaying necessary information at different levels during runtime. Most Tilelang C++ error reporting is implemented via `LOG(FATAL) << "error msg"`.
+- **DLOG**: Debug logging for developer debugging output. DLOG is controlled at build time by the TVM_LOG_DEBUG environment variable and is **eliminated in Release builds through dead code elimination**.
+  - The key difference between LOG(DEBUG) and DLOG is this build-time elimination. We recommend using DLOG over LOG(DEBUG), as the latter has overlapping functionality and gets compiled into the release runtime.
+- **VLOG**: [Verbose logging](https://google.github.io/glog/stable/logging/#verbose-logging), primarily for debugging. Its main feature is customizable verbosity levels. For example, VLOG(n) where n can be 1, 2, 3, 4, 5, or 6, enabling complex tracing requirements. In contrast, LOG and DLOG typically use predefined verbose levels like INFO and DEBUG.
+  - In practical Tilelang development, VLOG is used less frequently.
+  - TVM's VLOG is implemented using DLOG, thus inheriting DLOG's characteristics.
+
+Additional useful macros include various **CHECK** variants:
+
+```c++
+CHECK(cond) << "error msg";
+DCHECK(cond) << "error msg";
+ICHECK(cond) << "error msg";
+```
+
+The implementation routes errors to LogFatal:
+
+```c++
+#define CHECK(x)                                                \
+  if (!(x))                                                     \
+  ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \
+      << "Check failed: (" #x << ") is false: "
+```
+- **DCHECK**: Debug mode CHECK, only compiled in debug builds
+- **ICHECK**: Internal Check that should exist in Release builds. When ICHECK fails, the entire system should report an error.
+
+## Logging Verbose Levels
+
+TVM defines 5 levels for LOG and DLOG (adding DEBUG compared to glog):
+
+```c++
+#define TVM_LOG_LEVEL_DEBUG 0
+#define TVM_LOG_LEVEL_INFO 1
+#define TVM_LOG_LEVEL_WARNING 2
+#define TVM_LOG_LEVEL_ERROR 3
+#define TVM_LOG_LEVEL_FATAL 4
+```
+
+## Using Logging in TileLang Development
+
+### Guidelines
+
+For temporary debugging output in your code, there are no restrictions (you can even use std::cout). Just remember to remove it before submitting a PR.
+
+For meaningful logging that should remain in the Tilelang codebase:
+
+- Critical correctness checks: Use ICHECK with sufficient error messages to facilitate debugging when issues arise.
+- Complex Pass debugging: For passes requiring intermediate output that may need future review (e.g., LayoutInference), use DLOG.
+- General INFO/WARNING messages: Use standard LOG.
+
+### Enabling Log Output in Tilelang
+
+To specify current log level at runtime, we need to set the environment variable `TVM_LOG_LEVEL`. An example usage is:
+
+```c++
+TVM_LOG_DEBUG=1 python3 code.py
+```
+
+which enables all DEBUG/INFO (level <= 1) logs for all files.
+
+#### Detailed Rules for TVM_LOG_DEBUG Specification
+
+The parsing logic is in `logging.cc`. Reference: [HyperAI Zhihu Article](https://zhuanlan.zhihu.com/p/1933106843468665163).
+
+Launch Python with `TVM_LOG_DEBUG=<spec>`, where `<spec>` is a comma-separated list of level assignments in the form `<file_name>=<level>`. Important notes:
+
+- The special filename DEFAULT sets the LOG level for all files.
+- `<level>` can be set to -1 to disable LOG for that file.
+- `<file_name>` is the C++ source filename (e.g., .cc, not .h) relative to the `src/` directory in the TVM repository. The `src/` prefix is optional when specifying file paths.
+
+### Enabling Debug Mode
+
+To enable DLOG/DCHECK, developers need to first build Tilelang in Debug mode:
+
+```bash
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DUSE_CUDA=ON
+```
+
+Tilelang's CMake logic automatically adds the `TVM_LOG_DEBUG` macro, compiling all DLOG statements:
+
+```cmake
+target_compile_definitions(tilelang_objs PRIVATE "TVM_LOG_DEBUG")
+```
+
+Then you also need to specify the runtime environment variables. For example, to use `DLOG(INFO) << "xxx"` for debugging, run your code with INFO level (1): `TVM_LOG_DEBUG=1`.
+
+:::{note}
+   **Important**: There are two TVM_LOG_DEBUG variables. (1) Compile-time macro: Determines whether debug content (like DLOG) is compiled into the .so file. Referenced in C++ source via #ifdef TVM_LOG_DEBUG. This is automatically enabled when using Debug build mode in CMake. (2) Runtime environment variable: Controls logging level at runtime. TVM provides a specification for this variable, allowing control over per-file logging levels.
+
+   These two should ideally have different names, but TVM uses the same name for both, which can cause confusion.
+:::
diff --git a/examples/amd/example_amd_flash_attn_bwd.py b/examples/amd/example_amd_flash_attn_bwd.py
index d47866e1e..788aec367 100644
--- a/examples/amd/example_amd_flash_attn_bwd.py
+++ b/examples/amd/example_amd_flash_attn_bwd.py
@@ -2,7 +2,7 @@
 import torch.nn.functional as F
 import tilelang
 import tilelang.language as T
-from tilelang.primitives.gemm.base import GemmWarpPolicy
+from tilelang.tileop.base import GemmWarpPolicy
 import itertools
 import argparse
 from functools import partial
@@ -11,22 +11,20 @@
 
 
 def ref_program(Q, K, V, is_causal, groups=1):
-    assert Q.size(
-        2) == K.size(2) * groups, f"Q heads {Q.size(2)} K heads {K.size(2)} groups {groups}"
-    assert Q.size(
-        2) == V.size(2) * groups, f"Q heads {Q.size(2)} V heads {V.size(2)} groups {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q heads {Q.size(2)} K heads {K.size(2)} groups {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q heads {Q.size(2)} V heads {V.size(2)} groups {groups}"
     dim = Q.size(-1)
     K_ref = K.repeat_interleave(groups, dim=2)
     V_ref = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K_ref)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K_ref)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V_ref)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V_ref)
     lse = torch.logsumexp(scores, dim=-1).float()
     return output, lse
 
@@ -45,23 +43,23 @@ def get_fwd_configs():
 
     valid_configs = []
 
-    for m, n, s, t, stages, r, k, p, qkw, vw in itertools.product(block_M, block_N, num_split_q,
-                                                                  threads, num_stages,
-                                                                  enable_rasterization, k_pack,
-                                                                  panel_size, qk_coalesced_width,
-                                                                  v_coalesced_width):
-        valid_configs.append({
-            "block_M": m,
-            "block_N": n,
-            "num_split_q": s,
-            "threads": t,
-            "num_stages": stages,
-            "enable_rasterization": r,
-            "k_pack": k,
-            "panel_size": p,
-            "qk_coalesced_width": qkw,
-            "v_coalesced_width": vw,
-        })
+    for m, n, s, t, stages, r, k, p, qkw, vw in itertools.product(
+        block_M, block_N, num_split_q, threads, num_stages, enable_rasterization, k_pack, panel_size, qk_coalesced_width, v_coalesced_width
+    ):
+        valid_configs.append(
+            {
+                "block_M": m,
+                "block_N": n,
+                "num_split_q": s,
+                "threads": t,
+                "num_stages": stages,
+                "enable_rasterization": r,
+                "k_pack": k,
+                "panel_size": p,
+                "qk_coalesced_width": qkw,
+                "v_coalesced_width": vw,
+            }
+        )
     return valid_configs
 
 
@@ -85,23 +83,23 @@ def fast_flashattn(
     qk_coalesced_width: int,
     v_coalesced_width: int,
 ):
-    scale = (1.0 / dim)**0.5
+    scale = (1.0 / dim) ** 0.5
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     vec_size = qk_coalesced_width
     v_vec_size = v_coalesced_width
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
-            LSE: T.Tensor([batch, heads, seq_len], accum_dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
+        LSE: T.Tensor([batch, heads, seq_len], accum_dtype),
     ):
         with T.Kernel(num_split_q, batch * heads, threads=threads) as (b_split, byz_combined):
             T.use_swizzle(panel_size, enable=enable_rasterization)
@@ -111,7 +109,7 @@ def main(
 
             num_q_blocks = T.ceildiv(seq_len, block_M)
 
-            bx_loop_var = T.alloc_var("int32")
+            bx_loop_var = T.alloc_var(T.int32)
             bx_loop_var = b_split
 
             with T.While(bx_loop_var < num_q_blocks):
@@ -135,33 +133,21 @@ def main(
                 m_prev = T.alloc_fragment([block_M], accum_dtype)
                 scale_factor = T.alloc_fragment([block_M], accum_dtype)
 
-                T.copy(
-                    Q[bz, q_block_offset:q_block_offset + block_M, by, :],
-                    Q_shared,
-                    coalesced_width=vec_size)
+                T.copy(Q[bz, q_block_offset : q_block_offset + block_M, by, :], Q_shared, coalesced_width=vec_size)
 
-                loop_end_k = (
-                    T.ceildiv(q_block_offset +
-                              block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+                loop_end_k = T.ceildiv(q_block_offset + block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
 
                 row_sum = T.alloc_fragment([block_M], accum_dtype)
 
                 for k in T.Pipelined(loop_end_k, num_stages=num_stages):
                     kv_idx = k * block_N
 
-                    T.copy(
-                        K[bz, kv_idx:kv_idx + block_N, by // groups, :],
-                        K_shared,
-                        coalesced_width=vec_size)
-                    T.copy(
-                        V[bz, kv_idx:kv_idx + block_N, by // groups, :],
-                        V_shared,
-                        coalesced_width=v_vec_size)
+                    T.copy(K[bz, kv_idx : kv_idx + block_N, by // groups, :], K_shared, coalesced_width=vec_size)
+                    T.copy(V[bz, kv_idx : kv_idx + block_N, by // groups, :], V_shared, coalesced_width=v_vec_size)
 
                     if is_causal:
                         for i, j in T.Parallel(block_M, block_N):
-                            acc_s[i, j] = T.if_then_else(q_block_offset + i >= kv_idx + j, 0,
-                                                         -T.infinity(acc_s.dtype))
+                            acc_s[i, j] = T.if_then_else(q_block_offset + i >= kv_idx + j, 0, -T.infinity(acc_s.dtype))
                     else:
                         T.clear(acc_s)
                     T.gemm(
@@ -178,6 +164,8 @@ def main(
 
                     T.copy(m_i, m_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for i in T.Parallel(block_M):
+                        m_i[i] = T.max(m_i[i], m_prev[i])
 
                     for i in T.Parallel(block_M):
                         if m_prev[i] == -T.infinity(accum_dtype):
@@ -214,8 +202,7 @@ def main(
 
                 for i in T.Parallel(block_M):
                     if q_block_offset + i < seq_len:
-                        lse_val = T.if_then_else(l_i[i] > 0,
-                                                 T.log(l_i[i]) + m_i[i], -T.infinity(accum_dtype))
+                        lse_val = T.if_then_else(l_i[i] > 0, T.log(l_i[i]) + m_i[i], -T.infinity(accum_dtype))
                         LSE[bz, by, q_block_offset + i] = lse_val
 
                 bx_loop_var = current_bx + num_split_q
@@ -232,30 +219,30 @@ def get_bwd_configs():
     panel_size = [7, 8, 9, 10]
 
     configs = []
-    for m, n, stages, t, r, p in itertools.product(block_M, block_N, num_stages, threads,
-                                                   enable_rasterization, panel_size):
-        configs.append({
-            "block_M": m,
-            "block_N": n,
-            "num_stages": stages,
-            "threads": t,
-            "enable_rasterization": r,
-            "panel_size": p,
-        })
+    for m, n, stages, t, r, p in itertools.product(block_M, block_N, num_stages, threads, enable_rasterization, panel_size):
+        configs.append(
+            {
+                "block_M": m,
+                "block_N": n,
+                "num_stages": stages,
+                "threads": t,
+                "enable_rasterization": r,
+                "panel_size": p,
+            }
+        )
 
     return configs
 
 
 @tilelang.jit(out_idx=[2])
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim]
     blk = 32
 
     @T.prim_func
-    def flash_bwd_prep(O: T.Tensor(shape, dtype), dO: T.Tensor(shape, dtype),
-                       Delta: T.Tensor([batch, heads, seq_len], accum_dtype)):
+    def flash_bwd_prep(O: T.Tensor(shape, dtype), dO: T.Tensor(shape, dtype), Delta: T.Tensor([batch, heads, seq_len], accum_dtype)):
         with T.Kernel(batch, heads, T.ceildiv(seq_len, blk)) as (bz, bx, by):
             o = T.alloc_fragment([blk, blk], dtype)
             do = T.alloc_fragment([blk, blk], dtype)
@@ -263,36 +250,51 @@ def flash_bwd_prep(O: T.Tensor(shape, dtype), dO: T.Tensor(shape, dtype),
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
 @tilelang.autotune(configs=get_bwd_configs(), cache_input_tensors=True)
 @tilelang.jit
-def flashattn_bwd(batch, heads, seq_len, dim, is_causal, groups, block_M: int, block_N: int,
-                  num_stages: int, threads: int, enable_rasterization: bool, panel_size: int):
-    sm_scale = (1.0 / dim)**0.5
+def flashattn_bwd(
+    batch,
+    heads,
+    seq_len,
+    dim,
+    is_causal,
+    groups,
+    block_M: int,
+    block_N: int,
+    num_stages: int,
+    threads: int,
+    enable_rasterization: bool,
+    panel_size: int,
+):
+    sm_scale = (1.0 / dim) ** 0.5
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
-    def flash_bwd_kernel(Q: T.Tensor(q_shape,
-                                     dtype), K: T.Tensor(kv_shape,
-                                                         dtype), V: T.Tensor(kv_shape, dtype),
-                         dO: T.Tensor(q_shape, dtype), lse: T.Tensor([batch, heads, seq_len],
-                                                                     accum_dtype),
-                         Delta: T.Tensor([batch, heads, seq_len],
-                                         accum_dtype), dQ: T.Tensor(q_shape, accum_dtype),
-                         dK: T.Tensor(kv_shape, accum_dtype), dV: T.Tensor(kv_shape, accum_dtype)):
+    def flash_bwd_kernel(
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        dO: T.Tensor(q_shape, dtype),
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),
+        dQ: T.Tensor(q_shape, accum_dtype),
+        dK: T.Tensor(kv_shape, accum_dtype),
+        dV: T.Tensor(kv_shape, accum_dtype),
+    ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             T.use_swizzle(panel_size, enable=enable_rasterization)
 
@@ -313,8 +315,8 @@ def flash_bwd_kernel(Q: T.Tensor(q_shape,
             dk = T.alloc_fragment([block_M, dim], accum_dtype)
             dq = T.alloc_fragment([block_N, dim], accum_dtype)
 
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx // groups, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx // groups, :], V_shared)
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
             T.clear(dv)
             T.clear(dk)
 
@@ -322,22 +324,21 @@ def flash_bwd_kernel(Q: T.Tensor(q_shape,
             loop_ed = T.ceildiv(seq_len, block_N)
 
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q_shared)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q_shared)
                 T.clear(qkT)
 
                 T.gemm(K_shared, q_shared, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
 
                 for i, j in T.Parallel(block_M, block_N):
                     P_acc[i, j] = T.exp(qkT[i, j] * sm_scale - lse_shared[j])
 
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        P_acc[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j,
-                                                     P_acc[i, j], 0.0)
+                        P_acc[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, P_acc[i, j], 0.0)
 
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do_shared)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do_shared)
                 T.clear(dP)
 
                 T.gemm(V_shared, do_shared, dP, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
@@ -345,7 +346,7 @@ def flash_bwd_kernel(Q: T.Tensor(q_shape,
                 T.copy(P_acc, p_cast)
                 T.gemm(p_cast, do_shared, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta_shared)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta_shared)
 
                 for i, j in T.Parallel(block_M, block_N):
                     p_cast[i, j] = P_acc[i, j] * (dP[i, j] - delta_shared[j]) * sm_scale
@@ -367,8 +368,8 @@ def flash_bwd_kernel(Q: T.Tensor(q_shape,
 
 @tilelang.jit(out_idx=[1])
 def flashattn_bwd_postprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim]
     blk = 64
 
@@ -376,8 +377,8 @@ def flashattn_bwd_postprocess(batch, heads, seq_len, dim):
     def flash_bwd_post(dQ_in: T.Tensor(shape, accum_dtype), dQ_out: T.Tensor(shape, dtype)):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.copy(
-                dQ_in[bz, bx * blk:(bx + 1) * blk, by, :],
-                dQ_out[bz, bx * blk:(bx + 1) * blk, by, :],
+                dQ_in[bz, bx * blk : (bx + 1) * blk, by, :],
+                dQ_out[bz, bx * blk : (bx + 1) * blk, by, :],
             )
 
     return flash_bwd_post
@@ -444,22 +445,14 @@ def benchmark_function(func, *args, warmup=10, repeat=100):
     return np.median(times)
 
 
-def main(batch: int = 1,
-         heads: int = 8,
-         seq_len: int = 4096,
-         dim: int = 128,
-         is_causal: bool = False,
-         groups: int = 1):
-
+def main(batch: int = 1, heads: int = 8, seq_len: int = 4096, dim: int = 128, is_causal: bool = False, groups: int = 1):
     device = "cuda"
     dtype = torch.float16
 
     torch.manual_seed(42)
     torch.cuda.manual_seed(42)
 
-    print(
-        f"Test configuration: batch={batch}, heads={heads}, seq_len={seq_len}, dim={dim}, is_causal={is_causal}, groups={groups}"
-    )
+    print(f"Test configuration: batch={batch}, heads={heads}, seq_len={seq_len}, dim={dim}, is_causal={is_causal}, groups={groups}")
 
     flops_per_gemm = 2.0 * batch * heads * seq_len * seq_len * dim
     total_flops = 5 * flops_per_gemm
@@ -515,22 +508,19 @@ def main(batch: int = 1,
     o_ref.backward(dO)
 
     print("Verifying backward pass correctness...")
-    dq_close, dq_max_diff, dq_mean_diff = debug_tensor_comparison(
-        dQ_tl, q_ref.grad, "dQ", rtol=0.05, atol=0.05)
+    dq_close, dq_max_diff, dq_mean_diff = debug_tensor_comparison(dQ_tl, q_ref.grad, "dQ", rtol=0.05, atol=0.05)
     if dq_close:
         print("dQ is correct.")
     else:
         print("dQ mismatch detected.")
 
-    dk_close, dk_max_diff, dk_mean_diff = debug_tensor_comparison(
-        dK_tl.to(torch.float16), k_ref.grad, "dK", rtol=0.05, atol=0.05)
+    dk_close, dk_max_diff, dk_mean_diff = debug_tensor_comparison(dK_tl.to(torch.float16), k_ref.grad, "dK", rtol=0.05, atol=0.05)
     if dk_close:
         print("dK is correct.")
     else:
         print("dK mismatch detected.")
 
-    dv_close, dv_max_diff, dv_mean_diff = debug_tensor_comparison(
-        dV_tl.to(torch.float16), v_ref.grad, "dV", rtol=0.05, atol=0.05)
+    dv_close, dv_max_diff, dv_mean_diff = debug_tensor_comparison(dV_tl.to(torch.float16), v_ref.grad, "dV", rtol=0.05, atol=0.05)
     if dv_close:
         print("dV is correct.")
     else:
@@ -551,9 +541,7 @@ def run_reference_fwd_bwd():
             torch.cuda.synchronize()
 
     ref_latency = benchmark_function(run_reference_fwd_bwd, warmup=10, repeat=100)
-    print(
-        f"Reference PyTorch Forward+Backward: {ref_latency:.2f} ms | {total_flops / ref_latency * 1e-9:.2f} TFlops"
-    )
+    print(f"Reference PyTorch Forward+Backward: {ref_latency:.2f} ms | {total_flops / ref_latency * 1e-9:.2f} TFlops")
 
     def run_complete_fwd_bwd():
         o_tl_bench, lse_tl_bench = fwd_kernel(q, k, v)
@@ -591,12 +579,12 @@ def run_complete_fwd_bwd():
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=8, help='heads')
-    parser.add_argument('--seq_len', type=int, default=1024, help='sequence length')
-    parser.add_argument('--dim', type=int, default=64, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--groups', type=int, default=1, help='groups')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=8, help="heads")
+    parser.add_argument("--seq_len", type=int, default=1024, help="sequence length")
+    parser.add_argument("--dim", type=int, default=64, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--groups", type=int, default=1, help="groups")
     args = parser.parse_args()
 
     main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.groups)
diff --git a/examples/amd/example_amd_flash_attn_fwd.py b/examples/amd/example_amd_flash_attn_fwd.py
index 6ec5db1e5..ca9c361ff 100644
--- a/examples/amd/example_amd_flash_attn_fwd.py
+++ b/examples/amd/example_amd_flash_attn_fwd.py
@@ -2,29 +2,42 @@
 import torch.nn.functional as F
 import tilelang
 import tilelang.language as T
-from tilelang.primitives.gemm.base import GemmWarpPolicy
+from tilelang.tileop.base import GemmWarpPolicy
 import itertools
 import argparse
 from functools import partial
 
 
+# Custom supply function to ensure tensors are created on GPU
+def supply_tensors_gpu(params):
+    """Supply function that creates tensors on GPU for ROCm/HIP."""
+    tensors = []
+    for param in params:
+        if hasattr(param, "shape") and hasattr(param, "dtype"):
+            # Force creation on GPU device
+            shape = [int(s) for s in param.shape]
+            tensor = torch.randn(shape, dtype=param.dtype, device="cuda")
+            tensors.append(tensor)
+        else:
+            tensors.append(param)
+    return tensors
+
+
 def ref_program(Q, K, V, is_causal, groups=1):
-    assert Q.size(
-        2) == K.size(2) * groups, f"Q heads {Q.size(2)} K heads {K.size(2)} groups {groups}"
-    assert Q.size(
-        2) == V.size(2) * groups, f"Q heads {Q.size(2)} V heads {V.size(2)} groups {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q heads {Q.size(2)} K heads {K.size(2)} groups {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q heads {Q.size(2)} V heads {V.size(2)} groups {groups}"
     dim = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -43,27 +56,27 @@ def get_configs():
 
     valid_configs = []
 
-    for m, n, s, t, stages, r, k, p, qkw, vw in itertools.product(block_M, block_N, num_split_q,
-                                                                  threads, num_stages,
-                                                                  enable_rasterization, k_pack,
-                                                                  panel_size, qk_coalesced_width,
-                                                                  v_coalesced_width):
-        valid_configs.append({
-            "block_M": m,
-            "block_N": n,
-            "num_split_q": s,
-            "threads": t,
-            "num_stages": stages,
-            "enable_rasterization": r,
-            "k_pack": k,
-            "panel_size": p,
-            "qk_coalesced_width": qkw,
-            "v_coalesced_width": vw,
-        })
+    for m, n, s, t, stages, r, k, p, qkw, vw in itertools.product(
+        block_M, block_N, num_split_q, threads, num_stages, enable_rasterization, k_pack, panel_size, qk_coalesced_width, v_coalesced_width
+    ):
+        valid_configs.append(
+            {
+                "block_M": m,
+                "block_N": n,
+                "num_split_q": s,
+                "threads": t,
+                "num_stages": stages,
+                "enable_rasterization": r,
+                "k_pack": k,
+                "panel_size": p,
+                "qk_coalesced_width": qkw,
+                "v_coalesced_width": vw,
+            }
+        )
     return valid_configs
 
 
-@tilelang.autotune(configs=get_configs(), cache_input_tensors=True)
+@tilelang.autotune(configs=get_configs(), cache_input_tensors=True, supply_prog=supply_tensors_gpu)
 @tilelang.jit(out_idx=[3])
 def fast_flashattn(
     batch,
@@ -83,22 +96,22 @@ def fast_flashattn(
     qk_coalesced_width: int,
     v_coalesced_width: int,
 ):
-    scale = (1.0 / dim)**0.5
+    scale = (1.0 / dim) ** 0.5
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     vec_size = qk_coalesced_width
     v_vec_size = v_coalesced_width
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(num_split_q, batch * heads, threads=threads) as (b_split, byz_combined):
             T.use_swizzle(panel_size, enable=enable_rasterization)
@@ -108,7 +121,7 @@ def main(
 
             num_q_blocks = T.ceildiv(seq_len, block_M)
 
-            bx = T.alloc_var("int32")
+            bx = T.alloc_var(T.int32)
             bx = b_split
 
             with T.While(bx < num_q_blocks):
@@ -132,32 +145,21 @@ def main(
                 m_prev = T.alloc_fragment([block_M], accum_dtype)
                 scale_factor = T.alloc_fragment([block_M], accum_dtype)
 
-                T.copy(
-                    Q[bz, q_block_offset:q_block_offset + block_M, by, :],
-                    Q_shared,
-                    coalesced_width=vec_size)
+                T.copy(Q[bz, q_block_offset : q_block_offset + block_M, by, :], Q_shared, coalesced_width=vec_size)
 
-                loop_end_k = T.ceildiv(q_block_offset + block_M,
-                                       block_N) if is_causal else T.ceildiv(seq_len, block_N)
+                loop_end_k = T.ceildiv(q_block_offset + block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
 
                 row_sum = T.alloc_fragment([block_M], accum_dtype)
 
                 for k in T.Pipelined(loop_end_k, num_stages=num_stages):
                     kv_idx = k * block_N
 
-                    T.copy(
-                        K[bz, kv_idx:kv_idx + block_N, by // groups, :],
-                        K_shared,
-                        coalesced_width=vec_size)
-                    T.copy(
-                        V[bz, kv_idx:kv_idx + block_N, by // groups, :],
-                        V_shared,
-                        coalesced_width=v_vec_size)
+                    T.copy(K[bz, kv_idx : kv_idx + block_N, by // groups, :], K_shared, coalesced_width=vec_size)
+                    T.copy(V[bz, kv_idx : kv_idx + block_N, by // groups, :], V_shared, coalesced_width=v_vec_size)
 
                     if is_causal:
                         for i, j in T.Parallel(block_M, block_N):
-                            acc_s[i, j] = T.if_then_else(q_block_offset + i >= kv_idx + j, 0,
-                                                         -T.infinity(acc_s.dtype))
+                            acc_s[i, j] = T.if_then_else(q_block_offset + i >= kv_idx + j, 0, -T.infinity(acc_s.dtype))
                     else:
                         T.clear(acc_s)
                     T.gemm(
@@ -171,6 +173,8 @@ def main(
 
                     T.copy(m_i, m_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for i in T.Parallel(block_M):
+                        m_i[i] = T.max(m_i[i], m_prev[i])
 
                     for i in T.Parallel(block_M):
                         sf = T.exp(m_prev[i] * scale - m_i[i] * scale)
@@ -205,13 +209,7 @@ def main(
     return main
 
 
-def main(batch: int = 1,
-         heads: int = 8,
-         seq_len: int = 4096,
-         dim: int = 128,
-         is_causal: bool = False,
-         groups: int = 1):
-
+def main(batch: int = 1, heads: int = 8, seq_len: int = 4096, dim: int = 128, is_causal: bool = False, groups: int = 1):
     flops_per_matmul = 2.0 * batch * heads * seq_len * seq_len * dim
     total_flops = 2 * flops_per_matmul
     if is_causal:
@@ -233,18 +231,16 @@ def main(batch: int = 1,
     print(f"Reference (PyTorch): {latency:.2f} ms | {total_flops / latency * 1e-9:.2f} TFlops")
 
     latency = profiler.do_bench(warmup=100)
-    print(
-        f"Fast Flash Attention V2 (Tile-lang): {latency:.2f} ms | {total_flops / latency * 1e-9:.2f} TFlops"
-    )
+    print(f"Fast Flash Attention V2 (Tile-lang): {latency:.2f} ms | {total_flops / latency * 1e-9:.2f} TFlops")
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=8, help='heads')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--groups', type=int, default=1, help='groups')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=8, help="heads")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--groups", type=int, default=1, help="groups")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.groups)
diff --git a/examples/analyze/README.md b/examples/analyze/README.md
index 8171d8826..1c2788b0b 100644
--- a/examples/analyze/README.md
+++ b/examples/analyze/README.md
@@ -21,9 +21,9 @@ M = N = K = 1024
 
 def kernel(block_M=128, block_N=128, block_K=32, num_stages=3, thread_num=128):
     @T.prim_func
-    def main(A: T.Tensor((M, K), "float16"),
-             B: T.Tensor((N, K), "float16"),
-             C: T.Tensor((M, N), "float")):
+    def main(A: T.Tensor((M, K), T.float16),
+             B: T.Tensor((N, K), T.float16),
+             C: T.Tensor((M, N), T.float)):
         # ... (kernel definition)
     return main
 
@@ -40,9 +40,9 @@ from tilelang.carver.arch import CUDA
 
 def kernel(N=64, C=256, H=512, W=512, F=512, K=3, block_M=64, block_N=128):
     @T.prim_func
-    def main(data: T.Tensor((N, H, W, C), "float16"),
-             kernel: T.Tensor((K, K, C, F), "float16"),
-             out: T.Tensor((N, (H-K+1), (W-K+1), F), "float")):
+    def main(data: T.Tensor((N, H, W, C), T.float16),
+             kernel: T.Tensor((K, K, C, F), T.float16),
+             out: T.Tensor((N, (H-K+1), (W-K+1), F), T.float)):
         # ... (convolution kernel definition)
     return main
 
@@ -64,10 +64,10 @@ class AnalysisResult:
 ```
 ### `Analyzer` Class Methods
 #### `analysis(fn, device)`
-* ​Parameters:
-    * fn: TVM IRModule or PrimFunc
-    * device: Device configuration object
-* Returns: AnalysisResult
+- ​Parameters:
+  - fn: TVM IRModule or PrimFunc
+  - device: Device configuration object
+- Returns: AnalysisResult
 #### Supported Architectures
 ```python
 # Extendable to custom hardware via: "compute_capability": (cores_per_SM, clock_GHz, flops_per_cycle, max_SM_count)
diff --git a/examples/analyze/example_conv_analyze.py b/examples/analyze/example_conv_analyze.py
index 540fcf4b7..06e5a86e9 100644
--- a/examples/analyze/example_conv_analyze.py
+++ b/examples/analyze/example_conv_analyze.py
@@ -2,7 +2,6 @@
 from tilelang.tools import Analyzer
 from tilelang.carver.arch import CUDA
 from tilelang.carver.arch import CDNA
-from tilelang.layout import make_swizzled_layout
 import torch
 
 N = 64
@@ -25,38 +24,21 @@ def check_hopper():
     return False
 
 
-def kernel(N,
-           C,
-           H,
-           W,
-           F,
-           K,
-           S,
-           D,
-           P,
-           block_M,
-           block_N,
-           block_K,
-           num_stages,
-           threads,
-           dtype="float16",
-           accum_dtype="float"):
+def kernel(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, threads, dtype=T.float16, accum_dtype=T.float32):
     KH, KW = K, K
     OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
     OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     is_hopper = check_hopper()
 
     @T.prim_func
     def conv(
-            data: T.Tensor((N, H, W, C), dtype),
-            kernel: T.Tensor((KH, KW, C, F), dtype),
-            out: T.Tensor((N, OH, OW, F), dtype),
+        data: T.Tensor((N, H, W, C), dtype),
+        kernel: T.Tensor((KH, KW, C, F), dtype),
+        out: T.Tensor((N, OH, OW, F), dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M),
-                threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M), threads=threads) as (bx, by):
             data_shared = T.alloc_shared((block_M, block_K), dtype)
             kernel_shared = T.alloc_shared((block_K, block_N), dtype)
             out_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -65,12 +47,6 @@ def conv(
             kernel_flat = T.Tensor((KH * KW * C, F), dtype, kernel.data)
             out_flat = T.Tensor((N * OH * OW, F), dtype, out.data)
 
-            T.annotate_layout({
-                out_shared: make_swizzled_layout(out_shared),
-                data_shared: make_swizzled_layout(data_shared),
-                kernel_shared: make_swizzled_layout(kernel_shared),
-            })
-
             T.clear(out_local)
             for k_iter in T.Pipelined(T.ceildiv(KH * KW * C, block_K), num_stages=num_stages):
                 if is_hopper:
@@ -81,10 +57,8 @@ def conv(
                         m = by * block_M + i
                         access_h = m % (OH * OW) // OW * S + k // (KW * C) * D - P
                         access_w = m % OW * S + k // C % KW * D - P
-                        in_bound = ((access_h >= 0) and (access_w >= 0) and (access_h < H) and
-                                    (access_w < W))
-                        data_shared[i, j] = T.if_then_else(
-                            in_bound, data[m // (OH * OW), access_h, access_w, k % C], 0)
+                        in_bound = (access_h >= 0) and (access_w >= 0) and (access_h < H) and (access_w < W)
+                        data_shared[i, j] = T.if_then_else(in_bound, data[m // (OH * OW), access_h, access_w, k % C], 0)
                 T.copy(kernel_flat[k_iter * block_K, bx * block_N], kernel_shared)
                 T.gemm(data_shared, kernel_shared, out_local)
 
diff --git a/examples/analyze/example_gemm_analyze.py b/examples/analyze/example_gemm_analyze.py
index bfd934f6a..0367af126 100644
--- a/examples/analyze/example_gemm_analyze.py
+++ b/examples/analyze/example_gemm_analyze.py
@@ -15,14 +15,14 @@ def kernel(
     thread_num=None,
     enable_rasteration=None,
 ):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def matmul(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
diff --git a/examples/attention_sink/README.md b/examples/attention_sink/README.md
index ed4b7004e..2cba8f0cc 100644
--- a/examples/attention_sink/README.md
+++ b/examples/attention_sink/README.md
@@ -2,7 +2,6 @@
 
 We compare with an optimized version of the official Triton implementation [here](https://github.com/openai/gpt-oss/blob/main/gpt_oss/triton/attention.py).
 
-
 ## Algorithm
 ### Forward
 The only change from vanilla FlashAttention is that `sinks` should be taken into consideration in the softmax, which requires an extra rescaling at the epilogue stage.
@@ -43,4 +42,4 @@ where $P_{b, h, q}$ is the proportion of $sink_h$ in the softmax in the $b$-th b
 | 16384   |   64    | 309.46        | **400.62**           | 1.29x   |
 | 16384   |  128    | 418.99        | **549.11**           | 1.31x   |
 
-> The backward performance will be further optimized in the future.
\ No newline at end of file
+> The backward performance will be further optimized in the future.
diff --git a/examples/attention_sink/benchmark_gqa_sink_fwd.py b/examples/attention_sink/benchmark_gqa_sink_fwd.py
index 00256286b..211ef1d18 100644
--- a/examples/attention_sink/benchmark_gqa_sink_fwd.py
+++ b/examples/attention_sink/benchmark_gqa_sink_fwd.py
@@ -1,10 +1,12 @@
 import torch
 import argparse
 from tilelang.profiler import do_bench
+from tilelang import language as T
 import triton
 import triton.language as tl
 from triton.tools.tensor_descriptor import TensorDescriptor
 from example_gqa_sink_fwd_bhsd_wgmma_pipelined import flashattn, ref_program, gen_inputs
+from typing import Optional
 
 
 @triton.jit
@@ -50,8 +52,7 @@ def triton_kernel(
     q = Q.load([off_z, off_h, start_m * BLOCK_M, 0]).reshape([BLOCK_M, HEAD_DIM])
 
     if BANDWIDTH:
-        lo, hi = tl.maximum(0, start_q + start_m * BLOCK_M -
-                            BANDWIDTH), start_q + (start_m + 1) * BLOCK_M
+        lo, hi = tl.maximum(0, start_q + start_m * BLOCK_M - BANDWIDTH), start_q + (start_m + 1) * BLOCK_M
     else:
         lo, hi = 0, start_q + (start_m + 1) * BLOCK_M
 
@@ -94,7 +95,7 @@ def triton_kernel(
     Out.store([off_z, off_h, start_m * BLOCK_M, 0], acc)
 
 
-def triton_program(Q, K, V, Sinks, window_size: int | None = None) -> torch.Tensor:
+def triton_program(Q, K, V, Sinks, window_size: Optional[int] = None) -> torch.Tensor:
     bs, n_heads, seq_q, head_dim = Q.shape
     _, n_heads_kv, seq_kv, _ = K.shape
     BLOCK_M = 64
@@ -119,7 +120,8 @@ def triton_program(Q, K, V, Sinks, window_size: int | None = None) -> torch.Tens
         BANDWIDTH=window_size,
         BLOCK_M=BLOCK_M,
         BLOCK_N=BLOCK_N,
-        start_q=seq_kv - seq_q)
+        start_q=seq_kv - seq_q,
+    )
     return o
 
 
@@ -130,18 +132,18 @@ def main(
     seq_kv: int = 256,
     dim: int = 128,
     groups: int = 8,
-    window_size: int | None = None,
+    window_size: Optional[int] = None,
     dtype: str = "float16",
     tune: bool = False,
 ):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= seq_q
-        flops_per_matmul = 2.0 * batch * heads * min(
-            window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
+        flops_per_matmul = 2.0 * batch * heads * min(window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * batch * heads * seq_q * seq_kv * dim * 0.5
     total_flops = 2 * flops_per_matmul
 
@@ -169,15 +171,14 @@ def main(
             block_N=block_N,
             num_stages=num_stages,
             threads=threads,
-            dtype=dtype)
+            dtype=dtype,
+        )
 
         Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, groups, dtype=torch_dtype)
 
         if torch.allclose(
-                triton_program(Q, K, V, sinks, window_size),
-                ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype),
-                rtol=1e-2,
-                atol=1e-2):
+            triton_program(Q, K, V, sinks, window_size), ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), rtol=1e-2, atol=1e-2
+        ):
             print("Checks for triton passed.✅")
         else:
             print("Checks for triton failed.❌")
@@ -197,20 +198,14 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=64, help='heads')
-    parser.add_argument('--seq_q', type=int, default=2048, help='sequence length of query')
-    parser.add_argument('--seq_kv', type=int, default=2048, help='sequence length of key/value')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--groups', type=int, default=8, help='groups')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="heads")
+    parser.add_argument("--seq_q", type=int, default=2048, help="sequence length of query")
+    parser.add_argument("--seq_kv", type=int, default=2048, help="sequence length of key/value")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--groups", type=int, default=8, help="groups")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default="float16", help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.groups, args.window_size,
-         args.dtype, args.tune)
+    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.groups, args.window_size, args.dtype, args.tune)
diff --git a/examples/attention_sink/benchmark_mha_sink_fwd.py b/examples/attention_sink/benchmark_mha_sink_fwd.py
index 734870fe4..50747e6b0 100644
--- a/examples/attention_sink/benchmark_mha_sink_fwd.py
+++ b/examples/attention_sink/benchmark_mha_sink_fwd.py
@@ -1,10 +1,12 @@
 import torch
 import argparse
 from tilelang.profiler import do_bench
+from tilelang import language as T
 import triton
 import triton.language as tl
 from triton.tools.tensor_descriptor import TensorDescriptor
 from example_mha_sink_fwd_bhsd_wgmma_pipelined import flashattn, ref_program, gen_inputs
+from typing import Optional
 
 
 @triton.jit
@@ -49,8 +51,7 @@ def triton_kernel(
     q = Q.load([off_z, off_h, start_m * BLOCK_M, 0]).reshape([BLOCK_M, HEAD_DIM])
 
     if BANDWIDTH:
-        lo, hi = tl.maximum(0, start_q + start_m * BLOCK_M -
-                            BANDWIDTH), start_q + (start_m + 1) * BLOCK_M
+        lo, hi = tl.maximum(0, start_q + start_m * BLOCK_M - BANDWIDTH), start_q + (start_m + 1) * BLOCK_M
     else:
         lo, hi = 0, start_q + (start_m + 1) * BLOCK_M
 
@@ -93,7 +94,7 @@ def triton_kernel(
     Out.store([off_z, off_h, start_m * BLOCK_M, 0], acc)
 
 
-def triton_program(Q, K, V, Sinks, window_size: int | None = None) -> torch.Tensor:
+def triton_program(Q, K, V, Sinks, window_size: Optional[int] = None) -> torch.Tensor:
     bs, n_heads, seq_q, head_dim = Q.shape
     seq_kv = K.shape[2]
     BLOCK_M = 64
@@ -116,26 +117,29 @@ def triton_program(Q, K, V, Sinks, window_size: int | None = None) -> torch.Tens
         BANDWIDTH=window_size,
         BLOCK_M=BLOCK_M,
         BLOCK_N=BLOCK_N,
-        start_q=seq_kv - seq_q)
+        start_q=seq_kv - seq_q,
+    )
     return o
 
 
-def main(batch: int = 1,
-         heads: int = 32,
-         seq_q: int = 256,
-         seq_kv: int = 256,
-         dim: int = 128,
-         window_size: int | None = None,
-         dtype: str = "float16",
-         tune: bool = False):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+def main(
+    batch: int = 1,
+    heads: int = 32,
+    seq_q: int = 256,
+    seq_kv: int = 256,
+    dim: int = 128,
+    window_size: Optional[int] = None,
+    dtype: str = "float16",
+    tune: bool = False,
+):
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= seq_q
-        flops_per_matmul = 2.0 * batch * heads * min(
-            window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
+        flops_per_matmul = 2.0 * batch * heads * min(window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * batch * heads * seq_q * seq_kv * dim * 0.5
     total_flops = 2 * flops_per_matmul
 
@@ -162,15 +166,14 @@ def main(batch: int = 1,
             block_N=block_N,
             num_stages=num_stages,
             threads=threads,
-            dtype=dtype)
+            dtype=dtype,
+        )
 
         Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, dtype=torch_dtype)
 
         torch.testing.assert_close(
-            kernel(Q, K, V, sinks),
-            ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype),
-            rtol=1e-2,
-            atol=1e-2)
+            kernel(Q, K, V, sinks), ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), rtol=1e-2, atol=1e-2
+        )
         print("All checks passed.✅")
 
         latency = do_bench(lambda: triton_program(Q, K, V, sinks, window_size), warmup=500)
@@ -183,19 +186,13 @@ def main(batch: int = 1,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--seq_q', type=int, default=4096, help='sequence length of query')
-    parser.add_argument('--seq_kv', type=int, default=4096, help='sequence length of key/value')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
-    parser.add_argument('--tune', action='store_true', help='tune')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--seq_q", type=int, default=4096, help="sequence length of query")
+    parser.add_argument("--seq_kv", type=int, default=4096, help="sequence length of key/value")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default="float16", help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--tune", action="store_true", help="tune")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype,
-         args.tune)
+    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype, args.tune)
diff --git a/examples/attention_sink/example_gqa_sink_bwd_bhsd.py b/examples/attention_sink/example_gqa_sink_bwd_bhsd.py
index f8f970ea4..cfdcd21b5 100644
--- a/examples/attention_sink/example_gqa_sink_bwd_bhsd.py
+++ b/examples/attention_sink/example_gqa_sink_bwd_bhsd.py
@@ -13,50 +13,50 @@ def get_bwd_configs():
     sm_version = sm_major * 10 + sm_minor
     if sm_version == 80:
         return 64, 32, 1, 128
-    elif sm_version == 90:
-        return 128, 32, 2, 256
     else:
-        raise ValueError(f"Unsupported SM version: {sm_version}")
+        return 128, 32, 2, 256
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(
-        batch,
-        heads,
-        seq_len,
-        dim,
-        groups=1,
-        window_size=None,  # None for full attention
-        sm_scale=None,
-        block_M=64,
-        block_N=64,
-        num_stages=1,
-        threads=128,
-        dtype: str = "float16"):
-
+    batch,
+    heads,
+    seq_len,
+    dim,
+    groups=1,
+    window_size=None,  # None for full attention
+    sm_scale=None,
+    block_M=64,
+    block_N=64,
+    num_stages=1,
+    threads=128,
+    dtype: T.dtype = T.float16,
+):
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
 
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
 
     head_kv = heads // groups
     q_shape = [batch, heads, seq_len, dim]
     kv_shape = [batch, head_kv, seq_len, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(kv_shape, dtype),  # type: ignore
-            V: T.Tensor(kv_shape, dtype),  # type: ignore
-            Output: T.Tensor(q_shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Sinks: T.Tensor([heads], dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(kv_shape, dtype),  # type: ignore
+        V: T.Tensor(kv_shape, dtype),  # type: ignore
+        Output: T.Tensor(q_shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Sinks: T.Tensor([heads], dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -72,8 +72,7 @@ def flash_fwd(
             logsum = T.alloc_fragment([block_M], accum_dtype)
             sinks = T.alloc_fragment([heads], dtype)
 
-            T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -81,34 +80,30 @@ def flash_fwd(
                 sinks[i] = Sinks[by]
 
             end = T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N))
-            start = T.alloc_local([1], 'int32')
-            if window_size is not None:
-                start[0] = T.max(0, (bx * block_M - window_size) // block_N)
-            else:
-                start[0] = 0
-
-            for k in T.Pipelined(start[0], end, num_stages=num_stages):
-                T.copy(K[bz, by // groups, k * block_N:(k + 1) * block_N, :], K_shared)
+            start = T.max(0, (bx * block_M - window_size) // block_N) if window_size is not None else 0
+
+            for k in T.Pipelined(start, end, num_stages=num_stages):
+                T.copy(K[bz, by // groups, k * block_N : (k + 1) * block_N, :], K_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     q_idx = bx * block_M + i
                     k_idx = k * block_N + j
                     if window_size is not None:
-                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size,
-                                                     0, -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0, -T.infinity(acc_s.dtype))
                     else:
                         acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(V[bz, by // groups, k * block_N:(k + 1) * block_N, :], V_shared)
+                T.copy(V[bz, by // groups, k * block_N : (k + 1) * block_N, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 # To do causal softmax, we need to set the scores_max to 0 if it is -inf
                 # This process is called Check_inf in FlashAttention3 code, and it only need to be done
                 # NOTE(wt): check_inf is necessary for sliding window attention.
                 for i in T.Parallel(block_M):
                     if window_size is not None:
-                        scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0,
-                                                       scores_max[i])
+                        scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
 
                 for i, j in T.Parallel(block_M, dim):
@@ -125,32 +120,33 @@ def flash_fwd(
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
 
             for i in T.Parallel(block_M):
-                logsum[i] += T.exp2(sinks[i] * 1.44269504 -
-                                    scores_max[i] * scale)  # The only change for attention sink
+                logsum[i] += T.exp2(sinks[i] * 1.44269504 - scores_max[i] * scale)  # The only change for attention sink
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(acc_o, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: str = "float16"):
-    accum_dtype = "float"
+    },
+)
+def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len, dim]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -159,65 +155,61 @@ def flash_bwd_prep(
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, bx, by * blk:(by + 1) * blk, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, bx, by * blk:(by + 1) * blk, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, bx, by * blk : (by + 1) * blk, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, bx, by * blk : (by + 1) * blk, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
 def make_dq_layout(dQ):
     # atomicAdd can not be vectorized, so we need to reorder dq to match the 8x8 gemm fragment
-    return T.Layout(dQ.shape,
-                    lambda b, h, l, d: [b, h, l // 8, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
+    return T.Layout(dQ.shape, lambda b, h, l, d: [b, h, l // 8, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
 
 
 @tilelang.jit(
-    out_idx=[1], pass_configs={
+    out_idx=[1],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn_bwd_postprocess(batch, heads, seq_len, dim, dtype: str = "float16"):
-    accum_dtype = "float"
+    },
+)
+def flashattn_bwd_postprocess(batch, heads, seq_len, dim, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len, dim]
     blk = 64
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(shape, dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
             T.copy(
-                dQ[bz, by, bx * blk:(bx + 1) * blk, :],
-                dQ_out[bz, by, bx * blk:(bx + 1) * blk, :],
+                dQ[bz, by, bx * blk : (bx + 1) * blk, :],
+                dQ_out[bz, by, bx * blk : (bx + 1) * blk, :],
             )
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd(batch,
-                  heads,
-                  seq_len,
-                  dim,
-                  groups,
-                  window_size=None,
-                  sm_scale=None,
-                  dtype="float16"):  # None for full attention
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd(batch, heads, seq_len, dim, groups, window_size=None, sm_scale=None, dtype=T.float16):  # None for full attention
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
 
     head_kv = heads // groups
     q_shape = [batch, heads, seq_len, dim]
     kv_shape = [batch, head_kv, seq_len, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     block_M, block_N, num_stages, threads = get_bwd_configs()
 
@@ -226,15 +218,15 @@ def flashattn_bwd(batch,
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(kv_shape, dtype),  # type: ignore
-            V: T.Tensor(kv_shape, dtype),  # type: ignore
-            dO: T.Tensor(q_shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(kv_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(kv_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(kv_shape, dtype),  # type: ignore
+        V: T.Tensor(kv_shape, dtype),  # type: ignore
+        dO: T.Tensor(q_shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(kv_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(kv_shape, accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim], dtype)
@@ -254,47 +246,44 @@ def flash_bwd(
             dv_shared = T.alloc_shared([block_M, dim], accum_dtype)
             dk_shared = T.alloc_shared([block_M, dim], accum_dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
-            T.copy(K[bz, bx // groups, by * block_M:(by + 1) * block_M, :], K_shared)
-            T.copy(V[bz, bx // groups, by * block_M:(by + 1) * block_M, :], V_shared)
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                }
+            )
+            T.copy(K[bz, bx // groups, by * block_M : (by + 1) * block_M, :], K_shared)
+            T.copy(V[bz, bx // groups, by * block_M : (by + 1) * block_M, :], V_shared)
             T.clear(dv)
             T.clear(dk)
 
             loop_st = T.floordiv(by * block_M, block_N)
-            loop_ed = T.alloc_local([1], 'int32')
-            if window_size is not None:
-                loop_ed[0] = T.min(
-                    T.ceildiv((by + 1) * block_M + window_size, block_N),
-                    T.ceildiv(seq_len, block_N))
-            else:
-                loop_ed[0] = T.ceildiv(seq_len, block_N)
-            for k in T.Pipelined(loop_st, loop_ed[0], num_stages=num_stages):
-                T.copy(Q[bz, bx, k * block_N:(k + 1) * block_N, :], q)
+            loop_ed = (
+                T.min(T.ceildiv((by + 1) * block_M + window_size, block_N), T.ceildiv(seq_len, block_N))
+                if window_size is not None
+                else T.ceildiv(seq_len, block_N)
+            )
+
+            for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
+                T.copy(Q[bz, bx, k * block_N : (k + 1) * block_N, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 for i, j in T.Parallel(block_M, block_N):
                     if window_size is not None:
                         qkT[i, j] = T.if_then_else(
-                            by * block_M + i <= k * block_N + j and
-                            by * block_M + i > k * block_N + j - window_size, qkT[i, j], 0)
+                            by * block_M + i <= k * block_N + j and by * block_M + i > k * block_N + j - window_size, qkT[i, j], 0
+                        )
                     else:
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, bx, k * block_N:(k + 1) * block_N, :], dst=do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                T.copy(dO[bz, bx, k * block_N : (k + 1) * block_N, :], dst=do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -303,50 +292,46 @@ def flash_bwd(
                 T.copy(dsT_cast, dsT_shared)
                 T.clear(dq)
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True)
-                T.atomic_add(dQ[bz, bx, k * block_N:(k + 1) * block_N, :], dq)
+                T.atomic_add(dQ[bz, bx, k * block_N : (k + 1) * block_N, :], dq)
 
             T.copy(dv, dv_shared)
-            T.atomic_add(dV[bz, bx // groups, by * block_M:(by + 1) * block_M, :], dv_shared)
+            T.atomic_add(dV[bz, bx // groups, by * block_M : (by + 1) * block_M, :], dv_shared)
             T.copy(dk, dk_shared)
-            T.atomic_add(dK[bz, bx // groups, by * block_M:(by + 1) * block_M, :], dk_shared)
+            T.atomic_add(dK[bz, bx // groups, by * block_M : (by + 1) * block_M, :], dk_shared)
 
     return flash_bwd
 
 
 @tilelang.jit(out_idx=-1)
-def flashattn_bwd_dsink(batch, heads, seq_len, block=256, dtype: str = "float16"):
-    accum_dtype = "float"
+def flashattn_bwd_dsink(batch, heads, seq_len, block=256, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len]
 
     @T.prim_func
     def flash_bwd_dsink(
-            Sinks: T.Tensor([heads], dtype),  # type: ignore
-            Delta: T.Tensor(shape, accum_dtype),  # type: ignore
-            lse: T.Tensor(shape, accum_dtype),  # type: ignore
-            dsinks: T.Tensor(shape, dtype),  # type: ignore
+        Sinks: T.Tensor([heads], dtype),  # type: ignore
+        Delta: T.Tensor(shape, accum_dtype),  # type: ignore
+        lse: T.Tensor(shape, accum_dtype),  # type: ignore
+        dsinks: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block), batch, threads=256) as (bx, by, bz):
-            sink = T.alloc_local([1], dtype)
             lse_fragment = T.alloc_fragment([block], accum_dtype)
             delta_fragment = T.alloc_fragment([block], accum_dtype)
             dsink_fragment = T.alloc_fragment([block], dtype)
 
-            sink[0] = Sinks[bx]
-            T.copy(lse[bz, bx, by * block:(by + 1) * block], lse_fragment)
-            T.copy(Delta[bz, bx, by * block:(by + 1) * block], delta_fragment)
+            sink = Sinks[bx]
+            T.copy(lse[bz, bx, by * block : (by + 1) * block], lse_fragment)
+            T.copy(Delta[bz, bx, by * block : (by + 1) * block], delta_fragment)
             for i in T.Parallel(block):
-                dsink_fragment[i] = -T.exp2(Sinks[bx] * 1.44269504 -
-                                            lse_fragment[i]) * delta_fragment[i]
-            T.copy(dsink_fragment, dsinks[bz, bx, by * block:(by + 1) * block])
+                dsink_fragment[i] = -T.exp2(sink * 1.44269504 - lse_fragment[i]) * delta_fragment[i]
+            T.copy(dsink_fragment, dsinks[bz, bx, by * block : (by + 1) * block])
 
     return flash_bwd_dsink
 
 
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, sinks, window_size, groups):
-
         def maybe_contiguous(x):
             if x.stride(-1) != 1:
                 return x.contiguous()
@@ -354,7 +339,7 @@ def maybe_contiguous(x):
 
         q, k, v, sinks = [maybe_contiguous(x) for x in (q, k, v, sinks)]
         BATCH, H, N_CTX, D_HEAD = q.shape
-        dtype = "float16" if q.dtype == torch.float16 else "bfloat16"
+        dtype = T.float16 if q.dtype == torch.float16 else T.bfloat16
         kernel = flashattn_fwd(BATCH, H, N_CTX, D_HEAD, groups, window_size, dtype=dtype)
         o, lse = kernel(q, k, v, sinks)
         ctx.save_for_backward(q, k, v, sinks, o, lse)
@@ -367,7 +352,7 @@ def backward(ctx, do):
         q, k, v, sinks, o, lse = ctx.saved_tensors
         BATCH, H, N_CTX, D_HEAD = q.shape
         groups = ctx.groups
-        dtype = "float16" if q.dtype == torch.float16 else "bfloat16"
+        dtype = T.float16 if q.dtype == torch.float16 else T.bfloat16
 
         kernel_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD, dtype=dtype)
         kernel_post = flashattn_bwd_postprocess(BATCH, H, N_CTX, D_HEAD, dtype=dtype)
@@ -392,13 +377,14 @@ def backward(ctx, do):
 
 # Adapted and optimized from
 # https://github.com/openai/gpt-oss/blob/main/gpt_oss/triton/attention.py
-def ref_program(query: torch.Tensor,
-                key: torch.Tensor,
-                value: torch.Tensor,
-                sinks: torch.Tensor,
-                sliding_window: Optional[int] = None,
-                dtype: torch.dtype = torch.float16) -> torch.Tensor:
-
+def ref_program(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    sinks: torch.Tensor,
+    sliding_window: Optional[int] = None,
+    dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
     key = key.transpose(1, 2).contiguous()
     value = value.transpose(1, 2).contiguous()
     batch_size, num_keys, num_key_value_heads, head_dim = key.shape
@@ -434,32 +420,32 @@ def ref_program(query: torch.Tensor,
 
     output = torch.einsum("bhmqk,bkhmd->bqhmd", scores, value.float())
 
-    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups,
-                            head_dim).to(dtype)
+    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups, head_dim).to(dtype)
     return output.transpose(1, 2).contiguous()
 
 
-def main(BATCH: int = 1,
-         H: int = 8,
-         N_CTX: int = 512,
-         D_HEAD: int = 64,
-         groups: int = 2,
-         window_size: int | None = None,
-         dtype: str = "float16"):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+def main(
+    BATCH: int = 1,
+    H: int = 8,
+    N_CTX: int = 512,
+    D_HEAD: int = 64,
+    groups: int = 2,
+    window_size: Optional[int] = None,
+    dtype: str = "float16",
+):
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= N_CTX
-        flops_per_matmul = 2.0 * BATCH * H * min(
-            window_size, N_CTX // 2) * N_CTX * D_HEAD  # just a rough estimation
+        flops_per_matmul = 2.0 * BATCH * H * min(window_size, N_CTX // 2) * N_CTX * D_HEAD  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD * 0.5
     total_flops = 5 * flops_per_matmul
 
-    Q = (torch.randn(BATCH, H, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda").requires_grad_())
-    K = torch.randn(
-        BATCH, H // groups, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda").requires_grad_()
+    Q = torch.randn(BATCH, H, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda").requires_grad_()
+    K = torch.randn(BATCH, H // groups, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda").requires_grad_()
     V = torch.randn_like(K).requires_grad_()
     sinks = torch.randn(H, dtype=torch_dtype, device="cuda").requires_grad_()
     dO = torch.randn_like(Q)
@@ -480,19 +466,14 @@ def main(BATCH: int = 1,
 
     # Checks
     rtol, atol = {
-        "float16": (1e-2, 1e-2),
-        "bfloat16": (2e-2, 2e-2),
+        T.float16: (1e-2, 1e-2),
+        T.bfloat16: (2e-2, 2e-2),
     }[dtype]
-    assert torch.allclose(O, O_ref, rtol=rtol, atol=atol), f'O max err: {(O-O_ref).abs().max()}'
-    assert torch.allclose(
-        dV, dV_ref, rtol=rtol, atol=atol), f'dV max err: {(dV-dV_ref).abs().max()}'
-    assert torch.allclose(
-        dK, dK_ref, rtol=rtol, atol=atol), f'dK max err: {(dK-dK_ref).abs().max()}'
-    assert torch.allclose(
-        dQ, dQ_ref, rtol=rtol, atol=atol), f'dq max err: {(dQ-dQ_ref).abs().max()}'
-    assert torch.allclose(
-        dsinks, dsinks_ref, rtol=rtol,
-        atol=atol), f'dsinks max err: {(dsinks-dsinks_ref).abs().max()}'
+    assert torch.allclose(O, O_ref, rtol=rtol, atol=atol), f"O max err: {(O - O_ref).abs().max()}"
+    assert torch.allclose(dV, dV_ref, rtol=rtol, atol=atol), f"dV max err: {(dV - dV_ref).abs().max()}"
+    assert torch.allclose(dK, dK_ref, rtol=rtol, atol=atol), f"dK max err: {(dK - dK_ref).abs().max()}"
+    assert torch.allclose(dQ, dQ_ref, rtol=rtol, atol=atol), f"dq max err: {(dQ - dQ_ref).abs().max()}"
+    assert torch.allclose(dsinks, dsinks_ref, rtol=rtol, atol=atol), f"dsinks max err: {(dsinks - dsinks_ref).abs().max()}"
 
     print("All checks passed for tilelang kernels.✅")
 
@@ -511,19 +492,57 @@ def tl_bwd():
     print("tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
+def run_regression_perf(
+    BATCH: int = 1,
+    H: int = 8,
+    N_CTX: int = 512,
+    D_HEAD: int = 64,
+    groups: int = 2,
+    window_size: Optional[int] = None,
+    dtype: str = "float16",
+):
+    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+    with torch.no_grad():
+        Q = torch.randn(BATCH, H, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda")
+        K = torch.randn(BATCH, H // groups, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda")
+        V = torch.randn_like(K)
+        sinks = torch.randn(H, dtype=torch_dtype, device="cuda")
+        dO = torch.randn_like(Q)
+        fwd = flashattn_fwd(BATCH, H, N_CTX, D_HEAD, groups, window_size, dtype=dtype)
+        O, lse = fwd(Q, K, V, sinks)
+
+        def maybe_contiguous(x):
+            return x if x.stride(-1) == 1 else x.contiguous()
+
+        do, q, k, v, sinks_c, o = [maybe_contiguous(x) for x in (dO, Q, K, V, sinks, O)]
+        k_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD, dtype=dtype)
+        Delta = k_prep(o, do)
+        k_bwd = flashattn_bwd(BATCH, H, N_CTX, D_HEAD, groups, window_size, dtype=dtype)
+        k_dsink = flashattn_bwd_dsink(BATCH, H, N_CTX, dtype=dtype)
+        q_shape = (BATCH, H, N_CTX, D_HEAD)
+        head_kv = H // groups
+        kv_shape = (BATCH, head_kv, N_CTX, D_HEAD)
+        dq = torch.zeros(q_shape, dtype=torch.float32, device="cuda")
+        dk = torch.zeros(kv_shape, dtype=torch.float32, device="cuda")
+        dv = torch.zeros(kv_shape, dtype=torch.float32, device="cuda")
+        k_bwd(q, k, v, do, lse, Delta, dq, dk, dv)
+        _ = k_dsink(sinks_c, Delta, lse).sum(0).sum(1)
+
+        def run_kernel_only():
+            k_bwd(q, k, v, do, lse, Delta, dq, dk, dv)
+
+        latency_ms = do_bench(run_kernel_only, backend="cupti")
+        return latency_ms
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='Batch size')
-    parser.add_argument('--h', type=int, default=64, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=4096, help='Context size')
-    parser.add_argument('--d_head', type=int, default=128, help='Head dimension')
-    parser.add_argument('--groups', type=int, default=8, help='Groups')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--batch", type=int, default=1, help="Batch size")
+    parser.add_argument("--h", type=int, default=64, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=4096, help="Context size")
+    parser.add_argument("--d_head", type=int, default=128, help="Head dimension")
+    parser.add_argument("--groups", type=int, default=8, help="Groups")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default="float16", help="dtype, can be float16 or bfloat16")
     args = parser.parse_args()
     main(args.batch, args.h, args.n_ctx, args.d_head, args.groups, args.window_size, args.dtype)
diff --git a/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py b/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py
index 49a3ecbd8..fa73df0af 100644
--- a/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py
+++ b/examples/attention_sink/example_gqa_sink_fwd_bhsd_wgmma_pipelined.py
@@ -6,7 +6,6 @@
 from tilelang.autotuner import autotune
 from tilelang.profiler import do_bench
 import tilelang.language as T
-from tilelang.layout import make_swizzled_layout
 import itertools
 import argparse
 from typing import Optional
@@ -23,9 +22,11 @@ def get_configs():
     rep=100,
 )
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn(
     batch,
     heads,
@@ -39,106 +40,30 @@ def flashattn(
     block_N=128,
     num_stages=2,
     threads=256,
-    dtype: str = "float16",
+    dtype: T.dtype = T.float16,
 ):
-
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
 
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
 
     head_kv = heads // groups
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, head_kv, seq_kv, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     past_len = seq_kv - seq_q
     assert past_len >= 0, "seq_kv must be greater than or equal to seq_q"
 
-    @T.macro
-    def MMA0(
-        K: T.Tensor(kv_shape, dtype),
-        Q_shared: T.SharedBuffer([block_M, dim], dtype),
-        K_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-        k: T.int32,
-        bx: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(K[bz, by // groups, k * block_N:(k + 1) * block_N, :], K_shared)
-        for i, j in T.Parallel(block_M, block_N):
-            q_idx = bx * block_M + i + past_len
-            k_idx = k * block_N + j
-            if window_size is not None:
-                acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0,
-                                             -T.infinity(acc_s.dtype))
-            else:
-                acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
-        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def MMA1(
-        V: T.Tensor(kv_shape, dtype),
-        V_shared: T.SharedBuffer([block_M, dim], dtype),
-        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-        k: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(V[bz, by // groups, k * block_N:(k + 1) * block_N, :], V_shared)
-        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        T.copy(scores_max, scores_max_prev)
-        T.fill(scores_max, -T.infinity(accum_dtype))
-        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
-        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
-        # NOTE(wt): check_inf is necessary for sliding window attention.
-        for i in T.Parallel(block_M):
-            if window_size is not None:
-                scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0,
-                                               scores_max[i])
-            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-
-        for i, j in T.Parallel(block_M, block_N):
-            # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
-            # max * log_2(e)) This allows the compiler to use the ffma
-            # instruction instead of fadd and fmul separately.
-            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-        T.reduce_sum(acc_s, scores_sum, dim=1)
-        for i in T.Parallel(block_M):
-            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-        T.copy(acc_s, acc_s_cast)
-
-    @T.macro
-    def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        for i, j in T.Parallel(block_M, dim):
-            acc_o[i, j] *= scores_scale[i]
-
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
-            Sinks: T.Tensor([heads], dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
+        Sinks: T.Tensor([heads], dtype),
     ):
         with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -155,61 +80,83 @@ def main(
             logsum = T.alloc_fragment([block_M], accum_dtype)
             sinks = T.alloc_fragment([block_M], dtype)
 
-            T.annotate_layout({
-                Q_shared: make_swizzled_layout(Q_shared),
-                K_shared: make_swizzled_layout(K_shared),
-                V_shared: make_swizzled_layout(V_shared),
-                O_shared: make_swizzled_layout(O_shared),
-            })
-
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
             for i in T.Parallel(block_M):
                 sinks[i] = Sinks[by]
 
-            end = T.min(
-                T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
+            end = T.min(T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
 
-            start = T.alloc_local([1], 'int32')
-            if window_size is not None:
-                start[0] = T.max(0, (bx * block_M + past_len - window_size) // block_N)
-            else:
-                start[0] = 0
+            start = T.max(0, (bx * block_M + past_len - window_size) // block_N) if window_size is not None else 0
 
             for k in T.Pipelined(
-                    start[0],
-                    end,
-                    num_stages=num_stages,
-                    order=[-1, 0, 3, 1, -1, 2],
-                    stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
-                MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
-                Rescale(acc_o, scores_scale)
-                MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
+                start,
+                end,
+                num_stages=num_stages,
+                order=[-1, 0, 3, 1, -1, 2],
+                stage=[-1, 0, 0, 1, -1, 1],
+                group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]],
+            ):
+                T.copy(K[bz, by // groups, k * block_N : (k + 1) * block_N, :], K_shared)
+                for i, j in T.Parallel(block_M, block_N):
+                    q_idx = bx * block_M + i + past_len
+                    k_idx = k * block_N + j
+                    if window_size is not None:
+                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0, -T.infinity(acc_s.dtype))
+                    else:
+                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                # To do causal softmax, we need to set the scores_max to 0 if it is -inf
+                # This process is called Check_inf in FlashAttention3 code, and it only need to be done
+                # NOTE(wt): check_inf is necessary for sliding window attention.
+                for i in T.Parallel(block_M):
+                    if window_size is not None:
+                        scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, block_N):
+                    # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
+                    # max * log_2(e)) This allows the compiler to use the ffma
+                    # instruction instead of fadd and fmul separately.
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+
+                T.copy(V[bz, by // groups, k * block_N : (k + 1) * block_N, :], V_shared)
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
             for i in T.Parallel(block_M):
-                logsum[i] += T.exp2(sinks[i] * 1.44269504 -
-                                    scores_max[i] * scale)  # The only change for attention sink
+                logsum[i] += T.exp2(sinks[i] * 1.44269504 - scores_max[i] * scale)  # The only change for attention sink
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
     return main
 
 
 # Following functions are adapted and optimized from
 # https://github.com/openai/gpt-oss/blob/main/gpt_oss/triton/attention.py
-def ref_program(query: torch.Tensor,
-                key: torch.Tensor,
-                value: torch.Tensor,
-                sinks: torch.Tensor,
-                sliding_window: Optional[int] = None,
-                dtype: torch.dtype = torch.float16) -> torch.Tensor:
-
+def ref_program(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    sinks: torch.Tensor,
+    sliding_window: Optional[int] = None,
+    dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
     key = key.transpose(1, 2).contiguous()
     value = value.transpose(1, 2).contiguous()
     batch_size, num_keys, num_key_value_heads, head_dim = key.shape
@@ -245,23 +192,15 @@ def ref_program(query: torch.Tensor,
 
     output = torch.einsum("bhmqk,bkhmd->bqhmd", scores, value.float())
 
-    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups,
-                            head_dim).to(dtype)
+    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups, head_dim).to(dtype)
     return output.transpose(1, 2).contiguous()
 
 
-def gen_inputs(
-        B,
-        H,
-        Sq,
-        Skv,
-        D,
-        groups,
-        dtype=torch.float16) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    query = torch.randn([B, H, Sq, D], dtype=dtype, device='cuda')
-    key = torch.randn([B, H // groups, Skv, D], dtype=dtype, device='cuda')
-    value = torch.randn([B, H // groups, Skv, D], dtype=dtype, device='cuda')
-    sinks = torch.randn([H], dtype=dtype, device='cuda')
+def gen_inputs(B, H, Sq, Skv, D, groups, dtype=torch.float16) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    query = torch.randn([B, H, Sq, D], dtype=dtype, device="cuda")
+    key = torch.randn([B, H // groups, Skv, D], dtype=dtype, device="cuda")
+    value = torch.randn([B, H // groups, Skv, D], dtype=dtype, device="cuda")
+    sinks = torch.randn([H], dtype=dtype, device="cuda")
     return query, key, value, sinks
 
 
@@ -272,18 +211,18 @@ def main(
     seq_kv: int = 256,
     dim: int = 128,
     groups: int = 8,
-    window_size: int | None = None,
-    dtype: str = "float16",
+    window_size: Optional[int] = None,
+    dtype: T.dtype = T.float16,
     tune: bool = False,
 ):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= seq_q
-        flops_per_matmul = 2.0 * batch * heads * min(
-            window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
+        flops_per_matmul = 2.0 * batch * heads * min(window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * batch * heads * seq_q * seq_kv * dim * 0.5
     total_flops = 2 * flops_per_matmul
 
@@ -311,15 +250,14 @@ def main(
             block_N=block_N,
             num_stages=num_stages,
             threads=threads,
-            dtype=dtype)
+            dtype=dtype,
+        )
 
         Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, groups, dtype=torch_dtype)
 
         torch.testing.assert_close(
-            kernel(Q, K, V, sinks),
-            ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype),
-            rtol=1e-2,
-            atol=1e-2)
+            kernel(Q, K, V, sinks), ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), rtol=1e-2, atol=1e-2
+        )
         print("All checks passed.✅")
 
         # Benchmark tilelang
@@ -328,22 +266,51 @@ def main(
         print("Tilelang: {:.2f} TFlops".format(total_flops / latency_tilelang * 1e-9))
 
 
+def run_regression_perf(
+    batch: int = 1,
+    heads: int = 32,
+    seq_q: int = 256,
+    seq_kv: int = 256,
+    dim: int = 128,
+    groups: int = 8,
+    window_size: Optional[int] = None,
+    dtype: str = "float16",
+    tune: bool = False,
+):
+    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+    block_M = 128
+    block_N = 128
+    num_stages = 2
+    threads = 256
+    kernel = flashattn(
+        batch,
+        heads,
+        seq_q,
+        seq_kv,
+        dim,
+        groups,
+        window_size,
+        block_M=block_M,
+        block_N=block_N,
+        num_stages=num_stages,
+        threads=threads,
+        dtype=dtype,
+    )
+    Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, groups, dtype=torch_dtype)
+    latency = do_bench(lambda: kernel(Q, K, V, sinks), backend="cupti")
+    return latency
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=64, help='heads')
-    parser.add_argument('--seq_q', type=int, default=2048, help='sequence length of query')
-    parser.add_argument('--seq_kv', type=int, default=2048, help='sequence length of key/value')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--groups', type=int, default=8, help='groups')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="heads")
+    parser.add_argument("--seq_q", type=int, default=2048, help="sequence length of query")
+    parser.add_argument("--seq_kv", type=int, default=2048, help="sequence length of key/value")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--groups", type=int, default=8, help="groups")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default="float16", help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.groups, args.window_size,
-         args.dtype, args.tune)
+    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.groups, args.window_size, args.dtype, args.tune)
diff --git a/examples/attention_sink/example_mha_sink_bwd_bhsd.py b/examples/attention_sink/example_mha_sink_bwd_bhsd.py
index ee1c35ece..66905f55d 100644
--- a/examples/attention_sink/example_mha_sink_bwd_bhsd.py
+++ b/examples/attention_sink/example_mha_sink_bwd_bhsd.py
@@ -20,40 +20,42 @@ def get_bwd_configs():
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(
-        batch,
-        heads,
-        seq_len,
-        dim,
-        window_size=None,  # None for full attention,
-        sm_scale=None,
-        block_M=64,
-        block_N=64,
-        num_stages=1,
-        threads=128,
-        dtype: str = "float16"):
-
+    batch,
+    heads,
+    seq_len,
+    dim,
+    window_size=None,  # None for full attention,
+    sm_scale=None,
+    block_M=64,
+    block_N=64,
+    num_stages=1,
+    threads=128,
+    dtype: T.dtype = T.float16,
+):
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
 
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
 
     shape = [batch, heads, seq_len, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            Output: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Sinks: T.Tensor([heads], dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        Output: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Sinks: T.Tensor([heads], dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -69,8 +71,7 @@ def flash_fwd(
             logsum = T.alloc_fragment([block_M], accum_dtype)
             sinks = T.alloc_fragment([heads], dtype)
 
-            T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
@@ -78,34 +79,30 @@ def flash_fwd(
                 sinks[i] = Sinks[by]
 
             end = T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N))
-            start = T.alloc_local([1], 'int32')
-            if window_size is not None:
-                start[0] = T.max(0, (bx * block_M - window_size) // block_N)
-            else:
-                start[0] = 0
-
-            for k in T.Pipelined(start[0], end, num_stages=num_stages):
-                T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+            start = T.max(0, (bx * block_M - window_size) // block_N) if window_size is not None else 0
+
+            for k in T.Pipelined(start, end, num_stages=num_stages):
+                T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     q_idx = bx * block_M + i
                     k_idx = k * block_N + j
                     if window_size is not None:
-                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size,
-                                                     0, -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0, -T.infinity(acc_s.dtype))
                     else:
                         acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+                T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 # To do causal softmax, we need to set the scores_max to 0 if it is -inf
                 # This process is called Check_inf in FlashAttention3 code, and it only need to be done
                 # NOTE(wt): check_inf is necessary for sliding window attention.
                 for i in T.Parallel(block_M):
                     if window_size is not None:
-                        scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0,
-                                                       scores_max[i])
+                        scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
 
                 for i, j in T.Parallel(block_M, dim):
@@ -122,32 +119,33 @@ def flash_fwd(
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
 
             for i in T.Parallel(block_M):
-                logsum[i] += T.exp2(sinks[i] * 1.44269504 -
-                                    scores_max[i] * scale)  # The only change for attention sink
+                logsum[i] += T.exp2(sinks[i] * 1.44269504 - scores_max[i] * scale)  # The only change for attention sink
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(acc_o, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: str = "float16"):
-    accum_dtype = "float"
+    },
+)
+def flashattn_bwd_preprocess(batch, heads, seq_len, dim, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len, dim]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -156,49 +154,52 @@ def flash_bwd_prep(
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, bx, by * blk:(by + 1) * blk, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, bx, by * blk:(by + 1) * blk, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, bx, by * blk : (by + 1) * blk, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, bx, by * blk : (by + 1) * blk, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
 def make_dq_layout(dQ):
     # atomicAdd can not be vectorized, so we need to reorder dq to match the 8x8 gemm fragment
-    return T.Layout(dQ.shape,
-                    lambda b, h, l, d: [b, h, l // 8, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
+    return T.Layout(dQ.shape, lambda b, h, l, d: [b, h, l // 8, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
 
 
 @tilelang.jit(
-    out_idx=[1], pass_configs={
+    out_idx=[1],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn_bwd_postprocess(batch, heads, seq_len, dim, dtype: str = "float16"):
-    accum_dtype = "float"
+    },
+)
+def flashattn_bwd_postprocess(batch, heads, seq_len, dim, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len, dim]
     blk = 64
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(shape, dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
             T.copy(
-                dQ[bz, by, bx * blk:(bx + 1) * blk, :],
-                dQ_out[bz, by, bx * blk:(bx + 1) * blk, :],
+                dQ[bz, by, bx * blk : (bx + 1) * blk, :],
+                dQ_out[bz, by, bx * blk : (bx + 1) * blk, :],
             )
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
 def flashattn_bwd(
     batch,
     heads,
@@ -206,32 +207,31 @@ def flashattn_bwd(
     dim,
     window_size=None,  # None for full attention
     sm_scale=None,
-    dtype: str = "float16",
+    dtype: T.dtype = T.float16,
 ):
-
     block_M, block_N, num_stages, threads = get_bwd_configs()
 
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
 
     shape = [batch, heads, seq_len, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(shape, dtype),  # type: ignore
-            dV: T.Tensor(shape, dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(shape, dtype),  # type: ignore
+        dV: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim], dtype)
@@ -255,47 +255,43 @@ def flash_bwd(
             dv_shared = T.alloc_shared([block_M, dim], dtype)
             dk_shared = T.alloc_shared([block_M, dim], dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
-            T.copy(K[bz, bx, by * block_M:(by + 1) * block_M, :], K_shared)
-            T.copy(V[bz, bx, by * block_M:(by + 1) * block_M, :], V_shared)
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                }
+            )
+            T.copy(K[bz, bx, by * block_M : (by + 1) * block_M, :], K_shared)
+            T.copy(V[bz, bx, by * block_M : (by + 1) * block_M, :], V_shared)
             T.clear(dv)
             T.clear(dk)
 
             loop_st = T.floordiv(by * block_M, block_N)
-            loop_ed = T.alloc_local([1], 'int32')
-            if window_size is not None:
-                loop_ed[0] = T.min(
-                    T.ceildiv((by + 1) * block_M + window_size, block_N),
-                    T.ceildiv(seq_len, block_N))
-            else:
-                loop_ed[0] = T.ceildiv(seq_len, block_N)
-            for k in T.Pipelined(loop_st, loop_ed[0], num_stages=num_stages):
-                T.copy(Q[bz, bx, k * block_N:(k + 1) * block_N, :], q)
+            loop_ed = (
+                T.min(T.ceildiv((by + 1) * block_M + window_size, block_N), T.ceildiv(seq_len, block_N))
+                if window_size is not None
+                else T.ceildiv(seq_len, block_N)
+            )
+            for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
+                T.copy(Q[bz, bx, k * block_N : (k + 1) * block_N, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 for i, j in T.Parallel(block_M, block_N):
                     if window_size is not None:
                         qkT[i, j] = T.if_then_else(
-                            by * block_M + i <= k * block_N + j and
-                            by * block_M + i > k * block_N + j - window_size, qkT[i, j], 0)
+                            by * block_M + i <= k * block_N + j and by * block_M + i > k * block_N + j - window_size, qkT[i, j], 0
+                        )
                     else:
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, bx, k * block_N:(k + 1) * block_N, :], dst=do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                T.copy(dO[bz, bx, k * block_N : (k + 1) * block_N, :], dst=do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, B=do, C=dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -304,51 +300,48 @@ def flash_bwd(
                 T.copy(dsT_cast, dsT_shared)
                 T.clear(dq)
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True)
-                T.atomic_add(dQ[bz, bx, k * block_N:(k + 1) * block_N, :], dq)
+                T.atomic_add(dQ[bz, bx, k * block_N : (k + 1) * block_N, :], dq)
 
             T.copy(dv, dv_shared)
             T.copy(dk, dk_shared)
-            T.copy(dv_shared, dV[bz, bx, by * block_M:(by + 1) * block_M, :])
-            T.copy(dk_shared, dK[bz, bx, by * block_M:(by + 1) * block_M, :])
+            T.copy(dv_shared, dV[bz, bx, by * block_M : (by + 1) * block_M, :])
+            T.copy(dk_shared, dK[bz, bx, by * block_M : (by + 1) * block_M, :])
 
     return flash_bwd
 
 
 @tilelang.jit(out_idx=-1)
-def flashattn_bwd_dsink(batch, heads, seq_len, block=128, dtype: str = "float16"):
-    accum_dtype = "float"
+def flashattn_bwd_dsink(batch, heads, seq_len, block=128, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len]
 
     @T.prim_func
     def flash_bwd_dsink(
-            Sinks: T.Tensor([heads], dtype),  # type: ignore
-            Delta: T.Tensor(shape, accum_dtype),  # type: ignore
-            lse: T.Tensor(shape, accum_dtype),  # type: ignore
-            dsinks: T.Tensor(shape, accum_dtype),  # type: ignore
+        Sinks: T.Tensor([heads], dtype),  # type: ignore
+        Delta: T.Tensor(shape, accum_dtype),  # type: ignore
+        lse: T.Tensor(shape, accum_dtype),  # type: ignore
+        dsinks: T.Tensor(shape, accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block), batch, threads=128) as (bx, by, bz):
-            sink = T.alloc_local([1], dtype)
             lse_fragment = T.alloc_fragment([block], accum_dtype)
             delta_fragment = T.alloc_fragment([block], accum_dtype)
             dsink_fragment = T.alloc_fragment([block], accum_dtype)
 
-            sink[0] = Sinks[bx]
-            T.copy(lse[bz, bx, by * block:(by + 1) * block], lse_fragment)
-            T.copy(Delta[bz, bx, by * block:(by + 1) * block], delta_fragment)
+            sink = Sinks[bx]
+            T.copy(lse[bz, bx, by * block : (by + 1) * block], lse_fragment)
+            T.copy(Delta[bz, bx, by * block : (by + 1) * block], delta_fragment)
             for i in T.Parallel(block):
-                dsink_fragment[i] = -T.exp2(Sinks[bx] * 1.44269504 -
-                                            lse_fragment[i]) * delta_fragment[i]
-            T.copy(dsink_fragment, dsinks[bz, bx, by * block:(by + 1) * block])
+                dsink_fragment[i] = -T.exp2(sink * 1.44269504 - lse_fragment[i]) * delta_fragment[i]
+            T.copy(dsink_fragment, dsinks[bz, bx, by * block : (by + 1) * block])
 
     return flash_bwd_dsink
 
 
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, sinks, window_size):
         BATCH, H, N_CTX, D_HEAD = q.shape
-        dtype = "float16" if q.dtype == torch.float16 else "bfloat16"
+        dtype = T.float16 if q.dtype == torch.float16 else T.bfloat16
         kernel = flashattn_fwd(BATCH, H, N_CTX, D_HEAD, window_size, dtype=dtype)
         o, lse = kernel(q, k, v, sinks)
         ctx.save_for_backward(q, k, v, sinks, o, lse)
@@ -366,7 +359,7 @@ def maybe_contiguous(x):
             return x
 
         do, q, k, v, sinks, o = [maybe_contiguous(x) for x in (do, q, k, v, sinks, o)]
-        dtype = "float16" if q.dtype == torch.float16 else "bfloat16"
+        dtype = T.float16 if q.dtype == torch.float16 else T.bfloat16
         kernel_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD, dtype=dtype)
         kernel_post = flashattn_bwd_postprocess(BATCH, H, N_CTX, D_HEAD, dtype=dtype)
         delta = kernel_prep(o, do)
@@ -388,15 +381,15 @@ def maybe_contiguous(x):
 
 # Adapted and optimized from
 # https://github.com/openai/gpt-oss/blob/main/gpt_oss/triton/attention.py
-def ref_program(query: torch.Tensor,
-                key: torch.Tensor,
-                value: torch.Tensor,
-                sinks: torch.Tensor,
-                sliding_window: Optional[int] = None,
-                dtype: torch.dtype = torch.float16) -> torch.Tensor:
-
-    query = query.transpose(1, 2).contiguous().unsqueeze(
-        3)  # align with the original function's interface
+def ref_program(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    sinks: torch.Tensor,
+    sliding_window: Optional[int] = None,
+    dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    query = query.transpose(1, 2).contiguous().unsqueeze(3)  # align with the original function's interface
     key = key.transpose(1, 2).contiguous()
     value = value.transpose(1, 2).contiguous()
 
@@ -431,29 +424,23 @@ def ref_program(query: torch.Tensor,
 
     output = torch.einsum("bhmqk,bkhmd->bqhmd", scores, value.float())
 
-    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups,
-                            head_dim).to(dtype)
+    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups, head_dim).to(dtype)
     return output.transpose(1, 2).contiguous()
 
 
-def main(BATCH: int = 1,
-         H: int = 1,
-         N_CTX: int = 512,
-         D_HEAD: int = 128,
-         window_size: int | None = None,
-         dtype: str = "float16"):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+def main(BATCH: int = 1, H: int = 1, N_CTX: int = 512, D_HEAD: int = 128, window_size: Optional[int] = None, dtype: T.dtype = T.float16):
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= N_CTX
-        flops_per_matmul = 2.0 * BATCH * H * min(
-            window_size, N_CTX // 2) * N_CTX * D_HEAD  # just a rough estimation
+        flops_per_matmul = 2.0 * BATCH * H * min(window_size, N_CTX // 2) * N_CTX * D_HEAD  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD * 0.5
     total_flops = 5 * flops_per_matmul
 
-    Q = (torch.randn(BATCH, H, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda").requires_grad_())
+    Q = torch.randn(BATCH, H, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda").requires_grad_()
     K = torch.randn_like(Q).requires_grad_()
     V = torch.randn_like(Q).requires_grad_()
     sinks = torch.randn(H, dtype=torch_dtype, device=Q.device).requires_grad_()
@@ -475,19 +462,14 @@ def main(BATCH: int = 1,
 
     # Checks
     rtol, atol = {
-        "float16": (1e-2, 1e-2),
-        "bfloat16": (2e-2, 2e-2),
+        T.float16: (1e-2, 1e-2),
+        T.bfloat16: (2e-2, 2e-2),
     }[dtype]
-    assert torch.allclose(O, O_ref, rtol=rtol, atol=atol), f'O max err: {(O-O_ref).abs().max()}'
-    assert torch.allclose(
-        dV, dV_ref, rtol=rtol, atol=atol), f'dV max err: {(dV-dV_ref).abs().max()}'
-    assert torch.allclose(
-        dK, dK_ref, rtol=rtol, atol=atol), f'dK max err: {(dK-dK_ref).abs().max()}'
-    assert torch.allclose(
-        dQ, dQ_ref, rtol=rtol, atol=atol), f'dq max err: {(dQ-dQ_ref).abs().max()}'
-    assert torch.allclose(
-        dsinks, dsinks_ref, rtol=rtol,
-        atol=atol), f'dsinks max err: {(dsinks-dsinks_ref).abs().max()}'
+    assert torch.allclose(O, O_ref, rtol=rtol, atol=atol), f"O max err: {(O - O_ref).abs().max()}"
+    assert torch.allclose(dV, dV_ref, rtol=rtol, atol=atol), f"dV max err: {(dV - dV_ref).abs().max()}"
+    assert torch.allclose(dK, dK_ref, rtol=rtol, atol=atol), f"dK max err: {(dK - dK_ref).abs().max()}"
+    assert torch.allclose(dQ, dQ_ref, rtol=rtol, atol=atol), f"dq max err: {(dQ - dQ_ref).abs().max()}"
+    assert torch.allclose(dsinks, dsinks_ref, rtol=rtol, atol=atol), f"dsinks max err: {(dsinks - dsinks_ref).abs().max()}"
 
     print("All checks passed for tilelang kernels.✅")
 
@@ -506,18 +488,53 @@ def tl_bwd():
     print("tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
+def run_regression_perf(
+    BATCH: int = 1,
+    H: int = 32,
+    N_CTX: int = 512,
+    D_HEAD: int = 128,
+    window_size: Optional[int] = None,
+    dtype: str = "float16",
+):
+    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+    with torch.no_grad():
+        Q = torch.randn(BATCH, H, N_CTX, D_HEAD, dtype=torch_dtype, device="cuda")
+        K = torch.randn_like(Q)
+        V = torch.randn_like(Q)
+        sinks = torch.randn(H, dtype=torch_dtype, device=Q.device)
+        dO = torch.randn_like(Q)
+        fwd = flashattn_fwd(BATCH, H, N_CTX, D_HEAD, window_size=window_size, dtype=dtype)
+        O, lse = fwd(Q, K, V, sinks)
+
+        def maybe_contiguous(x):
+            return x if x.stride(-1) == 1 else x.contiguous()
+
+        do, q, k, v, sinks_c, o = [maybe_contiguous(x) for x in (dO, Q, K, V, sinks, O)]
+        k_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD, dtype=dtype)
+        Delta = k_prep(o, do)
+        k_bwd = flashattn_bwd(BATCH, H, N_CTX, D_HEAD, window_size, dtype=dtype)
+        k_dsink = flashattn_bwd_dsink(BATCH, H, N_CTX, dtype=dtype)
+        shape = (BATCH, H, N_CTX, D_HEAD)
+        dq = torch.zeros(shape, dtype=torch.float32, device=Q.device)
+        dk = torch.empty(shape, dtype=torch_dtype, device=Q.device)
+        dv = torch.empty(shape, dtype=torch_dtype, device=Q.device)
+        k_bwd(q, k, v, do, lse, Delta, dq, dk, dv)
+        _ = k_dsink(sinks_c, Delta, lse).sum(0).sum(1)
+
+        def run_kernel_only():
+            k_bwd(q, k, v, do, lse, Delta, dq, dk, dv)
+
+        latency_ms = do_bench(run_kernel_only, backend="cupti")
+        return latency_ms
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='Batch size')
-    parser.add_argument('--h', type=int, default=64, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=4096, help='Context size')
-    parser.add_argument('--d_head', type=int, default=128, help='Head dimension')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--batch", type=int, default=1, help="Batch size")
+    parser.add_argument("--h", type=int, default=64, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=4096, help="Context size")
+    parser.add_argument("--d_head", type=int, default=128, help="Head dimension")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default="float16", help="dtype, can be float16 or bfloat16")
     args = parser.parse_args()
     main(args.batch, args.h, args.n_ctx, args.d_head, args.window_size, args.dtype)
diff --git a/examples/attention_sink/example_mha_sink_fwd_bhsd.py b/examples/attention_sink/example_mha_sink_fwd_bhsd.py
index 7e59e277e..f24aa38b7 100644
--- a/examples/attention_sink/example_mha_sink_fwd_bhsd.py
+++ b/examples/attention_sink/example_mha_sink_fwd_bhsd.py
@@ -5,7 +5,6 @@
 from tilelang.autotuner import autotune
 from tilelang.profiler import do_bench
 import tilelang.language as T
-from tilelang.layout import make_swizzled_layout
 import itertools
 import argparse
 from typing import Optional
@@ -18,117 +17,45 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=500, rep=100)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn(
-        batch,
-        heads,
-        seq_q,
-        seq_kv,
-        dim,
-        window_size=None,  # None for full attention
-        sm_scale=None,
-        block_M=64,
-        block_N=64,
-        num_stages=1,
-        threads=128,
-        dtype: str = "float16"):
+    batch,
+    heads,
+    seq_q,
+    seq_kv,
+    dim,
+    window_size=None,  # None for full attention
+    sm_scale=None,
+    block_M=64,
+    block_N=64,
+    num_stages=1,
+    threads=128,
+    dtype: T.dtype = T.float16,
+):
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
 
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     past_len = seq_kv - seq_q
     assert past_len >= 0, "seq_kv must be greater than or equal to seq_q"
 
-    @T.macro
-    def MMA0(
-        K: T.Tensor(kv_shape, dtype),
-        Q_shared: T.SharedBuffer([block_M, dim], dtype),
-        K_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-        k: T.int32,
-        bx: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
-        for i, j in T.Parallel(block_M, block_N):
-            q_idx = bx * block_M + i + past_len
-            k_idx = k * block_N + j
-            if window_size is not None:
-                acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0,
-                                             -T.infinity(acc_s.dtype))
-            else:
-                acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
-        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def MMA1(
-        V: T.Tensor(kv_shape, dtype),
-        V_shared: T.SharedBuffer([block_M, dim], dtype),
-        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-        k: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
-        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        T.copy(scores_max, scores_max_prev)
-        T.fill(scores_max, -T.infinity(accum_dtype))
-        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
-        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
-        # NOTE(wt): check_inf is necessary for sliding window attention.
-        for i in T.Parallel(block_M):
-            if window_size is not None:
-                scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0,
-                                               scores_max[i])
-            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-
-        for i, j in T.Parallel(block_M, block_N):
-            # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
-            # max * log_2(e)) This allows the compiler to use the ffma
-            # instruction instead of fadd and fmul separately.
-            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-        T.reduce_sum(acc_s, scores_sum, dim=1)
-        for i in T.Parallel(block_M):
-            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-        T.copy(acc_s, acc_s_cast)
-
-    @T.macro
-    def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        for i, j in T.Parallel(block_M, dim):
-            acc_o[i, j] *= scores_scale[i]
-
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
-            Sinks: T.Tensor([heads], dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
+        Sinks: T.Tensor([heads], dtype),
     ):
         with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -145,56 +72,76 @@ def main(
             logsum = T.alloc_fragment([block_M], accum_dtype)
             sinks = T.alloc_fragment([block_M], dtype)
 
-            T.annotate_layout({
-                Q_shared: make_swizzled_layout(Q_shared),
-                K_shared: make_swizzled_layout(K_shared),
-                V_shared: make_swizzled_layout(V_shared),
-                O_shared: make_swizzled_layout(O_shared),
-            })
-
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
             for i in T.Parallel(block_M):
                 sinks[i] = Sinks[by]
 
-            end = T.min(
-                T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
-
-            start = T.alloc_local([1], 'int32')
-            if window_size is not None:
-                start[0] = T.max(0, (bx * block_M + past_len - window_size) // block_N)
-            else:
-                start[0] = 0
-
-            for k in T.Pipelined(start[0], end, num_stages=num_stages):
-                MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
-                Rescale(acc_o, scores_scale)
-                MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
+            end = T.min(T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
+
+            start = T.max(0, (bx * block_M + past_len - window_size) // block_N) if window_size is not None else 0
+
+            for k in T.Pipelined(start, end, num_stages=num_stages):
+                T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
+                for i, j in T.Parallel(block_M, block_N):
+                    q_idx = bx * block_M + i + past_len
+                    k_idx = k * block_N + j
+                    if window_size is not None:
+                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0, -T.infinity(acc_s.dtype))
+                    else:
+                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                # To do causal softmax, we need to set the scores_max to 0 if it is -inf
+                # This process is called Check_inf in FlashAttention3 code, and it only need to be done
+                # NOTE(wt): check_inf is necessary for sliding window attention.
+                for i in T.Parallel(block_M):
+                    if window_size is not None:
+                        scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, block_N):
+                    # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
+                    # max * log_2(e)) This allows the compiler to use the ffma
+                    # instruction instead of fadd and fmul separately.
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+
+                T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
             for i in T.Parallel(block_M):
-                logsum[i] += T.exp2(sinks[i] * 1.44269504 -
-                                    scores_max[i] * scale)  # The only change for attention sink
+                logsum[i] += T.exp2(sinks[i] * 1.44269504 - scores_max[i] * scale)  # The only change for attention sink
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
     return main
 
 
 # Modified from https://github.com/openai/gpt-oss/blob/main/gpt_oss/triton/attention.py
-def ref_program(query: torch.Tensor,
-                key: torch.Tensor,
-                value: torch.Tensor,
-                sinks: torch.Tensor,
-                sliding_window: Optional[int] = None,
-                dtype: torch.dtype = torch.float16) -> torch.Tensor:
-
-    query = query.transpose(1, 2).contiguous().unsqueeze(
-        3)  # align with the original function's interface
+def ref_program(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    sinks: torch.Tensor,
+    sliding_window: Optional[int] = None,
+    dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    query = query.transpose(1, 2).contiguous().unsqueeze(3)  # align with the original function's interface
     key = key.transpose(1, 2).contiguous()
     value = value.transpose(1, 2).contiguous()
 
@@ -229,41 +176,36 @@ def ref_program(query: torch.Tensor,
 
     output = torch.einsum("bhmqk,bkhmd->bqhmd", scores, value.float())
 
-    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups,
-                            head_dim).to(dtype)
+    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups, head_dim).to(dtype)
     return output.transpose(1, 2).contiguous()
 
 
-def gen_inputs(
-        B,
-        H,
-        Sq,
-        Skv,
-        D,
-        dtype=torch.float16) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    query = torch.randn([B, H, Sq, D], dtype=dtype, device='cuda')
-    key = torch.randn([B, H, Skv, D], dtype=dtype, device='cuda')
-    value = torch.randn([B, H, Skv, D], dtype=dtype, device='cuda')
-    sinks = torch.randn([H], dtype=dtype, device='cuda')
+def gen_inputs(B, H, Sq, Skv, D, dtype=torch.float16) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    query = torch.randn([B, H, Sq, D], dtype=dtype, device="cuda")
+    key = torch.randn([B, H, Skv, D], dtype=dtype, device="cuda")
+    value = torch.randn([B, H, Skv, D], dtype=dtype, device="cuda")
+    sinks = torch.randn([H], dtype=dtype, device="cuda")
     return query, key, value, sinks
 
 
-def main(batch: int = 1,
-         heads: int = 1,
-         seq_q: int = 256,
-         seq_kv: int = 256,
-         dim: int = 128,
-         window_size: int | None = None,
-         dtype: str = "float16",
-         tune: bool = False):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+def main(
+    batch: int = 1,
+    heads: int = 1,
+    seq_q: int = 256,
+    seq_kv: int = 256,
+    dim: int = 128,
+    window_size: Optional[int] = None,
+    dtype: T.dtype = T.float16,
+    tune: bool = False,
+):
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= seq_q
-        flops_per_matmul = 2.0 * batch * heads * min(
-            window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
+        flops_per_matmul = 2.0 * batch * heads * min(window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * batch * heads * seq_q * seq_kv * dim * 0.5
     total_flops = 2 * flops_per_matmul
 
@@ -290,19 +232,17 @@ def main(batch: int = 1,
             block_N=block_N,
             num_stages=num_stages,
             threads=threads,
-            dtype=dtype)
+            dtype=dtype,
+        )
 
         Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, dtype=torch_dtype)
 
         torch.testing.assert_close(
-            kernel(Q, K, V, sinks),
-            ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype),
-            rtol=1e-2,
-            atol=1e-2)
+            kernel(Q, K, V, sinks), ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), rtol=1e-2, atol=1e-2
+        )
         print("All checks passed.✅")
 
-        latency = do_bench(
-            lambda: ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), warmup=500)
+        latency = do_bench(lambda: ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), warmup=500)
         print("Ref: {:.2f} ms".format(latency))
         print("Ref: {:.2f} TFlops".format(total_flops / latency * 1e-9))
         latency = do_bench(lambda: kernel(Q, K, V, sinks), warmup=500)
@@ -310,21 +250,37 @@ def main(batch: int = 1,
         print("Tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
+def run_regression_perf(
+    batch: int = 1,
+    heads: int = 32,
+    seq_q: int = 256,
+    seq_kv: int = 256,
+    dim: int = 128,
+    window_size: Optional[int] = None,
+    dtype: str = "float16",
+):
+    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+    block_M = 128
+    block_N = 128
+    num_stages = 2
+    threads = 256
+    kernel = flashattn(
+        batch, heads, seq_q, seq_kv, dim, window_size, block_M=block_M, block_N=block_N, num_stages=num_stages, threads=threads, dtype=dtype
+    )
+    Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, dtype=torch_dtype)
+    latency = do_bench(lambda: kernel(Q, K, V, sinks), backend="cupti")
+    return latency
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--seq_q', type=int, default=4096, help='sequence length of query')
-    parser.add_argument('--seq_kv', type=int, default=4096, help='sequence length of key/value')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
-    parser.add_argument('--tune', action='store_true', help='tune')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--seq_q", type=int, default=4096, help="sequence length of query")
+    parser.add_argument("--seq_kv", type=int, default=4096, help="sequence length of key/value")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default=T.float16, help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--tune", action="store_true", help="tune")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype,
-         args.tune)
+    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype, args.tune)
diff --git a/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py b/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py
index eee2f3ac5..b47c8175f 100644
--- a/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py
+++ b/examples/attention_sink/example_mha_sink_fwd_bhsd_wgmma_pipelined.py
@@ -6,7 +6,6 @@
 from tilelang.autotuner import autotune
 from tilelang.profiler import do_bench
 import tilelang.language as T
-from tilelang.layout import make_swizzled_layout
 import itertools
 import argparse
 from typing import Optional
@@ -19,119 +18,46 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=500, rep=100)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn(
-        batch,
-        heads,
-        seq_q,
-        seq_kv,
-        dim,
-        window_size=None,  # None for full attention
-        sm_scale=None,
-        block_M=128,
-        block_N=128,
-        num_stages=2,
-        threads=256,
-        dtype: str = "float16"):
-
+    batch,
+    heads,
+    seq_q,
+    seq_kv,
+    dim,
+    window_size=None,  # None for full attention
+    sm_scale=None,
+    block_M=128,
+    block_N=128,
+    num_stages=2,
+    threads=256,
+    dtype: T.dtype = T.float16,
+):
     if window_size is not None:
         assert window_size % block_N == 0, "window_size must be divisible by block_N"
 
     if sm_scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     scale = sm_scale * 1.44269504  # log2(e)
 
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     past_len = seq_kv - seq_q
     assert past_len >= 0, "seq_kv must be greater than or equal to seq_q"
 
-    @T.macro
-    def MMA0(
-        K: T.Tensor(kv_shape, dtype),
-        Q_shared: T.SharedBuffer([block_M, dim], dtype),
-        K_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-        k: T.int32,
-        bx: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
-        for i, j in T.Parallel(block_M, block_N):
-            q_idx = bx * block_M + i + past_len
-            k_idx = k * block_N + j
-            if window_size is not None:
-                acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0,
-                                             -T.infinity(acc_s.dtype))
-            else:
-                acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
-        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def MMA1(
-        V: T.Tensor(kv_shape, dtype),
-        V_shared: T.SharedBuffer([block_M, dim], dtype),
-        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-        k: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
-        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        T.copy(scores_max, scores_max_prev)
-        T.fill(scores_max, -T.infinity(accum_dtype))
-        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
-        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
-        # NOTE(wt): check_inf is necessary for sliding window attention.
-        for i in T.Parallel(block_M):
-            if window_size is not None:
-                scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0,
-                                               scores_max[i])
-            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-
-        for i, j in T.Parallel(block_M, block_N):
-            # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
-            # max * log_2(e)) This allows the compiler to use the ffma
-            # instruction instead of fadd and fmul separately.
-            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-        T.reduce_sum(acc_s, scores_sum, dim=1)
-        for i in T.Parallel(block_M):
-            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-        T.copy(acc_s, acc_s_cast)
-
-    @T.macro
-    def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        for i, j in T.Parallel(block_M, dim):
-            acc_o[i, j] *= scores_scale[i]
-
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
-            Sinks: T.Tensor([heads], dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
+        Sinks: T.Tensor([heads], dtype),
     ):
         with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -148,63 +74,84 @@ def main(
             logsum = T.alloc_fragment([block_M], accum_dtype)
             sinks = T.alloc_fragment([block_M], dtype)
 
-            T.annotate_layout({
-                Q_shared: make_swizzled_layout(Q_shared),
-                K_shared: make_swizzled_layout(K_shared),
-                V_shared: make_swizzled_layout(V_shared),
-                O_shared: make_swizzled_layout(O_shared),
-            })
-
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
             for i in T.Parallel(block_M):
                 sinks[i] = Sinks[by]
 
-            end = T.min(
-                T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
+            end = T.min(T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
 
-            start = T.alloc_local([1], 'int32')
-            if window_size is not None:
-                start[0] = T.max(0, (bx * block_M + past_len - window_size) // block_N)
-            else:
-                start[0] = 0
+            start = T.max(0, (bx * block_M + past_len - window_size) // block_N) if window_size is not None else 0
 
             for k in T.Pipelined(
-                    start[0],
-                    end,
-                    num_stages=num_stages,
-                    order=[-1, 0, 3, 1, -1, 2],
-                    stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
-                MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
-                Rescale(acc_o, scores_scale)
-                MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
+                start,
+                end,
+                num_stages=num_stages,
+                order=[-1, 0, 3, 1, -1, 2],
+                stage=[-1, 0, 0, 1, -1, 1],
+                group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]],
+            ):
+                T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
+                for i, j in T.Parallel(block_M, block_N):
+                    q_idx = bx * block_M + i + past_len
+                    k_idx = k * block_N + j
+                    if window_size is not None:
+                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx and q_idx < k_idx + window_size, 0, -T.infinity(acc_s.dtype))
+                    else:
+                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                # To do causal softmax, we need to set the scores_max to 0 if it is -inf
+                # This process is called Check_inf in FlashAttention3 code, and it only need to be done
+                # NOTE(wt): check_inf is necessary for sliding window attention.
+                for i in T.Parallel(block_M):
+                    if window_size is not None:
+                        scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, block_N):
+                    # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
+                    # max * log_2(e)) This allows the compiler to use the ffma
+                    # instruction instead of fadd and fmul separately.
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+
+                T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
             for i in T.Parallel(block_M):
-                logsum[i] += T.exp2(sinks[i] * 1.44269504 -
-                                    scores_max[i] * scale)  # The only change for attention sink
+                logsum[i] += T.exp2(sinks[i] * 1.44269504 - scores_max[i] * scale)  # The only change for attention sink
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
     return main
 
 
 # Following functions are adapted and optimized from
 # https://github.com/openai/gpt-oss/blob/main/gpt_oss/triton/attention.py
-def ref_program(query: torch.Tensor,
-                key: torch.Tensor,
-                value: torch.Tensor,
-                sinks: torch.Tensor,
-                sliding_window: Optional[int] = None,
-                dtype: torch.dtype = torch.float16) -> torch.Tensor:
-
-    query = query.transpose(1, 2).contiguous().unsqueeze(
-        3)  # align with the original function'sinterface
+def ref_program(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    sinks: torch.Tensor,
+    sliding_window: Optional[int] = None,
+    dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    query = query.transpose(1, 2).contiguous().unsqueeze(3)  # align with the original function'sinterface
     key = key.transpose(1, 2).contiguous()
     value = value.transpose(1, 2).contiguous()
 
@@ -239,41 +186,36 @@ def ref_program(query: torch.Tensor,
 
     output = torch.einsum("bhmqk,bkhmd->bqhmd", scores, value.float())
 
-    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups,
-                            head_dim).to(dtype)
+    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups, head_dim).to(dtype)
     return output.transpose(1, 2).contiguous()
 
 
-def gen_inputs(
-        B,
-        H,
-        Sq,
-        Skv,
-        D,
-        dtype=torch.float16) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    query = torch.randn([B, H, Sq, D], dtype=dtype, device='cuda')
-    key = torch.randn([B, H, Skv, D], dtype=dtype, device='cuda')
-    value = torch.randn([B, H, Skv, D], dtype=dtype, device='cuda')
-    sinks = torch.randn([H], dtype=dtype, device='cuda')
+def gen_inputs(B, H, Sq, Skv, D, dtype=torch.float16) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    query = torch.randn([B, H, Sq, D], dtype=dtype, device="cuda")
+    key = torch.randn([B, H, Skv, D], dtype=dtype, device="cuda")
+    value = torch.randn([B, H, Skv, D], dtype=dtype, device="cuda")
+    sinks = torch.randn([H], dtype=dtype, device="cuda")
     return query, key, value, sinks
 
 
-def main(batch: int = 1,
-         heads: int = 32,
-         seq_q: int = 256,
-         seq_kv: int = 256,
-         dim: int = 128,
-         window_size: int | None = None,
-         dtype: str = "float16",
-         tune: bool = False):
-    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+def main(
+    batch: int = 1,
+    heads: int = 32,
+    seq_q: int = 256,
+    seq_kv: int = 256,
+    dim: int = 128,
+    window_size: Optional[int] = None,
+    dtype: T.dtype = T.float16,
+    tune: bool = False,
+):
+    dtype = T.dtype(dtype)
+    torch_dtype = dtype.as_torch()
     if window_size is not None:
-        print('Using sliding window attention.')
+        print("Using sliding window attention.")
         assert window_size <= seq_q
-        flops_per_matmul = 2.0 * batch * heads * min(
-            window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
+        flops_per_matmul = 2.0 * batch * heads * min(window_size, seq_kv // 2) * seq_q * dim  # just a rough estimation
     else:
-        print('Using full attention.')
+        print("Using full attention.")
         flops_per_matmul = 2.0 * batch * heads * seq_q * seq_kv * dim * 0.5
     total_flops = 2 * flops_per_matmul
 
@@ -300,15 +242,14 @@ def main(batch: int = 1,
             block_N=block_N,
             num_stages=num_stages,
             threads=threads,
-            dtype=dtype)
+            dtype=dtype,
+        )
 
         Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, dtype=torch_dtype)
 
         torch.testing.assert_close(
-            kernel(Q, K, V, sinks),
-            ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype),
-            rtol=1e-2,
-            atol=1e-2)
+            kernel(Q, K, V, sinks), ref_program(Q, K, V, sinks, window_size, dtype=torch_dtype), rtol=1e-2, atol=1e-2
+        )
         print("All checks passed.✅")
 
         latency = do_bench(lambda: kernel(Q, K, V, sinks), warmup=500)
@@ -316,21 +257,38 @@ def main(batch: int = 1,
         print("Tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
+def run_regression_perf(
+    batch: int = 1,
+    heads: int = 32,
+    seq_q: int = 256,
+    seq_kv: int = 256,
+    dim: int = 128,
+    window_size: Optional[int] = None,
+    dtype: str = "float16",
+    tune: bool = False,
+):
+    torch_dtype = {"float16": torch.float16, "bfloat16": torch.bfloat16}[dtype]
+    block_M = 128
+    block_N = 128
+    num_stages = 2
+    threads = 256
+    kernel = flashattn(
+        batch, heads, seq_q, seq_kv, dim, window_size, block_M=block_M, block_N=block_N, num_stages=num_stages, threads=threads, dtype=dtype
+    )
+    Q, K, V, sinks = gen_inputs(batch, heads, seq_q, seq_kv, dim, dtype=torch_dtype)
+    latency = do_bench(lambda: kernel(Q, K, V, sinks), backend="cupti")
+    return latency
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--seq_q', type=int, default=4096, help='sequence length of query')
-    parser.add_argument('--seq_kv', type=int, default=4096, help='sequence length of key/value')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument(
-        '--window_size',
-        type=int,
-        default=None,
-        help='window size (default: None, which means full attention)')
-    parser.add_argument(
-        '--dtype', type=str, default="float16", help="dtype, can be float16 or bfloat16")
-    parser.add_argument('--tune', action='store_true', help='tune')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--seq_q", type=int, default=4096, help="sequence length of query")
+    parser.add_argument("--seq_kv", type=int, default=4096, help="sequence length of key/value")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--dtype", type=str, default=T.float16, help="dtype, can be float16 or bfloat16")
+    parser.add_argument("--tune", action="store_true", help="tune")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype,
-         args.tune)
+    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.window_size, args.dtype, args.tune)
diff --git a/examples/attention_sink/regression_attention_sink.py b/examples/attention_sink/regression_attention_sink.py
new file mode 100644
index 000000000..e2453173c
--- /dev/null
+++ b/examples/attention_sink/regression_attention_sink.py
@@ -0,0 +1,64 @@
+import tilelang.testing
+import example_mha_sink_fwd_bhsd
+import example_mha_sink_fwd_bhsd_wgmma_pipelined
+import example_mha_sink_bwd_bhsd
+import example_gqa_sink_bwd_bhsd
+import example_gqa_sink_fwd_bhsd_wgmma_pipelined
+
+
+def regression_example_mha_sink_fwd_bhsd():
+    tilelang.testing.process_func(example_mha_sink_fwd_bhsd.run_regression_perf)
+
+
+def regression_example_mha_sink_fwd_bhsd_sliding_window():
+    tilelang.testing.process_func(
+        example_mha_sink_fwd_bhsd.run_regression_perf, "regression_example_mha_sink_fwd_bhsd_sliding_window", window_size=128
+    )
+
+
+def regression_example_mha_sink_fwd_bhsd_wgmma_pipelined():
+    tilelang.testing.process_func(example_mha_sink_fwd_bhsd_wgmma_pipelined.run_regression_perf)
+
+
+def regression_example_mha_sink_fwd_bhsd_wgmma_pipelined_sliding_window():
+    tilelang.testing.process_func(
+        example_mha_sink_fwd_bhsd_wgmma_pipelined.run_regression_perf,
+        "regression_example_mha_sink_fwd_bhsd_wgmma_pipelined_sliding_window",
+        window_size=128,
+    )
+
+
+def regression_example_gqa_sink_fwd_bhsd_wgmma_pipelined():
+    tilelang.testing.process_func(example_gqa_sink_fwd_bhsd_wgmma_pipelined.run_regression_perf)
+
+
+def regression_example_gqa_sink_fwd_bhsd_wgmma_pipelined_sliding_window():
+    tilelang.testing.process_func(
+        example_gqa_sink_fwd_bhsd_wgmma_pipelined.run_regression_perf,
+        "regression_example_gqa_sink_fwd_bhsd_wgmma_pipelined_sliding_window",
+        window_size=128,
+    )
+
+
+def regression_example_mha_sink_bwd_bhsd():
+    tilelang.testing.process_func(example_mha_sink_bwd_bhsd.run_regression_perf)
+
+
+def regression_example_mha_sink_bwd_bhsd_sliding_window():
+    tilelang.testing.process_func(
+        example_mha_sink_bwd_bhsd.run_regression_perf, "regression_example_mha_sink_bwd_bhsd_sliding_window", window_size=128
+    )
+
+
+def regression_example_gqa_sink_bwd_bhsd():
+    tilelang.testing.process_func(example_gqa_sink_bwd_bhsd.run_regression_perf)
+
+
+def regression_example_gqa_sink_bwd_bhsd_sliding_window():
+    tilelang.testing.process_func(
+        example_gqa_sink_bwd_bhsd.run_regression_perf, "regression_example_gqa_sink_bwd_bhsd_sliding_window", window_size=128
+    )
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/bitnet-1.58b/.gitignore b/examples/bitnet-1.58b/.gitignore
index 6ea887496..2bcdfd92b 100644
--- a/examples/bitnet-1.58b/.gitignore
+++ b/examples/bitnet-1.58b/.gitignore
@@ -1 +1 @@
-models/
\ No newline at end of file
+models/
diff --git a/examples/bitnet-1.58b/README.md b/examples/bitnet-1.58b/README.md
index 2b587eab4..b9898741b 100644
--- a/examples/bitnet-1.58b/README.md
+++ b/examples/bitnet-1.58b/README.md
@@ -2,7 +2,6 @@
 license: mit
 ---
 
-
 This is a Tilelang Implementation for the reproduced 1.58bit model from [1bitLLM/bitnet_b1_58-3B](https://huggingface.co/1bitLLM/bitnet_b1_58-3B). We replaced the original simulated Int8x3bit Quantized Inference Kernel with INT8xINT2 Kernel. We also evaluated the model's correctness and performance through `eval_correctness.py` and `benchmark_inference_latency.py`.
 
 ## Make Checkpoints for vLLM
@@ -43,7 +42,6 @@ python3 inference_with_bitblas_format.py
 | bitnet-3b-1.58bits     | vllm-tilelang            | 379.25         | 117.43          | 752.55         |
 | bitnet-3b-1.58bits     | vllm-tilelang-cuda-graph | 2543.58        | 1621.08         | 2731.79        |
 
-
 ## BitBLAS Results
 
 ### Performance
@@ -94,4 +92,4 @@ The differences between the reported numbers and the reproduced results are poss
   journal={arXiv preprint arXiv:2402.17764},
   year={2024}
 }
-```
\ No newline at end of file
+```
diff --git a/examples/bitnet-1.58b/benchmark.sh b/examples/bitnet-1.58b/benchmark.sh
index 6a2550d45..839443dc6 100755
--- a/examples/bitnet-1.58b/benchmark.sh
+++ b/examples/bitnet-1.58b/benchmark.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 python benchmark_generate.py --bs 16 --in_seq_len 32 --out_seq_len 128 | tee b16_i32_o128.log
 
 python benchmark_generate.py --bs 1 --in_seq_len 512 --out_seq_len 64 | tee b1_i512_o64.log
diff --git a/examples/bitnet-1.58b/benchmark_generate.py b/examples/bitnet-1.58b/benchmark_generate.py
index d6f21ed50..d678b91a4 100644
--- a/examples/bitnet-1.58b/benchmark_generate.py
+++ b/examples/bitnet-1.58b/benchmark_generate.py
@@ -12,8 +12,7 @@
 
 def generate_text_batch(model, tokenizer, prompts, max_length=100):
     # Encode the input prompts as a batch
-    input_ids = tokenizer(
-        prompts, return_tensors="pt", padding=True, truncation=True).input_ids.to(model.device)
+    input_ids = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).input_ids.to(model.device)
 
     # Generate cos and sin values (commented out as not used in generation)
     seq_length = input_ids.size(1)
@@ -37,9 +36,7 @@ def generate_text_batch(model, tokenizer, prompts, max_length=100):
     end_time = time.time()
 
     # Decode the output ids to text
-    generated_texts = [
-        tokenizer.decode(output_id, skip_special_tokens=True) for output_id in output_ids
-    ]
+    generated_texts = [tokenizer.decode(output_id, skip_special_tokens=True) for output_id in output_ids]
 
     generation_time = end_time - start_time
     num_tokens = sum(len(output_id) for output_id in output_ids)
@@ -52,8 +49,8 @@ def generate_text_batch(model, tokenizer, prompts, max_length=100):
 
 
 def profile(model, input_data):
-
     import numpy as np
+
     model = model.cuda()
     model.eval()
 
@@ -74,25 +71,29 @@ def get_runtime(num_repeats=1):
     return np.mean(times)
 
 
-model_path = '1bitLLM/bitnet_b1_58-3B'
+model_path = "1bitLLM/bitnet_b1_58-3B"
 
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--bs', default=16, type=int)
-    parser.add_argument('--in_seq_len', default=32, type=int)
-    parser.add_argument('--out_seq_len', default=128, type=int)
-    parser.add_argument('--bitblas', action='store_true')
+    parser.add_argument("--bs", default=16, type=int)
+    parser.add_argument("--in_seq_len", default=32, type=int)
+    parser.add_argument("--out_seq_len", default=128, type=int)
+    parser.add_argument("--bitblas", action="store_true")
     args = parser.parse_args()
     bs = args.bs
     in_seq_len = args.in_seq_len
     out_seq_len = args.out_seq_len
     is_bitblas = args.bitblas
-    model = BitnetForCausalLM.from_pretrained(
-        model_path,
-        use_flash_attention_2=True,
-        torch_dtype=torch.float16,
-    ).cuda().half()
+    model = (
+        BitnetForCausalLM.from_pretrained(
+            model_path,
+            use_flash_attention_2=True,
+            torch_dtype=torch.float16,
+        )
+        .cuda()
+        .half()
+    )
     if is_bitblas:
         with torch.no_grad():
             model.quantize()
@@ -109,5 +110,5 @@ def main():
     print(generate_text_batch(model, tokenizer, prompts, max_length=max_length))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/bitnet-1.58b/benchmark_inference_latency.py b/examples/bitnet-1.58b/benchmark_inference_latency.py
index 9ce7a3898..788fc5565 100644
--- a/examples/bitnet-1.58b/benchmark_inference_latency.py
+++ b/examples/bitnet-1.58b/benchmark_inference_latency.py
@@ -6,13 +6,14 @@
 torch.set_grad_enabled(False)
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--hf_path', default='1bitLLM/bitnet_b1_58-3B', type=str)
+parser.add_argument("--hf_path", default="1bitLLM/bitnet_b1_58-3B", type=str)
 
 
 def profile(model, input_data):
     import time
 
     import numpy as np
+
     model = model.cuda()
     model.eval()
 
@@ -35,8 +36,8 @@ def get_runtime(num_repeats=1):
 
 def main():
     model = BitnetForCausalLM.from_pretrained(
-        '1bitLLM/bitnet_b1_58-3B',
-        device_map='auto',
+        "1bitLLM/bitnet_b1_58-3B",
+        device_map="auto",
         low_cpu_mem_usage=True,
         use_flash_attention_2=True,
         torch_dtype=torch.float16,
@@ -52,5 +53,5 @@ def main():
         print(f"Batch size: {batch_size}, Seq len: {seq_len}, Latency: {latency}")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/bitnet-1.58b/configuration_bitnet.py b/examples/bitnet-1.58b/configuration_bitnet.py
index 5f4937b87..63c499db3 100644
--- a/examples/bitnet-1.58b/configuration_bitnet.py
+++ b/examples/bitnet-1.58b/configuration_bitnet.py
@@ -17,7 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" LLaMA model configuration"""
+"""LLaMA model configuration"""
 
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
@@ -180,16 +180,10 @@ def _rope_scaling_validation(self):
             return
 
         if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
-            raise ValueError(
-                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
-                f"got {self.rope_scaling}")
+            raise ValueError(f"`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, got {self.rope_scaling}")
         rope_scaling_type = self.rope_scaling.get("type", None)
         rope_scaling_factor = self.rope_scaling.get("factor", None)
         if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
-            raise ValueError(
-                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
-            )
-        if rope_scaling_factor is None or not isinstance(rope_scaling_factor,
-                                                         float) or rope_scaling_factor <= 1.0:
-            raise ValueError(
-                f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
+            raise ValueError(f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}")
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
diff --git a/examples/bitnet-1.58b/eval_correctness.py b/examples/bitnet-1.58b/eval_correctness.py
index ac1e34072..11d47004b 100644
--- a/examples/bitnet-1.58b/eval_correctness.py
+++ b/examples/bitnet-1.58b/eval_correctness.py
@@ -47,8 +47,8 @@ def generate_text(model, tokenizer, prompt, max_length=100):
 
 
 def profile(model, input_data):
-
     import numpy as np
+
     model = model.cuda()
     model.eval()
 
@@ -69,18 +69,22 @@ def get_runtime(num_repeats=1):
     return np.mean(times)
 
 
-model_path = '1bitLLM/bitnet_b1_58-3B'
+model_path = "1bitLLM/bitnet_b1_58-3B"
 
 
 def main():
-    model = BitnetForCausalLM.from_pretrained(
-        model_path,
-        use_flash_attention_2=False,
-        torch_dtype=torch.float16,
-    ).cuda().half()
+    model = (
+        BitnetForCausalLM.from_pretrained(
+            model_path,
+            use_flash_attention_2=False,
+            torch_dtype=torch.float16,
+        )
+        .cuda()
+        .half()
+    )
 
     tokenizer = BitnetTokenizer.from_pretrained(model_path, use_fast=False)
-    input_id = tokenizer("Hello")['input_ids']
+    input_id = tokenizer("Hello")["input_ids"]
     input_id = torch.tensor(input_id).unsqueeze(0).cuda()
 
     print("original model generated text:")
@@ -91,5 +95,5 @@ def main():
     print(generate_text(model, tokenizer, "Hello", max_length=100))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/bitnet-1.58b/eval_gpu_memory.py b/examples/bitnet-1.58b/eval_gpu_memory.py
index 597cbbfcd..00c914cb3 100644
--- a/examples/bitnet-1.58b/eval_gpu_memory.py
+++ b/examples/bitnet-1.58b/eval_gpu_memory.py
@@ -6,13 +6,14 @@
 torch.set_grad_enabled(False)
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--hf_path', default='1bitLLM/bitnet_b1_58-3B', type=str)
+parser.add_argument("--hf_path", default="1bitLLM/bitnet_b1_58-3B", type=str)
 
 
 def profile(model, input_data):
     import time
 
     import numpy as np
+
     model = model.cuda()
     model.eval()
 
@@ -35,17 +36,17 @@ def get_runtime(num_repeats=1):
 
 def main():
     model = BitnetForCausalLM.from_pretrained(
-        '1bitLLM/bitnet_b1_58-3B',
-        device_map='auto',
+        "1bitLLM/bitnet_b1_58-3B",
+        device_map="auto",
         low_cpu_mem_usage=True,
         use_flash_attention_2=True,
         torch_dtype=torch.float16,
     ).half()
-    print(f"gpu memory: {torch.cuda.memory_allocated() / 1024 ** 3} GB")
+    print(f"gpu memory: {torch.cuda.memory_allocated() / 1024**3} GB")
     with torch.no_grad():
         model._post_process_weights()
-    print(f"gpu memory BitBLAS: {torch.cuda.memory_allocated() / 1024 ** 3} GB")
+    print(f"gpu memory BitBLAS: {torch.cuda.memory_allocated() / 1024**3} GB")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/bitnet-1.58b/eval_ppl.py b/examples/bitnet-1.58b/eval_ppl.py
index 61c8488e4..97db2d0f5 100644
--- a/examples/bitnet-1.58b/eval_ppl.py
+++ b/examples/bitnet-1.58b/eval_ppl.py
@@ -15,9 +15,9 @@
 torch.set_grad_enabled(False)
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--seed', default=0, type=int)
-parser.add_argument('--hf_path', default='1bitLLM/bitnet_b1_58-3B', type=str)
-parser.add_argument('--seqlen', default=2048, type=int)
+parser.add_argument("--seed", default=0, type=int)
+parser.add_argument("--hf_path", default="1bitLLM/bitnet_b1_58-3B", type=str)
+parser.add_argument("--seqlen", default=2048, type=int)
 
 
 def calulate_loss(model, input, loss_fct):
@@ -29,12 +29,16 @@ def calulate_loss(model, input, loss_fct):
 
 
 def main(args):
-    datasets = ['c4', 'wikitext2']
-    model = BitnetForCausalLM.from_pretrained(
-        args.hf_path,
-        use_flash_attention_2=True,
-        torch_dtype=torch.float16,
-    ).cuda().half()
+    datasets = ["c4", "wikitext2"]
+    model = (
+        BitnetForCausalLM.from_pretrained(
+            args.hf_path,
+            use_flash_attention_2=True,
+            torch_dtype=torch.float16,
+        )
+        .cuda()
+        .half()
+    )
     with torch.no_grad():
         model._post_process_weights()
     tokenizer = BitnetTokenizer.from_pretrained(args.hf_path, use_fast=False)
@@ -48,9 +52,9 @@ def main(args):
         for ii in progress:
             input = torch.Tensor(testdata[ii]).long().cuda().view(1, -1)
             loss = calulate_loss(model, input, loss_fct)
-            count += (input.size(-1) - 1)
+            count += input.size(-1) - 1
             acc_loss += loss.item()
-            progress.set_description(f"avg_loss = {acc_loss/ count / math.log(2)}")
+            progress.set_description(f"avg_loss = {acc_loss / count / math.log(2)}")
 
         avg_loss = acc_loss / count / math.log(2)
         ppl.append(2**avg_loss)
@@ -60,7 +64,7 @@ def main(args):
     print("Avg PPL:", sum(ppl) / len(ppl))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     torch.set_grad_enabled(False)
     args = parser.parse_args()
     random.seed(args.seed)
diff --git a/examples/bitnet-1.58b/eval_utils.py b/examples/bitnet-1.58b/eval_utils.py
index 46241eedf..72480c392 100644
--- a/examples/bitnet-1.58b/eval_utils.py
+++ b/examples/bitnet-1.58b/eval_utils.py
@@ -15,21 +15,17 @@ def set_seed(seed):
 
 def get_test_dataset(dataset_name, tokenizer, seqlen=2048):
     if dataset_name == "wikitext2":
-        testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
-        testdata = "".join(testdata['text']).split('\n')
+        testdata = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+        testdata = "".join(testdata["text"]).split("\n")
     elif dataset_name == "c4":
-        testdata = load_dataset(
-            'allenai/c4',
-            data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'},
-            split='validation')['text']
+        testdata = load_dataset("allenai/c4", data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"}, split="validation")[
+            "text"
+        ]
     else:
         raise NotImplementedError
 
     testdata = [item for item in testdata if item != ""]
-    tokenized_text = [
-        tokenizer(item, add_special_tokens=False)['input_ids'] + [tokenizer.eos_token_id]
-        for item in testdata
-    ]
+    tokenized_text = [tokenizer(item, add_special_tokens=False)["input_ids"] + [tokenizer.eos_token_id] for item in testdata]
 
     data, doc = [], [tokenizer.bos_token_id]
     for sen in tokenized_text:
@@ -45,7 +41,6 @@ def get_test_dataset(dataset_name, tokenizer, seqlen=2048):
 
 
 class LMEvalAdaptor(BaseLM):
-
     def __init__(self, model_name, model, tokenizer, batch_size=1, max_length=-1):
         super().__init__()
 
@@ -137,5 +132,4 @@ def _model_call(self, inps):
         return out
 
     def _model_generate(self, context, max_length, eos_token_id):
-        return self.model.generate(
-            context, max_length=max_length, eos_token_id=eos_token_id, do_sample=False)
+        return self.model.generate(context, max_length=max_length, eos_token_id=eos_token_id, do_sample=False)
diff --git a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_decode.py b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_decode.py
index e5af16cc4..7b8b7b95c 100644
--- a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_decode.py
+++ b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_decode.py
@@ -76,13 +76,13 @@ def bitnet_158_int8xint2_decode(
     reduce_thread=32,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
     storage_nbit = 8
     num_bits = 2
@@ -94,7 +94,7 @@ def bitnet_158_int8xint2_decode(
     MAX_TRANSACTION_SIZE_IN_BITS = 128
     micro_size_k = MAX_TRANSACTION_SIZE_IN_BITS // DataType(in_dtype).bits
     micro_size_k_compressed = micro_size_k // num_elems_per_byte
-    storage_dtype = "int8"
+    storage_dtype = T.int8
     block_K = reduce_thread * micro_size_k
 
     use_dp4a = True
@@ -102,17 +102,17 @@ def bitnet_158_int8xint2_decode(
 
     @T.prim_func
     def kernel(
-            A: T.Buffer(A_shape, in_dtype),
-            B: T.Buffer(B_shape, storage_dtype),
-            C: T.Buffer(C_shape, out_dtype),
+        A: T.Buffer(A_shape, in_dtype),
+        B: T.Buffer(B_shape, storage_dtype),
+        C: T.Buffer(C_shape, out_dtype),
     ):
         with T.Kernel(
-                T.ceildiv(N, n_partition),
-                M,
-                threads=(reduce_thread, n_partition),
+            T.ceildiv(N, n_partition),
+            M,
+            threads=(reduce_thread, n_partition),
         ) as (
-                bx,
-                by,
+            bx,
+            by,
         ):
             A_local = T.alloc_local((micro_size_k,), in_dtype)
             B_quant_local = T.alloc_local([micro_size_k_compressed], storage_dtype)
@@ -133,8 +133,7 @@ def kernel(
                 for v in T.vectorized(micro_size_k_compressed):
                     B_quant_local[v] = B[
                         bx * n_partition + ni,
-                        ko * (reduce_thread * micro_size_k_compressed) +
-                        kr * micro_size_k_compressed + v,
+                        ko * (reduce_thread * micro_size_k_compressed) + kr * micro_size_k_compressed + v,
                     ]
 
                 T.call_extern(
@@ -156,9 +155,9 @@ def kernel(
                         accum_res[0] += A_local[ki] * B_dequantize_local[ki]
 
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
-                    "reduce_scope",
-                    T.reinterpret(T.uint64(0), dtype="handle"),
+                T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                "reduce_scope",
+                T.reinterpret(T.uint64(0), dtype="handle"),
             ):
                 T.evaluate(
                     T.tvm_thread_allreduce(
@@ -168,7 +167,8 @@ def kernel(
                         reduced_accum_res[0],
                         kr,
                         dtype="handle",
-                    ))
+                    )
+                )
             if kr == 0:
                 C[by, bx * n_partition + ni] = reduced_accum_res[0]
 
@@ -194,12 +194,12 @@ def general_compress(lowprecision_weight, source_bits=4, storage_dtype=np.int8):
 
 
 # interleave weight numpy implementation
-def interleave_weight(qweight, nbits=4, target_dtype="float16"):
-    assert target_dtype in ["float16", "int8"]
+def interleave_weight(qweight, nbits=4, target_dtype=T.float16):
+    assert target_dtype in [T.float16, T.int8]
     # reinterpret the data type of qweight to int32
     qweight = qweight.view(np.int32)
     new_qweight = np.zeros_like(qweight)
-    bits_stride = 8 if target_dtype == "int8" else 16
+    bits_stride = 8 if target_dtype == T.int8 else 16
     mask = (1 << nbits) - 1  # for 4bit the val is 0x0000000f
     num_groups = 32 // bits_stride
     elems_per_group = bits_stride // nbits
@@ -209,7 +209,7 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
             shift = (offset % num_groups) * bits_stride + (offset // num_groups) * nbits
             new_qweight |= ((qweight >> (nbits * offset)) & mask) << shift
 
-    if nbits == 1 and target_dtype == "int8":
+    if nbits == 1 and target_dtype == T.int8:
         # special handling for 1b interleave
         n16_weight = new_qweight & np.int32(0xF0F00F0F)
         n16_weight |= ((new_qweight & np.int32(0x000000F0)) >> 4) << 16
@@ -217,12 +217,12 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
         n16_weight |= ((new_qweight & np.int32(0x000F0000)) >> 16) << 4
         n16_weight |= ((new_qweight & np.int32(0x0F000000)) >> 24) << 12
         return n16_weight.view(np.int8)
-    elif nbits == 2 and target_dtype == "float16":
+    elif nbits == 2 and target_dtype == T.float16:
         n8_weight = new_qweight & np.int32(0xFF0000FF)
         n8_weight |= ((new_qweight & np.int32(0x0000FF00)) >> 8) << 16
         n8_weight |= ((new_qweight & np.int32(0x00FF0000)) >> 16) << 8
         return n8_weight.view(np.int8)
-    elif nbits == 1 and target_dtype == "float16":
+    elif nbits == 1 and target_dtype == T.float16:
         n8_weight = new_qweight & 0xF000000F
         n8_weight |= ((new_qweight & 0x000000F0) >> 4) << 8
         n8_weight |= ((new_qweight & 0x00000F00) >> 8) << 16
@@ -234,13 +234,7 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
     return new_qweight.view(np.int8)
 
 
-def assert_bitnet_158_int8xint2_decode_correctness(M,
-                                                   N,
-                                                   K,
-                                                   in_dtype,
-                                                   out_dtype,
-                                                   accum_dtype,
-                                                   fast_decoding=True):
+def assert_bitnet_158_int8xint2_decode_correctness(M, N, K, in_dtype, out_dtype, accum_dtype, fast_decoding=True):
     program = bitnet_158_int8xint2_decode(M, N, K, in_dtype, out_dtype, accum_dtype, fast_decoding)
     print(program)
     kernel = tilelang.compile(program)
@@ -265,4 +259,4 @@ def assert_bitnet_158_int8xint2_decode_correctness(M,
 
 
 if __name__ == "__main__":
-    assert_bitnet_158_int8xint2_decode_correctness(1, 256, 256, "int8", "int32", "int32")
+    assert_bitnet_158_int8xint2_decode_correctness(1, 256, 256, T.int8, T.int32, T.int32)
diff --git a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py
index d8b1f6228..f4a60098a 100644
--- a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py
+++ b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py
@@ -8,11 +8,13 @@
 from tilelang import tvm as tvm
 from tvm import DataType
 from tilelang.intrinsics.mma_layout import (
-    make_mma_swizzle_layout as make_swizzle_layout,)
+    make_mma_swizzle_layout as make_swizzle_layout,
+)
 import numpy as np
 
 from tilelang.intrinsics.mma_macro_generator import (
-    INT4TensorCoreIntrinEmitter,)
+    INT4TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 
 torch.manual_seed(42)
@@ -86,9 +88,9 @@ def bitnet_158_int8xint2_prefill(
     Create a TVM GPU prim_func implementing a block-tiled matrix multiply that multiplies dense A by compressed/interleaved low‑precision B (2-bit packed into int8 storage), decoding B to int8 on-chip and accumulating into C.
 
     The returned prim_func expects:
-    - A: shape (M, K) with dtype `in_dtype` ("float16" or "int8").
+    - A: shape (M, K) with dtype `in_dtype` (T.float16 or T.int8).
     - B: compressed storage with shape (N, K/4) and int8 storage layout (packing 4 2-bit elements per byte).
-    - C: output buffer shape (M, N) with dtype `out_dtype` ("float16", "float32", or "int32").
+    - C: output buffer shape (M, N) with dtype `out_dtype` (T.float16, T.float32, or T.int32).
 
     Details:
     - Builds a tiled, pipelined kernel using shared memory and warp-level MMA intrinsics (INT4TensorCoreIntrinEmitter). B is loaded from compressed storage, decoded to int8 in threads (via decode_i2u_to_i8s / decode_i2s_to_i8s), and dequantized into a shared buffer used by the MMA emitter.
@@ -96,15 +98,15 @@ def bitnet_158_int8xint2_prefill(
       - block_row_warps, block_col_warps: number of warps per block in row/col.
       - warp_row_tiles, warp_col_tiles: tiles per warp.
       - chunk: K-sized chunk per block (block_K).
-      - micro sizes are fixed (16x16x16, except micro_k=32 when accum_dtype == "int32").
+      - micro sizes are fixed (16x16x16, except micro_k=32 when accum_dtype == T.int32).
     - Uses 2-stage pipelining by default to overlap loads and compute and applies a swizzle layout to improve L2 behavior.
     - Assertions: raises AssertionError if in_dtype or out_dtype are not among supported values.
 
     Parameters:
         M, N, K (int): Global matrix dimensions.
-        in_dtype (str): Input and decoded B element dtype; "float16" or "int8".
-        out_dtype (str): Output C dtype; one of "float16", "float32", "int32".
-        accum_dtype (str): Accumulator dtype used by MMA (e.g., "int32").
+        in_dtype (str): Input and decoded B element dtype; T.float16 or T.int8.
+        out_dtype (str): Output C dtype; one of T.float16, T.float32, T.int32.
+        accum_dtype (str): Accumulator dtype used by MMA (e.g., T.int32).
         fast_decoding (bool): If True, enable the fast decoding path (affects which device decode is used).
         block_row_warps (int): Warps in block row dimension.
         block_col_warps (int): Warps in block column dimension.
@@ -116,18 +118,18 @@ def bitnet_158_int8xint2_prefill(
         T.prim_func: A TVM prim_func implementing the described GPU kernel suitable for compilation and execution.
     """
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if accum_dtype == "int32":
+    if accum_dtype == T.int32:
         micro_size_k = 32
 
     num_elems_per_byte = 4
@@ -136,7 +138,7 @@ def bitnet_158_int8xint2_prefill(
     local_size_compressed = local_size // num_elems_per_byte
 
     shared_scope = "shared.dyn"
-    storage_dtype = "int8"
+    storage_dtype = T.int8
 
     # Pipeline Stage
     stage = 2
@@ -181,38 +183,36 @@ def bitnet_158_int8xint2_prefill(
 
     @T.prim_func
     def main(
-            A: T.Buffer(A_shape, in_dtype),
-            B: T.Buffer(B_shape, storage_dtype),
-            C: T.Buffer((M, N), out_dtype),
+        A: T.Buffer(A_shape, in_dtype),
+        B: T.Buffer(B_shape, storage_dtype),
+        C: T.Buffer((M, N), out_dtype),
     ):
         """
-            GPU kernel entry that performs a blocked, pipelined matrix multiplication A @ B.T writing into C.
+        GPU kernel entry that performs a blocked, pipelined matrix multiplication A @ B.T writing into C.
 
-            This kernel:
-            - Loads tiles of A and a compressed/interleaved representation of B from global memory into shared memory.
-            - Decodes B's packed low-precision format (storage_dtype, e.g., 2-bit packed) into element values of `in_dtype` in shared memory via an external decode routine.
-            - Uses Warp/MMA tiled fragments and an INT4/INT2-capable MMA emitter to compute accumulation across K in a pipelined fashion with configurable stages.
-            - Writes accumulated tile results from shared memory back to global C with the expected block/micro-tile indexing.
+        This kernel:
+        - Loads tiles of A and a compressed/interleaved representation of B from global memory into shared memory.
+        - Decodes B's packed low-precision format (storage_dtype, e.g., 2-bit packed) into element values of `in_dtype` in shared memory via an external decode routine.
+        - Uses Warp/MMA tiled fragments and an INT4/INT2-capable MMA emitter to compute accumulation across K in a pipelined fashion with configurable stages.
+        - Writes accumulated tile results from shared memory back to global C with the expected block/micro-tile indexing.
 
-            Parameters:
-                A: Input matrix buffer of shape A_shape and element type `in_dtype`. Represents the MxK activations.
-                B: Compressed/interleaved weight buffer of shape B_shape and storage type `storage_dtype`. Must contain B in the packed low-precision layout expected by the decode routine used by this kernel.
-                C: Output buffer of shape (M, N) and type `out_dtype`; receives the resulting matrix (accumulated values are produced in `accum_dtype` and stored into C).
+        Parameters:
+            A: Input matrix buffer of shape A_shape and element type `in_dtype`. Represents the MxK activations.
+            B: Compressed/interleaved weight buffer of shape B_shape and storage type `storage_dtype`. Must contain B in the packed low-precision layout expected by the decode routine used by this kernel.
+            C: Output buffer of shape (M, N) and type `out_dtype`; receives the resulting matrix (accumulated values are produced in `accum_dtype` and stored into C).
 
-            Side effects:
-                Writes results into C. Calls external device decode functions to expand B from its packed representation into shared memory before computation.
+        Side effects:
+            Writes results into C. Calls external device decode functions to expand B from its packed representation into shared memory before computation.
         """
         with T.Kernel(
-                T.ceildiv(N, block_N),
-                T.ceildiv(M, block_M),
-                threads=threads,
-                prelude=decode_i2s_to_i8s,
+            T.ceildiv(N, block_N),
+            T.ceildiv(M, block_M),
+            threads=threads,
+            prelude=decode_i2s_to_i8s,
         ) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, storage_dtype, scope=shared_scope)
-            B_dequantize_shared = T.alloc_shared(
-                B_dequantize_shared_shape, in_dtype, scope=shared_scope)
+            B_dequantize_shared = T.alloc_shared(B_dequantize_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
             A_frag = T.alloc_local((warp_rows * fragement_size_a), in_dtype)
             B_frag = T.alloc_local((warp_cols * fragement_size_b), in_dtype)
@@ -221,12 +221,14 @@ def main(
             B_local = T.alloc_local([local_size_compressed], storage_dtype)
             B_dequantize_local = T.alloc_local([local_size], in_dtype)
 
-            thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
+            thread_bindings = T.get_thread_binding(0)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_dequantize_shared: make_swizzle_layout(B_dequantize_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_dequantize_shared: make_swizzle_layout(B_dequantize_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -234,7 +236,6 @@ def main(
             T.clear(C_frag)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -243,12 +244,9 @@ def main(
                 for j, k in T.Parallel(block_N, block_K // num_elems_per_byte):
                     B_shared[j, k] = B[bx * block_N + j, ko * (block_K // num_elems_per_byte) + k]
 
-                for i in T.serial(block_N * block_K // num_elems_per_byte //
-                                  (threads * local_size_compressed)):
+                for i in T.serial(block_N * block_K // num_elems_per_byte // (threads * local_size_compressed)):
                     for v in T.vectorized(0, local_size_compressed):
-                        index = (
-                            i * threads * local_size_compressed +
-                            thread_bindings * local_size_compressed + v)
+                        index = i * threads * local_size_compressed + thread_bindings * local_size_compressed + v
                         vi, vj = T.index_to_coordinates(index, B_shared_shape)
                         B_local[v] = B_shared[vi, vj]
 
@@ -260,12 +258,11 @@ def main(
                     )
 
                     for v in T.vectorized(0, local_size):
-                        index = (i * threads * local_size + thread_bindings * local_size + v)
+                        index = i * threads * local_size + thread_bindings * local_size + v
                         vi, vj = T.index_to_coordinates(index, B_dequantize_shared_shape)
                         B_dequantize_shared[vi, vj] = B_dequantize_local[v]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_frag,
@@ -320,12 +317,12 @@ def general_compress(lowprecision_weight, source_bits=4, storage_dtype=np.int8):
 
 
 # interleave weight numpy implementation
-def interleave_weight(qweight, nbits=4, target_dtype="float16"):
-    assert target_dtype in ["float16", "int8"]
+def interleave_weight(qweight, nbits=4, target_dtype=T.float16):
+    assert target_dtype in [T.float16, T.int8]
     # reinterpret the data type of qweight to int32
     qweight = qweight.view(np.int32)
     new_qweight = np.zeros_like(qweight)
-    bits_stride = 8 if target_dtype == "int8" else 16
+    bits_stride = 8 if target_dtype == T.int8 else 16
     mask = (1 << nbits) - 1  # for 4bit the val is 0x0000000f
     num_groups = 32 // bits_stride
     elems_per_group = bits_stride // nbits
@@ -335,7 +332,7 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
             shift = (offset % num_groups) * bits_stride + (offset // num_groups) * nbits
             new_qweight |= ((qweight >> (nbits * offset)) & mask) << shift
 
-    if nbits == 1 and target_dtype == "int8":
+    if nbits == 1 and target_dtype == T.int8:
         # special handling for 1b interleave
         n16_weight = new_qweight & np.int32(0xF0F00F0F)
         n16_weight |= ((new_qweight & np.int32(0x000000F0)) >> 4) << 16
@@ -343,12 +340,12 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
         n16_weight |= ((new_qweight & np.int32(0x000F0000)) >> 16) << 4
         n16_weight |= ((new_qweight & np.int32(0x0F000000)) >> 24) << 12
         return n16_weight.view(np.int8)
-    elif nbits == 2 and target_dtype == "float16":
+    elif nbits == 2 and target_dtype == T.float16:
         n8_weight = new_qweight & np.int32(0xFF0000FF)
         n8_weight |= ((new_qweight & np.int32(0x0000FF00)) >> 8) << 16
         n8_weight |= ((new_qweight & np.int32(0x00FF0000)) >> 16) << 8
         return n8_weight.view(np.int8)
-    elif nbits == 1 and target_dtype == "float16":
+    elif nbits == 1 and target_dtype == T.float16:
         n8_weight = new_qweight & 0xF000000F
         n8_weight |= ((new_qweight & 0x000000F0) >> 4) << 8
         n8_weight |= ((new_qweight & 0x00000F00) >> 8) << 16
@@ -360,13 +357,7 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
     return new_qweight.view(np.int8)
 
 
-def assert_bitnet_158_int8xint2_prefill_correctness(M,
-                                                    N,
-                                                    K,
-                                                    in_dtype,
-                                                    out_dtype,
-                                                    accum_dtype,
-                                                    fast_decoding=True):
+def assert_bitnet_158_int8xint2_prefill_correctness(M, N, K, in_dtype, out_dtype, accum_dtype, fast_decoding=True):
     program = bitnet_158_int8xint2_prefill(M, N, K, in_dtype, out_dtype, accum_dtype, fast_decoding)
     print(program)
     kernel = tilelang.compile(program)
@@ -391,4 +382,4 @@ def assert_bitnet_158_int8xint2_prefill_correctness(M,
 
 
 if __name__ == "__main__":
-    assert_bitnet_158_int8xint2_prefill_correctness(256, 256, 256, "int8", "int32", "int32")
+    assert_bitnet_158_int8xint2_prefill_correctness(256, 256, 256, T.int8, T.int32, T.int32)
diff --git a/examples/bitnet-1.58b/kernel_benchmark/tl_int8xint8.py b/examples/bitnet-1.58b/kernel_benchmark/tl_int8xint8.py
index 986463598..e3d35df4b 100644
--- a/examples/bitnet-1.58b/kernel_benchmark/tl_int8xint8.py
+++ b/examples/bitnet-1.58b/kernel_benchmark/tl_int8xint8.py
@@ -6,7 +6,8 @@
 import tvm.tl.language as T
 from bitblas.tl.utils import get_swizzle_layout
 from bitblas.tl.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from bitblas.base import simplify_prim_func
 
 torch.manual_seed(0)
@@ -37,18 +38,18 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if out_dtype == "int32":
+    if out_dtype == T.int32:
         micro_size_k = 32
 
     # This is a debug config
@@ -56,7 +57,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 64
     warp_col_tiles = 64
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -101,12 +102,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Buffer(A_shape, in_dtype),
-            B: T.Buffer(B_shape, in_dtype),
-            C: T.Buffer((M, N), out_dtype),
+        A: T.Buffer(A_shape, in_dtype),
+        B: T.Buffer(B_shape, in_dtype),
+        C: T.Buffer((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -116,10 +116,12 @@ def main(
 
             thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -127,7 +129,6 @@ def main(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -137,7 +138,6 @@ def main(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -183,7 +183,7 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
     # src_code is the generated cuda source
     assert src_code is not None
     print(src_code)
-    if in_dtype == "int8":
+    if in_dtype == T.int8:
         A = torch.randint(-7, 7, (M, K), device="cuda", dtype=torch.int8)
         B = torch.randint(-7, 7, (N, K), device="cuda", dtype=torch.int8)
     else:
@@ -209,12 +209,12 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 
 
 def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(128, 128, 128, "float16", "float16", "float16")
-    assert_tl_matmul_correctness(128, 256, 256, "float16", "float32", "float32")
+    assert_tl_matmul_correctness(128, 128, 128, T.float16, T.float16, T.float16)
+    assert_tl_matmul_correctness(128, 256, 256, T.float16, T.float32, T.float32)
 
 
 if __name__ == "__main__":
     # bitblas.testing.main()
-    # assert_tl_matmul_correctness(128, 128, 128, "float16", "float16", "float16")
-    # assert_tl_matmul_correctness(128, 128, 128, "int8", "int32", "int32")
-    assert_tl_matmul_correctness(16384, 16384, 16384, "int8", "int32", "int32")
+    # assert_tl_matmul_correctness(128, 128, 128, T.float16, T.float16, T.float16)
+    # assert_tl_matmul_correctness(128, 128, 128, T.int8, T.int32, T.int32)
+    assert_tl_matmul_correctness(16384, 16384, 16384, T.int8, T.int32, T.int32)
diff --git a/examples/bitnet-1.58b/load_from_quantized.py b/examples/bitnet-1.58b/load_from_quantized.py
index 26a32f974..8c775aa4c 100644
--- a/examples/bitnet-1.58b/load_from_quantized.py
+++ b/examples/bitnet-1.58b/load_from_quantized.py
@@ -49,7 +49,13 @@ def generate_text(model, tokenizer, prompt, max_length=100):
 
 def main():
     # load quantized model
-    qmodel = BitnetForCausalLM.from_quantized(saved_model_path,).cuda().half()
+    qmodel = (
+        BitnetForCausalLM.from_quantized(
+            saved_model_path,
+        )
+        .cuda()
+        .half()
+    )
     tokenizer = BitnetTokenizer.from_pretrained(model_name_or_path, use_fast=False)
     # print("original model generated text:")
     # print(generate_text(model, tokenizer, "Hi, ", max_length=100))
diff --git a/examples/bitnet-1.58b/maint/README.md b/examples/bitnet-1.58b/maint/README.md
index 63cc3e275..6bccdf93a 100644
--- a/examples/bitnet-1.58b/maint/README.md
+++ b/examples/bitnet-1.58b/maint/README.md
@@ -2,7 +2,6 @@
 license: mit
 ---
 
-
 This is a BitBLAS Implementation for the reproduced 1.58bit model from [1bitLLM/bitnet_b1_58-3B](https://huggingface.co/1bitLLM/bitnet_b1_58-3B). We replaced the original simulated Int8x3bit Quantized Inference Kernel with BitBLAS INT8xINT2 Kernel. We also evaluated the model's correctness and performance through `eval_correctness.py` and `benchmark_inference_latency.py`.
 
 ## Latest News
@@ -88,4 +87,4 @@ The differences between the reported numbers and the reproduced results are poss
   journal={arXiv preprint arXiv:2402.17764},
   year={2024}
 }
-```
\ No newline at end of file
+```
diff --git a/examples/bitnet-1.58b/maint/create_bitblas_ckpt.py b/examples/bitnet-1.58b/maint/create_bitblas_ckpt.py
index 1e29a553a..2604ef387 100644
--- a/examples/bitnet-1.58b/maint/create_bitblas_ckpt.py
+++ b/examples/bitnet-1.58b/maint/create_bitblas_ckpt.py
@@ -25,9 +25,9 @@
 args = parser.parse_args()
 
 model_name_or_path = args.model_name_or_path
-saved_model_path = os.path.join(
-    dirpath, "models",
-    f"{model_name_or_path}_bitblas") if args.saved_model_path is None else args.saved_model_path
+saved_model_path = (
+    os.path.join(dirpath, "models", f"{model_name_or_path}_bitblas") if args.saved_model_path is None else args.saved_model_path
+)
 
 
 def generate_text(model, tokenizer, prompt, max_length=100):
@@ -67,7 +67,10 @@ def main():
             model_name_or_path,
             use_flash_attention_2=False,
             torch_dtype=torch.float16,
-        ).cuda().half())
+        )
+        .cuda()
+        .half()
+    )
     tokenizer = BitnetTokenizer.from_pretrained(model_name_or_path, use_fast=False)
 
     # print("original model generated text:")
@@ -112,10 +115,16 @@ def main():
         file_path = cached_file(model_name_or_path, file)
         os.system(f"cp {file_path} {saved_model_path}")
     # load quantized model
-    qmodel = BitnetForCausalLM.from_quantized(saved_model_path,).cuda().half()
+    qmodel = (
+        BitnetForCausalLM.from_quantized(
+            saved_model_path,
+        )
+        .cuda()
+        .half()
+    )
     print("quantized model generated text:")
     print(generate_text(qmodel, tokenizer, "Hi, ", max_length=100))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/bitnet-1.58b/maint/generate_bitnet_model_bitblas_format.sh b/examples/bitnet-1.58b/maint/generate_bitnet_model_bitblas_format.sh
index 741c3a124..b0430588a 100755
--- a/examples/bitnet-1.58b/maint/generate_bitnet_model_bitblas_format.sh
+++ b/examples/bitnet-1.58b/maint/generate_bitnet_model_bitblas_format.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 # retrieve the native model input and saved model directory
 MODEL_DIR=$1
 SAVED_MODEL_DIR=$2
diff --git a/examples/bitnet-1.58b/maint/generate_bitnet_model_native_format.sh b/examples/bitnet-1.58b/maint/generate_bitnet_model_native_format.sh
index a2df0eb8c..66356d3d8 100755
--- a/examples/bitnet-1.58b/maint/generate_bitnet_model_native_format.sh
+++ b/examples/bitnet-1.58b/maint/generate_bitnet_model_native_format.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 # require git lfs
 if ! command -v git-lfs &> /dev/null; then
     echo "Please install git-lfs first by running 'sudo apt install git-lfs'"
diff --git a/examples/bitnet-1.58b/maint/quantize_config.json b/examples/bitnet-1.58b/maint/quantize_config.json
index e2b24123a..80fbf02f0 100644
--- a/examples/bitnet-1.58b/maint/quantize_config.json
+++ b/examples/bitnet-1.58b/maint/quantize_config.json
@@ -7,4 +7,4 @@
     "model_name_or_path": "1bitLLM/bitnet_b1_58-3B",
     "quant_method": "bitnet",
     "checkpoint_format": "bitnet"
-}
\ No newline at end of file
+}
diff --git a/examples/bitnet-1.58b/maint/upload_models.sh b/examples/bitnet-1.58b/maint/upload_models.sh
index b764b0da6..7c6d76e32 100755
--- a/examples/bitnet-1.58b/maint/upload_models.sh
+++ b/examples/bitnet-1.58b/maint/upload_models.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 MODEL_DIR=$1
 REMOTE_DIR=$2
 
diff --git a/examples/bitnet-1.58b/modeling_bitnet.py b/examples/bitnet-1.58b/modeling_bitnet.py
index 6e3c42b6f..1830995ee 100644
--- a/examples/bitnet-1.58b/modeling_bitnet.py
+++ b/examples/bitnet-1.58b/modeling_bitnet.py
@@ -64,8 +64,7 @@ def find_layers(module, layers=None, name=""):
             return {name: module}
     res = {}
     for name1, child in module.named_children():
-        res.update(
-            find_layers(child, layers=layers, name=name + "." + name1 if name != "" else name1))
+        res.update(find_layers(child, layers=layers, name=name + "." + name1 if name != "" else name1))
     return res
 
 
@@ -87,7 +86,6 @@ def _get_unpad_data(attention_mask):
 
 
 class BitnetRMSNorm(nn.Module):
-
     def __init__(self, hidden_size, eps=1e-6):
         """
         BitnetRMSNorm is equivalent to T5LayerNorm
@@ -108,34 +106,23 @@ def forward(self, hidden_states):
 
 
 class BitnetRotaryEmbedding(nn.Module):
-
-    def __init__(self,
-                 dim,
-                 max_position_embeddings=2048,
-                 base=10000,
-                 device=None,
-                 scaling_factor=1.0):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
         super().__init__()
         self.scaling_factor = scaling_factor
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
-        inv_freq = 1.0 / (
-            self.base
-            **(torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
         self.register_buffer("inv_freq", inv_freq)
         # For BC we register cos and sin cached
         self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(
-            self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
         t = t / self.scaling_factor
         freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer(
-            "_cos_cached", emb.cos().to(torch.get_default_dtype()), persistent=False)
-        self.register_buffer(
-            "_sin_cached", emb.sin().to(torch.get_default_dtype()), persistent=False)
+        self.register_buffer("_cos_cached", emb.cos().to(torch.get_default_dtype()), persistent=False)
+        self.register_buffer("_sin_cached", emb.sin().to(torch.get_default_dtype()), persistent=False)
 
     @property
     def sin_cached(self):
@@ -156,14 +143,12 @@ def cos_cached(self):
     @torch.no_grad()
     def forward(self, x, position_ids):
         # x: [bs, num_attention_heads, seq_len, head_size]
-        inv_freq_expanded = self.inv_freq[None, :,
-                                          None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
         position_ids_expanded = position_ids[:, None, :].float()
         # Force float32 since bfloat16 loses precision on long contexts
         # See https://github.com/huggingface/transformers/pull/29285
         device_type = x.device.type
-        device_type = device_type if isinstance(device_type,
-                                                str) and device_type != "mps" else "cpu"
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
         with torch.autocast(device_type=device_type, enabled=False):
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
@@ -174,8 +159,8 @@ def forward(self, x, position_ids):
 
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
-    x1 = x[..., :x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
     return torch.cat((-x2, x1), dim=-1)
 
 
@@ -207,7 +192,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
 
 
 class BitnetMLP(nn.Module):
-
     def __init__(self, config):
         super().__init__()
         self.config = config
@@ -245,7 +229,6 @@ def forward(self, x):
 
 
 class BitnetMLPFuseGateUp(nn.Module):
-
     def __init__(self, config):
         super().__init__()
         self.config = config
@@ -272,8 +255,7 @@ def __init__(self, config):
     def from_bit_mlp(cls, bit_mlp: BitnetMLP):
         module = cls(bit_mlp.config)
         # assign the weights
-        module.gate_up_proj.weight = nn.Parameter(
-            torch.cat([bit_mlp.gate_proj.weight, bit_mlp.up_proj.weight], dim=0))
+        module.gate_up_proj.weight = nn.Parameter(torch.cat([bit_mlp.gate_proj.weight, bit_mlp.up_proj.weight], dim=0))
         module.down_proj = bit_mlp.down_proj
         module.ffn_layernorm = bit_mlp.ffn_layernorm
         return module
@@ -295,8 +277,7 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     batch, num_key_value_heads, slen, head_dim = hidden_states.shape
     if n_rep == 1:
         return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen,
-                                                           head_dim)
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 
 
@@ -311,7 +292,8 @@ def __init__(self, config: BitnetConfig, layer_idx: Optional[int] = None):
             logger.warning_once(
                 f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
                 "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class.")
+                "when creating this class."
+            )
 
         self.attention_dropout = config.attention_dropout
         self.hidden_size = config.hidden_size
@@ -325,8 +307,8 @@ def __init__(self, config: BitnetConfig, layer_idx: Optional[int] = None):
 
         if (self.head_dim * self.num_heads) != self.hidden_size:
             raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads}).")
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`: {self.num_heads})."
+            )
 
         self.q_proj = BitLinear(
             self.hidden_size,
@@ -387,10 +369,8 @@ def forward(
         value_states = self.v_proj(hidden_states)
 
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
-                                     self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads,
-                                         self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
         past_key_value = getattr(self, "past_key_value", past_key_value)
         cos, sin = self.rotary_emb(value_states, position_ids)
@@ -399,30 +379,24 @@ def forward(
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states,
-                                                             self.layer_idx, cache_kwargs)
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(
-            self.head_dim)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
 
         if attention_mask is not None:  # no matter the length, we just slice it
-            causal_mask = attention_mask[:, :, :, :key_states.shape[-2]]
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
             attn_weights = attn_weights + causal_mask
 
         # upcast attention to fp32
-        attn_weights = nn.functional.softmax(
-            attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(
-            attn_weights, p=self.attention_dropout, training=self.training)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
         attn_output = torch.matmul(attn_weights, value_states)
 
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}")
+            raise ValueError(f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is {attn_output.size()}")
 
         attn_output = attn_output.transpose(1, 2).contiguous()
 
@@ -448,7 +422,8 @@ def __init__(self, config: BitnetConfig, layer_idx: Optional[int] = None):
             logger.warning_once(
                 f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
                 "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class.")
+                "when creating this class."
+            )
 
         self.attention_dropout = config.attention_dropout
         self.hidden_size = config.hidden_size
@@ -462,8 +437,8 @@ def __init__(self, config: BitnetConfig, layer_idx: Optional[int] = None):
 
         if (self.head_dim * self.num_heads) != self.hidden_size:
             raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads}).")
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`: {self.num_heads})."
+            )
 
         self.qkv_proj = BitLinear(
             self.hidden_size,
@@ -497,17 +472,12 @@ def from_bit_attention(cls, bit_attention: BitnetAttention):
         module = cls(bit_attention.config, bit_attention.layer_idx)
         # assign the weights
         module.qkv_proj.weight = nn.Parameter(
-            torch.cat([
-                bit_attention.q_proj.weight, bit_attention.k_proj.weight,
-                bit_attention.v_proj.weight
-            ],
-                      dim=0))
+            torch.cat([bit_attention.q_proj.weight, bit_attention.k_proj.weight, bit_attention.v_proj.weight], dim=0)
+        )
         if bit_attention.q_proj.bias is not None and bit_attention.k_proj.bias is not None and bit_attention.v_proj.bias is not None:
             module.qkv_proj.bias = nn.Parameter(
-                torch.cat([
-                    bit_attention.q_proj.bias, bit_attention.k_proj.bias, bit_attention.v_proj.bias
-                ],
-                          dim=0))
+                torch.cat([bit_attention.q_proj.bias, bit_attention.k_proj.bias, bit_attention.v_proj.bias], dim=0)
+            )
         module.o_proj = bit_attention.o_proj
         module.inner_attn_ln = bit_attention.inner_attn_ln
         if bit_attention.config.rope_scaling is None:
@@ -528,16 +498,13 @@ def forward(
         bsz, q_len, _ = hidden_states.size()
         qkv_states = self.qkv_proj(hidden_states)
         query_states, key_states, value_states = torch.split(
-            qkv_states, [
-                self.num_heads * self.head_dim, self.num_key_value_heads * self.head_dim,
-                self.num_key_value_heads * self.head_dim
-            ],
-            dim=-1)
+            qkv_states,
+            [self.num_heads * self.head_dim, self.num_key_value_heads * self.head_dim, self.num_key_value_heads * self.head_dim],
+            dim=-1,
+        )
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
-                                     self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads,
-                                         self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
         past_key_value = getattr(self, "past_key_value", past_key_value)
         cos, sin = self.rotary_emb(value_states, position_ids)
@@ -546,30 +513,24 @@ def forward(
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states,
-                                                             self.layer_idx, cache_kwargs)
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(
-            self.head_dim)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
 
         if attention_mask is not None:  # no matter the length, we just slice it
-            causal_mask = attention_mask[:, :, :, :key_states.shape[-2]]
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
             attn_weights = attn_weights + causal_mask
 
         # upcast attention to fp32
-        attn_weights = nn.functional.softmax(
-            attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(
-            attn_weights, p=self.attention_dropout, training=self.training)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
         attn_output = torch.matmul(attn_weights, value_states)
 
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}")
+            raise ValueError(f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is {attn_output.size()}")
 
         attn_output = attn_output.transpose(1, 2).contiguous()
 
@@ -622,10 +583,8 @@ def forward(
         # batch_size x seq_length x head_dim x hidden_dim
         # therefore we just need to keep the original shape
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
-                                     self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads,
-                                         self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
         cos, sin = self.rotary_emb(value_states, position_ids)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
@@ -635,8 +594,7 @@ def forward(
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states,
-                                                             self.layer_idx, cache_kwargs)
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
         # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
         # to be able to avoid many of these transpose/reshape/view.
@@ -665,14 +623,14 @@ def forward(
             logger.warning_once(
                 f"The input hidden states seems to be silently casted in float32, this might be related to"
                 f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}.")
+                f" {target_dtype}."
+            )
 
             query_states = query_states.to(target_dtype)
             key_states = key_states.to(target_dtype)
             value_states = value_states.to(target_dtype)
 
-        attn_output = self._flash_attention_forward(
-            query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate)
+        attn_output = self._flash_attention_forward(query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate)
 
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
         attn_output = self.inner_attn_ln(attn_output)
@@ -683,14 +641,9 @@ def forward(
 
         return attn_output, attn_weights, past_key_value
 
-    def _flash_attention_forward(self,
-                                 query_states,
-                                 key_states,
-                                 value_states,
-                                 attention_mask,
-                                 query_length,
-                                 dropout=0.0,
-                                 softmax_scale=None):
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
         first unpad the input, then computes the attention scores and pad the final attention scores.
@@ -720,7 +673,8 @@ def _flash_attention_forward(self,
         if attention_mask is not None:
             batch_size = query_states.shape[0]
             query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length)
+                query_states, key_states, value_states, attention_mask, query_length
+            )
 
             cu_seqlens_q, cu_seqlens_k = cu_seq_lens
             max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
@@ -740,13 +694,7 @@ def _flash_attention_forward(self,
 
             attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
         else:
-            attn_output = flash_attn_func(
-                query_states,
-                key_states,
-                value_states,
-                dropout,
-                softmax_scale=softmax_scale,
-                causal=causal)
+            attn_output = flash_attn_func(query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal)
 
         return attn_output
 
@@ -754,28 +702,24 @@ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query
         indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
         batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
 
-        key_layer = index_first_axis(
-            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k)
-        value_layer = index_first_axis(
-            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k)
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k)
         if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k)
+            query_layer = index_first_axis(query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k)
             cu_seqlens_q = cu_seqlens_k
             max_seqlen_in_batch_q = max_seqlen_in_batch_k
             indices_q = indices_k
         elif query_length == 1:
             max_seqlen_in_batch_q = 1
             cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32,
-                device=query_layer.device)  # There is a memcpy here, that is very bad.
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
             indices_q = cu_seqlens_q[:-1]
             query_layer = query_layer.squeeze(1)
         else:
             # The -q_len: slice assumes left padding.
             attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
-                query_layer, attention_mask)
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
 
         return (
             query_layer,
@@ -794,13 +738,11 @@ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query
 
 
 class BitnetDecoderLayer(nn.Module):
-
     def __init__(self, config: BitnetConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
 
-        self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation](
-            config=config, layer_idx=layer_idx)
+        self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
 
         self.mlp = BitnetMLP(config)
         self.input_layernorm = BitnetRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -834,7 +776,8 @@ def forward(
         if "padding_mask" in kwargs:
             warnings.warn(
                 "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`",
-                stacklevel=2)
+                stacklevel=2,
+            )
 
         residual = hidden_states
 
@@ -925,8 +868,7 @@ def _setup_cache(self, cache_cls, max_batch_size, max_cache_len: Optional[int] =
                 dtype = self.config._pre_quantization_dtype
             else:
                 dtype = layer.self_attn.o_proj.weight.dtype
-            layer.self_attn.past_key_value = cache_cls(
-                self.config, max_batch_size, max_cache_len, device=device, dtype=dtype)
+            layer.self_attn.past_key_value = cache_cls(self.config, max_batch_size, max_cache_len, device=device, dtype=dtype)
 
     def _reset_cache(self):
         for layer in self.model.layers:
@@ -1025,9 +967,7 @@ def __init__(self, config: BitnetConfig):
         self.vocab_size = config.vocab_size
 
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList([
-            BitnetDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)
-        ])
+        self.layers = nn.ModuleList([BitnetDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)])
         self.norm = BitnetRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
 
@@ -1055,21 +995,15 @@ def forward(
         cache_position: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None else self.config.output_hidden_states)
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one")
 
         if self.gradient_checkpointing and self.training and use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
-            )
+            logger.warning_once("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.")
             use_cache = False
 
         if inputs_embeds is None:
@@ -1083,10 +1017,7 @@ def forward(
         if cache_position is None:
             if isinstance(past_key_values, StaticCache):
                 raise ValueError("cache_position is a required argument when using StaticCache.")
-            cache_position = torch.arange(
-                past_seen_tokens,
-                past_seen_tokens + inputs_embeds.shape[1],
-                device=inputs_embeds.device)
+            cache_position = torch.arange(past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device)
 
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
@@ -1143,12 +1074,9 @@ def forward(
 
         next_cache = None
         if use_cache:
-            next_cache = (
-                next_decoder_cache.to_legacy_cache()
-                if isinstance(next_decoder_cache, Cache) else next_decoder_cache)
+            next_cache = next_decoder_cache.to_legacy_cache() if isinstance(next_decoder_cache, Cache) else next_decoder_cache
         if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
-                         if v is not None)
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
@@ -1172,14 +1100,9 @@ def _update_causal_mask(self, attention_mask, input_tensor, cache_position):
         if hasattr(self.layers[0].self_attn, "past_key_value"):  # static cache
             target_length = self.config.max_position_embeddings
         else:  # dynamic cache
-            target_length = (
-                attention_mask.shape[-1]
-                if isinstance(attention_mask, torch.Tensor) else cache_position[-1] + 1)
-
-        causal_mask = torch.full((sequence_length, target_length),
-                                 fill_value=min_dtype,
-                                 dtype=dtype,
-                                 device=device)
+            target_length = attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else cache_position[-1] + 1
+
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
         if sequence_length != 1:
             causal_mask = torch.triu(causal_mask, diagonal=1)
         causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
@@ -1188,10 +1111,8 @@ def _update_causal_mask(self, attention_mask, input_tensor, cache_position):
             causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
             if attention_mask.dim() == 2:
                 mask_length = attention_mask.shape[-1]
-                padding_mask = causal_mask[..., :mask_length].eq(
-                    0.0) * attention_mask[:, None, None, :].eq(0.0)
-                causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(
-                    padding_mask, min_dtype)
+                padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0)
+                causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype)
             elif attention_mask.dim() == 4:
                 # backwards compatibility: we allow passing a 4D attention mask shorter than the input length with
                 # cache. In that case, the 4D attention mask attends to the newest tokens only.
@@ -1201,8 +1122,7 @@ def _update_causal_mask(self, attention_mask, input_tensor, cache_position):
                     offset = 0
                 mask_shape = attention_mask.shape
                 mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype
-                causal_mask[:mask_shape[0], :mask_shape[1],
-                            offset:mask_shape[2] + offset, :mask_shape[3]] = mask_slice
+                causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = mask_slice
 
         return causal_mask
 
@@ -1279,9 +1199,7 @@ def forward(
         "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None else self.config.output_hidden_states)
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
@@ -1327,13 +1245,9 @@ def forward(
             attentions=outputs.attentions,
         )
 
-    def prepare_inputs_for_generation(self,
-                                      input_ids,
-                                      past_key_values=None,
-                                      attention_mask=None,
-                                      inputs_embeds=None,
-                                      cache_position=None,
-                                      **kwargs):
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, cache_position=None, **kwargs
+    ):
         # With static cache, the `past_key_values` is None
         # TODO joao: standardize interface for the different Cache classes and remove of this if
         has_static_cache = False
@@ -1344,13 +1258,13 @@ def prepare_inputs_for_generation(self,
         past_length = 0
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
-                past_length = cache_position[
-                    0] if cache_position is not None else past_key_values.get_seq_length()
+                past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length()
                 max_cache_length = (
                     torch.tensor(past_key_values.get_max_length(), device=input_ids.device)
-                    if past_key_values.get_max_length() is not None else None)
-                cache_length = past_length if max_cache_length is None else torch.min(
-                    max_cache_length, past_length)
+                    if past_key_values.get_max_length() is not None
+                    else None
+                )
+                cache_length = past_length if max_cache_length is None else torch.min(max_cache_length, past_length)
             # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
@@ -1361,7 +1275,7 @@ def prepare_inputs_for_generation(self,
             # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
             # input)
             if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length):]
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
             # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
             # input_ids based on the past_length.
             elif past_length < input_ids.shape[1]:
@@ -1369,8 +1283,7 @@ def prepare_inputs_for_generation(self,
             # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
 
             # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (max_cache_length is not None and attention_mask is not None and
-                    cache_length + input_ids.shape[1] > max_cache_length):
+            if max_cache_length is not None and attention_mask is not None and cache_length + input_ids.shape[1] > max_cache_length:
                 attention_mask = attention_mask[:, -max_cache_length:]
 
         position_ids = kwargs.get("position_ids")
@@ -1379,7 +1292,7 @@ def prepare_inputs_for_generation(self,
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
             if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1]:]
+                position_ids = position_ids[:, -input_ids.shape[1] :]
 
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
@@ -1392,39 +1305,38 @@ def prepare_inputs_for_generation(self,
 
         input_length = position_ids.shape[-1] if position_ids is not None else input_ids.shape[-1]
         if cache_position is None:
-            cache_position = torch.arange(
-                past_length, past_length + input_length, device=input_ids.device)
+            cache_position = torch.arange(past_length, past_length + input_length, device=input_ids.device)
         else:
             cache_position = cache_position[-input_length:]
 
         if has_static_cache:
             past_key_values = None
 
-        model_inputs.update({
-            "position_ids": position_ids,
-            "cache_position": cache_position,
-            "past_key_values": past_key_values,
-            "use_cache": kwargs.get("use_cache"),
-            "attention_mask": attention_mask,
-        })
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
         return model_inputs
 
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
-            reordered_past += (tuple(
-                past_state.index_select(0, beam_idx.to(past_state.device))
-                for past_state in layer_past),)
+            reordered_past += (tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),)
         return reordered_past
 
     @staticmethod
     def recursive_set(model, name, attr):
-        '''
-            set layers.25.mlp.up_proj to attr
-        '''
+        """
+        set layers.25.mlp.up_proj to attr
+        """
 
-        names = name.split('.')
+        names = name.split(".")
         obj = model
         for n in names[:-1]:
             obj = getattr(obj, n)
@@ -1521,6 +1433,7 @@ def from_quantized(
         fuse_gateup = quant_config.get("fuse_gateup", True)
 
         import accelerate
+
         if checkpoint_format == "bitblas":
             model = cls(config)
             for name, module in model.named_modules():
@@ -1567,7 +1480,6 @@ def from_quantized(
     LLAMA_START_DOCSTRING,
 )
 class BitnetForSequenceClassification(BitnetPreTrainedModel):
-
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
@@ -1631,8 +1543,7 @@ def forward(
         else:
             if input_ids is not None:
                 # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(input_ids,
-                                            self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
                 sequence_lengths = sequence_lengths % input_ids.shape[-1]
                 sequence_lengths = sequence_lengths.to(logits.device)
             else:
@@ -1646,8 +1557,7 @@ def forward(
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or
-                                              labels.dtype == torch.int):
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                     self.config.problem_type = "single_label_classification"
                 else:
                     self.config.problem_type = "multi_label_classification"
diff --git a/examples/bitnet-1.58b/nvidia_measure_memory.sh b/examples/bitnet-1.58b/nvidia_measure_memory.sh
index e8998f309..82cf4855f 100755
--- a/examples/bitnet-1.58b/nvidia_measure_memory.sh
+++ b/examples/bitnet-1.58b/nvidia_measure_memory.sh
@@ -1 +1,3 @@
+#!/usr/bin/env bash
+
 nvidia-smi --query-gpu=memory.used --format=csv -lms 500
diff --git a/examples/bitnet-1.58b/tokenization_bitnet.py b/examples/bitnet-1.58b/tokenization_bitnet.py
index 6fea3252a..2adfd6dee 100644
--- a/examples/bitnet-1.58b/tokenization_bitnet.py
+++ b/examples/bitnet-1.58b/tokenization_bitnet.py
@@ -18,6 +18,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for LLaMA."""
+
 import os
 from shutil import copyfile
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
@@ -37,12 +38,10 @@
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "hf-internal-testing/llama-tokenizer":
-            "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
+        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
     },
     "tokenizer_file": {
-        "hf-internal-testing/llama-tokenizer":
-            "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
+        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
     },
 }
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
@@ -159,14 +158,10 @@ def __init__(
         **kwargs,
     ):
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-        bos_token = AddedToken(
-            bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(
-            eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
-        unk_token = AddedToken(
-            unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(
-            pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token
+        bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token
 
         if legacy is None:
             logger.warning_once(
@@ -174,7 +169,8 @@ def __init__(
                 " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
                 " If you want to use the new behavior, set `legacy=False`. This should only be set if you understand what it"
                 " means, and thoroughly read the reason why this was added as explained in"
-                " https://github.com/huggingface/transformers/pull/24565")
+                " https://github.com/huggingface/transformers/pull/24565"
+            )
             legacy = True
 
         self.legacy = legacy
@@ -214,8 +210,7 @@ def get_spm_processor(self, from_slow=False):
 
         with open(self.vocab_file, "rb") as f:
             sp_model = f.read()
-            model_pb2 = import_protobuf(
-                f"The new behavior of {self.__class__.__name__} (with `self.legacy = False`)")
+            model_pb2 = import_protobuf(f"The new behavior of {self.__class__.__name__} (with `self.legacy = False`)")
             model = model_pb2.ModelProto.FromString(sp_model)
             normalizer_spec = model_pb2.NormalizerSpec()
             normalizer_spec.add_dummy_prefix = False
@@ -261,8 +256,7 @@ def tokenize(self, text: "TextInput", **kwargs) -> List[str]:
 
         tokens = super().tokenize(text, **kwargs)
 
-        if len(tokens
-              ) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
+        if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
             tokens = tokens[1:]
         return tokens
 
@@ -284,7 +278,7 @@ def _tokenize(self, text, **kwargs):
         # 1. Encode string + prefix ex: "<unk> Hey"
         tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
         # 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
-        return tokens[self.unk_token_length:] if len(tokens) >= self.unk_token_length else tokens
+        return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
 
     def _convert_token_to_id(self, token):
         """Converts a token (str) in an id using the vocab."""
@@ -332,12 +326,9 @@ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None)
         if not os.path.isdir(save_directory):
             logger.error(f"Vocabulary path ({save_directory}) should be a directory")
             return
-        out_vocab_file = os.path.join(save_directory,
-                                      (filename_prefix + "-" if filename_prefix else "") +
-                                      VOCAB_FILES_NAMES["vocab_file"])
+        out_vocab_file = os.path.join(save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"])
 
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(
-                self.vocab_file):
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
             copyfile(self.vocab_file, out_vocab_file)
         elif not os.path.isfile(self.vocab_file):
             with open(out_vocab_file, "wb") as fi:
@@ -357,10 +348,9 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
 
         return output
 
-    def get_special_tokens_mask(self,
-                                token_ids_0: List[int],
-                                token_ids_1: Optional[List[int]] = None,
-                                already_has_special_tokens: bool = False) -> List[int]:
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer `prepare_for_model` method.
@@ -377,20 +367,16 @@ def get_special_tokens_mask(self,
             `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True)
+            return super().get_special_tokens_mask(token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True)
 
         bos_token_id = [1] if self.add_bos_token else []
         eos_token_id = [1] if self.add_eos_token else []
 
         if token_ids_1 is None:
             return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
-        return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + bos_token_id +
-                ([0] * len(token_ids_1)) + eos_token_id)
+        return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + bos_token_id + ([0] * len(token_ids_1)) + eos_token_id
 
-    def create_token_type_ids_from_sequences(self,
-                                             token_ids_0: List[int],
-                                             token_ids_1: Optional[List[int]] = None) -> List[int]:
+    def create_token_type_ids_from_sequences(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]:
         """
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
         sequence pair mask has the following format:
@@ -473,9 +459,9 @@ def default_chat_template(self):
             "{% elif message['role'] == 'assistant' %}"
             "{{ ' '  + content.strip() + ' ' + eos_token }}"
             "{% endif %}"
-            "{% endfor %}")
-        template = template.replace("USE_DEFAULT_PROMPT",
-                                    "true" if self.use_default_system_prompt else "false")
+            "{% endfor %}"
+        )
+        template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false")
         default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'")
         template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message)
 
diff --git a/examples/bitnet-1.58b/utils_quant.py b/examples/bitnet-1.58b/utils_quant.py
index 5f5db5dbc..5a50edb39 100644
--- a/examples/bitnet-1.58b/utils_quant.py
+++ b/examples/bitnet-1.58b/utils_quant.py
@@ -24,15 +24,14 @@ def weight_quant(weight, num_bits=1):
 def activation_quant(x, num_bits=8):
     dtype = x.dtype
     x = x.float()
-    Qn = -(2**(num_bits - 1))
-    Qp = 2**(num_bits - 1) - 1
+    Qn = -(2 ** (num_bits - 1))
+    Qp = 2 ** (num_bits - 1) - 1
     s = Qp / x.abs().max(dim=-1, keepdim=True).values.clamp(min=1e-5)
     result = (x * s).round().clamp(Qn, Qp) / s
     return result.type(dtype)
 
 
 class BitLinearBitBLAS(nn.Module):
-
     def __init__(
         self,
         in_features: int,
@@ -68,7 +67,7 @@ def __init__(
         self.bitblas_matmul = self._get_or_create_bitblas_operator(matmul_config, ENABLE_TUNING)
 
         self.format = "bitnet"
-        self.Qp = 2**(self.input_bits - 1) - 1
+        self.Qp = 2 ** (self.input_bits - 1) - 1
 
     def _get_or_create_bitblas_operator(self, config, enable_tuning):
         if global_operator_cache.size() == 0:
@@ -99,8 +98,7 @@ def replace_weight_param_with_qweight(self):
 
     @classmethod
     def from_bit_linear(cls, bitlinear, weight_group=1):
-        bitblas_linear = cls(
-            bitlinear.in_features, bitlinear.out_features, weight_bits=1, input_bits=8)
+        bitblas_linear = cls(bitlinear.in_features, bitlinear.out_features, weight_bits=1, input_bits=8)
         sw, qweight = bitblas_linear.create_bitblas_weights(bitlinear.weight, weight_group)
         bitblas_linear.register_buffer("qweight", qweight)
         bitblas_linear.register_buffer("sw", sw)
@@ -158,8 +156,8 @@ def weight_quant(weight):
     @torch.compile
     def activation_quant(self, x, num_bits=8):
         x = x.float()
-        Qn = -(2**(num_bits - 1))
-        Qp = 2**(num_bits - 1) - 1
+        Qn = -(2 ** (num_bits - 1))
+        Qp = 2 ** (num_bits - 1) - 1
         s = Qp / x.abs().max(dim=-1, keepdim=True).values.clamp(min=1e-5)
         result = (x * s).round().clamp(Qn, Qp)
         return result.type(torch.int8), s
@@ -173,9 +171,8 @@ def post_quant_process(self, input, si, sw):
 
     # for the correctness evaluation.
     def native_forward(self, input):
-        quant_input = (input + (activation_quant(input, self.input_bits) - input).detach())
-        quant_weight = (
-            self.weight + (weight_quant(self.weight, self.weight_bits) - self.weight).detach())
+        quant_input = input + (activation_quant(input, self.input_bits) - input).detach()
+        quant_weight = self.weight + (weight_quant(self.weight, self.weight_bits) - self.weight).detach()
 
         out = nn.functional.linear(quant_input, quant_weight)
         if self.bias is not None:
@@ -214,7 +211,6 @@ def forward(self, input):
 
 # Naive BitLinear from HuggingFace
 class BitLinear(nn.Linear):
-
     def __init__(self, *kargs, weight_bits=1, input_bits=8, **kwargs):
         super(BitLinear, self).__init__(*kargs, **kwargs)
         """
@@ -224,10 +220,8 @@ def __init__(self, *kargs, weight_bits=1, input_bits=8, **kwargs):
         self.input_bits = input_bits
 
     def forward(self, input):
-
         quant_input = input + (activation_quant(input, self.input_bits) - input).detach()
-        quant_weight = self.weight + (weight_quant(self.weight, self.weight_bits) -
-                                      self.weight).detach()
+        quant_weight = self.weight + (weight_quant(self.weight, self.weight_bits) - self.weight).detach()
 
         out = nn.functional.linear(quant_input, quant_weight)
         if self.bias is not None:
diff --git a/examples/bitnet-1.58b/vllm_workspace/conftest.py b/examples/bitnet-1.58b/vllm_workspace/conftest.py
index 951f38991..e9e2997ef 100644
--- a/examples/bitnet-1.58b/vllm_workspace/conftest.py
+++ b/examples/bitnet-1.58b/vllm_workspace/conftest.py
@@ -20,7 +20,7 @@
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.config import TokenizerPoolConfig
-from vllm.distributed import (destroy_distributed_environment, destroy_model_parallel)
+from vllm.distributed import destroy_distributed_environment, destroy_model_parallel
 from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
 from vllm.sequence import SampleLogprobs
@@ -56,12 +56,13 @@ class _ImageAssetsBase(UserList[ImageAsset]):
 
 
 class _ImageAssets(_ImageAssetsBase):
-
     def __init__(self) -> None:
-        super().__init__([
-            ImageAsset("stop_sign"),
-            ImageAsset("cherry_blossom"),
-        ])
+        super().__init__(
+            [
+                ImageAsset("stop_sign"),
+                ImageAsset("cherry_blossom"),
+            ]
+        )
 
     def prompts(self, prompts: _ImageAssetPrompts) -> List[str]:
         """
@@ -136,7 +137,6 @@ def image_assets() -> _ImageAssets:
 
 
 class HfRunner:
-
     def wrap_device(self, input: _T) -> _T:
         if not is_cpu():
             return input.to("cuda")
@@ -166,7 +166,8 @@ def __init__(
                 SentenceTransformer(
                     model_name,
                     device="cpu",
-                ).to(dtype=torch_dtype))
+                ).to(dtype=torch_dtype)
+            )
         else:
             if is_vision_model:
                 auto_cls = AutoModelForVision2Seq
@@ -184,7 +185,8 @@ def __init__(
                     torch_dtype=torch_dtype,
                     trust_remote_code=True,
                     **model_kwargs,
-                ))
+                )
+            )
 
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_name,
@@ -204,8 +206,7 @@ def __init__(
             )
         except Exception:
             logger.warning(
-                "Unable to auto-load processor from HuggingFace for "
-                "model %s. Using tokenizer instead.",
+                "Unable to auto-load processor from HuggingFace for model %s. Using tokenizer instead.",
                 model_name,
             )
             self.processor = self.tokenizer
@@ -362,7 +363,7 @@ def generate_greedy_logprobs_limit(
                     last_hidden_states,
                     self.model.get_output_embeddings().weight.t(),
                 )
-                if (getattr(self.model.get_output_embeddings(), "bias", None) is not None):
+                if getattr(self.model.get_output_embeddings(), "bias", None) is not None:
                     logits += self.model.get_output_embeddings().bias.unsqueeze(0)
                 logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
                 seq_logprobs.append(logprobs)
@@ -389,8 +390,7 @@ def generate_greedy_logprobs_limit(
             all_output_strs.append(self.tokenizer.decode(output_ids))
 
         outputs = zip(all_output_ids, all_output_strs, all_logprobs)
-        return [(output_ids, output_str, output_logprobs)
-                for output_ids, output_str, output_logprobs in outputs]
+        return [(output_ids, output_str, output_logprobs) for output_ids, output_str, output_logprobs in outputs]
 
     def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]:
         return self.model.encode(prompts)
@@ -409,7 +409,6 @@ def hf_runner():
 
 
 class VllmRunner:
-
     def __init__(
         self,
         model_name: str,
@@ -514,12 +513,10 @@ def generate_greedy_logprobs(
         num_logprobs: int,
         images: Optional[List[Image.Image]] = None,
     ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
-        greedy_logprobs_params = SamplingParams(
-            temperature=0.0, max_tokens=max_tokens, logprobs=num_logprobs)
+        greedy_logprobs_params = SamplingParams(temperature=0.0, max_tokens=max_tokens, logprobs=num_logprobs)
         outputs = self.generate_w_logprobs(prompts, greedy_logprobs_params, images=images)
 
-        return [(output_ids, output_str, output_logprobs)
-                for output_ids, output_str, output_logprobs in outputs]
+        return [(output_ids, output_str, output_logprobs) for output_ids, output_str, output_logprobs in outputs]
 
     def generate_beam_search(
         self,
diff --git a/examples/bitnet-1.58b/vllm_workspace/inference_with_compress_format.py b/examples/bitnet-1.58b/vllm_workspace/inference_with_compress_format.py
index 55a24543e..ea18239cb 100644
--- a/examples/bitnet-1.58b/vllm_workspace/inference_with_compress_format.py
+++ b/examples/bitnet-1.58b/vllm_workspace/inference_with_compress_format.py
@@ -32,15 +32,14 @@
 
 ckpt_path = args.ckpt_path
 with VllmRunner(
-        ckpt_path,
-        dtype="half",
-        quantization="bitblas",
-        # set enforce_eager = False to enable cuda graph
-        # set enforce_eager = True to disable cuda graph
-        enforce_eager=False,
+    ckpt_path,
+    dtype="half",
+    quantization="bitblas",
+    # set enforce_eager = False to enable cuda graph
+    # set enforce_eager = True to disable cuda graph
+    enforce_eager=False,
 ) as bitnet_model:
-    bitbnet_outputs = bitnet_model.generate_greedy(["Hi, tell me about microsoft?"],
-                                                   max_tokens=1024)
+    bitbnet_outputs = bitnet_model.generate_greedy(["Hi, tell me about microsoft?"], max_tokens=1024)
     print("bitnet inference:")
     print(bitbnet_outputs[0][0])
     print(bitbnet_outputs[0][1])
diff --git a/examples/bitnet-1.58b/vllm_workspace/inference_with_native_format.py b/examples/bitnet-1.58b/vllm_workspace/inference_with_native_format.py
index 4f5f87f6f..f631fb306 100644
--- a/examples/bitnet-1.58b/vllm_workspace/inference_with_native_format.py
+++ b/examples/bitnet-1.58b/vllm_workspace/inference_with_native_format.py
@@ -33,13 +33,13 @@
 ckpt_path = args.ckpt_path
 
 with VllmRunner(
-        ckpt_path,
-        dtype="half",
-        quantization="bitnet_bitblas",
-        gpu_memory_utilization=0.5,
-        # set enforce_eager = False to enable cuda graph
-        # set enforce_eager = True to disable cuda graph
-        enforce_eager=False,
+    ckpt_path,
+    dtype="half",
+    quantization="bitnet_bitblas",
+    gpu_memory_utilization=0.5,
+    # set enforce_eager = False to enable cuda graph
+    # set enforce_eager = True to disable cuda graph
+    enforce_eager=False,
 ) as bitnet_model:
     bitbnet_outputs = bitnet_model.generate_greedy(["Hi, tell me about microsoft?"], max_tokens=128)
     print("bitnet inference output:")
diff --git a/examples/bitnet-1.58b/vllm_workspace/utils.py b/examples/bitnet-1.58b/vllm_workspace/utils.py
index daa9d8f52..e96b19e28 100644
--- a/examples/bitnet-1.58b/vllm_workspace/utils.py
+++ b/examples/bitnet-1.58b/vllm_workspace/utils.py
@@ -3,8 +3,7 @@
 TokensText = Tuple[List[int], str]
 
 
-def check_outputs_equal(outputs_0_lst: List[TokensText], outputs_1_lst: List[TokensText],
-                        name_0: str, name_1: str):
+def check_outputs_equal(outputs_0_lst: List[TokensText], outputs_1_lst: List[TokensText], name_0: str, name_1: str):
     """
     Compare the two sequences generated by different models,
     which should be equal.
@@ -15,19 +14,14 @@ def check_outputs_equal(outputs_0_lst: List[TokensText], outputs_1_lst: List[Tok
         output_ids_0, output_str_0 = outputs_0
         output_ids_1, output_str_1 = outputs_1
 
-        assert output_str_0 == output_str_1, (f"Test{prompt_idx}:"
-                                              f"\n{name_0}:\t{output_str_0!r}"
-                                              f"\n{name_1}:\t{output_str_1!r}")
-        assert output_ids_0 == output_ids_1, (f"Test{prompt_idx}:"
-                                              f"\n{name_0}:\t{output_str_0!r}"
-                                              f"\n{name_1}:\t{output_str_1!r}")
+        assert output_str_0 == output_str_1, f"Test{prompt_idx}:\n{name_0}:\t{output_str_0!r}\n{name_1}:\t{output_str_1!r}"
+        assert output_ids_0 == output_ids_1, f"Test{prompt_idx}:\n{name_0}:\t{output_str_0!r}\n{name_1}:\t{output_str_1!r}"
 
 
 TokensTextLogprobs = Tuple[List[int], str, List[Dict[int, float]]]
 
 
-def check_logprobs_close(outputs_0_lst: List[TokensTextLogprobs],
-                         outputs_1_lst: List[TokensTextLogprobs], name_0: str, name_1: str):
+def check_logprobs_close(outputs_0_lst: List[TokensTextLogprobs], outputs_1_lst: List[TokensTextLogprobs], name_0: str, name_1: str):
     """
     Compare the logprobs of two sequences generated by different models,
     which should be similar but not necessarily equal.
@@ -41,16 +35,11 @@ def check_logprobs_close(outputs_0_lst: List[TokensTextLogprobs],
 
         # Loop through generated tokens.
         for idx, (output_id_0, output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
-
             # If generated tokens don't match, then
             if output_id_0 != output_id_1:
                 # Each predicted token must be in top N logprobs of the other
-                assert output_id_0 in logprobs_1[idx], (f"Test{prompt_idx}:"
-                                                        f"\n{name_0}:\t{output_str_0!r}"
-                                                        f"\n{name_1}:\t{output_str_1!r}")
-                assert output_id_1 in logprobs_0[idx], (f"Test{prompt_idx}:"
-                                                        f"\n{name_0}:\t{output_str_0!r}"
-                                                        f"\n{name_1}:\t{output_str_1!r}")
+                assert output_id_0 in logprobs_1[idx], f"Test{prompt_idx}:\n{name_0}:\t{output_str_0!r}\n{name_1}:\t{output_str_1!r}"
+                assert output_id_1 in logprobs_0[idx], f"Test{prompt_idx}:\n{name_0}:\t{output_str_0!r}\n{name_1}:\t{output_str_1!r}"
 
                 # Break out since sequences will now diverge.
                 break
diff --git a/examples/blocksparse_attention/README.md b/examples/blocksparse_attention/README.md
index 89f75b81d..34bf3c637 100644
--- a/examples/blocksparse_attention/README.md
+++ b/examples/blocksparse_attention/README.md
@@ -1,6 +1,5 @@
 # Block-Sparse Flash-Attention
 
-Tilelang implementation of block-sparse flash-attention kernels. 
-
-The kernels have been used in [Rectified Sparse Attention](https://arxiv.org/abs/2506.04108) and [SeerAttention-R](https://arxiv.org/abs/2506.08889). 
+Tilelang implementation of block-sparse flash-attention kernels.
 
+The kernels have been used in [Rectified Sparse Attention](https://arxiv.org/abs/2506.04108) and [SeerAttention-R](https://arxiv.org/abs/2506.08889).
diff --git a/examples/blocksparse_attention/block_sparse_attn_triton.py b/examples/blocksparse_attention/block_sparse_attn_triton.py
index 014f0c5fc..b94e602f6 100644
--- a/examples/blocksparse_attention/block_sparse_attn_triton.py
+++ b/examples/blocksparse_attention/block_sparse_attn_triton.py
@@ -1,7 +1,6 @@
 # ruff: noqa: E712
 import math
 import torch
-
 import triton
 import triton.language as tl
 import torch.nn.functional as F
@@ -15,10 +14,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -56,7 +52,6 @@ def _fwd_kernel_inner(
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
 ):
-
     mask_val = tl.load(block_mask_ptr + k_block_col_idx * stride_bmask_n)
     # print
 
@@ -73,8 +68,7 @@ def _fwd_kernel_inner(
 
         # the following is needed only when LAST_K_BLOCK or BLOCK_M < BLOCK_N
         if LAST_K_BLOCK:
-            qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0,
-                           float('-inf'))
+            qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0, float("-inf"))
 
         m_ij = tl.maximum(m_i, tl.max(qk, 1))
         qk -= m_ij[:, None]
@@ -154,7 +148,7 @@ def _fwd_kernel(
     v_ptrs = V + off_v
     mask_ptrs = block_mask_ptr + start_m * stride_bmm
 
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
     l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
     acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
 
@@ -192,24 +186,12 @@ def _fwd_kernel(
     acc = acc * l_recip
     acc = acc.to(Out.dtype.element_ty)
 
-    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[
-        None, :] * stride_od
+    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :] * stride_od
     out_ptrs = Out + off_o
     tl.store(out_ptrs, acc, mask=offs_m[:, None] < N_CTX)
 
 
-def _forward(ctx,
-             q,
-             k,
-             v,
-             block_sparse_mask,
-             sm_scale,
-             BLOCK_M=64,
-             BLOCK_N=64,
-             num_warps=None,
-             num_stages=1,
-             out=None):
-
+def _forward(ctx, q, k, v, block_sparse_mask, sm_scale, BLOCK_M=64, BLOCK_N=64, num_warps=None, num_stages=1, out=None):
     assert q.shape[-1] == k.shape[-1] == v.shape[-1]
     assert k.shape[2] == v.shape[2]
     o = out if out is not None else torch.empty_like(q).contiguous()
@@ -254,7 +236,6 @@ def _forward(ctx,
 
 
 class _sparse_attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, block_sparse_dense, sm_scale):
         # shape constraints
@@ -278,9 +259,9 @@ def test_topk_sparse_attention():
     torch.manual_seed(0)
 
     # Create inputs
-    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
     sm_scale = 1.0 / (D_HEAD**0.5)
 
     # Create sparse mask (downsampled to block level)
@@ -288,9 +269,7 @@ def test_topk_sparse_attention():
     downsample_len = math.ceil(SEQ_LEN / downsample_factor)
     print("downsample_len", downsample_len)
 
-    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                       device='cuda',
-                       dtype=torch.bfloat16)
+    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
     x_ds[:, :, :, 0] = 100
     print("x_ds.shape", x_ds.shape)
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
@@ -302,22 +281,21 @@ def test_topk_sparse_attention():
 
     # Compute reference
     # Expand block mask to full attention matrix
-    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda'))
+    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda"))
     full_mask = full_mask[..., :SEQ_LEN, :SEQ_LEN].bool()
     full_mask = full_mask & torch.tril(torch.ones_like(full_mask))  # Apply causal
 
     # PyTorch reference implementation
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
-    attn = attn.masked_fill(~full_mask, float('-inf'))
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
+    attn = attn.masked_fill(~full_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     # print("ref_output", ref_output)
     # print("triton_output", triton_output)
 
     # Verify accuracy
-    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), \
-        "Triton output doesn't match reference"
+    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), "Triton output doesn't match reference"
     print("Pass topk sparse attention test with qlen == klen")
 
 
@@ -329,9 +307,9 @@ def test_topk_sparse_attention_qlt_kl():
     torch.manual_seed(0)
 
     # Create inputs.
-    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
+    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
     # softmax scale
     sm_scale = 1.0 / (D_HEAD**0.5)
 
@@ -339,8 +317,7 @@ def test_topk_sparse_attention_qlt_kl():
     print("downsample_factor", downsample_factor)
     downsample_len = math.ceil(K_LEN / downsample_factor)  # number of blocks along one dimension
     print("downsample_len", downsample_len)
-    x_ds = torch.randn(
-        BATCH, N_HEADS, downsample_len, downsample_len, device='cuda', dtype=torch.bfloat16)
+    x_ds = torch.randn(BATCH, N_HEADS, downsample_len, downsample_len, device="cuda", dtype=torch.bfloat16)
     # Force the first column to be high so that the first block is always selected.
     x_ds[:, :, :, 0] = 100
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
@@ -351,26 +328,25 @@ def test_topk_sparse_attention_qlt_kl():
 
     past_len = K_LEN - Q_LEN
 
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
 
-    full_mask_full = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda')).bool()
+    full_mask_full = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda")).bool()
     full_mask_full = full_mask_full[..., :K_LEN, :K_LEN]
 
     effective_mask = full_mask_full[..., past_len:K_LEN, :]  # shape: (B, H, Q_LEN, K_LEN)
 
     i_global = torch.arange(past_len, K_LEN, device=k.device).unsqueeze(1)  # shape: (Q_LEN, 1)
     j_global = torch.arange(K_LEN, device=k.device).unsqueeze(0)  # shape: (1, K_LEN)
-    causal_mask = (j_global <= i_global)  # shape: (Q_LEN, K_LEN)
+    causal_mask = j_global <= i_global  # shape: (Q_LEN, K_LEN)
 
     final_mask = effective_mask & causal_mask  # shape: (B, H, Q_LEN, K_LEN)
 
-    attn = attn.masked_fill(~final_mask, float('-inf'))
+    attn = attn.masked_fill(~final_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     # Verify accuracy.
-    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), \
-        "Triton output doesn't match reference when qlen < klen"
+    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), "Triton output doesn't match reference when qlen < klen"
 
     print("Pass topk sparse attention test with qlen < klen")
 
diff --git a/examples/blocksparse_attention/example_tilelang_block_sparse_attn.py b/examples/blocksparse_attention/example_tilelang_block_sparse_attn.py
index 7e90db7e5..9a394710f 100644
--- a/examples/blocksparse_attention/example_tilelang_block_sparse_attn.py
+++ b/examples/blocksparse_attention/example_tilelang_block_sparse_attn.py
@@ -1,8 +1,8 @@
 import math
 import torch
-
 import tilelang
 import tilelang.language as T
+from tilelang.profiler import do_bench
 import torch.nn.functional as F
 
 
@@ -10,10 +10,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -30,105 +27,34 @@ def get_sparse_attn_mask_from_threshold(x, threshold, use_dense_for_last_block=F
 
 
 @tilelang.jit(
-    out_idx=[4], pass_configs={
+    out_idx=[4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal):
     block_M = 64
     block_N = 64
     num_stages = 1
     threads = 128
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, heads, seq_len, dim]
     block_mask_shape = [batch, heads, downsample_len, downsample_len]
 
-    dtype = "float16"
-    accum_dtype = "float"
-    block_mask_dtype = "bool"
+    dtype = T.float16
+    accum_dtype = T.float32
+    block_mask_dtype = T.bool
 
     def kernel_func(block_M, block_N, num_stages, threads):
-
-        @T.macro
-        def MMA0(
-            K: T.Tensor(shape, dtype),
-            Q_shared: T.SharedBuffer([block_M, dim], dtype),
-            K_shared: T.SharedBuffer([block_N, dim], dtype),
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            k: T.int32,
-            bx: T.int32,
-            by: T.int32,
-            bz: T.int32,
-        ):
-            T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
-            if is_causal:
-                for i, j in T.Parallel(block_M, block_N):
-                    acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                 -T.infinity(acc_s.dtype))
-            else:
-                T.clear(acc_s)
-            T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-
-        @T.macro
-        def MMA1(
-            V: T.Tensor(shape, dtype),
-            V_shared: T.SharedBuffer([block_M, dim], dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            k: T.int32,
-            by: T.int32,
-            bz: T.int32,
-        ):
-            T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
-            T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-
-        @T.macro
-        def Softmax(
-                acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-                acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-                scores_max: T.FragmentBuffer([block_M], accum_dtype),
-                scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-                scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-                logsum: T.FragmentBuffer([block_M], accum_dtype),
-        ):
-            T.copy(scores_max, scores_max_prev)
-            T.fill(scores_max, -T.infinity(accum_dtype))
-            T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-            # To do causal softmax, we need to set the scores_max to 0 if it is -inf
-            # This process is called Check_inf in FlashAttention3 code, and it only need to be done
-            # in the first ceil_div(kBlockM, kBlockN) steps.
-            # for i in T.Parallel(block_M):
-            #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
-            for i in T.Parallel(block_M):
-                scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-            for i, j in T.Parallel(block_M, block_N):
-                # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
-                # max * log_2(e)) This allows the compiler to use the ffma
-                # instruction instead of fadd and fmul separately.
-                acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-            T.reduce_sum(acc_s, scores_sum, dim=1)
-            for i in T.Parallel(block_M):
-                logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-            T.copy(acc_s, acc_s_cast)
-
-        @T.macro
-        def Rescale(
-                acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-        ):
-            for i, j in T.Parallel(block_M, dim):
-                acc_o[i, j] *= scores_scale[i]
-
         @T.prim_func
         def blocksparse_flashattn(
-                Q: T.Tensor(shape, dtype),
-                K: T.Tensor(shape, dtype),
-                V: T.Tensor(shape, dtype),
-                BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
-                Output: T.Tensor(shape, dtype),
+            Q: T.Tensor(shape, dtype),
+            K: T.Tensor(shape, dtype),
+            V: T.Tensor(shape, dtype),
+            BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
+            Output: T.Tensor(shape, dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
+            with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
                 Q_shared = T.alloc_shared([block_M, dim], dtype)
                 K_shared = T.alloc_shared([block_N, dim], dtype)
                 V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -141,31 +67,59 @@ def blocksparse_flashattn(
                 scores_scale = T.alloc_fragment([block_M], accum_dtype)
                 scores_sum = T.alloc_fragment([block_M], accum_dtype)
                 logsum = T.alloc_fragment([block_M], accum_dtype)
-                block_mask = T.alloc_local([downsample_len], block_mask_dtype)
+                block_mask = T.alloc_fragment([downsample_len], block_mask_dtype)
 
-                T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+                T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
 
-                for vj in T.serial(downsample_len):
-                    block_mask[vj] = BlockSparseMask[bz, by, bx, vj]
+                T.copy(BlockSparseMask[bz, by, bx, :], block_mask)
 
                 loop_range = (
-                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                        (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+                )
 
                 for k in T.Pipelined(loop_range, num_stages=num_stages):
                     if block_mask[k] != 0:
-                        MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                        Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale,
-                                scores_sum, logsum)
-                        Rescale(acc_o, scores_scale)
-                        MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
+                        T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
+                        if is_causal:
+                            for i, j in T.Parallel(block_M, block_N):
+                                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
+                        else:
+                            T.clear(acc_s)
+                        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                        T.copy(scores_max, scores_max_prev)
+                        T.fill(scores_max, -T.infinity(accum_dtype))
+                        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
+                        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
+                        # in the first ceil_div(kBlockM, kBlockN) steps.
+                        # for i in T.Parallel(block_M):
+                        #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
+                        for i in T.Parallel(block_M):
+                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                        for i, j in T.Parallel(block_M, block_N):
+                            # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
+                            # max * log_2(e)) This allows the compiler to use the ffma
+                            # instruction instead of fadd and fmul separately.
+                            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                        T.reduce_sum(acc_s, scores_sum, dim=1)
+                        for i in T.Parallel(block_M):
+                            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                        T.copy(acc_s, acc_s_cast)
+
+                        for i, j in T.Parallel(block_M, dim):
+                            acc_o[i, j] *= scores_scale[i]
+
+                        T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
+                        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
                 for i, j in T.Parallel(block_M, dim):
                     acc_o[i, j] /= logsum[i]
                 T.copy(acc_o, O_shared)
-                T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+                T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
         return blocksparse_flashattn
 
@@ -180,18 +134,16 @@ def test_topk_sparse_attention():
     torch.manual_seed(0)
 
     # Create inputs
-    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
     sm_scale = 1.0 / (D_HEAD**0.5)
 
     # Create sparse mask (downsampled to block level)
     downsample_factor = BLOCK
     downsample_len = math.ceil(SEQ_LEN / downsample_factor)
-    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                       device='cuda',
-                       dtype=torch.bfloat16)
+    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
     x_ds[:, :, :, 0] = 100
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
 
@@ -202,15 +154,15 @@ def test_topk_sparse_attention():
 
     # Compute reference
     # Expand block mask to full attention matrix
-    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda'))
+    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda"))
     full_mask = full_mask[..., :SEQ_LEN, :SEQ_LEN].bool()
     full_mask = full_mask & torch.tril(torch.ones_like(full_mask))  # Apply causal
 
     # PyTorch reference implementation
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
-    attn = attn.masked_fill(~full_mask, float('-inf'))
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
+    attn = attn.masked_fill(~full_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     print("ref_output", ref_output)
     print("tilelang_output", tilelang_output)
@@ -224,5 +176,26 @@ def main():
     test_topk_sparse_attention()
 
 
+def run_regression_perf():
+    BATCH, N_HEADS, SEQ_LEN, D_HEAD = 1, 32, 256, 64
+    TOPK = 2
+    BLOCK = 64
+    torch.manual_seed(0)
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    downsample_factor = BLOCK
+    downsample_len = math.ceil(SEQ_LEN / downsample_factor)
+    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
+    x_ds[:, :, :, 0] = 100
+    block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
+    kernel = blocksparse_flashattn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, downsample_len, is_causal=True)
+
+    def run_kernel_only():
+        kernel(q, k, v, block_mask)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
index e29982162..6e7321452 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
@@ -8,22 +8,26 @@
 import argparse
 import time
 import math
+from tilelang.profiler import do_bench
 
 from heuristic import num_splits_heuristic
 
 
 def flashattn(batch, heads, heads_kv, dim, dim_v):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // heads_kv
 
     @tilelang.jit(
-        out_idx=[-1], pass_configs={
+        out_idx=[-1],
+        pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
-    def kernel_func(block_N, block_H, page_block_size, num_split, num_stages, threads, num_pages,
-                    max_num_blocks_per_seq, max_selected_blocks):
+        },
+    )
+    def kernel_func(
+        block_N, block_H, page_block_size, num_split, num_stages, threads, num_pages, max_num_blocks_per_seq, max_selected_blocks
+    ):
         shape_q = [batch, heads, dim]
         shape_k = [num_pages, page_block_size, heads_kv, dim]
         shape_v = [num_pages, page_block_size, heads_kv, dim_v]
@@ -35,19 +39,20 @@ def kernel_func(block_N, block_H, page_block_size, num_split, num_stages, thread
         assert block_N <= page_block_size and page_block_size % block_N == 0
         block_ratio = page_block_size // block_N
 
-        @T.macro
-        def flash_attn_split(
-                Q: T.Tensor(shape_q, dtype),
-                K: T.Tensor(shape_k, dtype),
-                V: T.Tensor(shape_v, dtype),
-                block_indices: T.Tensor(shape_indices, "int32"),
-                cache_seqlens: T.Tensor([batch], "int32"),
-                block_table: T.Tensor(shape_block_table, "int32"),
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
+        @T.prim_func
+        def main(
+            Q: T.Tensor(shape_q, dtype),
+            K: T.Tensor(shape_k, dtype),
+            V: T.Tensor(shape_v, dtype),
+            block_indices: T.Tensor(shape_indices, T.int32),
+            cache_seqlens: T.Tensor([batch], T.int32),
+            block_table: T.Tensor(shape_block_table, T.int32),
+            glse: T.Tensor([batch, heads, num_split], accum_dtype),
+            Output_partial: T.Tensor(part_shape, accum_dtype),
+            Output: T.Tensor(shape_o, dtype),
         ):
-            with T.Kernel(
-                    batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
+            # flash_attn_split
+            with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
                 Q_shared = T.alloc_shared([block_H, dim], dtype)
                 K_shared = T.alloc_shared([block_N, dim], dtype)
                 V_shared = T.alloc_shared([block_N, dim_v], dtype)
@@ -67,7 +72,7 @@ def flash_attn_split(
                 sid = bz
                 cur_kv_head = hid // (kv_group_num // valid_block_H)
 
-                T.copy(Q[bid, hid * valid_block_H:hid * valid_block_H + block_H, :], Q_shared)
+                T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
@@ -75,7 +80,7 @@ def flash_attn_split(
                 num_blocks = max_selected_blocks
                 blocks_per_split = T.floordiv(num_blocks, num_split)
                 remaining_blocks = T.floormod(num_blocks, num_split)
-                loop_range = (blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0))
+                loop_range = blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0)
                 start = blocks_per_split * sid + T.min(sid, remaining_blocks)
                 has_valid_block = False
                 for k in T.Pipelined(loop_range, num_stages=num_stages):
@@ -85,30 +90,20 @@ def flash_attn_split(
                         block_table_idx = T.floordiv(logical_block_idx, block_ratio)
                         block_tile_idx = T.floormod(logical_block_idx, block_ratio)
                         physical_block_idx = block_table[bid, block_table_idx]
-                        T.copy(
-                            K[physical_block_idx,
-                              block_tile_idx * block_N:(block_tile_idx + 1) * block_N,
-                              cur_kv_head, :], K_shared)
+                        T.copy(K[physical_block_idx, block_tile_idx * block_N : (block_tile_idx + 1) * block_N, cur_kv_head, :], K_shared)
                         T.clear(acc_s)
-                        T.gemm(
-                            Q_shared,
-                            K_shared,
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullRow)
+                        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                         if k == 0:  # assume block_indices is sorted in reverse order, otherwise, remove this if condition
                             for i, j in T.Parallel(block_H, block_N):
                                 acc_s[i, j] = T.if_then_else(
-                                    logical_block_idx * block_N + j >= cache_seqlens[bid],
-                                    -T.infinity(accum_dtype), acc_s[i, j])
+                                    logical_block_idx * block_N + j >= cache_seqlens[bid], -T.infinity(accum_dtype), acc_s[i, j]
+                                )
                         T.copy(scores_max, scores_max_prev)
                         T.fill(scores_max, -T.infinity(accum_dtype))
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                         for i in T.Parallel(block_H):
-                            scores_max[i] = T.if_then_else(scores_max[i] > scores_max_prev[i],
-                                                           scores_max[i], scores_max_prev[i])
-                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
-                                                     scores_max[i] * scale)
+                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                         for i, j in T.Parallel(block_H, block_N):
                             acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                         T.reduce_sum(acc_s, scores_sum, dim=1)
@@ -117,10 +112,7 @@ def flash_attn_split(
                         T.copy(acc_s, acc_s_cast)
                         for i, j in T.Parallel(block_H, dim_v):
                             acc_o[i, j] *= scores_scale[i]
-                        T.copy(
-                            V[physical_block_idx,
-                              block_tile_idx * block_N:(block_tile_idx + 1) * block_N,
-                              cur_kv_head, :], V_shared)
+                        T.copy(V[physical_block_idx, block_tile_idx * block_N : (block_tile_idx + 1) * block_N, cur_kv_head, :], V_shared)
                         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
                 if has_valid_block:
                     for i, j in T.Parallel(block_H, dim_v):
@@ -137,74 +129,47 @@ def flash_attn_split(
                     if i < valid_block_H:
                         Output_partial[bid, hid * valid_block_H + i, sid, j] = acc_o[i, j]
 
-        @T.macro
-        def combine(
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-                Output: T.Tensor(shape_o, dtype),
-        ):
+            # combine
             with T.Kernel(heads, batch, threads=128) as (by, bz):
                 po_local = T.alloc_fragment([dim_v], accum_dtype)
                 o_accum_local = T.alloc_fragment([dim_v], accum_dtype)
-                lse_local_split = T.alloc_local([1], accum_dtype)
-                lse_logsum_local = T.alloc_local([1], accum_dtype)
-                lse_max_local = T.alloc_local([1], accum_dtype)
-                scale_local = T.alloc_local([1], accum_dtype)
-                max_split = T.alloc_local([1], "int32")
-
-                T.annotate_layout({
-                    lse_logsum_local:
-                        T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-                })
+                lse_local_split = T.alloc_var(accum_dtype)
+                lse_logsum_local = T.alloc_var(accum_dtype)
+                lse_max_local = T.alloc_var(accum_dtype)
+                scale_local = T.alloc_var(accum_dtype)
+                max_split = T.alloc_var(T.int32)
 
                 T.clear(lse_logsum_local)
                 T.clear(o_accum_local)
-                lse_max_local[0] = -T.infinity(accum_dtype)
+                lse_max_local = -T.infinity(accum_dtype)
                 for k in T.serial(num_split):
-                    lse_local_split[0] = glse[bz, by, k]
-                    if (lse_local_split[0] != 0):
-                        max_split[0] = k
-                        lse_max_local[0] = T.max(lse_max_local[0], glse[bz, by, k])
+                    lse_local_split = glse[bz, by, k]
+                    if lse_local_split != 0:
+                        max_split = k
+                        lse_max_local = T.max(lse_max_local, glse[bz, by, k])
 
                 for k in T.Pipelined(num_split, num_stages=1):
-                    if k <= max_split[0]:
-                        lse_local_split[0] = glse[bz, by, k]
-                        lse_logsum_local[0] += T.exp2(lse_local_split[0] - lse_max_local[0])
-                lse_logsum_local[0] = T.log2(lse_logsum_local[0]) + lse_max_local[0]
+                    if k <= max_split:
+                        lse_local_split = glse[bz, by, k]
+                        lse_logsum_local += T.exp2(lse_local_split - lse_max_local)
+                lse_logsum_local = T.log2(lse_logsum_local) + lse_max_local
                 for k in T.serial(num_split):
-                    if k <= max_split[0]:
+                    if k <= max_split:
                         for i in T.Parallel(dim_v):
                             po_local[i] = Output_partial[bz, by, k, i]
-                        lse_local_split[0] = glse[bz, by, k]
-                        scale_local[0] = T.exp2(lse_local_split[0] - lse_logsum_local[0])
+                        lse_local_split = glse[bz, by, k]
+                        scale_local = T.exp2(lse_local_split - lse_logsum_local)
                         for i in T.Parallel(dim_v):
-                            o_accum_local[i] += po_local[i] * scale_local[0]
+                            o_accum_local[i] += po_local[i] * scale_local
                 for i in T.Parallel(dim_v):
                     Output[bz, by, i] = o_accum_local[i]
 
-        @T.prim_func
-        def main(
-                Q: T.Tensor(shape_q, dtype),
-                K: T.Tensor(shape_k, dtype),
-                V: T.Tensor(shape_v, dtype),
-                block_indices: T.Tensor(shape_indices, "int32"),
-                cache_seqlens: T.Tensor([batch], "int32"),
-                block_table: T.Tensor(shape_block_table, "int32"),
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-                Output: T.Tensor(shape_o, dtype),
-        ):
-            flash_attn_split(Q, K, V, block_indices, cache_seqlens, block_table, glse,
-                             Output_partial)
-            combine(glse, Output_partial, Output)
-
         return main
 
     return kernel_func
 
 
 class SparseFlashAttn(torch.nn.Module):
-
     def __init__(self, batch, heads, heads_kv, dim, dim_v, page_block_size, block_N, num_pages):
         super(SparseFlashAttn, self).__init__()
         self.batch = batch
@@ -250,18 +215,11 @@ def forward(self, query, key, value, block_indices, cache_seqlens, block_table):
         num_sm = self.num_sm
 
         num_split = num_splits_heuristic(
-            total_mblocks,
-            num_sm,
-            num_n_blocks,
-            num_m_blocks,
-            size_one_kv_head,
-            is_causal_or_local=True,
-            max_splits=128)
-
-        glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device='cuda')
-        output_partial = torch.empty((batch, heads, num_split, dim_v),
-                                     dtype=torch.float32,
-                                     device='cuda')
+            total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+        )
+
+        glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+        output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
 
         output = self.kernel(
             query,
@@ -276,14 +234,13 @@ def forward(self, query, key, value, block_indices, cache_seqlens, block_table):
         return output
 
 
-def ref_program_torch_paged(query, key_cache, value_cache, block_indices, cache_seqlens,
-                            block_table, page_block_size, block_size):
+def ref_program_torch_paged(query, key_cache, value_cache, block_indices, cache_seqlens, block_table, page_block_size, block_size):
     """
     Paged version of sparse attention reference implementation.
-    
+
     Args:
         query: [batch, heads, dim]
-        key_cache: [num_pages, page_block_size, heads_kv, dim] 
+        key_cache: [num_pages, page_block_size, heads_kv, dim]
         value_cache: [num_pages, page_block_size, heads_kv, dim]
         block_indices: [batch, heads_kv, max_selected_blocks] - logical block indices
         cache_seqlens: [batch] - actual sequence lengths
@@ -299,12 +256,8 @@ def ref_program_torch_paged(query, key_cache, value_cache, block_indices, cache_
 
     # Reconstruct the full key and value tensors from paged cache
     max_cache_seqlen = max(cache_seqlens).item()
-    key_full = torch.zeros((batch, heads_kv, max_cache_seqlen, dim),
-                           dtype=key_cache.dtype,
-                           device=key_cache.device)
-    value_full = torch.zeros((batch, heads_kv, max_cache_seqlen, dim_v),
-                             dtype=value_cache.dtype,
-                             device=value_cache.device)
+    key_full = torch.zeros((batch, heads_kv, max_cache_seqlen, dim), dtype=key_cache.dtype, device=key_cache.device)
+    value_full = torch.zeros((batch, heads_kv, max_cache_seqlen, dim_v), dtype=value_cache.dtype, device=value_cache.device)
 
     # Reconstruct full tensors from paged cache using block_table
     for b in range(batch):
@@ -320,20 +273,14 @@ def ref_program_torch_paged(query, key_cache, value_cache, block_indices, cache_
             actual_block_size = end_token - start_token
 
             # Copy from paged cache to full tensors
-            key_full[b, :, start_token:end_token, :] = key_cache[
-                physical_block_idx, :actual_block_size, :, :].transpose(0, 1)
-            value_full[b, :, start_token:end_token, :] = value_cache[
-                physical_block_idx, :actual_block_size, :, :].transpose(0, 1)
+            key_full[b, :, start_token:end_token, :] = key_cache[physical_block_idx, :actual_block_size, :, :].transpose(0, 1)
+            value_full[b, :, start_token:end_token, :] = value_cache[physical_block_idx, :actual_block_size, :, :].transpose(0, 1)
 
     # Reshape query for grouped attention
-    query = rearrange(
-        query, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
+    query = rearrange(query, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
 
     # Compute attention scores
-    scores = einsum(
-        query, key_full,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = einsum(query, key_full, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
     # Create sparse mask based on block_indices
     sparse_mask = torch.zeros_like(scores)
@@ -349,24 +296,23 @@ def ref_program_torch_paged(query, key_cache, value_cache, block_indices, cache_
                     sparse_mask[b, :, h, start_pos:end_pos] = 1
 
     # Apply sparse mask
-    scores = scores.masked_fill(sparse_mask == 0, float('-inf'))
+    scores = scores.masked_fill(sparse_mask == 0, float("-inf"))
 
     # Apply causal mask based on actual sequence lengths
     range_len = torch.arange(scores.shape[-1], device=scores.device).unsqueeze(0)
     cache_seqlens_expanded = cache_seqlens.unsqueeze(1)
     pad_mask = range_len >= cache_seqlens_expanded
     pad_mask = pad_mask[:, None, None, :]
-    scores = scores.masked_fill(pad_mask, float('-inf'))
+    scores = scores.masked_fill(pad_mask, float("-inf"))
 
     # Compute attention weights
     attention = F.softmax(scores / scale, dim=-1)
 
     # Apply attention to values
-    out = einsum(attention, value_full,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, heads_kv, dim]
+    out = einsum(attention, value_full, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, heads_kv, dim]
 
     # Reshape output back to original format
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
 
     return out
 
@@ -374,17 +320,23 @@ def ref_program_torch_paged(query, key_cache, value_cache, block_indices, cache_
 def ref_program_fa(query, kcache, vcache, cache_seqlens, block_table):
     # latency reference
     # from flash_attn_interface import flash_attn_with_kvcache # fa3
-    from flash_attn import flash_attn_with_kvcache  #fa2
+    from flash_attn import flash_attn_with_kvcache  # fa2
+
     query = query.unsqueeze(1)
-    output = flash_attn_with_kvcache(
-        query, kcache, vcache, cache_seqlens=cache_seqlens, block_table=block_table)
+    output = flash_attn_with_kvcache(query, kcache, vcache, cache_seqlens=cache_seqlens, block_table=block_table)
     output = output.squeeze(1)
     return output
 
 
 def main(args):
-
-    batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v
+    batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = (
+        args.batch,
+        args.heads,
+        args.heads_kv,
+        args.max_cache_seqlen,
+        args.dim,
+        args.dim_v,
+    )
     sparse_ratio = args.sparse_ratio
     block_N = args.block_N
     page_block_size = args.page_block_size
@@ -396,35 +348,30 @@ def main(args):
     dtype = torch.float16
 
     # Generate random inputs
-    Q = torch.randn((batch, heads, dim), dtype=dtype, device='cuda')
-    cache_seqlens = torch.randint(
-        max_cache_seqlen // 2, max_cache_seqlen + 1, (batch,), dtype=torch.int32, device='cuda')
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(max_cache_seqlen // 2, max_cache_seqlen + 1, (batch,), dtype=torch.int32, device="cuda")
     print("cache_seqlens: ", cache_seqlens)
 
-    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device='cuda')
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
 
     # Create paged KV cache
-    K_cache = torch.zeros((num_blocks, page_block_size, heads_kv, dim), dtype=dtype, device='cuda')
-    V_cache = torch.zeros((num_blocks, page_block_size, heads_kv, dim_v),
-                          dtype=dtype,
-                          device='cuda')
+    K_cache = torch.zeros((num_blocks, page_block_size, heads_kv, dim), dtype=dtype, device="cuda")
+    V_cache = torch.zeros((num_blocks, page_block_size, heads_kv, dim_v), dtype=dtype, device="cuda")
 
     # Create block table and block indices for dense case (all blocks selected)
     max_num_blocks_per_seq = int(math.ceil(max_cache_seqlen / page_block_size))
     print("max_num_blocks_per_seq: ", max_num_blocks_per_seq)
-    block_table = torch.zeros((batch, max_num_blocks_per_seq), dtype=torch.int32, device='cuda')
-    block_indices = torch.zeros((batch, heads_kv, max_selected_blocks),
-                                dtype=torch.int32,
-                                device='cuda')
+    block_table = torch.zeros((batch, max_num_blocks_per_seq), dtype=torch.int32, device="cuda")
+    block_indices = torch.zeros((batch, heads_kv, max_selected_blocks), dtype=torch.int32, device="cuda")
 
     # Fill block table and block indices and cache
 
     # Create a pool of available physical blocks
-    total_blocks_needed = sum(
-        int(math.ceil(cache_seqlens[seq_idx].item() / page_block_size)) for seq_idx in range(batch))
+    total_blocks_needed = sum(int(math.ceil(cache_seqlens[seq_idx].item() / page_block_size)) for seq_idx in range(batch))
     available_blocks = list(range(total_blocks_needed))
     import random
+
     random.seed(42)  # For reproducibility
     random.shuffle(available_blocks)
 
@@ -459,10 +406,8 @@ def main(args):
             actual_block_size = end_token - start_token
 
             # Copy K and V data to the paged cache
-            K_cache[physical_block_idx, :actual_block_size, :, :] = K[seq_idx,
-                                                                      start_token:end_token, :, :]
-            V_cache[physical_block_idx, :actual_block_size, :, :] = V[seq_idx,
-                                                                      start_token:end_token, :, :]
+            K_cache[physical_block_idx, :actual_block_size, :, :] = K[seq_idx, start_token:end_token, :, :]
+            V_cache[physical_block_idx, :actual_block_size, :, :] = V[seq_idx, start_token:end_token, :, :]
 
     # Fill block_indices for sparse attention
     # For dense case (verification), we select all blocks in reverse order
@@ -497,10 +442,9 @@ def main(args):
                     remaining_blocks = [b for b in all_blocks if b not in selected_blocks]
                     if remaining_blocks:
                         import random
+
                         random.seed(42)  # For reproducibility
-                        additional_blocks = random.sample(
-                            remaining_blocks,
-                            min(num_selected - recent_blocks, len(remaining_blocks)))
+                        additional_blocks = random.sample(remaining_blocks, min(num_selected - recent_blocks, len(remaining_blocks)))
                         selected_blocks.extend(additional_blocks)
 
                 # Sort selected blocks in reverse order (most recent first)
@@ -513,25 +457,20 @@ def main(args):
                     block_indices[seq_idx, head_idx, i] = -1
 
     # Initialize sparse attention module
-    sparse_attn = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, page_block_size, block_N,
-                                  num_blocks)
-    output_sparse = sparse_attn.forward(Q, K_cache, V_cache, block_indices, cache_seqlens,
-                                        block_table)
+    sparse_attn = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, page_block_size, block_N, num_blocks)
+    output_sparse = sparse_attn.forward(Q, K_cache, V_cache, block_indices, cache_seqlens, block_table)
 
     import flash_attn  # noqa: F401
 
-    output_ref_torch = ref_program_torch_paged(Q, K_cache, V_cache, block_indices, cache_seqlens,
-                                               block_table, page_block_size, block_N)
+    output_ref_torch = ref_program_torch_paged(Q, K_cache, V_cache, block_indices, cache_seqlens, block_table, page_block_size, block_N)
 
     output_ref_fa = ref_program_fa(Q, K_cache, V_cache, cache_seqlens, block_table)
     # Check correctness
     if sparse_ratio == 0.0:
         max_diff = torch.max(torch.abs(output_sparse - output_ref_fa)).item()
         mean_diff = torch.mean(torch.abs(output_sparse - output_ref_fa)).item()
-        assert torch.allclose(
-            output_ref_fa, output_ref_torch, atol=1e-2), "Reference outputs do not match!"
+        assert torch.allclose(output_ref_fa, output_ref_torch, atol=1e-2), "Reference outputs do not match!"
     else:
-
         max_diff = torch.max(torch.abs(output_sparse - output_ref_torch)).item()
         mean_diff = torch.mean(torch.abs(output_sparse - output_ref_torch)).item()
 
@@ -573,18 +512,140 @@ def main(args):
     print(f"Speedup: {kernel_time_fa / kernel_time:.2f}x")
 
 
+def run_regression_perf(args):
+    batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = (
+        args.batch,
+        args.heads,
+        args.heads_kv,
+        args.max_cache_seqlen,
+        args.dim,
+        args.dim_v,
+    )
+    sparse_ratio = args.sparse_ratio
+    block_N = args.block_N
+    page_block_size = args.page_block_size
+    num_blocks = args.num_pages
+    max_selected_blocks = int(math.ceil(max_cache_seqlen / block_N))
+    dtype = torch.float16
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(max_cache_seqlen // 2, max_cache_seqlen + 1, (batch,), dtype=torch.int32, device="cuda")
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
+    K_cache = torch.zeros((num_blocks, page_block_size, heads_kv, dim), dtype=dtype, device="cuda")
+    V_cache = torch.zeros((num_blocks, page_block_size, heads_kv, dim_v), dtype=dtype, device="cuda")
+    max_num_blocks_per_seq = int(math.ceil(max_cache_seqlen / page_block_size))
+    block_table = torch.zeros((batch, max_num_blocks_per_seq), dtype=torch.int32, device="cuda")
+    block_indices = torch.zeros((batch, heads_kv, max_selected_blocks), dtype=torch.int32, device="cuda")
+    total_blocks_needed = sum(int(math.ceil(cache_seqlens[seq_idx].item() / page_block_size)) for seq_idx in range(batch))
+    available_blocks = list(range(total_blocks_needed))
+    import random
+
+    random.seed(42)
+    random.shuffle(available_blocks)
+    block_assignment = {}
+    block_idx_counter = 0
+    for seq_idx in range(batch):
+        seq_len = cache_seqlens[seq_idx].item()
+        num_blocks_needed = int(math.ceil(seq_len / page_block_size))
+        for block_idx in range(num_blocks_needed):
+            physical_block_idx = available_blocks[block_idx_counter]
+            block_table[seq_idx, block_idx] = physical_block_idx
+            block_assignment[(seq_idx, block_idx)] = physical_block_idx
+            block_idx_counter += 1
+    for seq_idx in range(batch):
+        seq_len = cache_seqlens[seq_idx].item()
+        num_blocks_needed = int(math.ceil(seq_len / page_block_size))
+        for block_idx in range(num_blocks_needed):
+            physical_block_idx = block_assignment[(seq_idx, block_idx)]
+            start_token = block_idx * page_block_size
+            end_token = min(start_token + page_block_size, seq_len)
+            actual_block_size = end_token - start_token
+            K_cache[physical_block_idx, :actual_block_size, :, :] = K[seq_idx, start_token:end_token, :, :]
+            V_cache[physical_block_idx, :actual_block_size, :, :] = V[seq_idx, start_token:end_token, :, :]
+    for seq_idx in range(batch):
+        seq_len = cache_seqlens[seq_idx].item()
+        num_tile = int(math.ceil(seq_len / block_N))
+        if sparse_ratio == 0.0:
+            selected_blocks = min(num_tile, max_selected_blocks)
+            for head_idx in range(heads_kv):
+                for i in range(selected_blocks):
+                    block_indices[seq_idx, head_idx, i] = num_tile - 1 - i
+                for i in range(selected_blocks, max_selected_blocks):
+                    block_indices[seq_idx, head_idx, i] = -1
+        else:
+            num_selected = int(num_tile * (1.0 - sparse_ratio))
+            num_selected = max(1, min(num_selected, max_selected_blocks))
+            all_blocks = list(range(num_tile))
+            for head_idx in range(heads_kv):
+                selected_blocks = []
+                recent_blocks = 1
+                selected_blocks.append(num_tile - 1)
+                if num_selected > recent_blocks:
+                    remaining_blocks = [b for b in all_blocks if b not in selected_blocks]
+                    if remaining_blocks:
+                        import random
+
+                        random.seed(42)
+                        additional_blocks = random.sample(remaining_blocks, min(num_selected - recent_blocks, len(remaining_blocks)))
+                        selected_blocks.extend(additional_blocks)
+
+                selected_blocks.sort(reverse=True)
+
+                for i in range(len(selected_blocks)):
+                    block_indices[seq_idx, head_idx, i] = selected_blocks[i]
+                for i in range(len(selected_blocks), max_selected_blocks):
+                    block_indices[seq_idx, head_idx, i] = -1
+
+    sparse_attn = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, page_block_size, block_N, num_blocks)
+    kernel = sparse_attn.kernel
+    batch = sparse_attn.batch
+    heads = sparse_attn.heads
+    heads_kv = sparse_attn.heads_kv
+    dim_v = sparse_attn.dim_v
+    dim = sparse_attn.dim
+    block_size = sparse_attn.block_N
+    max_selected_blocks = block_indices.shape[-1]
+
+    num_m_blocks = 1 * (heads // heads_kv + sparse_attn.block_H - 1) // sparse_attn.block_H
+    num_n_blocks = max_selected_blocks
+    size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2
+    total_mblocks = batch * heads_kv * num_m_blocks
+
+    num_sm = sparse_attn.num_sm
+
+    num_split = num_splits_heuristic(
+        total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+    )
+
+    glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+    output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
+
+    def run_kernel_only():
+        kernel(
+            Q,
+            K_cache,
+            V_cache,
+            block_indices,
+            cache_seqlens,
+            block_table,
+            glse,
+            output_partial,
+        )
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--heads_kv', type=int, default=8, help='heads_kv')
-    parser.add_argument(
-        '--max_cache_seqlen', type=int, default=8192, help='kvcache sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--dim_v', type=int, default=128, help='dim_v')
-    parser.add_argument('--sparse_ratio', type=float, default=0.0, help='sparse ratio')
-    parser.add_argument('--block_N', type=int, default=64, help='block_N')
-    parser.add_argument('--page_block_size', type=int, default=256, help='block size of pages')
-    parser.add_argument('--num_pages', type=int, default=1024, help='total number of pages')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--heads_kv", type=int, default=8, help="heads_kv")
+    parser.add_argument("--max_cache_seqlen", type=int, default=8192, help="kvcache sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--dim_v", type=int, default=128, help="dim_v")
+    parser.add_argument("--sparse_ratio", type=float, default=0.0, help="sparse ratio")
+    parser.add_argument("--block_N", type=int, default=64, help="block_N")
+    parser.add_argument("--page_block_size", type=int, default=256, help="block size of pages")
+    parser.add_argument("--num_pages", type=int, default=1024, help="total number of pages")
     args = parser.parse_args()
     main(args)
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
index ae3004267..d6cf7d917 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
@@ -7,20 +7,22 @@
 import time
 import math
 from heuristic import num_splits_heuristic
+from tilelang.profiler import do_bench
 
 
 def flashattn(batch, heads, heads_kv, dim, dim_v):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // heads_kv
 
     @tilelang.jit(
-        out_idx=[-1], pass_configs={
+        out_idx=[-1],
+        pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
-    def kernel_func(block_N, block_H, num_split, num_stages, threads, max_cache_seqlen,
-                    max_selected_blocks):
+        },
+    )
+    def kernel_func(block_N, block_H, num_split, num_stages, threads, max_cache_seqlen, max_selected_blocks):
         shape_q = [batch, heads, dim]
         shape_k = [batch, max_cache_seqlen, heads_kv, dim]
         shape_v = [batch, max_cache_seqlen, heads_kv, dim_v]
@@ -29,19 +31,21 @@ def kernel_func(block_N, block_H, num_split, num_stages, threads, max_cache_seql
         part_shape = [batch, heads, num_split, dim_v]
         valid_block_H = min(block_H, kv_group_num)
 
-        @T.macro
-        def flash_attn_split(
-                Q: T.Tensor(shape_q, dtype),
-                K: T.Tensor(shape_k, dtype),
-                V: T.Tensor(shape_v, dtype),
-                block_indices: T.Tensor(shape_indices, "int32"),
-                cache_seqlens: T.Tensor([batch], "int32"),
-                # actual_num_blocks: T.Tensor([batch], "int32"),
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
+        @T.prim_func
+        def main(
+            Q: T.Tensor(shape_q, dtype),
+            K: T.Tensor(shape_k, dtype),
+            V: T.Tensor(shape_v, dtype),
+            block_indices: T.Tensor(shape_indices, T.int32),
+            cache_seqlens: T.Tensor([batch], T.int32),
+            # actual_num_blocks: T.Tensor([batch], T.int32),
+            glse: T.Tensor([batch, heads, num_split], accum_dtype),
+            Output_partial: T.Tensor(part_shape, accum_dtype),
+            Output: T.Tensor(shape_o, dtype),
         ):
-            with T.Kernel(
-                    batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
+            # flash_attn_split(Q, K, V, block_indices, cache_seqlens, actual_num_blocks, glse, Output_partial)
+            # flash_attn_split
+            with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
                 Q_shared = T.alloc_shared([block_H, dim], dtype)
                 K_shared = T.alloc_shared([block_N, dim], dtype)
                 V_shared = T.alloc_shared([block_N, dim_v], dtype)
@@ -62,7 +66,7 @@ def flash_attn_split(
                 sid = bz
                 cur_kv_head = hid // (kv_group_num // valid_block_H)
 
-                T.copy(Q[bid, hid * valid_block_H:hid * valid_block_H + block_H, :], Q_shared)
+                T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
@@ -70,7 +74,7 @@ def flash_attn_split(
                 num_blocks = max_selected_blocks
                 blocks_per_split = T.floordiv(num_blocks, num_split)
                 remaining_blocks = T.floormod(num_blocks, num_split)
-                loop_range = (blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0))
+                loop_range = blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0)
                 start = blocks_per_split * sid + T.min(sid, remaining_blocks)
                 has_valid_block = False
 
@@ -78,27 +82,18 @@ def flash_attn_split(
                     i_s = block_indices[bid, cur_kv_head, start + k]
                     if i_s >= 0:
                         has_valid_block = True
-                        T.copy(K[bid, i_s * block_N:(i_s + 1) * block_N, cur_kv_head, :], K_shared)
+                        T.copy(K[bid, i_s * block_N : (i_s + 1) * block_N, cur_kv_head, :], K_shared)
                         T.clear(acc_s)
-                        T.gemm(
-                            Q_shared,
-                            K_shared,
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullRow)
+                        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                         if k == 0:  # assume block_indices is sorted in reverse order, otherwise, remove this if condition
                             for i, j in T.Parallel(block_H, block_N):
-                                acc_s[i,
-                                      j] = T.if_then_else(i_s * block_N + j >= cache_seqlens[bid],
-                                                          -T.infinity(accum_dtype), acc_s[i, j])
+                                acc_s[i, j] = T.if_then_else(i_s * block_N + j >= cache_seqlens[bid], -T.infinity(accum_dtype), acc_s[i, j])
                         T.copy(scores_max, scores_max_prev)
                         T.fill(scores_max, -T.infinity(accum_dtype))
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                         for i in T.Parallel(block_H):
-                            scores_max[i] = T.if_then_else(scores_max[i] > scores_max_prev[i],
-                                                           scores_max[i], scores_max_prev[i])
-                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
-                                                     scores_max[i] * scale)
+                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                         for i, j in T.Parallel(block_H, block_N):
                             acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                         T.reduce_sum(acc_s, scores_sum, dim=1)
@@ -107,7 +102,7 @@ def flash_attn_split(
                         T.copy(acc_s, acc_s_cast)
                         for i, j in T.Parallel(block_H, dim_v):
                             acc_o[i, j] *= scores_scale[i]
-                        T.copy(V[bid, i_s * block_N:(i_s + 1) * block_N, cur_kv_head, :], V_shared)
+                        T.copy(V[bid, i_s * block_N : (i_s + 1) * block_N, cur_kv_head, :], V_shared)
                         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
                 if has_valid_block:
                     for i, j in T.Parallel(block_H, dim_v):
@@ -124,74 +119,47 @@ def flash_attn_split(
                     if i < valid_block_H:
                         Output_partial[bid, hid * valid_block_H + i, sid, j] = acc_o[i, j]
 
-        @T.macro
-        def combine(
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-                Output: T.Tensor(shape_o, dtype),
-        ):
+            # combine
             with T.Kernel(heads, batch, threads=128) as (by, bz):
                 po_local = T.alloc_fragment([dim_v], accum_dtype)
                 o_accum_local = T.alloc_fragment([dim_v], accum_dtype)
-                lse_local_split = T.alloc_local([1], accum_dtype)
-                lse_logsum_local = T.alloc_local([1], accum_dtype)
-                lse_max_local = T.alloc_local([1], accum_dtype)
-                scale_local = T.alloc_local([1], accum_dtype)
-                max_split = T.alloc_local([1], "int32")
-
-                T.annotate_layout({
-                    lse_logsum_local:
-                        T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-                })
+                lse_local_split = T.alloc_var(accum_dtype)
+                lse_logsum_local = T.alloc_var(accum_dtype)
+                lse_max_local = T.alloc_var(accum_dtype)
+                scale_local = T.alloc_var(accum_dtype)
+                max_split = T.alloc_var(T.int32)
 
                 T.clear(lse_logsum_local)
                 T.clear(o_accum_local)
-                lse_max_local[0] = -T.infinity(accum_dtype)
+                lse_max_local = -T.infinity(accum_dtype)
                 for k in T.serial(num_split):
-                    lse_local_split[0] = glse[bz, by, k]
-                    if (lse_local_split[0] != 0):
-                        max_split[0] = k
-                        lse_max_local[0] = T.max(lse_max_local[0], glse[bz, by, k])
+                    lse_local_split = glse[bz, by, k]
+                    if lse_local_split != 0:
+                        max_split = k
+                        lse_max_local = T.max(lse_max_local, glse[bz, by, k])
 
                 for k in T.Pipelined(num_split, num_stages=1):
-                    if k <= max_split[0]:
-                        lse_local_split[0] = glse[bz, by, k]
-                        lse_logsum_local[0] += T.exp2(lse_local_split[0] - lse_max_local[0])
-                lse_logsum_local[0] = T.log2(lse_logsum_local[0]) + lse_max_local[0]
+                    if k <= max_split:
+                        lse_local_split = glse[bz, by, k]
+                        lse_logsum_local += T.exp2(lse_local_split - lse_max_local)
+                lse_logsum_local = T.log2(lse_logsum_local) + lse_max_local
                 for k in T.serial(num_split):
-                    if k <= max_split[0]:
+                    if k <= max_split:
                         for i in T.Parallel(dim_v):
                             po_local[i] = Output_partial[bz, by, k, i]
-                        lse_local_split[0] = glse[bz, by, k]
-                        scale_local[0] = T.exp2(lse_local_split[0] - lse_logsum_local[0])
+                        lse_local_split = glse[bz, by, k]
+                        scale_local = T.exp2(lse_local_split - lse_logsum_local)
                         for i in T.Parallel(dim_v):
-                            o_accum_local[i] += po_local[i] * scale_local[0]
+                            o_accum_local[i] += po_local[i] * scale_local
                 for i in T.Parallel(dim_v):
                     Output[bz, by, i] = o_accum_local[i]
 
-        @T.prim_func
-        def main(
-                Q: T.Tensor(shape_q, dtype),
-                K: T.Tensor(shape_k, dtype),
-                V: T.Tensor(shape_v, dtype),
-                block_indices: T.Tensor(shape_indices, "int32"),
-                cache_seqlens: T.Tensor([batch], "int32"),
-                # actual_num_blocks: T.Tensor([batch], "int32"),
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-                Output: T.Tensor(shape_o, dtype),
-        ):
-            # flash_attn_split(Q, K, V, block_indices, cache_seqlens, actual_num_blocks, glse, Output_partial)
-            flash_attn_split(Q, K, V, block_indices, cache_seqlens, glse, Output_partial)
-            combine(glse, Output_partial, Output)
-
         return main
 
     return kernel_func
 
 
 class SparseFlashAttn(torch.nn.Module):
-
     def __init__(self, batch, heads, heads_kv, dim, dim_v, block_size):
         super(SparseFlashAttn, self).__init__()
         self.batch = batch
@@ -210,7 +178,8 @@ def __init__(self, batch, heads, heads_kv, dim, dim_v, block_size):
             num_stages=2,
             threads=128,
             max_cache_seqlen=T.dynamic("max_cache_seqlen"),
-            max_selected_blocks=T.dynamic("max_selected_blocks"))
+            max_selected_blocks=T.dynamic("max_selected_blocks"),
+        )
 
         props = torch.cuda.get_device_properties(torch.device("cuda:0"))
         self.num_sm = props.multi_processor_count
@@ -233,25 +202,17 @@ def forward(self, query, key, value, block_indices, cache_seqlens):
         num_sm = self.num_sm
 
         num_split = num_splits_heuristic(
-            total_mblocks,
-            num_sm,
-            num_n_blocks,
-            num_m_blocks,
-            size_one_kv_head,
-            is_causal_or_local=True,
-            max_splits=128)
-
-        glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device='cuda')
-        output_partial = torch.empty((batch, heads, num_split, dim_v),
-                                     dtype=torch.float32,
-                                     device='cuda')
+            total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+        )
+
+        glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+        output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
 
         output = self.kernel(query, key, value, block_indices, cache_seqlens, glse, output_partial)
         return output
 
 
-def sparse_gqa_decode_varlen_indice(query, key, value, block_indices, cache_seqlens,
-                                    max_cache_seqlen, block_size):
+def sparse_gqa_decode_varlen_indice(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, block_size):
     """
     Args:
         query: [batch, heads, dim]
@@ -273,31 +234,24 @@ def sparse_gqa_decode_varlen_indice(query, key, value, block_indices, cache_seql
     block_H = 64
 
     actual_num_blocks = torch.sum(block_indices != -1, dim=-1).to(torch.int32)
-    actual_num_blocks = actual_num_blocks[:,
-                                          0]  #[batch],  number of valid blocks, assume all groups in the same batch have the same number of blocks
+    actual_num_blocks = actual_num_blocks[
+        :, 0
+    ]  # [batch],  number of valid blocks, assume all groups in the same batch have the same number of blocks
 
     # get num_split
     num_m_blocks = 1 * (heads // heads_kv + block_H - 1) // block_H
-    num_n_blocks = max_selected_blocks  #(kv_seqlen  + block_size - 1 ) // block_size
+    num_n_blocks = max_selected_blocks  # (kv_seqlen  + block_size - 1 ) // block_size
     # num_n_blocks = torch.sum(actual_num_blocks, dim=-1).item() * heads_kv # total number of blocks
 
-    size_one_kv_head = max_selected_blocks * block_size * (
-        dim + dim_v) * 2  #kv_seqlen * (dim + dim_v) * 2
+    size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2  # kv_seqlen * (dim + dim_v) * 2
     total_mblocks = batch * heads_kv * num_m_blocks
     num_sm = 132
     num_split = num_splits_heuristic(
-        total_mblocks,
-        num_sm,
-        num_n_blocks,
-        num_m_blocks,
-        size_one_kv_head,
-        is_causal_or_local=True,
-        max_splits=128)
-
-    glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device='cuda')
-    Output_partial = torch.empty((batch, heads, num_split, dim_v),
-                                 dtype=torch.float32,
-                                 device='cuda')
+        total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+    )
+
+    glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+    Output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
     kernel = flashattn(batch, heads, heads_kv, dim, dim_v)(
         block_N=block_size,
         block_H=block_H,
@@ -305,29 +259,24 @@ def sparse_gqa_decode_varlen_indice(query, key, value, block_indices, cache_seql
         num_stages=2,
         threads=128,
         max_cache_seqlen=T.dynamic("max_cache_seqlen"),
-        max_selected_blocks=T.dynamic("max_selected_blocks"))
+        max_selected_blocks=T.dynamic("max_selected_blocks"),
+    )
 
     output = kernel(query, key, value, block_indices, cache_seqlens, glse, Output_partial)
     return output
 
 
-def ref_program_torch(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks,
-                      block_size):
-
+def ref_program_torch(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks, block_size):
     batch, heads, dim = query.shape
     heads_kv = key.shape[2]
     num_head_groups = query.shape[1] // key.shape[2]
     scale = dim**0.5
-    key = rearrange(key, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
-    value = rearrange(value, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
+    key = rearrange(key, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
+    value = rearrange(value, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
 
-    query = rearrange(
-        query, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
+    query = rearrange(query, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
     sparse_mask = torch.zeros_like(scores)
     # Assign mask values based on block_indices
@@ -336,28 +285,26 @@ def ref_program_torch(query, key, value, block_indices, cache_seqlens, max_cache
             valid_indices = block_indices[b, h]  # Extract indices for this batch and head
             for idx in valid_indices:
                 if idx >= 0:
-                    sparse_mask[b, :, h, idx * block_size:(idx + 1) * block_size] = 1
-    scores = scores.masked_fill(sparse_mask == 0, float('-inf'))
+                    sparse_mask[b, :, h, idx * block_size : (idx + 1) * block_size] = 1
+    scores = scores.masked_fill(sparse_mask == 0, float("-inf"))
 
-    range_len = torch.arange(scores.shape[-1], device='cuda').unsqueeze(0)
+    range_len = torch.arange(scores.shape[-1], device="cuda").unsqueeze(0)
     cache_seqlens_expanded = cache_seqlens.unsqueeze(1)
     pad_mask = range_len >= cache_seqlens_expanded
     pad_mask = pad_mask[:, None, None, :]
-    scores = scores.masked_fill(pad_mask, float('-inf'))
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = scores.masked_fill(pad_mask, float("-inf"))
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
-    out = einsum(attention, value,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, heads_kv, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, value, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, heads_kv, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
-def ref_program_fa(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks,
-                   block_size):
+def ref_program_fa(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks, block_size):
     # latency reference
     # from flash_attn_interface import flash_attn_with_kvcache # fa3
-    from flash_attn import flash_attn_with_kvcache  #fa2
+    from flash_attn import flash_attn_with_kvcache  # fa2
+
     query = query.unsqueeze(1)
     output = flash_attn_with_kvcache(query, key, value, cache_seqlens=cache_seqlens)
     output = output.squeeze(1)
@@ -369,23 +316,13 @@ def debug(name, expect, actual, atol=1e-3, rtol=1e-3):
     print(name + "  all_close={}".format(all_close))
     if not all_close:
         diff = (expect - actual).abs()
-        print("all_close={}, max={}, min={}, mean={}".format(all_close,
-                                                             diff.max().item(),
-                                                             diff.min().item(),
-                                                             diff.mean().item()))
+        print("all_close={}, max={}, min={}, mean={}".format(all_close, diff.max().item(), diff.min().item(), diff.mean().item()))
         max_indices = torch.nonzero(diff == diff.max().item())
         first_index = tuple(max_indices[0].tolist())
         print(f"Index: {first_index}, expect: {expect[first_index]}, actual: {actual[first_index]}")
 
 
-def main(batch=8,
-         heads=32,
-         heads_kv=8,
-         max_cache_seqlen=8192,
-         dim=128,
-         dim_v=128,
-         sparse_ratio=0.8,
-         block_size=32):
+def main(batch=8, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32):
     batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = batch, heads, heads_kv, max_cache_seqlen, dim, dim_v
     sparse_ratio = sparse_ratio
     block_size = block_size
@@ -393,10 +330,10 @@ def main(batch=8,
     print("max_selected_blocks: ", max_selected_blocks)
     dtype = torch.float16
 
-    Q = torch.randn((batch, heads, dim), dtype=dtype, device='cuda')
-    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device='cuda')
-    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device='cuda')
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device="cuda")
     # cache_seqlens = torch.full((batch,), max_cache_seqlen, dtype=torch.int32, device='cuda')
     # # Ensure at least one element equals cache_seqlen
     # random_index = torch.randint(0, batch, (1,), device='cuda').item()  # Select a random index
@@ -407,10 +344,7 @@ def main(batch=8,
     max_valid_num_blocks = torch.ceil(cache_seqlens / block_size).int()
     print("max_valid_num_blocks: ", max_valid_num_blocks)
     # Initialize block_indices with -1 (for padding blocks)
-    block_indices = torch.full((batch, heads_kv, max_selected_blocks),
-                               -1,
-                               dtype=torch.int32,
-                               device='cuda')
+    block_indices = torch.full((batch, heads_kv, max_selected_blocks), -1, dtype=torch.int32, device="cuda")
     # max_num_blocks = int((max_cache_seqlen + block_size - 1)/ block_size)
     # block_indices = torch.full((batch, heads_kv, max_num_blocks), -1, dtype=torch.int32, device='cuda')
 
@@ -419,10 +353,9 @@ def main(batch=8,
         max_valid_block = max_valid_num_blocks[b].item()  # Max valid blocks for this batch
         if max_valid_block > 0:  # Ensure there's at least one valid block
             for h in range(heads_kv):
-                valid_indices = torch.randperm(
-                    max_valid_block, device='cuda', dtype=torch.int32)[:max_selected_blocks]
+                valid_indices = torch.randperm(max_valid_block, device="cuda", dtype=torch.int32)[:max_selected_blocks]
                 # valid_indices = torch.randperm(max_valid_block, device='cuda', dtype=torch.int32)[:max_num_blocks]
-                block_indices[b, h, :len(valid_indices)] = valid_indices
+                block_indices[b, h, : len(valid_indices)] = valid_indices
 
     # Sort indices within each batch-group for consistency
     block_indices, _ = block_indices.sort(dim=-1, descending=True)
@@ -435,8 +368,7 @@ def main(batch=8,
     print("max_num_blocks: ", max_num_blocks)
 
     # parity reference
-    ref = ref_program_torch(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks,
-                            block_size)
+    ref = ref_program_torch(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks, block_size)
 
     sparse_kernel = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, block_size)
     out = sparse_kernel(Q, K, V, block_indices, cache_seqlens)
@@ -446,13 +378,11 @@ def main(batch=8,
 
     ## latency reference
     for _ in range(10):
-        ref = ref_program_fa(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen,
-                             max_num_blocks, block_size)
+        ref = ref_program_fa(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks, block_size)
     torch.cuda.synchronize()
     start = time.time()
     for _ in range(100):
-        ref = ref_program_fa(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen,
-                             max_num_blocks, block_size)
+        ref = ref_program_fa(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks, block_size)
     torch.cuda.synchronize()
     print("dense time: ", (time.time() - start) / 100 * 1000)
 
@@ -468,17 +398,67 @@ def main(batch=8,
     print("sparse time: ", (time.time() - start) / 100 * 1000)
 
 
+def run_regression_perf(batch=8, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32):
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = batch, heads, heads_kv, max_cache_seqlen, dim, dim_v
+    sparse_ratio = sparse_ratio
+    block_size = block_size
+    max_selected_blocks = int(math.ceil(max_cache_seqlen * (1 - sparse_ratio) / block_size))
+    dtype = torch.float16
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device="cuda")
+    max_valid_num_blocks = torch.ceil(cache_seqlens / block_size).int()
+    block_indices = torch.full((batch, heads_kv, max_selected_blocks), -1, dtype=torch.int32, device="cuda")
+
+    for b in range(batch):
+        max_valid_block = max_valid_num_blocks[b].item()
+        if max_valid_block > 0:
+            for h in range(heads_kv):
+                valid_indices = torch.randperm(max_valid_block, device="cuda", dtype=torch.int32)[:max_selected_blocks]
+                block_indices[b, h, : len(valid_indices)] = valid_indices
+
+    block_indices, _ = block_indices.sort(dim=-1, descending=True)
+    sparse_kernel = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, block_size)
+    batch = sparse_kernel.batch
+    heads = sparse_kernel.heads
+    heads_kv = sparse_kernel.heads_kv
+    dim_v = sparse_kernel.dim_v
+    dim = sparse_kernel.dim
+    block_size = sparse_kernel.block_size
+    max_selected_blocks = block_indices.shape[-1]
+
+    num_m_blocks = 1 * (heads // heads_kv + sparse_kernel.block_H - 1) // sparse_kernel.block_H
+    num_n_blocks = max_selected_blocks
+    size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2
+    total_mblocks = batch * heads_kv * num_m_blocks
+    num_sm = sparse_kernel.num_sm
+
+    num_split = num_splits_heuristic(
+        total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+    )
+
+    glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+    output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
+    kernel = sparse_kernel.kernel
+
+    def run_kernel_only():
+        kernel(Q, K, V, block_indices, cache_seqlens, glse, output_partial)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--heads_kv', type=int, default=8, help='heads_kv')
-    parser.add_argument(
-        '--max_cache_seqlen', type=int, default=8192, help='kvcache sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--dim_v', type=int, default=128, help='dim_v')
-    parser.add_argument('--sparse_ratio', type=float, default=0.8, help='sparse ratio')
-    parser.add_argument('--block_size', type=int, default=32, help='block_size')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--heads_kv", type=int, default=8, help="heads_kv")
+    parser.add_argument("--max_cache_seqlen", type=int, default=8192, help="kvcache sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--dim_v", type=int, default=128, help="dim_v")
+    parser.add_argument("--sparse_ratio", type=float, default=0.8, help="sparse ratio")
+    parser.add_argument("--block_size", type=int, default=32, help="block_size")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v,
-         args.sparse_ratio, args.block_size)
+    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v, args.sparse_ratio, args.block_size)
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
index ad62817dd..e48428fb8 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
@@ -5,22 +5,24 @@
 import tilelang.language as T
 from einops import rearrange, einsum
 import argparse
-
 import time
 import math
 from heuristic import num_splits_heuristic
+from tilelang.profiler import do_bench
 
 
 def flashattn(batch, heads, heads_kv, dim, dim_v):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // heads_kv
 
     @tilelang.jit(
-        out_idx=[-1], pass_configs={
+        out_idx=[-1],
+        pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
+        },
+    )
     def kernel_func(block_N, block_H, num_split, num_stages, threads, max_cache_seqlen, num_blocks):
         shape_q = [batch, heads, dim]
         shape_k = [batch, max_cache_seqlen, heads_kv, dim]
@@ -30,22 +32,21 @@ def kernel_func(block_N, block_H, num_split, num_stages, threads, max_cache_seql
         part_shape = [batch, heads, num_split, dim_v]
         valid_block_H = min(block_H, kv_group_num)
 
-        @T.macro
-        def flash_attn_split(
-                Q: T.Tensor(shape_q, dtype),
-                K: T.Tensor(shape_k, dtype),
-                V: T.Tensor(shape_v, dtype),
-                block_mask: T.Tensor(shape_mask, "bool"),
-                cache_seqlens: T.Tensor([batch], "int32"),
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
+        @T.prim_func
+        def main(
+            Q: T.Tensor(shape_q, dtype),
+            K: T.Tensor(shape_k, dtype),
+            V: T.Tensor(shape_v, dtype),
+            block_mask: T.Tensor(shape_mask, T.bool),
+            cache_seqlens: T.Tensor([batch], T.int32),
+            glse: T.Tensor([batch, heads, num_split], accum_dtype),
+            Output_partial: T.Tensor(part_shape, accum_dtype),
+            Output: T.Tensor(shape_o, dtype),
         ):
-            with T.Kernel(
-                    batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
+            with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
                 Q_shared = T.alloc_shared([block_H, dim], dtype)
                 K_shared = T.alloc_shared([block_N, dim], dtype)
                 V_shared = T.alloc_shared([block_N, dim_v], dtype)
-                # O_shared = T.alloc_shared([valid_block_H, dim_v], dtype)
                 acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
                 acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
                 acc_o = T.alloc_fragment([block_H, dim_v], accum_dtype)
@@ -62,38 +63,31 @@ def flash_attn_split(
                 sid = bz
                 cur_kv_head = hid // (kv_group_num // valid_block_H)
 
-                T.copy(Q[bid, hid * valid_block_H:hid * valid_block_H + block_H, :], Q_shared)
+                T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 blocks_per_split = T.floordiv(num_blocks, num_split)
                 remaining_blocks = T.floormod(num_blocks, num_split)
-                loop_range = (blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0))
+                loop_range = blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0)
                 start = blocks_per_split * sid + T.min(sid, remaining_blocks)
                 has_valid_block = False
                 for k in T.Pipelined(loop_range, num_stages=num_stages):
                     if block_mask[bid, hid, start + k]:
                         has_valid_block = True
-                        T.copy(
-                            K[bid, (start + k) * block_N:(start + k + 1) * block_N, cur_kv_head, :],
-                            K_shared)
+                        T.copy(K[bid, (start + k) * block_N : (start + k + 1) * block_N, cur_kv_head, :], K_shared)
                         T.clear(acc_s)
-                        T.gemm(
-                            Q_shared,
-                            K_shared,
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullRow)
+                        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                         for i, j in T.Parallel(block_H, block_N):
-                            acc_s[i, j] = T.if_then_else((start + k) * block_N + j
-                                                         >= cache_seqlens[bx],
-                                                         -T.infinity(accum_dtype), acc_s[i, j])
+                            acc_s[i, j] = T.if_then_else(
+                                (start + k) * block_N + j >= cache_seqlens[bx], -T.infinity(accum_dtype), acc_s[i, j]
+                            )
                         T.copy(scores_max, scores_max_prev)
                         T.fill(scores_max, -T.infinity(accum_dtype))
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                         for i in T.Parallel(block_H):
-                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
-                                                     scores_max[i] * scale)
+                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                         for i, j in T.Parallel(block_H, block_N):
                             acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                         T.reduce_sum(acc_s, scores_sum, dim=1)
@@ -102,9 +96,7 @@ def flash_attn_split(
                         T.copy(acc_s, acc_s_cast)
                         for i, j in T.Parallel(block_H, dim_v):
                             acc_o[i, j] *= scores_scale[i]
-                        T.copy(
-                            V[bid, (start + k) * block_N:(start + k + 1) * block_N, cur_kv_head, :],
-                            V_shared)
+                        T.copy(V[bid, (start + k) * block_N : (start + k + 1) * block_N, cur_kv_head, :], V_shared)
                         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
                 if has_valid_block:
                     for i, j in T.Parallel(block_H, dim_v):
@@ -120,65 +112,39 @@ def flash_attn_split(
                     if i < valid_block_H:
                         Output_partial[bid, hid * valid_block_H + i, sid, j] = acc_o[i, j]
 
-        @T.macro
-        def combine(
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-                Output: T.Tensor(shape_o, dtype),
-        ):
             with T.Kernel(heads, batch, threads=128) as (by, bz):
                 po_local = T.alloc_fragment([dim_v], accum_dtype)
                 o_accum_local = T.alloc_fragment([dim_v], accum_dtype)
-                lse_local_split = T.alloc_local([1], accum_dtype)
-                lse_logsum_local = T.alloc_local([1], accum_dtype)
-                lse_max_local = T.alloc_local([1], accum_dtype)
-                scale_local = T.alloc_local([1], accum_dtype)
-
-                T.annotate_layout({
-                    lse_logsum_local:
-                        T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-                })
+                lse_local_split = T.alloc_var(accum_dtype)
+                lse_logsum_local = T.alloc_var(accum_dtype)
+                lse_max_local = T.alloc_var(accum_dtype)
+                scale_local = T.alloc_var(accum_dtype)
 
                 T.clear(lse_logsum_local)
                 T.clear(o_accum_local)
-                lse_max_local[0] = -T.infinity(accum_dtype)
+                lse_max_local = -T.infinity(accum_dtype)
                 for k in T.serial(num_split):
-                    lse_max_local[0] = T.max(lse_max_local[0], glse[bz, by, k])
+                    lse_max_local = T.max(lse_max_local, glse[bz, by, k])
                 for k in T.Pipelined(num_split, num_stages=1):
-                    lse_local_split[0] = glse[bz, by, k]
-                    lse_logsum_local[0] += T.exp2(lse_local_split[0] - lse_max_local[0])
-                lse_logsum_local[0] = T.log2(lse_logsum_local[0]) + lse_max_local[0]
+                    lse_local_split = glse[bz, by, k]
+                    lse_logsum_local += T.exp2(lse_local_split - lse_max_local)
+                lse_logsum_local = T.log2(lse_logsum_local) + lse_max_local
                 for k in T.serial(num_split):
                     for i in T.Parallel(dim_v):
                         po_local[i] = Output_partial[bz, by, k, i]
-                    lse_local_split[0] = glse[bz, by, k]
-                    scale_local[0] = T.exp2(lse_local_split[0] - lse_logsum_local[0])
+                    lse_local_split = glse[bz, by, k]
+                    scale_local = T.exp2(lse_local_split - lse_logsum_local)
                     for i in T.Parallel(dim_v):
-                        o_accum_local[i] += po_local[i] * scale_local[0]
+                        o_accum_local[i] += po_local[i] * scale_local
                 for i in T.Parallel(dim_v):
                     Output[bz, by, i] = o_accum_local[i]
 
-        @T.prim_func
-        def main(
-                Q: T.Tensor(shape_q, dtype),
-                K: T.Tensor(shape_k, dtype),
-                V: T.Tensor(shape_v, dtype),
-                block_mask: T.Tensor(shape_mask, "bool"),
-                cache_seqlens: T.Tensor([batch], "int32"),
-                glse: T.Tensor([batch, heads, num_split], accum_dtype),
-                Output_partial: T.Tensor(part_shape, accum_dtype),
-                Output: T.Tensor(shape_o, dtype),
-        ):
-            flash_attn_split(Q, K, V, block_mask, cache_seqlens, glse, Output_partial)
-            combine(glse, Output_partial, Output)
-
         return main
 
     return kernel_func
 
 
 class SparseFlashAttn(torch.nn.Module):
-
     def __init__(self, batch, heads, heads_kv, dim, dim_v, block_size):
         super(SparseFlashAttn, self).__init__()
         self.batch = batch
@@ -197,7 +163,8 @@ def __init__(self, batch, heads, heads_kv, dim, dim_v, block_size):
             num_stages=2,
             threads=128,
             max_cache_seqlen=T.dynamic("max_cache_seqlen"),
-            num_blocks=T.dynamic("num_blocks"))
+            num_blocks=T.dynamic("num_blocks"),
+        )
 
         props = torch.cuda.get_device_properties(torch.device("cuda:0"))
         self.num_sm = props.multi_processor_count
@@ -216,24 +183,16 @@ def forward(self, query, key, value, block_mask, cache_seqlens):
         num_m_blocks = 1 * (heads // heads_kv + block_H - 1) // block_H
         num_n_blocks = max_selected_blocks
 
-        size_one_kv_head = max_selected_blocks * block_size * (
-            dim + dim_v) * 2  #kv_seqlen * (dim + dim_v) * 2
+        size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2  # kv_seqlen * (dim + dim_v) * 2
         total_mblocks = batch * heads_kv * num_m_blocks
         # num_sm = 132
         num_sm = self.num_sm
         num_split = num_splits_heuristic(
-            total_mblocks,
-            num_sm,
-            num_n_blocks,
-            num_m_blocks,
-            size_one_kv_head,
-            is_causal_or_local=True,
-            max_splits=128)
+            total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+        )
         # print("num_split: ", num_split)
-        glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device='cuda')
-        Output_partial = torch.empty((batch, heads, num_split, dim_v),
-                                     dtype=torch.float32,
-                                     device='cuda')
+        glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+        Output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
         output = self.kernel(query, key, value, block_mask, cache_seqlens, glse, Output_partial)
         return output
 
@@ -258,26 +217,21 @@ def sparse_gqa_decode_varlen_mask(query, key, value, block_mask, cache_seqlens,
     block_H = 64
 
     actual_num_blocks = torch.sum(block_mask, dim=-1).to(torch.int32)
-    actual_num_blocks = actual_num_blocks[:,
-                                          0]  #[batch],  number of valid blocks, assume all groups in the same batch have the same number of blocks
+    actual_num_blocks = actual_num_blocks[
+        :, 0
+    ]  # [batch],  number of valid blocks, assume all groups in the same batch have the same number of blocks
     max_selected_blocks = actual_num_blocks.max().item()
     # get num_split
     num_m_blocks = 1 * (heads // heads_kv + block_H - 1) // block_H
-    num_n_blocks = max_selected_blocks  #(kv_seqlen  + block_size - 1 ) // block_size
+    num_n_blocks = max_selected_blocks  # (kv_seqlen  + block_size - 1 ) // block_size
     # num_n_blocks = torch.sum(actual_num_blocks, dim=-1).item() * heads_kv # total number of blocks
 
-    size_one_kv_head = max_selected_blocks * block_size * (
-        dim + dim_v) * 2  #kv_seqlen * (dim + dim_v) * 2
+    size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2  # kv_seqlen * (dim + dim_v) * 2
     total_mblocks = batch * heads_kv * num_m_blocks
     num_sm = 132
     num_split = num_splits_heuristic(
-        total_mblocks,
-        num_sm,
-        num_n_blocks,
-        num_m_blocks,
-        size_one_kv_head,
-        is_causal_or_local=True,
-        max_splits=128)
+        total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+    )
 
     kernel = flashattn(batch, heads, heads_kv, dim, dim_v)(
         block_N=block_size,
@@ -286,11 +240,10 @@ def sparse_gqa_decode_varlen_mask(query, key, value, block_mask, cache_seqlens,
         num_stages=2,
         threads=128,
         max_cache_seqlen=T.dynamic("max_cache_seqlen"),
-        num_blocks=T.dynamic("num_blocks"))
-    glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device='cuda')
-    Output_partial = torch.empty((batch, heads, num_split, dim_v),
-                                 dtype=torch.float32,
-                                 device='cuda')
+        num_blocks=T.dynamic("num_blocks"),
+    )
+    glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+    Output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
     # print(kernel.get_kernel_source())
 
     output = kernel(query, key, value, block_mask, cache_seqlens, glse, Output_partial)
@@ -298,24 +251,18 @@ def sparse_gqa_decode_varlen_mask(query, key, value, block_mask, cache_seqlens,
     return output
 
 
-def ref_program_torch(query, key, value, block_mask, cache_seqlens, max_cache_seqlen, num_blocks,
-                      block_size):
-
+def ref_program_torch(query, key, value, block_mask, cache_seqlens, max_cache_seqlen, num_blocks, block_size):
     batch, heads, dim = query.shape
     heads_kv = key.shape[2]
 
     num_head_groups = query.shape[1] // key.shape[2]
     scale = dim**0.5
-    key = rearrange(key, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
-    value = rearrange(value, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
+    key = rearrange(key, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
+    value = rearrange(value, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
 
-    query = rearrange(
-        query, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
+    query = rearrange(query, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
     sparse_mask = torch.zeros_like(scores)
     # Assign mask values
@@ -323,29 +270,27 @@ def ref_program_torch(query, key, value, block_mask, cache_seqlens, max_cache_se
         for h in range(heads_kv):
             for idx in range(num_blocks):
                 if block_mask[b, h, idx]:
-                    sparse_mask[b, :, h, idx * block_size:(idx + 1) * block_size] = 1
+                    sparse_mask[b, :, h, idx * block_size : (idx + 1) * block_size] = 1
 
-    scores = scores.masked_fill(sparse_mask == 0, float('-inf'))
+    scores = scores.masked_fill(sparse_mask == 0, float("-inf"))
 
-    range_len = torch.arange(scores.shape[-1], device='cuda').unsqueeze(0)
+    range_len = torch.arange(scores.shape[-1], device="cuda").unsqueeze(0)
     cache_seqlens_expanded = cache_seqlens.unsqueeze(1)
     pad_mask = range_len >= cache_seqlens_expanded
     pad_mask = pad_mask[:, None, None, :]
-    scores = scores.masked_fill(pad_mask, float('-inf'))
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = scores.masked_fill(pad_mask, float("-inf"))
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
-    out = einsum(attention, value,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, heads_kv, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, value, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, heads_kv, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
-def ref_program_fa(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks,
-                   block_size):
+def ref_program_fa(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks, block_size):
     # latency reference
     # from flash_attn_interface import flash_attn_with_kvcache # fa3
-    from flash_attn import flash_attn_with_kvcache  #fa2
+    from flash_attn import flash_attn_with_kvcache  # fa2
+
     query = query.unsqueeze(1)
     output = flash_attn_with_kvcache(query, key, value, cache_seqlens=cache_seqlens)
     output = output.squeeze(1)
@@ -359,23 +304,13 @@ def debug(name, expect, actual, atol=1e-3, rtol=1e-3):
         # print(expect[3, 28])
         # print(actual[3, 28])
         diff = (expect - actual).abs()
-        print("all_close={}, max={}, min={}, mean={}".format(all_close,
-                                                             diff.max().item(),
-                                                             diff.min().item(),
-                                                             diff.mean().item()))
+        print("all_close={}, max={}, min={}, mean={}".format(all_close, diff.max().item(), diff.min().item(), diff.mean().item()))
         max_indices = torch.nonzero(diff == diff.max().item())
         first_index = tuple(max_indices[0].tolist())
         print(f"Index: {first_index}, expect: {expect[first_index]}, actual: {actual[first_index]}")
 
 
-def main(batch=8,
-         heads=32,
-         heads_kv=8,
-         max_cache_seqlen=8192,
-         dim=128,
-         dim_v=128,
-         sparse_ratio=0.8,
-         block_size=32):
+def main(batch=8, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32):
     batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = batch, heads, heads_kv, max_cache_seqlen, dim, dim_v
     sparse_ratio = sparse_ratio
     block_size = block_size
@@ -383,14 +318,13 @@ def main(batch=8,
     print("max_selected_blocks: ", max_selected_blocks)
     dtype = torch.float16
 
-    Q = torch.randn((batch, heads, dim), dtype=dtype, device='cuda')
-    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device='cuda')
-    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device='cuda')
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device="cuda")
     # Ensure at least one element equals cache_seqlen
-    random_index = torch.randint(0, batch, (1,), device='cuda').item()  # Select a random index
-    cache_seqlens[
-        random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
+    random_index = torch.randint(0, batch, (1,), device="cuda").item()  # Select a random index
+    cache_seqlens[random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
     # cache_seqlens = torch.full((batch,), max_cache_seqlen, dtype=torch.int32, device='cuda')
 
     print("cache_seqlens: ", cache_seqlens)
@@ -402,7 +336,7 @@ def main(batch=8,
     max_valid_num_blocks = torch.ceil(cache_seqlens / block_size).int()
     print("max_valid_num_blocks: ", max_valid_num_blocks)
     # Initialize block_mask with false (for padding blocks)
-    block_mask = torch.zeros((batch, heads_kv, num_blocks), dtype=torch.bool, device='cuda')
+    block_mask = torch.zeros((batch, heads_kv, num_blocks), dtype=torch.bool, device="cuda")
 
     # Assign valid indices while ensuring no duplicates within each batch-group
     for b in range(batch):
@@ -410,13 +344,12 @@ def main(batch=8,
         valid_num_block = valid_num_blocks[b].item()  # Valid blocks for this batch
         if valid_num_block > 0:  # Ensure there's at least one valid block
             for h in range(heads_kv):
-                perm = torch.randperm(max_valid_block, device='cuda')[:valid_num_block]
+                perm = torch.randperm(max_valid_block, device="cuda")[:valid_num_block]
                 block_mask[b, h, perm] = True
     # print("block_mask: ", block_mask)
 
     # parity reference
-    ref = ref_program_torch(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks,
-                            block_size)
+    ref = ref_program_torch(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks, block_size)
     # out = sparse_gqa_decode_varlen_mask(Q, K, V, block_mask, cache_seqlens, block_size)
     model = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, block_size)
     out = model(Q, K, V, block_mask, cache_seqlens)
@@ -426,13 +359,11 @@ def main(batch=8,
 
     ## latency reference
     for _ in range(10):
-        ref = ref_program_fa(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks,
-                             block_size)
+        ref = ref_program_fa(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks, block_size)
     torch.cuda.synchronize()
     start = time.time()
     for _ in range(100):
-        ref = ref_program_fa(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks,
-                             block_size)
+        ref = ref_program_fa(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks, block_size)
     torch.cuda.synchronize()
     print("dense time: ", (time.time() - start) / 100 * 1000)
 
@@ -449,17 +380,72 @@ def main(batch=8,
     print("sparse time: ", (time.time() - start) / 100 * 1000)
 
 
+def run_regression_perf(batch=8, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32):
+    batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = batch, heads, heads_kv, max_cache_seqlen, dim, dim_v
+    sparse_ratio = sparse_ratio
+    block_size = block_size
+    max_selected_blocks = int(math.ceil(max_cache_seqlen * (1 - sparse_ratio) / block_size))
+    dtype = torch.float16
+
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device="cuda")
+    random_index = torch.randint(0, batch, (1,), device="cuda").item()
+    cache_seqlens[random_index] = max_cache_seqlen
+
+    num_blocks = (max_cache_seqlen + block_size - 1) // block_size
+
+    valid_num_blocks = torch.ceil(cache_seqlens * (1 - sparse_ratio) / block_size).int()
+    max_valid_num_blocks = torch.ceil(cache_seqlens / block_size).int()
+    block_mask = torch.zeros((batch, heads_kv, num_blocks), dtype=torch.bool, device="cuda")
+
+    for b in range(batch):
+        max_valid_block = max_valid_num_blocks[b].item()
+        valid_num_block = valid_num_blocks[b].item()
+        if valid_num_block > 0:
+            for h in range(heads_kv):
+                perm = torch.randperm(max_valid_block, device="cuda")[:valid_num_block]
+                block_mask[b, h, perm] = True
+
+    model = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, block_size)
+    batch = model.batch
+    heads = model.heads
+    heads_kv = model.heads_kv
+    dim_v = model.dim_v
+    dim = model.dim
+    block_size = model.block_size
+    block_H = model.block_H
+    max_cache_seqlen = K.shape[1]
+    max_selected_blocks = (max_cache_seqlen + block_size - 1) // block_size
+    num_m_blocks = 1 * (heads // heads_kv + block_H - 1) // block_H
+    num_n_blocks = max_selected_blocks
+
+    size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2
+    total_mblocks = batch * heads_kv * num_m_blocks
+    num_sm = model.num_sm
+    num_split = num_splits_heuristic(
+        total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+    )
+    glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+    Output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
+    kernel = model.kernel
+
+    def run_kernel_only():
+        kernel(Q, K, V, block_mask, cache_seqlens, glse, Output_partial)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--heads_kv', type=int, default=8, help='heads_kv')
-    parser.add_argument(
-        '--max_cache_seqlen', type=int, default=8192, help='kvcache sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--dim_v', type=int, default=128, help='dim_v')
-    parser.add_argument('--sparse_ratio', type=float, default=0.8, help='sparse ratio')
-    parser.add_argument('--block_size', type=int, default=32, help='block_size')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--heads_kv", type=int, default=8, help="heads_kv")
+    parser.add_argument("--max_cache_seqlen", type=int, default=8192, help="kvcache sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--dim_v", type=int, default=128, help="dim_v")
+    parser.add_argument("--sparse_ratio", type=float, default=0.8, help="sparse ratio")
+    parser.add_argument("--block_size", type=int, default=32, help="block_size")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v,
-         args.sparse_ratio, args.block_size)
+    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v, args.sparse_ratio, args.block_size)
diff --git a/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py b/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py
index 85b72b775..01695742b 100644
--- a/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py
+++ b/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py
@@ -5,19 +5,15 @@
 import argparse
 from einops import rearrange, einsum
 import torch.nn.functional as F
-
 import math
 import time
 from heuristic import num_splits_heuristic
+from tilelang.profiler import do_bench
 
 
 @triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4]\
-        for num_stages in [1, 2, 3, 4, 7]
-    ],
-    key=['BLOCK_H', 'BLOCK_N', 'BLOCK_D'],
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [1, 2, 4] for num_stages in [1, 2, 3, 4, 7]],
+    key=["BLOCK_H", "BLOCK_N", "BLOCK_D"],
 )
 @triton.jit
 def _split_kernel(
@@ -79,16 +75,11 @@ def _split_kernel(
         loop_range = blocks_per_split
 
     q_ptr += batch_idx * stride_q_b + head_idx_q * stride_q_h
-    k_cache_ptr += batch_idx * stride_k_b + head_idx_kv * stride_k_h + offs_n[
-        None, :] * stride_k_s + offs_d[:, None] * stride_k_d
-    v_cache_ptr += batch_idx * stride_v_b + head_idx_kv * stride_v_h + offs_n[:,
-                                                                              None] * stride_v_s + offs_d[
-                                                                                  None, :] * stride_v_d
+    k_cache_ptr += batch_idx * stride_k_b + head_idx_kv * stride_k_h + offs_n[None, :] * stride_k_s + offs_d[:, None] * stride_k_d
+    v_cache_ptr += batch_idx * stride_v_b + head_idx_kv * stride_v_h + offs_n[:, None] * stride_v_s + offs_d[None, :] * stride_v_d
     mask_ptr += batch_idx * stride_mask_b + head_idx_kv * stride_mask_h
 
-    q = tl.load(
-        q_ptr + offs_h[:, None] * stride_q_h + offs_d[None, :] * stride_q_d,
-        mask=offs_h[:, None] < gqa_group_size)
+    q = tl.load(q_ptr + offs_h[:, None] * stride_q_h + offs_d[None, :] * stride_q_d, mask=offs_h[:, None] < gqa_group_size)
     start = blocks_per_split * split_idx + tl.minimum(split_idx, remaining_blocks)
     for i in range(loop_range):
         block_idx = tl.load(mask_ptr + (start + i) * stride_mask_s)
@@ -119,23 +110,18 @@ def _split_kernel(
     acc = acc * l_recip
     acc = acc.to(o_partial_ptr.dtype.element_ty)
 
-    lse_partial_ptr += batch_idx * stride_lse_b + (
-        head_idx_q + offs_h) * stride_lse_h + split_idx * stride_lse_split
+    lse_partial_ptr += batch_idx * stride_lse_b + (head_idx_q + offs_h) * stride_lse_h + split_idx * stride_lse_split
     tl.store(lse_partial_ptr, m_i, mask=offs_h < gqa_group_size)
 
-    o_partial_ptr += batch_idx * stride_o_b + (
-        head_idx_q +
-        offs_h[:, None]) * stride_o_h + split_idx * stride_o_split + offs_d[None, :] * stride_o_d
+    o_partial_ptr += (
+        batch_idx * stride_o_b + (head_idx_q + offs_h[:, None]) * stride_o_h + split_idx * stride_o_split + offs_d[None, :] * stride_o_d
+    )
     tl.store(o_partial_ptr, acc, mask=offs_h[:, None] < gqa_group_size)
 
 
 @triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4]\
-        for num_stages in [1, 2, 3, 4, 7]
-    ],
-    key=['BLOCK_D'],
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [1, 2, 4] for num_stages in [1, 2, 3, 4, 7]],
+    key=["BLOCK_D"],
 )
 @triton.jit
 def _merge_kernel(
@@ -163,18 +149,15 @@ def _merge_kernel(
     offs_d = tl.arange(0, BLOCK_D)
 
     lse_offsets = lse_partial_ptr + batch_idx * lse_partial_stride_b + head_idx * lse_partial_stride_h
-    lse = tl.load(
-        lse_offsets + offs_splits * lse_partial_stride_split,
-        mask=offs_splits < num_splits,
-        other=float("-inf"))
+    lse = tl.load(lse_offsets + offs_splits * lse_partial_stride_split, mask=offs_splits < num_splits, other=float("-inf"))
 
     lse_max = tl.max(lse)
 
     o_offsets = o_partial_ptr + batch_idx * o_partial_stride_b + head_idx * o_partial_stride_h
     o_partial = tl.load(
-        o_offsets + offs_splits[:, None] * o_partial_stride_split +
-        offs_d[None, :] * o_partial_stride_d,
-        mask=offs_splits[:, None] < num_splits)
+        o_offsets + offs_splits[:, None] * o_partial_stride_split + offs_d[None, :] * o_partial_stride_d,
+        mask=offs_splits[:, None] < num_splits,
+    )
     sumexp_normalized_splitk = tl.exp(lse - lse_max)
     sumexp_normalized = tl.sum(sumexp_normalized_splitk, axis=0)
     numerator_normalized = tl.sum(o_partial * sumexp_normalized_splitk[:, None], axis=0)
@@ -209,19 +192,13 @@ def block_sparse_flash_decode_gqa_indice_triton(
     num_m_blocks = 1 * (heads // heads_kv + block_H - 1) // block_H
     num_n_blocks = max_selected_blocks
 
-    size_one_kv_head = max_selected_blocks * block_size * (
-        dim + dim_v) * 2  #kv_seqlen * (dim + dim_v) * 2
+    size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2  # kv_seqlen * (dim + dim_v) * 2
     total_mblocks = batch * heads_kv * num_m_blocks
     num_sm = 64
     # num_sm = self.num_sm
     num_splits = num_splits_heuristic(
-        total_mblocks,
-        num_sm,
-        num_n_blocks,
-        num_m_blocks,
-        size_one_kv_head,
-        is_causal_or_local=True,
-        max_splits=128)
+        total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+    )
 
     # print("num_splits:", num_splits, "num_blocks:", num_n_blocks)
 
@@ -295,24 +272,18 @@ def block_sparse_flash_decode_gqa_indice_triton(
     return output
 
 
-def ref_program_torch(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks,
-                      block_size):
-
+def ref_program_torch(query, key, value, block_indices, cache_seqlens, max_cache_seqlen, num_blocks, block_size):
     batch, heads, dim = query.shape
     heads_kv = key.shape[2]
     dim_v = value.shape[-1]
     num_head_groups = query.shape[1] // key.shape[2]
     scale = dim**0.5
-    key = rearrange(key, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
-    value = rearrange(value, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
+    key = rearrange(key, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
+    value = rearrange(value, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
 
-    query = rearrange(
-        query, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
+    query = rearrange(query, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
     sparse_mask = torch.zeros_like(scores)
     # Assign mask values based on block_indices
@@ -321,42 +292,33 @@ def ref_program_torch(query, key, value, block_indices, cache_seqlens, max_cache
             valid_indices = block_indices[b, h]  # Extract indices for this batch and head
             for idx in valid_indices:
                 if idx >= 0:
-                    sparse_mask[b, :, h, idx * block_size:(idx + 1) * block_size] = 1
-    scores = scores.masked_fill(sparse_mask == 0, float('-inf'))
+                    sparse_mask[b, :, h, idx * block_size : (idx + 1) * block_size] = 1
+    scores = scores.masked_fill(sparse_mask == 0, float("-inf"))
 
-    range_len = torch.arange(scores.shape[-1], device='cuda').unsqueeze(0)
+    range_len = torch.arange(scores.shape[-1], device="cuda").unsqueeze(0)
     cache_seqlens_expanded = cache_seqlens.unsqueeze(1)
     pad_mask = range_len >= cache_seqlens_expanded
     pad_mask = pad_mask[:, None, None, :]
-    scores = scores.masked_fill(pad_mask, float('-inf'))
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = scores.masked_fill(pad_mask, float("-inf"))
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
-    out = einsum(attention, value,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, heads_kv, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, value, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, heads_kv, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
 def ref_program_fa(query, key, value, cache_seqlens):
     # latency reference
     # from flash_attn_interface import flash_attn_with_kvcache # fa3
-    from flash_attn import flash_attn_with_kvcache  #fa2
+    from flash_attn import flash_attn_with_kvcache  # fa2
+
     query = query.unsqueeze(1)
     output = flash_attn_with_kvcache(query, key, value, cache_seqlens=cache_seqlens)
     output = output.squeeze(1)
     return output
 
 
-def main(batch=64,
-         heads=32,
-         heads_kv=8,
-         max_cache_seqlen=8192,
-         dim=128,
-         dim_v=128,
-         sparse_ratio=0.8,
-         block_size=32):
-
+def main(batch=64, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32):
     batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = batch, heads, heads_kv, max_cache_seqlen, dim, dim_v
     sparse_ratio = sparse_ratio
     block_size = block_size
@@ -369,34 +331,29 @@ def main(batch=64,
     dtype = torch.float16
     block_H = 64
 
-    Q = torch.randn((batch, heads, dim), dtype=dtype, device='cuda')
-    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device='cuda')
-    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device='cuda')
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device="cuda")
     # cache_seqlens = torch.full((batch,), max_cache_seqlen, dtype=torch.int32, device='cuda')
     # Ensure at least one element equals cache_seqlen
-    random_index = torch.randint(0, batch, (1,), device='cuda').item()  # Select a random index
-    cache_seqlens[
-        random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
+    random_index = torch.randint(0, batch, (1,), device="cuda").item()  # Select a random index
+    cache_seqlens[random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
 
     print("cache_seqlens: ", cache_seqlens)
 
     max_valid_num_blocks = torch.ceil(cache_seqlens / block_size).int()
     print("max_valid_num_blocks: ", max_valid_num_blocks)
     # Initialize block_indices with -1 (for padding blocks)
-    block_indices = torch.full((batch, heads_kv, max_selected_blocks),
-                               -1,
-                               dtype=torch.int32,
-                               device='cuda')
+    block_indices = torch.full((batch, heads_kv, max_selected_blocks), -1, dtype=torch.int32, device="cuda")
 
     # Assign valid indices while ensuring no duplicates within each batch-group
     for b in range(batch):
         max_valid_block = max_valid_num_blocks[b].item()  # Max valid blocks for this batch
         if max_valid_block > 0:  # Ensure there's at least one valid block
             for h in range(heads_kv):
-                valid_indices = torch.randperm(
-                    max_valid_block, device='cuda', dtype=torch.int32)[:max_selected_blocks]
-                block_indices[b, h, :len(valid_indices)] = valid_indices
+                valid_indices = torch.randperm(max_valid_block, device="cuda", dtype=torch.int32)[:max_selected_blocks]
+                block_indices[b, h, : len(valid_indices)] = valid_indices
 
     # Sort indices within each batch-group for consistency
     block_indices, _ = block_indices.sort(dim=-1, descending=True)
@@ -408,8 +365,7 @@ def main(batch=64,
     max_num_blocks = torch.max(max_valid_num_blocks).item()
     print("max_num_blocks: ", max_num_blocks)
 
-    ref = ref_program_torch(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks,
-                            block_size)
+    ref = ref_program_torch(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks, block_size)
 
     triton_out = block_sparse_flash_decode_gqa_indice_triton(
         Q,
@@ -423,8 +379,7 @@ def main(batch=64,
     )
 
     print("max difference: ", torch.max(torch.abs(ref - triton_out)))
-    assert torch.allclose(
-        ref, triton_out, atol=1e-2), "Output mismatch between Triton and reference implementation"
+    assert torch.allclose(ref, triton_out, atol=1e-2), "Output mismatch between Triton and reference implementation"
     print("Passed the ref test!")
 
     # Measure performance
@@ -466,15 +421,13 @@ def main(batch=64,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=64, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--heads_kv', type=int, default=8, help='heads_kv')
-    parser.add_argument(
-        '--max_cache_seqlen', type=int, default=8192, help='kvcache sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--dim_v', type=int, default=128, help='dim_v')
-    parser.add_argument('--sparse_ratio', type=float, default=0.8, help='sparse ratio')
-    parser.add_argument('--block_size', type=int, default=32, help='block_size')
+    parser.add_argument("--batch", type=int, default=64, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--heads_kv", type=int, default=8, help="heads_kv")
+    parser.add_argument("--max_cache_seqlen", type=int, default=8192, help="kvcache sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--dim_v", type=int, default=128, help="dim_v")
+    parser.add_argument("--sparse_ratio", type=float, default=0.8, help="sparse ratio")
+    parser.add_argument("--block_size", type=int, default=32, help="block_size")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v,
-         args.sparse_ratio, args.block_size)
+    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v, args.sparse_ratio, args.block_size)
diff --git a/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_mask.py b/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_mask.py
index 348572526..232bcacaf 100644
--- a/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_mask.py
+++ b/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_mask.py
@@ -4,19 +4,14 @@
 import argparse
 from einops import rearrange, einsum
 import torch.nn.functional as F
-
 import math
 import time
 from heuristic import num_splits_heuristic
 
 
 @triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4]\
-        for num_stages in [1, 2, 3, 4, 7]
-    ],
-    key=['BLOCK_H', 'BLOCK_N', 'BLOCK_D'],
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [1, 2, 4] for num_stages in [1, 2, 3, 4, 7]],
+    key=["BLOCK_H", "BLOCK_N", "BLOCK_D"],
 )
 @triton.jit
 def _split_kernel(
@@ -77,16 +72,11 @@ def _split_kernel(
         loop_range = blocks_per_split
 
     q_ptr += batch_idx * stride_q_b + head_idx_q * stride_q_h
-    k_cache_ptr += batch_idx * stride_k_b + head_idx_kv * stride_k_h + offs_n[
-        None, :] * stride_k_s + offs_d[:, None] * stride_k_d
-    v_cache_ptr += batch_idx * stride_v_b + head_idx_kv * stride_v_h + offs_n[:,
-                                                                              None] * stride_v_s + offs_d[
-                                                                                  None, :] * stride_v_d
+    k_cache_ptr += batch_idx * stride_k_b + head_idx_kv * stride_k_h + offs_n[None, :] * stride_k_s + offs_d[:, None] * stride_k_d
+    v_cache_ptr += batch_idx * stride_v_b + head_idx_kv * stride_v_h + offs_n[:, None] * stride_v_s + offs_d[None, :] * stride_v_d
     mask_ptr += batch_idx * stride_mask_b + head_idx_kv * stride_mask_h
 
-    q = tl.load(
-        q_ptr + offs_h[:, None] * stride_q_h + offs_d[None, :] * stride_q_d,
-        mask=offs_h[:, None] < gqa_group_size)
+    q = tl.load(q_ptr + offs_h[:, None] * stride_q_h + offs_d[None, :] * stride_q_d, mask=offs_h[:, None] < gqa_group_size)
     start = blocks_per_split * split_idx + tl.minimum(split_idx, remaining_blocks)
     for block_idx in range(loop_range):
         start_n = (start + block_idx) * BLOCK_N
@@ -117,23 +107,18 @@ def _split_kernel(
     acc = acc * l_recip
     acc = acc.to(o_partial_ptr.dtype.element_ty)
 
-    lse_partial_ptr += batch_idx * stride_lse_b + (
-        head_idx_q + offs_h) * stride_lse_h + split_idx * stride_lse_split
+    lse_partial_ptr += batch_idx * stride_lse_b + (head_idx_q + offs_h) * stride_lse_h + split_idx * stride_lse_split
     tl.store(lse_partial_ptr, m_i, mask=offs_h < gqa_group_size)
 
-    o_partial_ptr += batch_idx * stride_o_b + (
-        head_idx_q +
-        offs_h[:, None]) * stride_o_h + split_idx * stride_o_split + offs_d[None, :] * stride_o_d
+    o_partial_ptr += (
+        batch_idx * stride_o_b + (head_idx_q + offs_h[:, None]) * stride_o_h + split_idx * stride_o_split + offs_d[None, :] * stride_o_d
+    )
     tl.store(o_partial_ptr, acc, mask=offs_h[:, None] < gqa_group_size)
 
 
 @triton.autotune(
-    configs=[
-        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
-        for num_warps in [1, 2, 4]\
-        for num_stages in [1, 2, 3, 4, 7]
-    ],
-    key=['BLOCK_D'],
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [1, 2, 4] for num_stages in [1, 2, 3, 4, 7]],
+    key=["BLOCK_D"],
 )
 @triton.jit
 def _merge_kernel(
@@ -161,18 +146,15 @@ def _merge_kernel(
     offs_d = tl.arange(0, BLOCK_D)
 
     lse_offsets = lse_partial_ptr + batch_idx * lse_partial_stride_b + head_idx * lse_partial_stride_h
-    lse = tl.load(
-        lse_offsets + offs_splits * lse_partial_stride_split,
-        mask=offs_splits < num_splits,
-        other=float("-inf"))
+    lse = tl.load(lse_offsets + offs_splits * lse_partial_stride_split, mask=offs_splits < num_splits, other=float("-inf"))
 
     lse_max = tl.max(lse)
 
     o_offsets = o_partial_ptr + batch_idx * o_partial_stride_b + head_idx * o_partial_stride_h
     o_partial = tl.load(
-        o_offsets + offs_splits[:, None] * o_partial_stride_split +
-        offs_d[None, :] * o_partial_stride_d,
-        mask=offs_splits[:, None] < num_splits)
+        o_offsets + offs_splits[:, None] * o_partial_stride_split + offs_d[None, :] * o_partial_stride_d,
+        mask=offs_splits[:, None] < num_splits,
+    )
     sumexp_normalized_splitk = tl.exp(lse - lse_max)
     sumexp_normalized = tl.sum(sumexp_normalized_splitk, axis=0)
     numerator_normalized = tl.sum(o_partial * sumexp_normalized_splitk[:, None], axis=0)
@@ -207,19 +189,13 @@ def block_sparse_flash_decode_gqa_mask_triton(
     num_m_blocks = 1 * (heads // heads_kv + block_H - 1) // block_H
     num_n_blocks = max_selected_blocks
 
-    size_one_kv_head = max_selected_blocks * block_size * (
-        dim + dim_v) * 2  #kv_seqlen * (dim + dim_v) * 2
+    size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2  # kv_seqlen * (dim + dim_v) * 2
     total_mblocks = batch * heads_kv * num_m_blocks
     num_sm = 64
     # num_sm = self.num_sm
     num_splits = num_splits_heuristic(
-        total_mblocks,
-        num_sm,
-        num_n_blocks,
-        num_m_blocks,
-        size_one_kv_head,
-        is_causal_or_local=True,
-        max_splits=128)
+        total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
+    )
 
     # print("num_splits:", num_splits, "num_blocks:", num_n_blocks)
 
@@ -292,24 +268,18 @@ def block_sparse_flash_decode_gqa_mask_triton(
     return output
 
 
-def ref_program_torch(query, key, value, block_mask, cache_seqlens, max_cache_seqlen, num_blocks,
-                      block_size):
-
+def ref_program_torch(query, key, value, block_mask, cache_seqlens, max_cache_seqlen, num_blocks, block_size):
     batch, heads, dim = query.shape
     heads_kv = key.shape[2]
 
     num_head_groups = query.shape[1] // key.shape[2]
     scale = dim**0.5
-    key = rearrange(key, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
-    value = rearrange(value, 'b n h d -> b h n d')  # [batch_size, heads_kv, seqlen_kv, dim]
+    key = rearrange(key, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
+    value = rearrange(value, "b n h d -> b h n d")  # [batch_size, heads_kv, seqlen_kv, dim]
 
-    query = rearrange(
-        query, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
+    query = rearrange(query, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, heads_kv, dim]
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
     sparse_mask = torch.zeros_like(scores)
     # Assign mask values
@@ -317,43 +287,34 @@ def ref_program_torch(query, key, value, block_mask, cache_seqlens, max_cache_se
         for h in range(heads_kv):
             for idx in range(num_blocks):
                 if block_mask[b, h, idx]:
-                    sparse_mask[b, :, h, idx * block_size:(idx + 1) * block_size] = 1
+                    sparse_mask[b, :, h, idx * block_size : (idx + 1) * block_size] = 1
 
-    scores = scores.masked_fill(sparse_mask == 0, float('-inf'))
+    scores = scores.masked_fill(sparse_mask == 0, float("-inf"))
 
-    range_len = torch.arange(scores.shape[-1], device='cuda').unsqueeze(0)
+    range_len = torch.arange(scores.shape[-1], device="cuda").unsqueeze(0)
     cache_seqlens_expanded = cache_seqlens.unsqueeze(1)
     pad_mask = range_len >= cache_seqlens_expanded
     pad_mask = pad_mask[:, None, None, :]
-    scores = scores.masked_fill(pad_mask, float('-inf'))
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
+    scores = scores.masked_fill(pad_mask, float("-inf"))
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, heads_kv, seqlen_kv]
 
-    out = einsum(attention, value,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, heads_kv, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, value, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, heads_kv, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
 def ref_program_fa(query, key, value, cache_seqlens):
     # latency reference
     # from flash_attn_interface import flash_attn_with_kvcache # fa3
-    from flash_attn import flash_attn_with_kvcache  #fa2
+    from flash_attn import flash_attn_with_kvcache  # fa2
+
     query = query.unsqueeze(1)
     output = flash_attn_with_kvcache(query, key, value, cache_seqlens=cache_seqlens)
     output = output.squeeze(1)
     return output
 
 
-def main(batch=64,
-         heads=32,
-         heads_kv=8,
-         max_cache_seqlen=8192,
-         dim=128,
-         dim_v=128,
-         sparse_ratio=0.8,
-         block_size=32):
-
+def main(batch=64, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32):
     batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = batch, heads, heads_kv, max_cache_seqlen, dim, dim_v
     block_size = block_size
     sparse_ratio = sparse_ratio
@@ -363,14 +324,13 @@ def main(batch=64,
 
     dtype = torch.float16
 
-    Q = torch.randn((batch, heads, dim), dtype=dtype, device='cuda')
-    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device='cuda')
-    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device='cuda')
+    Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
+    cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device="cuda")
     # Ensure at least one element equals cache_seqlen
-    random_index = torch.randint(0, batch, (1,), device='cuda').item()  # Select a random index
-    cache_seqlens[
-        random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
+    random_index = torch.randint(0, batch, (1,), device="cuda").item()  # Select a random index
+    cache_seqlens[random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
 
     num_blocks = (max_cache_seqlen + block_size - 1) // block_size
 
@@ -379,7 +339,7 @@ def main(batch=64,
     max_valid_num_blocks = torch.ceil(cache_seqlens / block_size).int()
     print("max_valid_num_blocks: ", max_valid_num_blocks)
     # Initialize block_mask with false (for padding blocks)
-    block_mask = torch.zeros((batch, heads_kv, num_blocks), dtype=torch.bool, device='cuda')
+    block_mask = torch.zeros((batch, heads_kv, num_blocks), dtype=torch.bool, device="cuda")
 
     # Assign valid indices while ensuring no duplicates within each batch-group
     for b in range(batch):
@@ -387,11 +347,10 @@ def main(batch=64,
         valid_num_block = valid_num_blocks[b].item()  # Valid blocks for this batch
         if valid_num_block > 0:  # Ensure there's at least one valid block
             for h in range(heads_kv):
-                perm = torch.randperm(max_valid_block, device='cuda')[:valid_num_block]
+                perm = torch.randperm(max_valid_block, device="cuda")[:valid_num_block]
                 block_mask[b, h, perm] = True
 
-    ref = ref_program_torch(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks,
-                            block_size)
+    ref = ref_program_torch(Q, K, V, block_mask, cache_seqlens, max_cache_seqlen, num_blocks, block_size)
 
     triton_out = block_sparse_flash_decode_gqa_mask_triton(
         Q,
@@ -404,8 +363,7 @@ def main(batch=64,
     )
 
     # print("max difference: ", torch.max(torch.abs(ref - triton_out)))
-    assert torch.allclose(
-        ref, triton_out, atol=1e-2), "Output mismatch between Triton and reference implementation"
+    assert torch.allclose(ref, triton_out, atol=1e-2), "Output mismatch between Triton and reference implementation"
     print("Passed the ref test!")
 
     # Measure performance
@@ -448,15 +406,13 @@ def main(batch=64,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=64, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--heads_kv', type=int, default=8, help='heads_kv')
-    parser.add_argument(
-        '--max_cache_seqlen', type=int, default=8192, help='kvcache sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--dim_v', type=int, default=128, help='dim_v')
-    parser.add_argument('--sparse_ratio', type=float, default=0.8, help='sparse ratio')
-    parser.add_argument('--block_size', type=int, default=32, help='block_size')
+    parser.add_argument("--batch", type=int, default=64, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--heads_kv", type=int, default=8, help="heads_kv")
+    parser.add_argument("--max_cache_seqlen", type=int, default=8192, help="kvcache sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--dim_v", type=int, default=128, help="dim_v")
+    parser.add_argument("--sparse_ratio", type=float, default=0.8, help="sparse ratio")
+    parser.add_argument("--block_size", type=int, default=32, help="block_size")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v,
-         args.sparse_ratio, args.block_size)
+    main(args.batch, args.heads, args.heads_kv, args.max_cache_seqlen, args.dim, args.dim_v, args.sparse_ratio, args.block_size)
diff --git a/examples/blocksparse_attention/heuristic.py b/examples/blocksparse_attention/heuristic.py
index b60a81dc3..0e6fc5281 100644
--- a/examples/blocksparse_attention/heuristic.py
+++ b/examples/blocksparse_attention/heuristic.py
@@ -1,8 +1,7 @@
 import math
 
 
-def num_splits_heuristic(total_mblocks, num_SMs, num_n_blocks, num_m_blocks, size_one_kv_head,
-                         is_causal_or_local, max_splits):
+def num_splits_heuristic(total_mblocks, num_SMs, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local, max_splits):
     """
     Determines the optimal number of splits for maximizing GPU occupancy while balancing memory efficiency.
 
diff --git a/examples/blocksparse_attention/regression_example_blocksparse_attention.py b/examples/blocksparse_attention/regression_example_blocksparse_attention.py
new file mode 100644
index 000000000..26fa60df5
--- /dev/null
+++ b/examples/blocksparse_attention/regression_example_blocksparse_attention.py
@@ -0,0 +1,20 @@
+import tilelang.testing
+import example_tilelang_block_sparse_attn
+import example_tilelang_sparse_gqa_decode_varlen_indice
+import example_tilelang_sparse_gqa_decode_varlen_mask
+
+
+def regression_example_tilelang_block_sparse_attn():
+    tilelang.testing.process_func(example_tilelang_block_sparse_attn.run_regression_perf)
+
+
+def regression_example_tilelang_sparse_gqa_decode_varlen_indice():
+    tilelang.testing.process_func(example_tilelang_sparse_gqa_decode_varlen_indice.run_regression_perf, batch=1, max_cache_seqlen=2048)
+
+
+def regression_example_tilelang_sparse_gqa_decode_varlen_mask():
+    tilelang.testing.process_func(example_tilelang_sparse_gqa_decode_varlen_mask.run_regression_perf, batch=1, max_cache_seqlen=2048)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/blocksparse_attention/test_example_blocksparse_attention.py b/examples/blocksparse_attention/test_example_blocksparse_attention.py
index 88527f7b3..dd33f46c4 100644
--- a/examples/blocksparse_attention/test_example_blocksparse_attention.py
+++ b/examples/blocksparse_attention/test_example_blocksparse_attention.py
@@ -25,26 +25,14 @@ def test_example_tilelang_sparse_gqa_decode_varlen_mask():
 
 def test_example_triton_sparse_gqa_decode_varlen_indice():
     example_triton_sparse_gqa_decode_varlen_indice.main(
-        batch=16,
-        heads=16,
-        heads_kv=8,
-        max_cache_seqlen=4096,
-        dim=128,
-        dim_v=128,
-        sparse_ratio=0.8,
-        block_size=32)
+        batch=8, heads=8, heads_kv=4, max_cache_seqlen=2048, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32
+    )
 
 
 def test_example_triton_sparse_gqa_decode_varlen_mask():
     example_triton_sparse_gqa_decode_varlen_mask.main(
-        batch=16,
-        heads=16,
-        heads_kv=8,
-        max_cache_seqlen=4096,
-        dim=128,
-        dim_v=128,
-        sparse_ratio=0.8,
-        block_size=32)
+        batch=16, heads=16, heads_kv=8, max_cache_seqlen=1024, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/blocksparse_gemm/example_blocksparse_gemm.py b/examples/blocksparse_gemm/example_blocksparse_gemm.py
index 7b9cff7c1..178cc5984 100644
--- a/examples/blocksparse_gemm/example_blocksparse_gemm.py
+++ b/examples/blocksparse_gemm/example_blocksparse_gemm.py
@@ -6,6 +6,7 @@
 from tilelang.utils.tensor import get_tensor_supply, TensorSupplyType
 import torch
 from typing import List
+from tilelang.profiler import do_bench
 
 DEFAULT_BLOCK_M = 128
 DEFAULT_BLOCK_N = 128
@@ -19,8 +20,7 @@
 parser.add_argument("--n", type=int, default=1024, help="Matrix dimension N")
 parser.add_argument("--k", type=int, default=1024, help="Matrix dimension K")
 parser.add_argument("--sparsity", type=float, default=0.5, help="Sparsity ratio (0-1)")
-parser.add_argument(
-    "--use_autotune", action="store_true", default=False, help="Whether to use autotune")
+parser.add_argument("--use_autotune", action="store_true", default=False, help="Whether to use autotune")
 
 args, _ = parser.parse_known_args()
 M, N, K = args.m, args.n, args.k
@@ -41,17 +41,19 @@ def get_configs():
     thread_num = [128, 256]
     enable_rasterization = [True, False]
 
-    _configs = list(
-        itertools.product(block_M, block_N, block_K, num_stages, thread_num, enable_rasterization))
+    _configs = list(itertools.product(block_M, block_N, block_K, num_stages, thread_num, enable_rasterization))
 
-    return [{
-        "block_M": c[0],
-        "block_N": c[1],
-        "block_K": c[2],
-        "num_stages": c[3],
-        "thread_num": c[4],
-        "enable_rasteration": c[5],
-    } for c in _configs]
+    return [
+        {
+            "block_M": c[0],
+            "block_N": c[1],
+            "block_K": c[2],
+            "num_stages": c[3],
+            "thread_num": c[4],
+            "enable_rasteration": c[5],
+        }
+        for c in _configs
+    ]
 
 
 def ref_program(A, B, BlockMask, block_M, block_N, block_K):
@@ -61,12 +63,10 @@ def ref_program(A, B, BlockMask, block_M, block_N, block_K):
             accu = torch.zeros((block_M, block_N), dtype=torch.float32, device=A.device)
             for k in range(K // block_K):
                 if BlockMask[i, j, k]:
-                    accu += (
-                        A[i * block_M:(i + 1) * block_M, k * block_K:(k + 1) * block_K].to(
-                            torch.float32) @ B[k * block_K:(k + 1) * block_K,
-                                               j * block_N:(j + 1) * block_N].to(torch.float32))
-            ref_c[i * block_M:(i + 1) * block_M,
-                  j * block_N:(j + 1) * block_N] = accu.to(torch.float16)
+                    accu += A[i * block_M : (i + 1) * block_M, k * block_K : (k + 1) * block_K].to(torch.float32) @ B[
+                        k * block_K : (k + 1) * block_K, j * block_N : (j + 1) * block_N
+                    ].to(torch.float32)
+            ref_c[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N] = accu.to(torch.float16)
     return ref_c
 
 
@@ -89,28 +89,21 @@ def supply_program(params: List[KernelParam]):
     return input_tensors
 
 
-@tilelang.autotune(configs=get_configs(),)
+@tilelang.autotune(
+    configs=get_configs(),
+)
 @tilelang.jit(out_idx=[-1])
-def blocksparse_matmul(M,
-                       N,
-                       K,
-                       block_M,
-                       block_N,
-                       block_K,
-                       num_stages,
-                       thread_num,
-                       enable_rasteration,
-                       dtype="float16",
-                       accum_dtype="float"):
-
+def blocksparse_matmul(
+    M, N, K, block_M, block_N, block_K, num_stages, thread_num, enable_rasteration, dtype=T.float16, accum_dtype=T.float32
+):
     block_mask_shape = (M // block_M, N // block_N, K // block_K)
 
     @T.prim_func
     def block_sparse_matmul(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -134,7 +127,6 @@ def block_sparse_matmul(
 
 
 def main():
-
     # Initialize input matrices A and B on the GPU with half precision
     a = torch.randn(M, K).cuda().half()
     b = torch.randn(K, N).cuda().half()
@@ -147,8 +139,7 @@ def main():
 
         best_config = kernel.config
         best_latency = kernel.latency
-        block_M, block_N, block_K = best_config["block_M"], best_config["block_N"], best_config[
-            "block_K"]
+        block_M, block_N, block_K = best_config["block_M"], best_config["block_N"], best_config["block_K"]
 
         print(f"Best Config: {best_config}")
         print(f"Sparsity Ratio: {sparsity}")
@@ -163,10 +154,10 @@ def main():
             block_K=DEFAULT_BLOCK_K,
             num_stages=DEFAULT_NUM_STAGES,
             thread_num=DEFAULT_THREAD_NUM,
-            enable_rasteration=DEFAULT_ENABLE_RASTERIZATION)
+            enable_rasteration=DEFAULT_ENABLE_RASTERIZATION,
+        )
         block_M, block_N, block_K = DEFAULT_BLOCK_M, DEFAULT_BLOCK_N, DEFAULT_BLOCK_K
         print(f"Using default kernel with block size ({block_M}, {block_N}, {block_K})")
-
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
     block_mask = torch.rand(mask_shape).cuda() > sparsity
@@ -185,5 +176,32 @@ def main():
         print(e)
 
 
+def run_regression_perf():
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    a = torch.randn(M, K).cuda().half()
+    b = torch.randn(K, N).cuda().half()
+
+    kernel = blocksparse_matmul(
+        M,
+        N,
+        K,
+        block_M=DEFAULT_BLOCK_M,
+        block_N=DEFAULT_BLOCK_N,
+        block_K=DEFAULT_BLOCK_K,
+        num_stages=DEFAULT_NUM_STAGES,
+        thread_num=DEFAULT_THREAD_NUM,
+        enable_rasteration=DEFAULT_ENABLE_RASTERIZATION,
+    )
+    block_M, block_N, block_K = DEFAULT_BLOCK_M, DEFAULT_BLOCK_N, DEFAULT_BLOCK_K
+    mask_shape = (M // block_M, N // block_N, K // block_K)
+    block_mask = torch.rand(mask_shape).cuda() > sparsity
+
+    def run_kernel_only():
+        kernel(a, b, block_mask)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/blocksparse_gemm/regression_example_blocksparse_gemm.py b/examples/blocksparse_gemm/regression_example_blocksparse_gemm.py
new file mode 100644
index 000000000..81900a00c
--- /dev/null
+++ b/examples/blocksparse_gemm/regression_example_blocksparse_gemm.py
@@ -0,0 +1,10 @@
+import tilelang.testing
+import example_blocksparse_gemm
+
+
+def regression_example_blocksparse_gemm():
+    tilelang.testing.process_func(example_blocksparse_gemm.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/cast/example_group_per_split_token_cast_to_fp8.py b/examples/cast/example_group_per_split_token_cast_to_fp8.py
index 4c2f574c0..db6beab1e 100644
--- a/examples/cast/example_group_per_split_token_cast_to_fp8.py
+++ b/examples/cast/example_group_per_split_token_cast_to_fp8.py
@@ -5,8 +5,8 @@
 from tilelang.utils.tensor import torch_assert_close
 
 # support bfloat16, float, float16
-dtype = "bfloat16"
-accum_dtype = "float"
+dtype = T.bfloat16
+accum_dtype = T.float32
 
 
 @tilelang.jit(out_idx=[2, 3])
@@ -16,11 +16,13 @@ def group_per_split_token_cast_to_fp8(M, M_max, N, BG, blk_m):
     fp8_max = 448.0
 
     @T.prim_func
-    def group_per_split_token_cast(X: T.Tensor((M, N), dtype), batch_sizes: T.Tensor(
-        (BG,), "int32"), X_fp8: T.Tensor((BG, M_max, N), "float8_e4m3"), X_amax: T.Tensor(
-            (BG, M_max, T.ceildiv(N, group_size)), accum_dtype)):
-        with T.Kernel(
-                T.ceildiv(M_max, blk_m), T.ceildiv(N, group_size), BG, threads=128) as (bx, by, bz):
+    def group_per_split_token_cast(
+        X: T.Tensor((M, N), dtype),
+        batch_sizes: T.Tensor((BG,), T.int32),
+        X_fp8: T.Tensor((BG, M_max, N), T.float8_e4m3fn),
+        X_amax: T.Tensor((BG, M_max, T.ceildiv(N, group_size)), accum_dtype),
+    ):
+        with T.Kernel(T.ceildiv(M_max, blk_m), T.ceildiv(N, group_size), BG, threads=128) as (bx, by, bz):
             row = bx
             row_g_id = by
             bg = bz
@@ -28,39 +30,29 @@ def group_per_split_token_cast(X: T.Tensor((M, N), dtype), batch_sizes: T.Tensor
             y_amax_local = T.alloc_fragment((blk_m,), accum_dtype)
             y_s_local = T.alloc_fragment((blk_m,), accum_dtype)
             y_q_local = T.alloc_fragment((blk_m, group_size), accum_dtype)
-            y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), "float8_e4m3")
-            row_offset = T.alloc_fragment((1,), "int32")
+            y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), T.float8_e4m3fn)
+            row_offset = T.alloc_var(dtype=T.int32)
 
-            T.annotate_layout({
-                y_local:
-                    T.Fragment(
-                        y_local.shape,
-                        forward_thread_fn=lambda i, j: (i // (blk_m // 4)) * 32 + j % 32),
-            })
-
-            row_offset[0] = 0
+            row_offset = 0
             for i in T.serial(bg):
-                row_offset[0] += batch_sizes[i]
+                row_offset += batch_sizes[i]
 
             T.copy(
-                X[row_offset[0] + row * blk_m:row_offset[0] + (row + 1) * blk_m,
-                  row_g_id * group_size:(row_g_id + 1) * group_size], y_local)
+                X[row_offset + row * blk_m : row_offset + (row + 1) * blk_m, row_g_id * group_size : (row_g_id + 1) * group_size],
+                y_local,
+            )
             T.reduce_absmax(y_local, y_amax_local, dim=1)
             for i in T.Parallel(blk_m):
                 y_amax_local[i] = T.max(y_amax_local[i], 1e-4)
-                y_s_local[i] = T.if_then_else(row * blk_m + i < batch_sizes[bg],
-                                              y_amax_local[i] / fp8_max, 0)
+                y_s_local[i] = T.if_then_else(row * blk_m + i < batch_sizes[bg], y_amax_local[i] / fp8_max, 0)
             for i, j in T.Parallel(blk_m, group_size):
                 y_q_local[i, j] = T.clamp(y_local[i, j] / y_s_local[i], fp8_min, fp8_max)
             T.copy(y_q_local, y_q_local_fp8)
             for i, j in T.Parallel(blk_m, group_size):
-                y_q_local_fp8[i, j] = T.if_then_else(row * blk_m + i < batch_sizes[bg],
-                                                     y_q_local[i, j], 0)
+                y_q_local_fp8[i, j] = T.if_then_else(row * blk_m + i < batch_sizes[bg], y_q_local[i, j], 0)
             for i in T.Parallel(blk_m):
                 X_amax[bg, row * blk_m + i, row_g_id] = y_s_local[i]
-            T.copy(
-                y_q_local_fp8, X_fp8[bg, row * blk_m:(row + 1) * blk_m,
-                                     row_g_id * group_size:(row_g_id + 1) * group_size])
+            T.copy(y_q_local_fp8, X_fp8[bg, row * blk_m : (row + 1) * blk_m, row_g_id * group_size : (row_g_id + 1) * group_size])
 
     return group_per_split_token_cast
 
@@ -127,8 +119,7 @@ def get_col_major_tma_aligned_tensor(x: torch.Tensor) -> torch.Tensor:
         return x.squeeze(0) if remove_dim else x
 
     # Normal layout requires transposing
-    aligned_x = torch.transpose(
-        torch.empty((b, n, aligned_m), device=x.device, dtype=x.dtype), 1, 2)
+    aligned_x = torch.transpose(torch.empty((b, n, aligned_m), device=x.device, dtype=x.dtype), 1, 2)
     aligned_x[:, :m, :] = x
     aligned_x = aligned_x[:, :m, :]
     return aligned_x.squeeze(0) if remove_dim else aligned_x
@@ -146,31 +137,35 @@ def ref_per_token_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tens
     x_fp8 = x_fp8.view(m, -1)[:, :n].contiguous()
     return x_fp8, (x_amax / 448.0).view(m, -1)
 
-def ref_program(x: torch.Tensor, batch_sizes: torch.Tensor) -> \
-        Tuple[torch.Tensor, torch.Tensor]:
+
+def ref_program(x: torch.Tensor, batch_sizes: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     # assert x.shape[0] == batch_sizes.sum()
     M_max = ceil_div(batch_sizes.max(), 128) * 128
     split_x = torch.split(x, batch_sizes.tolist(), dim=0)
     padded_x = [torch.nn.functional.pad(t, (0, 0, 0, M_max - t.shape[0])) for t in split_x]
     num_groups, m, n = batch_sizes.shape[0], M_max, x.shape[1]
-    x_fp8 = (torch.empty((num_groups, m, n), device='cuda', dtype=torch.float8_e4m3fn),
-             torch.empty((num_groups, m, n // 128), device='cuda', dtype=torch.float))
+    x_fp8 = (
+        torch.empty((num_groups, m, n), device="cuda", dtype=torch.float8_e4m3fn),
+        torch.empty((num_groups, m, n // 128), device="cuda", dtype=torch.float),
+    )
     for i in range(num_groups):
         x_fp8[0][i], x_fp8[1][i] = ref_per_token_cast_to_fp8(padded_x[i])
     x_fp8 = (x_fp8[0], get_col_major_tma_aligned_tensor(x_fp8[1]))
     return x_fp8
 
 
-def main(M=8192, N=8192, BG=2, blk_m=8):
-    if dtype == "float":
+def main(M=8192, N=8192, BG=2, blk_m=8, batch_sizes=None):
+    if batch_sizes is None:
+        batch_sizes = [2048, 6144]
+    if dtype == T.float:
         x = torch.randn(M, N, device="cuda", dtype=torch.float32)
-    elif dtype == "float16":
+    elif dtype == T.float16:
         x = torch.randn(M, N, device="cuda", dtype=torch.float16)
-    elif dtype == "bfloat16":
+    elif dtype == T.bfloat16:
         x = torch.randn(M, N, device="cuda", dtype=torch.bfloat16)
     else:
         raise ValueError(f"Unsupported dtype: {dtype}")
-    batch_sizes = torch.tensor([2048, 6144], device="cuda", dtype=torch.int32)
+    batch_sizes = torch.tensor(batch_sizes, device="cuda", dtype=torch.int32)
     M_max = int(ceil_div(batch_sizes.max(), 128) * 128)
 
     print("batch_sizes:", batch_sizes)
@@ -204,5 +199,35 @@ def run_torch():
     print("Torch: {:.2f} ms".format(latency))
 
 
+def run_regression_perf(M=8192, N=8192, BG=2, blk_m=8, batch_sizes=None):
+    if batch_sizes is None:
+        batch_sizes = [2048, 6144]
+    if dtype == "float":
+        x = torch.randn(M, N, device="cuda", dtype=torch.float32)
+    elif dtype == "float16":
+        x = torch.randn(M, N, device="cuda", dtype=torch.float16)
+    elif dtype == "bfloat16":
+        x = torch.randn(M, N, device="cuda", dtype=torch.bfloat16)
+    else:
+        raise ValueError(f"Unsupported dtype: {dtype}")
+    batch_sizes = torch.tensor(batch_sizes, device="cuda", dtype=torch.int32)
+    M_max = int(ceil_div(batch_sizes.max(), 128) * 128)
+
+    kernel = group_per_split_token_cast_to_fp8(M, M_max, N, BG, blk_m)
+
+    x_fp8, x_amax = kernel(x, batch_sizes)
+    x_fp8_ref, x_amax_ref = ref_program(x, batch_sizes)
+
+    torch_assert_close(x_fp8.to(torch.float32), x_fp8_ref.to(torch.float32), rtol=0.01, atol=0.01)
+    torch_assert_close(x_amax, x_amax_ref, rtol=0.01, atol=0.01)
+
+    from tilelang.profiler import do_bench
+
+    def run_tilelang():
+        kernel(x, batch_sizes)
+
+    return do_bench(run_tilelang, backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/cast/example_per_token_cast_to_fp8.py b/examples/cast/example_per_token_cast_to_fp8.py
index 484a092f0..4b3730b4b 100644
--- a/examples/cast/example_per_token_cast_to_fp8.py
+++ b/examples/cast/example_per_token_cast_to_fp8.py
@@ -7,14 +7,15 @@
 
 @tilelang.jit(out_idx=[1, 2])
 def per_token_cast_to_fp8(M, N, blk_m):
-    dtype = "float"
+    dtype = T.float
     group_size = 128
     fp8_min = -448.0
     fp8_max = 448.0
 
     @T.prim_func
-    def per_token_cast(X: T.Tensor((M, N), dtype), X_fp8: T.Tensor((M, N), "float8_e4m3"),
-                       X_amax: T.Tensor((M, T.ceildiv(N, group_size)), dtype)):
+    def per_token_cast(
+        X: T.Tensor((M, N), dtype), X_fp8: T.Tensor((M, N), T.float8_e4m3fn), X_amax: T.Tensor((M, T.ceildiv(N, group_size)), dtype)
+    ):
         with T.Kernel(T.ceildiv(M, blk_m), T.ceildiv(N, group_size), threads=128) as (bx, by):
             row = bx
             row_g_id = by
@@ -22,18 +23,9 @@ def per_token_cast(X: T.Tensor((M, N), dtype), X_fp8: T.Tensor((M, N), "float8_e
             y_amax_local = T.alloc_fragment((blk_m,), dtype)
             y_s_local = T.alloc_fragment((blk_m,), dtype)
             y_q_local = T.alloc_fragment((blk_m, group_size), dtype)
-            y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), "float8_e4m3")
-
-            T.annotate_layout({
-                y_local:
-                    T.Fragment(
-                        y_local.shape,
-                        forward_thread_fn=lambda i, j: (i // (blk_m // 4)) * 32 + j % 32),
-            })
-
-            T.copy(
-                X[row * blk_m:(row + 1) * blk_m, row_g_id * group_size:(row_g_id + 1) * group_size],
-                y_local)
+            y_q_local_fp8 = T.alloc_fragment((blk_m, group_size), T.float8_e4m3fn)
+
+            T.copy(X[row * blk_m : (row + 1) * blk_m, row_g_id * group_size : (row_g_id + 1) * group_size], y_local)
             T.reduce_absmax(y_local, y_amax_local, dim=1)
             for i in T.Parallel(blk_m):
                 y_amax_local[i] = T.max(y_amax_local[i], 1e-4)
@@ -43,9 +35,7 @@ def per_token_cast(X: T.Tensor((M, N), dtype), X_fp8: T.Tensor((M, N), "float8_e
             T.copy(y_q_local, y_q_local_fp8)
             for i in T.Parallel(blk_m):
                 X_amax[row * blk_m + i, row_g_id] = y_s_local[i]
-            T.copy(
-                y_q_local_fp8, X_fp8[row * blk_m:(row + 1) * blk_m,
-                                     row_g_id * group_size:(row_g_id + 1) * group_size])
+            T.copy(y_q_local_fp8, X_fp8[row * blk_m : (row + 1) * blk_m, row_g_id * group_size : (row_g_id + 1) * group_size])
 
     return per_token_cast
 
@@ -102,16 +92,32 @@ def main(M=8192, N=8192, blk_m=8):
     print("Tile-lang: {:.2f} ms".format(latency))
 
     from tilelang.profiler import do_bench
-    from example_triton_cast_to_fp8 import per_token_group_quant_fp8
 
-    def run_triton():
-        x_fp8_triton_, x_amax_triton_ = per_token_group_quant_fp8(
-            x, 128, 1e-4, dtype=torch.float8_e4m3fn, column_major_scales=False)
-        return x_fp8_triton_, x_amax_triton_
+    # Triton fp8e4nv is only supported on Hopper (SM90) and later
+    major, _ = torch.cuda.get_device_capability()
+    if major >= 9:
+        from example_triton_cast_to_fp8 import per_token_group_quant_fp8
+
+        def run_triton():
+            x_fp8_triton_, x_amax_triton_ = per_token_group_quant_fp8(x, 128, 1e-4, dtype=torch.float8_e4m3fn, column_major_scales=False)
+            return x_fp8_triton_, x_amax_triton_
+
+        x_fp8_triton, x_amax_triton = run_triton()
+        latency = do_bench(run_triton)
+        print("Triton: {:.2f} ms".format(latency))
+    else:
+        print("Triton fp8e4nv benchmark skipped (requires SM90+)")
+
+
+def run_regression_perf(M=8192, N=8192, blk_m=8):
+    kernel = per_token_cast_to_fp8(M, N, blk_m)
+    x = torch.randn(M, N, device="cuda", dtype=torch.float32)
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(x)
 
-    x_fp8_triton, x_amax_triton = run_triton()
-    latency = do_bench(run_triton)
-    print("Triton: {:.2f} ms".format(latency))
+    return do_bench(run_kernel_only, backend="cupti")
 
 
 if __name__ == "__main__":
diff --git a/examples/cast/example_triton_cast_to_fp8.py b/examples/cast/example_triton_cast_to_fp8.py
index cc56defe7..1859433f1 100644
--- a/examples/cast/example_triton_cast_to_fp8.py
+++ b/examples/cast/example_triton_cast_to_fp8.py
@@ -128,9 +128,7 @@ def per_token_group_quant_fp8(
         Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
         scaling factor for quantization.
     """
-    assert (x.shape[-1] %
-            group_size == 0), (f"the last dimension of `x` {x.shape[-1]} must be divisible "
-                               f"by `group_size` {group_size}")
+    assert x.shape[-1] % group_size == 0, f"the last dimension of `x` {x.shape[-1]} must be divisible by `group_size` {group_size}"
     assert x.stride(-1) == 1, "`x` groups must be contiguous"
 
     finfo = torch.finfo(dtype)
diff --git a/examples/cast/regression_example_cast.py b/examples/cast/regression_example_cast.py
new file mode 100644
index 000000000..4bdfb99e7
--- /dev/null
+++ b/examples/cast/regression_example_cast.py
@@ -0,0 +1,17 @@
+import tilelang.testing
+import example_group_per_split_token_cast_to_fp8
+import example_per_token_cast_to_fp8
+
+
+def regression_example_group_per_split_token_cast_to_fp8():
+    tilelang.testing.process_func(
+        example_group_per_split_token_cast_to_fp8.run_regression_perf, M=1024, N=1024, BG=2, blk_m=4, batch_sizes=[128, 896]
+    )
+
+
+def regression_example_per_token_cast_to_fp8():
+    tilelang.testing.process_func(example_per_token_cast_to_fp8.run_regression_perf, M=2048, N=512, blk_m=8)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/cast/test_example_cast.py b/examples/cast/test_example_cast.py
index 2f978c1d4..e8b10a797 100644
--- a/examples/cast/test_example_cast.py
+++ b/examples/cast/test_example_cast.py
@@ -4,11 +4,11 @@
 
 
 def test_example_group_per_split_token_cast_to_fp8():
-    example_group_per_split_token_cast_to_fp8.main(M=8192, N=2048, BG=2, blk_m=8)
+    example_group_per_split_token_cast_to_fp8.main(M=1024, N=1024, BG=2, blk_m=4, batch_sizes=[128, 896])
 
 
 def test_example_per_token_cast_to_fp8():
-    example_per_token_cast_to_fp8.main(M=8192, N=2048, blk_m=8)
+    example_per_token_cast_to_fp8.main(M=2048, N=512, blk_m=8)
 
 
 if __name__ == "__main__":
diff --git a/examples/compile_flags/usecase.py b/examples/compile_flags/usecase.py
index 8451b04fc..80e2b784b 100644
--- a/examples/compile_flags/usecase.py
+++ b/examples/compile_flags/usecase.py
@@ -4,12 +4,11 @@
 
 # @tilelang.jit(compile_flags=["-O3", "--use_fast_math", "--expt-relaxed-constexpr"])
 def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -36,8 +35,7 @@ def main(
 
 func = matmul(M, N, K, block_M, block_N, block_K)
 
-jit_kernel = tilelang.compile(
-    func, out_idx=[2], target="cuda", compile_flags="-O3 --use_fast_math --expt-relaxed-constexpr")
+jit_kernel = tilelang.compile(func, out_idx=[2], target="cuda", compile_flags="-O3 --use_fast_math --expt-relaxed-constexpr")
 # or jit_kernel = tilelang.compile(func, out_idx=[2], target="cuda", compile_flags=["-O3", "--use_fast_math", "--expt-relaxed-constexpr"])
 # or jit_kernel = tilelang.compile(func, out_idx=[2], target="cuda", compile_flags=["-O3 --use_fast_math --expt-relaxed-constexpr"])
 
diff --git a/examples/conftest.py b/examples/conftest.py
index 9f49d40a9..4010e0d83 100644
--- a/examples/conftest.py
+++ b/examples/conftest.py
@@ -33,12 +33,9 @@ def pytest_terminal_summary(terminalreporter, exitstatus, config):
         "warnings",
         "error",
     }
-    if (sum(
-            len(terminalreporter.stats.get(k, []))
-            for k in known_types.difference({"skipped", "deselected"})) == 0):
+    if sum(len(terminalreporter.stats.get(k, [])) for k in known_types.difference({"skipped", "deselected"})) == 0:
         terminalreporter.write_sep(
             "!",
-            (f"Error: No tests were collected. "
-             f"{dict(sorted((k, len(v)) for k, v in terminalreporter.stats.items()))}"),
+            (f"Error: No tests were collected. {dict(sorted((k, len(v)) for k, v in terminalreporter.stats.items()))}"),
         )
         pytest.exit("No tests were collected.", returncode=5)
diff --git a/examples/convolution/example_convolution.py b/examples/convolution/example_convolution.py
index b2696ba8f..1599d3464 100644
--- a/examples/convolution/example_convolution.py
+++ b/examples/convolution/example_convolution.py
@@ -14,7 +14,6 @@ def check_hopper():
 
 
 def ref_program(stride, padding, dilation):
-
     def main(A, B):
         A = A.permute(0, 3, 1, 2)  # N, H, W, C -> N, C, H, W
         B = B.permute(3, 2, 0, 1)  # H, W, C, F -> F, C, H, W
@@ -26,38 +25,21 @@ def main(A, B):
 
 
 @tilelang.jit(out_idx=[2])
-def convolution(N,
-                C,
-                H,
-                W,
-                F,
-                K,
-                S,
-                D,
-                P,
-                block_M,
-                block_N,
-                block_K,
-                num_stages,
-                threads,
-                dtype="float16",
-                accum_dtype="float"):
+def convolution(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, threads, dtype=T.float16, accum_dtype=T.float32):
     KH, KW = K, K
     OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
     OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     is_hopper = check_hopper()
 
     @T.prim_func
     def main(
-            data: T.Tensor((N, H, W, C), dtype),
-            kernel: T.Tensor((KH, KW, C, F), dtype),
-            out: T.Tensor((N, OH, OW, F), dtype),
+        data: T.Tensor((N, H, W, C), dtype),
+        kernel: T.Tensor((KH, KW, C, F), dtype),
+        out: T.Tensor((N, OH, OW, F), dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M),
-                threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M), threads=threads) as (bx, by):
             data_shared = T.alloc_shared((block_M, block_K), dtype)
             kernel_shared = T.alloc_shared((block_K, block_N), dtype)
             out_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -66,12 +48,6 @@ def main(
             kernel_flat = T.Tensor((KH * KW * C, F), dtype, kernel.data)
             out_flat = T.Tensor((N * OH * OW, F), dtype, out.data)
 
-            T.annotate_layout({
-                out_shared: tilelang.layout.make_swizzled_layout(out_shared),
-                data_shared: tilelang.layout.make_swizzled_layout(data_shared),
-                kernel_shared: tilelang.layout.make_swizzled_layout(kernel_shared),
-            })
-
             T.clear(out_local)
             for k_iter in T.Pipelined(T.ceildiv(KH * KW * C, block_K), num_stages=num_stages):
                 if is_hopper:
@@ -82,10 +58,8 @@ def main(
                         m = by * block_M + i
                         access_h = m % (OH * OW) // OW * S + k // (KW * C) * D - P
                         access_w = m % OW * S + k // C % KW * D - P
-                        in_bound = ((access_h >= 0) and (access_w >= 0) and (access_h < H) and
-                                    (access_w < W))
-                        data_shared[i, j] = T.if_then_else(
-                            in_bound, data[m // (OH * OW), access_h, access_w, k % C], 0)
+                        in_bound = (access_h >= 0) and (access_w >= 0) and (access_h < H) and (access_w < W)
+                        data_shared[i, j] = T.if_then_else(in_bound, data[m // (OH * OW), access_h, access_w, k % C], 0)
                 T.copy(kernel_flat[k_iter * block_K, bx * block_N], kernel_shared)
                 T.gemm(data_shared, kernel_shared, out_local)
 
@@ -97,15 +71,15 @@ def main(
 
 def main(argv=None):
     parser = argparse.ArgumentParser()
-    parser.add_argument('--n', type=int, default=128, help='n')
-    parser.add_argument('--c', type=int, default=128, help='c')
-    parser.add_argument('--h', type=int, default=64, help='h')
-    parser.add_argument('--w', type=int, default=64, help='w')
-    parser.add_argument('--f', type=int, default=128, help='f')
-    parser.add_argument('--k', type=int, default=3, help='k')
-    parser.add_argument('--s', type=int, default=1, help='s')
-    parser.add_argument('--d', type=int, default=1, help='d')
-    parser.add_argument('--p', type=int, default=1, help='p')
+    parser.add_argument("--n", type=int, default=128, help="n")
+    parser.add_argument("--c", type=int, default=128, help="c")
+    parser.add_argument("--h", type=int, default=64, help="h")
+    parser.add_argument("--w", type=int, default=64, help="w")
+    parser.add_argument("--f", type=int, default=128, help="f")
+    parser.add_argument("--k", type=int, default=3, help="k")
+    parser.add_argument("--s", type=int, default=1, help="s")
+    parser.add_argument("--d", type=int, default=1, help="d")
+    parser.add_argument("--p", type=int, default=1, help="p")
 
     args = parser.parse_args(argv)
     N, C, H, W, F, K, S, D, P = args.n, args.c, args.h, args.w, args.f, args.k, args.s, args.d, args.p
@@ -125,5 +99,30 @@ def main(argv=None):
     print("All checks passed.✅")
 
 
+def run_regression_perf(argv=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--n", type=int, default=128, help="n")
+    parser.add_argument("--c", type=int, default=128, help="c")
+    parser.add_argument("--h", type=int, default=64, help="h")
+    parser.add_argument("--w", type=int, default=64, help="w")
+    parser.add_argument("--f", type=int, default=128, help="f")
+    parser.add_argument("--k", type=int, default=3, help="k")
+    parser.add_argument("--s", type=int, default=1, help="s")
+    parser.add_argument("--d", type=int, default=1, help="d")
+    parser.add_argument("--p", type=int, default=1, help="p")
+
+    args = parser.parse_args(argv)
+    N, C, H, W, F, K, S, D, P = args.n, args.c, args.h, args.w, args.f, args.k, args.s, args.d, args.p
+
+    block_m = 64
+    block_n = 128
+    block_k = 32
+    num_stages = 3
+    threads = 256
+    kernel = convolution(N, C, H, W, F, K, S, D, P, block_m, block_n, block_k, num_stages, threads)
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Auto)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/convolution/example_convolution_autotune.py b/examples/convolution/example_convolution_autotune.py
index 393677489..c0c666402 100644
--- a/examples/convolution/example_convolution_autotune.py
+++ b/examples/convolution/example_convolution_autotune.py
@@ -14,7 +14,6 @@ def check_hopper():
 
 
 def ref_program(stride, padding, dilation):
-
     def main(A, B):
         A = A.permute(0, 3, 1, 2)  # N, H, W, C -> N, C, H, W
         B = B.permute(3, 2, 0, 1)  # H, W, C, F -> F, C, H, W
@@ -40,7 +39,8 @@ def get_configs():
             num_stages,
             thread_num,
             enable_rasterization,
-        ))
+        )
+    )
 
     configs = [
         {
@@ -50,7 +50,8 @@ def get_configs():
             "num_stages": c[3],
             "thread_num": c[4],
             "enable_rasteration": c[5],  # keep param name for backward-compat
-        } for c in _configs
+        }
+        for c in _configs
     ]
     return configs
 
@@ -64,69 +65,32 @@ def get_heuristic_config() -> dict:
     sm_version = sm_major * 10 + sm_minor
     print(f"CUDA device capability: {sm_version}")
     if sm_version in {80}:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 32,
-            "num_stages": 2,
-            "thread_num": 128,
-            "enable_rasteration": True
-        }
+        return {"block_M": 128, "block_N": 256, "block_K": 32, "num_stages": 2, "thread_num": 128, "enable_rasteration": True}
     elif sm_version in {90}:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 64,
-            "num_stages": 3,
-            "thread_num": 256,
-            "enable_rasteration": True
-        }
+        return {"block_M": 128, "block_N": 256, "block_K": 64, "num_stages": 3, "thread_num": 256, "enable_rasteration": True}
     else:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 32,
-            "num_stages": 0,
-            "thread_num": 128,
-            "enable_rasteration": True
-        }
+        return {"block_M": 128, "block_N": 256, "block_K": 32, "num_stages": 0, "thread_num": 128, "enable_rasteration": True}
 
 
 @tilelang.autotune(configs=get_configs())
 @tilelang.jit(out_idx=[2])
-def convolution(N,
-                C,
-                H,
-                W,
-                F,
-                K,
-                S,
-                D,
-                P,
-                block_M,
-                block_N,
-                block_K,
-                num_stages,
-                thread_num,
-                enable_rasteration,
-                dtype="float16",
-                accum_dtype="float"):
+def convolution(
+    N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, thread_num, enable_rasteration, dtype=T.float16, accum_dtype=T.float32
+):
     KH, KW = K, K
     OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
     OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     is_hopper = check_hopper()
 
     @T.prim_func
     def main(
-            data: T.Tensor((N, H, W, C), dtype),
-            kernel: T.Tensor((KH, KW, C, F), dtype),
-            out: T.Tensor((N, OH, OW, F), dtype),
+        data: T.Tensor((N, H, W, C), dtype),
+        kernel: T.Tensor((KH, KW, C, F), dtype),
+        out: T.Tensor((N, OH, OW, F), dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M),
-                threads=thread_num) as (bx, by):
+        with T.Kernel(T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M), threads=thread_num) as (bx, by):
             data_shared = T.alloc_shared((block_M, block_K), dtype)
             kernel_shared = T.alloc_shared((block_K, block_N), dtype)
             out_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -135,11 +99,6 @@ def main(
             kernel_flat = T.Tensor((KH * KW * C, F), dtype, kernel.data)
             out_flat = T.Tensor((N * OH * OW, F), dtype, out.data)
 
-            if is_hopper:
-                T.annotate_layout({
-                    out_shared: tilelang.layout.make_swizzled_layout(out_shared),
-                })
-
             T.clear(out_local)
             for k_iter in T.Pipelined(T.ceildiv(KH * KW * C, block_K), num_stages=num_stages):
                 if is_hopper:
@@ -150,10 +109,8 @@ def main(
                         m = by * block_M + i
                         access_h = m % (OH * OW) // OW * S + k // (KW * C) * D - P
                         access_w = m % OW * S + k // C % KW * D - P
-                        in_bound = ((access_h >= 0) and (access_w >= 0) and (access_h < H) and
-                                    (access_w < W))
-                        data_shared[i, j] = T.if_then_else(
-                            in_bound, data[m // (OH * OW), access_h, access_w, k % C], 0)
+                        in_bound = (access_h >= 0) and (access_w >= 0) and (access_h < H) and (access_w < W)
+                        data_shared[i, j] = T.if_then_else(in_bound, data[m // (OH * OW), access_h, access_w, k % C], 0)
                 T.copy(kernel_flat[k_iter * block_K, bx * block_N], kernel_shared)
                 T.gemm(data_shared, kernel_shared, out_local)
 
@@ -166,17 +123,19 @@ def main(
     return main
 
 
-def main(n: int = 128,
-         c: int = 128,
-         h: int = 64,
-         w: int = 64,
-         f: int = 128,
-         k: int = 3,
-         s: int = 1,
-         d: int = 1,
-         p: int = 1,
-         use_autotune: bool = False,
-         with_roller: bool = True):
+def main(
+    n: int = 128,
+    c: int = 128,
+    h: int = 64,
+    w: int = 64,
+    f: int = 128,
+    k: int = 3,
+    s: int = 1,
+    d: int = 1,
+    p: int = 1,
+    use_autotune: bool = False,
+    with_roller: bool = True,
+):
     N, C, H, W, F, K, S, D, P = n, c, h, w, f, k, s, d, p
     ref_prog = ref_program(S, P, D)
 
@@ -194,27 +153,38 @@ def main(n: int = 128,
     print(f"Ref latency: {ref_latency}")
 
 
+def run_regression_perf(
+    n: int = 128,
+    c: int = 128,
+    h: int = 64,
+    w: int = 64,
+    f: int = 128,
+    k: int = 3,
+    s: int = 1,
+    d: int = 1,
+    p: int = 1,
+    use_autotune: bool = False,
+    with_roller: bool = True,
+):
+    N, C, H, W, F, K, S, D, P = n, c, h, w, f, k, s, d, p
+    config = get_heuristic_config()
+    kernel = convolution(N, C, H, W, F, K, S, D, P, **config)
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Auto)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark")
-    parser.add_argument('--n', type=int, default=128, help='n')
-    parser.add_argument('--c', type=int, default=128, help='c')
-    parser.add_argument('--h', type=int, default=64, help='h')
-    parser.add_argument('--w', type=int, default=64, help='w')
-    parser.add_argument('--f', type=int, default=128, help='f')
-    parser.add_argument('--k', type=int, default=3, help='k')
-    parser.add_argument('--s', type=int, default=1, help='s')
-    parser.add_argument('--d', type=int, default=1, help='d')
-    parser.add_argument('--p', type=int, default=1, help='p')
-    parser.add_argument(
-        "--use_autotune",
-        action="store_true",
-        default=False,
-        help="Whether to use autotune for matmul configs")
-    parser.add_argument(
-        "--with_roller",
-        action="store_true",
-        default=True,
-        help="Whether to enable BitBLAS roller for search space")
+    parser.add_argument("--n", type=int, default=128, help="n")
+    parser.add_argument("--c", type=int, default=128, help="c")
+    parser.add_argument("--h", type=int, default=64, help="h")
+    parser.add_argument("--w", type=int, default=64, help="w")
+    parser.add_argument("--f", type=int, default=128, help="f")
+    parser.add_argument("--k", type=int, default=3, help="k")
+    parser.add_argument("--s", type=int, default=1, help="s")
+    parser.add_argument("--d", type=int, default=1, help="d")
+    parser.add_argument("--p", type=int, default=1, help="p")
+    parser.add_argument("--use_autotune", action="store_true", default=False, help="Whether to use autotune for matmul configs")
+    parser.add_argument("--with_roller", action="store_true", default=True, help="Whether to enable BitBLAS roller for search space")
     args = parser.parse_args()
-    main(args.n, args.c, args.h, args.w, args.f, args.k, args.s, args.d, args.p, args.use_autotune,
-         args.with_roller)
+    main(args.n, args.c, args.h, args.w, args.f, args.k, args.s, args.d, args.p, args.use_autotune, args.with_roller)
diff --git a/examples/convolution/regression_example_convolution.py b/examples/convolution/regression_example_convolution.py
new file mode 100644
index 000000000..18d4bcb68
--- /dev/null
+++ b/examples/convolution/regression_example_convolution.py
@@ -0,0 +1,15 @@
+import tilelang.testing
+import example_convolution
+import example_convolution_autotune
+
+
+def regression_example_convolution():
+    tilelang.testing.process_func(example_convolution.run_regression_perf)
+
+
+def regression_example_convolution_autotune():
+    tilelang.testing.process_func(example_convolution_autotune.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py b/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py
index 715f09a9b..18467a811 100644
--- a/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py
+++ b/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py
@@ -20,11 +20,11 @@ def tl_gemm(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float8_e4m3",
+        T.float8_e4m3fn,
     ], "Currently only float8_e4m3 is supported"
     assert out_dtype in [
-        "bfloat16",
-        "float32",
+        T.bfloat16,
+        T.float32,
     ], "Currently only float16 and float32 are supported"
 
     group_size = 128
@@ -41,18 +41,17 @@ def tl_gemm(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-            scales_a: T.Tensor(Scales_A_shape, "float32"),
-            scales_b: T.Tensor(Scales_B_shape, "float32"),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+        scales_a: T.Tensor(Scales_A_shape, T.float32),
+        scales_b: T.Tensor(Scales_B_shape, T.float32),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype)
-            Scale_C_shared = T.alloc_shared((block_M), "float32")
+            Scale_C_shared = T.alloc_shared((block_M), T.float32)
             C_local = T.alloc_fragment(C_shared_shape, accum_dtype)
             C_local_accum = T.alloc_fragment(C_shared_shape, accum_dtype)
 
@@ -93,21 +92,18 @@ def per_token_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     m, n = x.shape
     x_view = x.view(m, -1, 128)
     x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
-    return (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(
-        m, n), (x_amax / 448.0).view(m, -1)
+    return (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1)
 
 
 def per_block_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     assert x.dim() == 2
     m, n = x.shape
-    x_padded = torch.zeros(
-        ceildiv(m, 128) * 128, ceildiv(n, 128) * 128, dtype=x.dtype, device=x.device)
+    x_padded = torch.zeros(ceildiv(m, 128) * 128, ceildiv(n, 128) * 128, dtype=x.dtype, device=x.device)
     x_padded[:m, :n] = x
     x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
     x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
     x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
-    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (x_amax / 448.0).view(
-        x_view.size(0), x_view.size(2))
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
 
 
 def ref_deepgemm_fp8(A_fp8, B_fp8, A_scale, B_scale, out_dtype):
@@ -127,13 +123,14 @@ def ref_deepgemm_fp8(A_fp8, B_fp8, A_scale, B_scale, out_dtype):
             c_acc.zero_()
             for k in range(ceildiv(K, 128)):
                 c = torch._scaled_mm(
-                    A_fp8[i * 128:(i + 1) * 128, k * 128:(k + 1) * 128],
-                    B_fp8[j * 128:(j + 1) * 128, k * 128:(k + 1) * 128].T,
+                    A_fp8[i * 128 : (i + 1) * 128, k * 128 : (k + 1) * 128],
+                    B_fp8[j * 128 : (j + 1) * 128, k * 128 : (k + 1) * 128].T,
                     scale_a=A_scales[i, k].view(128, 1).contiguous(),
                     scale_b=B_scales[j, k].view(1, 128).contiguous(),
-                    out_dtype=torch.bfloat16)
+                    out_dtype=torch.bfloat16,
+                )
                 c_acc += c.to(torch.float32)
-            C[i * 128:(i + 1) * 128, j * 128:(j + 1) * 128] = c_acc.to(out_dtype)
+            C[i * 128 : (i + 1) * 128, j * 128 : (j + 1) * 128] = c_acc.to(out_dtype)
     return C
 
 
@@ -179,11 +176,11 @@ def assert_tl_gemm_correctness(M, N, K, block_N, in_dtype, out_dtype, accum_dtyp
 
 
 def main():
-    assert_tl_gemm_correctness(1024, 1024, 8192, 128, "float8_e4m3", "bfloat16", "float32")
+    assert_tl_gemm_correctness(1024, 1024, 8192, 128, T.float8_e4m3fn, T.bfloat16, T.float32)
 
 
 if __name__ == "__main__":
-    for dtype in ["float8_e4m3"]:
-        for out_dtype in ["bfloat16", "float32"]:
+    for dtype in [T.float8_e4m3fn]:
+        for out_dtype in [T.bfloat16, T.float32]:
             for block_N in [16, 32, 64, 128]:
-                assert_tl_gemm_correctness(1024, 1024, 8192, block_N, dtype, out_dtype, "float32")
+                assert_tl_gemm_correctness(1024, 1024, 8192, block_N, dtype, out_dtype, T.float32)
diff --git a/examples/deepseek_mla/README.md b/examples/deepseek_mla/README.md
index e64b1c37d..bd3539d26 100644
--- a/examples/deepseek_mla/README.md
+++ b/examples/deepseek_mla/README.md
@@ -24,14 +24,14 @@ We benchmarked the performance of FlashMLA, TileLang, Torch, Triton, and FlashIn
   <figcaption style="text-align: center;">Figure 2：Performance under batch size=128</figcaption>
 </figure>
 
-As shown in the results, TileLang achieves performance comparable to FlashMLA in most cases, significantly outperforming both FlashInfer and Triton. 
+As shown in the results, TileLang achieves performance comparable to FlashMLA in most cases, significantly outperforming both FlashInfer and Triton.
 Notably, **TileLang accomplishes this with just around 80 lines of Python code**, demonstrating its exceptional ease of use and efficiency. Let's dive in and see how TileLang achieves this.
 
 ## Implementation
 
 First, let's review the core computation logic of traditional FlashAttention:
 
-```python   
+```python
 # acc_s: [block_M, block_N]
 # scores_max: [block_M]
 # scores_scale: [block_M]
@@ -54,7 +54,7 @@ Compared to traditional attention operators like MHA (Multi-Headed Attention) or
 
 This raises the question of how to partition the matrix multiplication operation. On the Hopper architecture, most computation kernels use [`wgmma.mma_async`](https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions) instructions for optimal performance. The `wgmma.mma_async` instruction organizes 4 warps (128 threads) into a warpgroup for collective MMA operations. However, `wgmma.mma_async` instructions require a minimum M dimension of 64. This means each warpgroup's minimum M dimension can only be reduced to 64, but a tile size of 64*512 is too large for a single warpgroup, leading to register spilling.
 
-Therefore, our only option is to partition `acc_o` along the `dim` dimension, with two warpgroups computing the left and right part of `acc_o` respectively. However, this introduces another challenge: both warpgroups require the complete `acc_s` result as input. 
+Therefore, our only option is to partition `acc_o` along the `dim` dimension, with two warpgroups computing the left and right part of `acc_o` respectively. However, this introduces another challenge: both warpgroups require the complete `acc_s` result as input.
 
 Our solution is to have each warpgroup compute half of `acc_s` during `Q @ K` computation, then obtain the other half computed by the other warpgroup through shared memory.
 
@@ -96,7 +96,6 @@ T.use_swizzle(panel_size: int, order: str = "row")
 
 Here, `panel_size` specifies the width of the swizzled threadblock group, and `order` determines the swizzling pattern, which can be either "row" or "col".
 
-
 ### Shared Memory Swizzling
 
 In CUDA programming, shared memory is divided into multiple memory banks, with each bank capable of servicing one thread request per clock cycle in parallel. Bank conflicts occur when multiple threads simultaneously access different addresses mapped to the same bank, forcing these accesses to be serialized and degrading performance.
@@ -113,17 +112,14 @@ T.annotate_layout({
 
 Here, `T.annotate_layout` allows users to specify any desired layout for a buffer. For convenience, TileLang provides the `make_swizzled_layout` primitive to automatically generate a swizzled layout.
 
-
 ### Warp-Specialization
 
 The Hopper architecture commonly employs warp specialization for performance optimization. A typical approach is to designate one warpgroup as a producer that handles data movement using TMA (Tensor Memory Accelerator), while the remaining warpgroups serve as consumers performing computations. However, this programming pattern is complex, requiring developers to manually manage the execution logic for producers and consumers, including synchronization through the `mbarrier` objects.
 
 In TileLang, users are completely shielded from these implementation details. The frontend script is automatically transformed into a warp-specialized form, where TileLang handles all producer-consumer synchronization automatically, enabling efficient computation.
 
-
 ### Pipeline
 
-
 Pipeline is a technique used to improve memory access efficiency by overlapping memory access and computation. In TileLang, pipeline can be implemented through the `T.pipelined` annotation:
 
 ```python
@@ -132,9 +128,8 @@ T.pipelined(range: int, stage: int)
 
 Here, `range` specifies the range of the pipeline, and `stage` specifies the stage of the pipeline. Multi-stage pipelining enables overlapping of computation and memory access, which can significantly improve performance for memory-intensive operators. However, setting a higher number of stages consumes more shared memory resources, so the optimal configuration needs to be determined based on specific use cases.
 
-
 ### Split-KV
 
 We have also implemented Split-KV optimization similar to [FlashDecoding](https://pytorch.org/blog/flash-decoding/). Specifically, when the batch size is small, parallel SM resources cannot be fully utilized due to low parallelism. In such cases, we can split the kv_ctx dimension across multiple SMs for parallel computation and then merge the results.
 
-In our implementation, we have developed both split and combine kernels, allowing users to control the split size through a `num_split` parameter.
\ No newline at end of file
+In our implementation, we have developed both split and combine kernels, allowing users to control the split size through a `num_split` parameter.
diff --git a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
index db460437f..dccf333ad 100644
--- a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
+++ b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
@@ -8,6 +8,7 @@
 
 def get_configs():
     import itertools
+
     BLOCK_N = [16, 32, 64, 128]
     BLOCK_H = [16, 32, 64, 128]
     num_split = [1, 2, 4, 8, 16, 32]
@@ -15,45 +16,44 @@ def get_configs():
 
     _configs = list(itertools.product(BLOCK_N, BLOCK_H, num_split, threads))
 
-    return [{
-        "block_N": c[0],
-        "block_H": c[1],
-        "num_split": c[2],
-        "threads": c[3],
-    } for c in _configs]
+    return [
+        {
+            "block_N": c[0],
+            "block_H": c[1],
+            "num_split": c[2],
+            "threads": c[3],
+        }
+        for c in _configs
+    ]
 
 
 @tilelang.autotune(configs=get_configs())
 @tilelang.jit(
-    out_idx=[6], pass_configs={
+    out_idx=[6],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashmla_decode(batch,
-                    heads,
-                    kv_head_num,
-                    seqlen_kv,
-                    dim,
-                    pe_dim,
-                    block_N,
-                    block_H,
-                    num_split,
-                    threads=128):
-    scale = (1.0 / (dim + pe_dim))**0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    },
+)
+def flashmla_decode(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split, threads=128):
+    scale = (1.0 / (dim + pe_dim)) ** 0.5 * 1.44269504  # log2(e)
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert kv_head_num == 1, "kv_head_num must be 1"
 
-    @T.macro
-    def flash_attn(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+    @T.prim_func
+    def main_split(
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
-        with T.Kernel(batch, heads // min(block_H, kv_group_num), threads=threads) as (bx, by):
+        # flash_attn_split
+        with T.Kernel(batch, heads // min(block_H, kv_group_num), num_split, threads=threads) as (bx, by, bz):
             Q_local = T.alloc_fragment([block_H, dim], dtype)
             Q_pe_local = T.alloc_fragment([block_H, pe_dim], dtype)
             KV_shared = T.alloc_shared([block_N, dim], dtype)
@@ -69,34 +69,31 @@ def flash_attn(
 
             cur_kv_head = by // (kv_group_num // block_H)
             T.use_swizzle(10)
-
-            T.copy(Q[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_local)
-            T.copy(Q_pe[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_pe_local)
+            T.copy(Q[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_local)
+            T.copy(Q_pe[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_pe_local)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
-            loop_range = T.ceildiv(seqlen_kv, block_N)
+            loop_range = T.ceildiv((seqlen_kv // num_split), block_N)
             for k in T.Pipelined(loop_range, num_stages=0):
-                T.copy(KV[bx, k * block_N:(k + 1) * block_N, cur_kv_head, :], KV_shared)
-                T.copy(K_pe[bx, k * block_N:(k + 1) * block_N, cur_kv_head, :], K_pe_shared)
+                kv_start = (seqlen_kv // num_split) * bz + k * block_N
+                kv_end = (seqlen_kv // num_split) * bz + (k + 1) * block_N
+                T.copy(KV[bx, kv_start:kv_end, cur_kv_head, :], KV_shared)
+                T.copy(K_pe[bx, kv_start:kv_end, cur_kv_head, :], K_pe_shared)
                 T.clear(acc_s)
                 T.gemm(Q_local, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.gemm(
-                    Q_pe_local,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullRow)
+                T.gemm(Q_pe_local, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
                     acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                 T.reduce_sum(acc_s, scores_sum, dim=1)
-                # T.copy(acc_s, S_shared)
                 T.copy(acc_s, acc_s_cast)
                 for i in T.Parallel(block_H):
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
@@ -105,20 +102,50 @@ def flash_attn(
                 T.gemm(acc_s_cast, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
             for i, j in T.Parallel(block_H, dim):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :])
+            for i in T.Parallel(block_H):
+                logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
+            T.copy(logsum, glse[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, bz])
+            T.copy(acc_o, Output_partial[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, bz, :])
+
+        # combine
+        with T.Kernel(heads, batch, threads=128) as (by, bz):
+            po_local = T.alloc_fragment([dim], dtype)
+            o_accum_local = T.alloc_fragment([dim], accum_dtype)
+            lse_local_split = T.alloc_var(accum_dtype)
+            lse_logsum_local = T.alloc_var(accum_dtype)
+            lse_max_local = T.alloc_var(accum_dtype)
+            scale_local = T.alloc_var(accum_dtype)
+
+            T.clear(lse_logsum_local)
+            T.clear(o_accum_local)
+            lse_max_local = -T.infinity(accum_dtype)
+            for k in T.serial(num_split):
+                lse_max_local = T.max(lse_max_local, glse[bz, by, k])
+            for k in T.Pipelined(num_split, num_stages=1):
+                lse_local_split = glse[bz, by, k]
+                lse_logsum_local += T.exp2(lse_local_split - lse_max_local)
+            lse_logsum_local = T.log2(lse_logsum_local) + lse_max_local
+            for k in T.serial(num_split):
+                for i in T.Parallel(dim):
+                    po_local[i] = Output_partial[bz, by, k, i]
+                lse_local_split = glse[bz, by, k]
+                scale_local = T.exp2(lse_local_split - lse_logsum_local)
+                for i in T.Parallel(dim):
+                    o_accum_local[i] += po_local[i] * scale_local[0]
+            for i in T.Parallel(dim):
+                Output[bz, by, i] = o_accum_local[i]
 
-    @T.macro
-    def flash_attn_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+    @T.prim_func
+    def main_no_split(
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
-        with T.Kernel(
-                batch, heads // min(block_H, kv_group_num), num_split,
-                threads=threads) as (bx, by, bz):
+        with T.Kernel(batch, heads // min(block_H, kv_group_num), threads=threads) as (bx, by):
             Q_local = T.alloc_fragment([block_H, dim], dtype)
             Q_pe_local = T.alloc_fragment([block_H, pe_dim], dtype)
             KV_shared = T.alloc_shared([block_N, dim], dtype)
@@ -134,34 +161,31 @@ def flash_attn_split(
 
             cur_kv_head = by // (kv_group_num // block_H)
             T.use_swizzle(10)
-            T.copy(Q[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_local)
-            T.copy(Q_pe[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_pe_local)
+
+            T.copy(Q[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_local)
+            T.copy(Q_pe[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_pe_local)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
-            loop_range = T.ceildiv((seqlen_kv // num_split), block_N)
+            loop_range = T.ceildiv(seqlen_kv, block_N)
             for k in T.Pipelined(loop_range, num_stages=0):
-                kv_start = (seqlen_kv // num_split) * bz + k * block_N
-                kv_end = (seqlen_kv // num_split) * bz + (k + 1) * block_N
-                T.copy(KV[bx, kv_start:kv_end, cur_kv_head, :], KV_shared)
-                T.copy(K_pe[bx, kv_start:kv_end, cur_kv_head, :], K_pe_shared)
+                T.copy(KV[bx, k * block_N : (k + 1) * block_N, cur_kv_head, :], KV_shared)
+                T.copy(K_pe[bx, k * block_N : (k + 1) * block_N, cur_kv_head, :], K_pe_shared)
                 T.clear(acc_s)
                 T.gemm(Q_local, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.gemm(
-                    Q_pe_local,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullRow)
+                T.gemm(Q_pe_local, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
                     acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                 T.reduce_sum(acc_s, scores_sum, dim=1)
+                # T.copy(acc_s, S_shared)
                 T.copy(acc_s, acc_s_cast)
                 for i in T.Parallel(block_H):
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
@@ -170,72 +194,7 @@ def flash_attn_split(
                 T.gemm(acc_s_cast, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
             for i, j in T.Parallel(block_H, dim):
                 acc_o[i, j] /= logsum[i]
-            for i in T.Parallel(block_H):
-                logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, glse[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, bz])
-            T.copy(acc_o, Output_partial[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, bz, :])
-
-    @T.macro
-    def combine(
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
-    ):
-        with T.Kernel(heads, batch, threads=128) as (by, bz):
-            po_local = T.alloc_fragment([dim], dtype)
-            o_accum_local = T.alloc_fragment([dim], accum_dtype)
-            lse_local_split = T.alloc_local([1], accum_dtype)
-            lse_logsum_local = T.alloc_local([1], accum_dtype)
-            lse_max_local = T.alloc_local([1], accum_dtype)
-            scale_local = T.alloc_local([1], accum_dtype)
-
-            T.annotate_layout({
-                lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-            })
-
-            T.clear(lse_logsum_local)
-            T.clear(o_accum_local)
-            lse_max_local[0] = -T.infinity(accum_dtype)
-            for k in T.serial(num_split):
-                lse_max_local[0] = T.max(lse_max_local[0], glse[bz, by, k])
-            for k in T.Pipelined(num_split, num_stages=1):
-                lse_local_split[0] = glse[bz, by, k]
-                lse_logsum_local[0] += T.exp2(lse_local_split[0] - lse_max_local[0])
-            lse_logsum_local[0] = T.log2(lse_logsum_local[0]) + lse_max_local[0]
-            for k in T.serial(num_split):
-                for i in T.Parallel(dim):
-                    po_local[i] = Output_partial[bz, by, k, i]
-                lse_local_split[0] = glse[bz, by, k]
-                scale_local[0] = T.exp2(lse_local_split[0] - lse_logsum_local[0])
-                for i in T.Parallel(dim):
-                    o_accum_local[i] += po_local[i] * scale_local[0]
-            for i in T.Parallel(dim):
-                Output[bz, by, i] = o_accum_local[i]
-
-    @T.prim_func
-    def main_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
-    ):
-        flash_attn_split(Q, Q_pe, KV, K_pe, glse, Output_partial)
-        combine(glse, Output_partial, Output)
-
-    @T.prim_func
-    def main_no_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
-    ):
-        flash_attn(Q, Q_pe, KV, K_pe, Output)
+            T.copy(acc_o, Output[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :])
 
     if num_split > 1:
         return main_split
@@ -258,43 +217,36 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
     dim = q.shape[-1]
     pe_dim = q_pe.shape[-1]
     num_head_groups = q.shape[1] // kv.shape[2]
-    scale = (dim + pe_dim)**0.5
-    q = rearrange(
-        q, 'b (h g) d -> b g h d', g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    scale = (dim + pe_dim) ** 0.5
+    q = rearrange(q, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    q_pe = rearrange(
-        q_pe, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
+    q_pe = rearrange(q_pe, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
 
-    kv = rearrange(kv, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    kv = rearrange(kv, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    k_pe = rearrange(k_pe, 'b n h d -> b h n d')  # [batch_size, num_head_groups, groups, pe_dim]
+    k_pe = rearrange(k_pe, "b n h d -> b h n d")  # [batch_size, num_head_groups, groups, pe_dim]
 
     query = torch.concat([q, q_pe], dim=-1)
     key = torch.concat([kv, k_pe], dim=-1)
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, kv,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, kv, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=128, help='batch size')
-    parser.add_argument('--heads', type=int, default=128, help='q heads number')
-    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
-    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
-    parser.add_argument('--dim', type=int, default=512, help='head dim')
-    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
-    parser.add_argument('--autotune', action='store_true', help='auto tune')
+    parser.add_argument("--batch", type=int, default=128, help="batch size")
+    parser.add_argument("--heads", type=int, default=128, help="q heads number")
+    parser.add_argument("--kv_heads", type=int, default=1, help="kv heads number")
+    parser.add_argument("--kv_ctx", type=int, default=8192, help="kv context length")
+    parser.add_argument("--dim", type=int, default=512, help="head dim")
+    parser.add_argument("--pe_dim", type=int, default=64, help="pe head dim")
+    parser.add_argument("--autotune", action="store_true", help="auto tune")
     args = parser.parse_args()
     batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
     enable_autotune = args.autotune
@@ -310,17 +262,7 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
     if enable_autotune:
         kernel = flashmla_decode(batch, heads, kv_heads, kv_ctx, dim, pe_dim)
     else:
-        kernel = flashmla_decode(
-            batch,
-            heads,
-            kv_heads,
-            kv_ctx,
-            dim,
-            pe_dim,
-            BLOCK_N,
-            BLOCK_H,
-            num_split,
-            threads=threads)
+        kernel = flashmla_decode(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split, threads=threads)
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
     input_tensors = profiler._get_inputs()
     tilelang_output = kernel(*input_tensors)
diff --git a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_torch.py b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_torch.py
index 0006d9468..18c0a5f86 100644
--- a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_torch.py
+++ b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_torch.py
@@ -32,8 +32,7 @@ def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal=False):
 
 
 @torch.inference_mode()
-def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q,
-                  h_kv, d, dv, causal, dtype):
+def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     blocked_v = blocked_k[..., :dv]
 
     def ref_mla():
@@ -94,8 +93,7 @@ def _mla_attn_kernel(
 
     offs_d_ckv = tl.arange(0, HEAD_DIM_CKV)
     cur_head = cur_head_id * BLOCK_H + tl.arange(0, BLOCK_H)
-    offs_q_nope = cur_batch * stride_q_nope_bs + cur_head[:, None] * stride_q_nope_h + offs_d_ckv[
-        None, :]
+    offs_q_nope = cur_batch * stride_q_nope_bs + cur_head[:, None] * stride_q_nope_h + offs_d_ckv[None, :]
     q_nope = tl.load(Q_nope + offs_q_nope)
 
     offs_d_kpe = tl.arange(0, HEAD_DIM_KPE)
@@ -141,9 +139,7 @@ def _mla_attn_kernel(
 
         e_sum = e_sum * re_scale + tl.sum(p, 1)
         e_max = n_e_max
-    offs_o = cur_batch * stride_o_b + cur_head[:,
-                                               None] * stride_o_h + split_kv_id * stride_o_s + offs_d_ckv[
-                                                   None, :]
+    offs_o = cur_batch * stride_o_b + cur_head[:, None] * stride_o_h + split_kv_id * stride_o_s + offs_d_ckv[None, :]
     tl.store(O + offs_o, acc / e_sum[:, None])
     offs_o_1 = cur_batch * stride_o_b + cur_head * stride_o_h + split_kv_id * stride_o_s + HEAD_DIM_CKV
     tl.store(O + offs_o_1, e_max + tl.log(e_sum))
@@ -309,24 +305,30 @@ def mla_decode_triton(
 
 
 @torch.inference_mode()
-def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
-                         cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-
+def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     blocked_v = blocked_k[..., :dv]
 
     assert d > dv, "mla with rope dim should be larger than no rope dim"
     q_nope, q_pe = q[..., :dv].contiguous(), q[..., dv:].contiguous()
-    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[...,
-                                                                               dv:].contiguous()
+    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[..., dv:].contiguous()
 
     def flash_mla_triton():
         num_kv_splits = 32
         o = torch.empty([b * s_q, h_q, dv])
         attn_logits = torch.empty([b * s_q, h_q, num_kv_splits, dv + 1])
         mla_decode_triton(
-            q_nope.view(-1, h_q, dv), q_pe.view(-1, h_q, d - dv), blocked_k_nope.view(-1, dv),
-            blocked_k_pe.view(-1, d - dv), o, block_table, cache_seqlens, attn_logits,
-            num_kv_splits, 1 / math.sqrt(d), block_size)
+            q_nope.view(-1, h_q, dv),
+            q_pe.view(-1, h_q, d - dv),
+            blocked_k_nope.view(-1, dv),
+            blocked_k_pe.view(-1, d - dv),
+            o,
+            block_table,
+            cache_seqlens,
+            attn_logits,
+            num_kv_splits,
+            1 / math.sqrt(d),
+            block_size,
+        )
         return o.view([b, s_q, h_q, dv])
 
     out_flash = flash_mla_triton()
@@ -362,14 +364,15 @@ def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal
 
     q = torch.randn(b, s_q, h_q, d)
     block_size = 64
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
 
-    out_a, lse_a, perf_a = baseline_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                         s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
-    out_b, lse_b, perf_b = target_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                       s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_a, lse_a, perf_a = baseline_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     torch.testing.assert_close(out_b.float(), out_a.float(), atol=1e-2, rtol=1e-2), "out"
     if target not in ["flash_mla_triton"]:
@@ -377,21 +380,14 @@ def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal
         torch.testing.assert_close(lse_b.float(), lse_a.float(), atol=1e-2, rtol=1e-2), "lse"
 
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (
-        torch.finfo(dtype).bits // 8)
-    print(
-        f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10 ** 9 / perf_a:.0f} TFLOPS, {bytes / 10 ** 6 / perf_a:.0f} GB/s"
-    )
-    print(
-        f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10 ** 9 / perf_b:.0f} TFLOPS, {bytes / 10 ** 6 / perf_b:.0f} GB/s"
-    )
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10**9 / perf_a:.0f} TFLOPS, {bytes / 10**6 / perf_a:.0f} GB/s")
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.0f} TFLOPS, {bytes / 10**6 / perf_b:.0f} GB/s")
     return bytes / 10**6 / perf_a, bytes / 10**6 / perf_b
 
 
 def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-    print(
-        f"{target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}"
-    )
+    print(f"{target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}")
     torch.set_default_dtype(dtype)
     device = torch.device("cuda:0")
     torch.set_default_device(device)
@@ -408,19 +404,16 @@ def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
 
     q = torch.randn(b, s_q, h_q, d)
     block_size = 64
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
 
-    out_b, lse_b, perf_b = target_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                       s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (
-        torch.finfo(dtype).bits // 8)
-    print(
-        f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10 ** 9 / perf_b:.0f} TFLOPS, {bytes / 10 ** 6 / perf_b:.0f} GB/s"
-    )
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.0f} TFLOPS, {bytes / 10**6 / perf_b:.0f} GB/s")
     return bytes / 10**6 / perf_b
 
 
@@ -429,26 +422,22 @@ def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     "flash_mla_triton",
 ]
 
-shape_configs = [{
-    "b":
-        batch,
-    "s_q":
-        1,
-    "cache_seqlens":
-        torch.tensor([seqlen + 2 * i for i in range(batch)], dtype=torch.int32, device="cuda"),
-    "h_q":
-        head,
-    "h_kv":
-        1,
-    "d":
-        512 + 64,
-    "dv":
-        512,
-    "causal":
-        True,
-    "dtype":
-        torch.float16
-} for batch in [128] for seqlen in [1024, 2048, 4096, 8192, 16384] for head in [128]]
+shape_configs = [
+    {
+        "b": batch,
+        "s_q": 1,
+        "cache_seqlens": torch.tensor([seqlen + 2 * i for i in range(batch)], dtype=torch.int32, device="cuda"),
+        "h_q": head,
+        "h_kv": 1,
+        "d": 512 + 64,
+        "dv": 512,
+        "causal": True,
+        "dtype": torch.float16,
+    }
+    for batch in [128]
+    for seqlen in [1024, 2048, 4096, 8192, 16384]
+    for head in [128]
+]
 
 
 def get_args():
@@ -470,26 +459,54 @@ def get_args():
         for shape in shape_configs:
             if args.all:
                 for target in available_targets:
-                    perf = compare_a(target, shape["b"], shape["s_q"], shape["cache_seqlens"],
-                                     shape["h_q"], shape["h_kv"], shape["d"], shape["dv"],
-                                     shape["causal"], shape["dtype"])
+                    perf = compare_a(
+                        target,
+                        shape["b"],
+                        shape["s_q"],
+                        shape["cache_seqlens"],
+                        shape["h_q"],
+                        shape["h_kv"],
+                        shape["d"],
+                        shape["dv"],
+                        shape["causal"],
+                        shape["dtype"],
+                    )
                     fout.write(
-                        f'{target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perf:.0f}\n'
+                        f"{target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
                     )
             elif args.compare:
-                perfa, prefb = compare_ab(args.baseline, args.target, shape["b"], shape["s_q"],
-                                          shape["cache_seqlens"], shape["h_q"], shape["h_kv"],
-                                          shape["d"], shape["dv"], shape["causal"], shape["dtype"])
+                perfa, prefb = compare_ab(
+                    args.baseline,
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
                 fout.write(
-                    f'{args.baseline},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perfa:.0f}\n'
+                    f"{args.baseline},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perfa:.0f}\n"
                 )
                 fout.write(
-                    f'{args.target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{prefb:.0f}\n'
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{prefb:.0f}\n"
                 )
             elif args.one:
-                perf = compare_a(args.target, shape["b"], shape["s_q"], shape["cache_seqlens"],
-                                 shape["h_q"], shape["h_kv"], shape["d"], shape["dv"],
-                                 shape["causal"], shape["dtype"])
+                perf = compare_a(
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
                 fout.write(
-                    f'{args.target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perf:.0f}\n'
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
                 )
diff --git a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_triton.py b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_triton.py
index 644f97da1..861e841c4 100644
--- a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_triton.py
+++ b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_triton.py
@@ -29,8 +29,7 @@ def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal=False):
 
 
 @torch.inference_mode()
-def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q,
-                  h_kv, d, dv, causal, dtype):
+def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     blocked_v = blocked_k[..., :dv]
 
     def ref_mla():
@@ -91,8 +90,7 @@ def _mla_attn_kernel(
 
     offs_d_ckv = tl.arange(0, HEAD_DIM_CKV)
     cur_head = cur_head_id * BLOCK_H + tl.arange(0, BLOCK_H)
-    offs_q_nope = cur_batch * stride_q_nope_bs + cur_head[:, None] * stride_q_nope_h + offs_d_ckv[
-        None, :]
+    offs_q_nope = cur_batch * stride_q_nope_bs + cur_head[:, None] * stride_q_nope_h + offs_d_ckv[None, :]
     q_nope = tl.load(Q_nope + offs_q_nope)
 
     offs_d_kpe = tl.arange(0, HEAD_DIM_KPE)
@@ -138,9 +136,7 @@ def _mla_attn_kernel(
 
         e_sum = e_sum * re_scale + tl.sum(p, 1)
         e_max = n_e_max
-    offs_o = cur_batch * stride_o_b + cur_head[:,
-                                               None] * stride_o_h + split_kv_id * stride_o_s + offs_d_ckv[
-                                                   None, :]
+    offs_o = cur_batch * stride_o_b + cur_head[:, None] * stride_o_h + split_kv_id * stride_o_s + offs_d_ckv[None, :]
     tl.store(O + offs_o, acc / e_sum[:, None])
     offs_o_1 = cur_batch * stride_o_b + cur_head * stride_o_h + split_kv_id * stride_o_s + HEAD_DIM_CKV
     tl.store(O + offs_o_1, e_max + tl.log(e_sum))
@@ -306,24 +302,30 @@ def mla_decode_triton(
 
 
 @torch.inference_mode()
-def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
-                         cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-
+def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     blocked_v = blocked_k[..., :dv]
 
     assert d > dv, "mla with rope dim should be larger than no rope dim"
     q_nope, q_pe = q[..., :dv].contiguous(), q[..., dv:].contiguous()
-    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[...,
-                                                                               dv:].contiguous()
+    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[..., dv:].contiguous()
 
     def flash_mla_triton():
         num_kv_splits = 32
         o = torch.empty([b * s_q, h_q, dv])
         attn_logits = torch.empty([b * s_q, h_q, num_kv_splits, dv + 1])
         mla_decode_triton(
-            q_nope.view(-1, h_q, dv), q_pe.view(-1, h_q, d - dv), blocked_k_nope.view(-1, dv),
-            blocked_k_pe.view(-1, d - dv), o, block_table, cache_seqlens, attn_logits,
-            num_kv_splits, 1 / math.sqrt(d), block_size)
+            q_nope.view(-1, h_q, dv),
+            q_pe.view(-1, h_q, d - dv),
+            blocked_k_nope.view(-1, dv),
+            blocked_k_pe.view(-1, d - dv),
+            o,
+            block_table,
+            cache_seqlens,
+            attn_logits,
+            num_kv_splits,
+            1 / math.sqrt(d),
+            block_size,
+        )
         return o.view([b, s_q, h_q, dv])
 
     out_flash = flash_mla_triton()
@@ -359,14 +361,15 @@ def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal
 
     q = torch.randn(b, s_q, h_q, d)
     block_size = 64
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
 
-    out_a, lse_a, perf_a = baseline_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                         s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
-    out_b, lse_b, perf_b = target_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                       s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_a, lse_a, perf_a = baseline_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     torch.testing.assert_close(out_b.float(), out_a.float(), atol=1e-2, rtol=1e-2), "out"
     if target not in ["flash_mla_triton"]:
@@ -374,21 +377,14 @@ def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal
         torch.testing.assert_close(lse_b.float(), lse_a.float(), atol=1e-2, rtol=1e-2), "lse"
 
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (
-        torch.finfo(dtype).bits // 8)
-    print(
-        f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10 ** 9 / perf_a:.0f} TFLOPS, {bytes / 10 ** 6 / perf_a:.0f} GB/s"
-    )
-    print(
-        f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10 ** 9 / perf_b:.0f} TFLOPS, {bytes / 10 ** 6 / perf_b:.0f} GB/s"
-    )
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10**9 / perf_a:.0f} TFLOPS, {bytes / 10**6 / perf_a:.0f} GB/s")
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.0f} TFLOPS, {bytes / 10**6 / perf_b:.0f} GB/s")
     return bytes / 10**6 / perf_a, bytes / 10**6 / perf_b
 
 
 def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-    print(
-        f"{target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}"
-    )
+    print(f"{target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}")
     torch.set_default_dtype(dtype)
     device = torch.device("cuda:0")
     torch.set_default_device(device)
@@ -405,19 +401,16 @@ def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
 
     q = torch.randn(b, s_q, h_q, d)
     block_size = 64
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
 
-    out_b, lse_b, perf_b = target_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                       s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (
-        torch.finfo(dtype).bits // 8)
-    print(
-        f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10 ** 9 / perf_b:.0f} TFLOPS, {bytes / 10 ** 6 / perf_b:.0f} GB/s"
-    )
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.0f} TFLOPS, {bytes / 10**6 / perf_b:.0f} GB/s")
     return bytes / 10**6 / perf_b
 
 
@@ -426,26 +419,22 @@ def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     "flash_mla_triton",
 ]
 
-shape_configs = [{
-    "b":
-        batch,
-    "s_q":
-        1,
-    "cache_seqlens":
-        torch.tensor([seqlen + 2 * i for i in range(batch)], dtype=torch.int32, device="cuda"),
-    "h_q":
-        head,
-    "h_kv":
-        1,
-    "d":
-        512 + 64,
-    "dv":
-        512,
-    "causal":
-        True,
-    "dtype":
-        torch.float16
-} for batch in [64, 128] for seqlen in [1024, 2048, 4096, 8192, 16384] for head in [128]]
+shape_configs = [
+    {
+        "b": batch,
+        "s_q": 1,
+        "cache_seqlens": torch.tensor([seqlen + 2 * i for i in range(batch)], dtype=torch.int32, device="cuda"),
+        "h_q": head,
+        "h_kv": 1,
+        "d": 512 + 64,
+        "dv": 512,
+        "causal": True,
+        "dtype": torch.float16,
+    }
+    for batch in [64, 128]
+    for seqlen in [1024, 2048, 4096, 8192, 16384]
+    for head in [128]
+]
 
 
 def get_args():
@@ -467,26 +456,54 @@ def get_args():
         for shape in shape_configs:
             if args.all:
                 for target in available_targets:
-                    perf = compare_a(target, shape["b"], shape["s_q"], shape["cache_seqlens"],
-                                     shape["h_q"], shape["h_kv"], shape["d"], shape["dv"],
-                                     shape["causal"], shape["dtype"])
+                    perf = compare_a(
+                        target,
+                        shape["b"],
+                        shape["s_q"],
+                        shape["cache_seqlens"],
+                        shape["h_q"],
+                        shape["h_kv"],
+                        shape["d"],
+                        shape["dv"],
+                        shape["causal"],
+                        shape["dtype"],
+                    )
                     fout.write(
-                        f'{target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perf:.0f}\n'
+                        f"{target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
                     )
             elif args.compare:
-                perfa, prefb = compare_ab(args.baseline, args.target, shape["b"], shape["s_q"],
-                                          shape["cache_seqlens"], shape["h_q"], shape["h_kv"],
-                                          shape["d"], shape["dv"], shape["causal"], shape["dtype"])
+                perfa, prefb = compare_ab(
+                    args.baseline,
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
                 fout.write(
-                    f'{args.baseline},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perfa:.0f}\n'
+                    f"{args.baseline},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perfa:.0f}\n"
                 )
                 fout.write(
-                    f'{args.target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{prefb:.0f}\n'
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{prefb:.0f}\n"
                 )
             elif args.one:
-                perf = compare_a(args.target, shape["b"], shape["s_q"], shape["cache_seqlens"],
-                                 shape["h_q"], shape["h_kv"], shape["d"], shape["dv"],
-                                 shape["causal"], shape["dtype"])
+                perf = compare_a(
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
                 fout.write(
-                    f'{args.target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perf:.0f}\n'
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
                 )
diff --git a/examples/deepseek_mla/benchmark_mla.py b/examples/deepseek_mla/benchmark_mla.py
index a542ff611..544b5e128 100644
--- a/examples/deepseek_mla/benchmark_mla.py
+++ b/examples/deepseek_mla/benchmark_mla.py
@@ -33,8 +33,7 @@ def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal=False):
 
 
 @torch.inference_mode()
-def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q,
-                  h_kv, d, dv, causal, dtype):
+def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     blocked_v = blocked_k[..., :dv]
 
     def ref_mla():
@@ -61,8 +60,7 @@ def ref_mla():
 
 
 @torch.inference_mode()
-def run_flash_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q,
-                  h_kv, d, dv, causal, dtype):
+def run_flash_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     from flash_mla import flash_mla_with_kvcache, get_mla_metadata
 
     blocked_v = blocked_k[..., :dv]
@@ -87,14 +85,13 @@ def flash_mla():
 
 
 @torch.inference_mode()
-def run_flashinfer(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens,
-                   h_q, h_kv, d, dv, causal, dtype):
+def run_flashinfer(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     # pip install flashinfer-python
     import flashinfer
+
     assert d > dv, "mla with rope dim should be larger than no rope dim"
     q_nope, q_pe = q[..., :dv].contiguous(), q[..., dv:].contiguous()
-    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[...,
-                                                                               dv:].contiguous()
+    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[..., dv:].contiguous()
 
     kv_indptr = [0]
     kv_indices = []
@@ -111,8 +108,7 @@ def run_flashinfer(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q
     kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
     kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
 
-    mla_wrapper = flashinfer.mla.BatchMLAPagedAttentionWrapper(
-        torch.empty(128 * 1024 * 1024, dtype=torch.int8), backend="fa3")
+    mla_wrapper = flashinfer.mla.BatchMLAPagedAttentionWrapper(torch.empty(128 * 1024 * 1024, dtype=torch.int8), backend="fa3")
     mla_wrapper.plan(
         q_indptr,
         kv_indptr,
@@ -129,12 +125,7 @@ def run_flashinfer(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q
     )
 
     def flashinfer():
-        output, lse = mla_wrapper.run(
-            q_nope.view(-1, h_q, dv),
-            q_pe.view(-1, h_q, d - dv),
-            blocked_k_nope,
-            blocked_k_pe,
-            return_lse=True)
+        output, lse = mla_wrapper.run(q_nope.view(-1, h_q, dv), q_pe.view(-1, h_q, d - dv), blocked_k_nope, blocked_k_pe, return_lse=True)
         return output.view(b, -1, h_q, dv), lse.view(b, h_q, 1)
 
     out_flash, lse_flash = flashinfer()
@@ -177,8 +168,7 @@ def _mla_attn_kernel(
 
     offs_d_ckv = tl.arange(0, HEAD_DIM_CKV)
     cur_head = cur_head_id * BLOCK_H + tl.arange(0, BLOCK_H)
-    offs_q_nope = cur_batch * stride_q_nope_bs + cur_head[:, None] * stride_q_nope_h + offs_d_ckv[
-        None, :]
+    offs_q_nope = cur_batch * stride_q_nope_bs + cur_head[:, None] * stride_q_nope_h + offs_d_ckv[None, :]
     q_nope = tl.load(Q_nope + offs_q_nope)
 
     offs_d_kpe = tl.arange(0, HEAD_DIM_KPE)
@@ -224,9 +214,7 @@ def _mla_attn_kernel(
 
         e_sum = e_sum * re_scale + tl.sum(p, 1)
         e_max = n_e_max
-    offs_o = cur_batch * stride_o_b + cur_head[:,
-                                               None] * stride_o_h + split_kv_id * stride_o_s + offs_d_ckv[
-                                                   None, :]
+    offs_o = cur_batch * stride_o_b + cur_head[:, None] * stride_o_h + split_kv_id * stride_o_s + offs_d_ckv[None, :]
     tl.store(O + offs_o, acc / e_sum[:, None])
     offs_o_1 = cur_batch * stride_o_b + cur_head * stride_o_h + split_kv_id * stride_o_s + HEAD_DIM_CKV
     tl.store(O + offs_o_1, e_max + tl.log(e_sum))
@@ -393,24 +381,30 @@ def mla_decode_triton(
 
 
 @torch.inference_mode()
-def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
-                         cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-
+def run_flash_mla_triton(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     blocked_v = blocked_k[..., :dv]
 
     assert d > dv, "mla with rope dim should be larger than no rope dim"
     q_nope, q_pe = q[..., :dv].contiguous(), q[..., dv:].contiguous()
-    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[...,
-                                                                               dv:].contiguous()
+    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[..., dv:].contiguous()
 
     def flash_mla_triton():
         num_kv_splits = 32
         o = torch.empty([b * s_q, h_q, dv])
         attn_logits = torch.empty([b * s_q, h_q, num_kv_splits, dv + 1])
         mla_decode_triton(
-            q_nope.view(-1, h_q, dv), q_pe.view(-1, h_q, d - dv), blocked_k_nope.view(-1, dv),
-            blocked_k_pe.view(-1, d - dv), o, block_table, cache_seqlens, attn_logits,
-            num_kv_splits, 1 / math.sqrt(d), block_size)
+            q_nope.view(-1, h_q, dv),
+            q_pe.view(-1, h_q, d - dv),
+            blocked_k_nope.view(-1, dv),
+            blocked_k_pe.view(-1, d - dv),
+            o,
+            block_table,
+            cache_seqlens,
+            attn_logits,
+            num_kv_splits,
+            1 / math.sqrt(d),
+            block_size,
+        )
         return o.view([b, s_q, h_q, dv])
 
     out_flash = flash_mla_triton()
@@ -419,13 +413,10 @@ def flash_mla_triton():
 
 
 @torch.inference_mode()
-def run_flash_mla_tilelang(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
-                           cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-
+def run_flash_mla_tilelang(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     assert d > dv, "mla with rope dim should be larger than no rope dim"
     q_nope, q_pe = q[..., :dv].contiguous(), q[..., dv:].contiguous()
-    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[...,
-                                                                               dv:].contiguous()
+    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[..., dv:].contiguous()
 
     dpe = d - dv
     num_kv_splits = 1
@@ -434,8 +425,7 @@ def run_flash_mla_tilelang(q, block_table, blocked_k, max_seqlen_pad, block_size
 
     out_partial = torch.empty(b, h_q, num_kv_splits, dv, dtype=dtype, device=q.device)
     glse = torch.empty(b, h_q, num_kv_splits, dtype=dtype, device=q.device)
-    kernel = mla_decode_tilelang(b, h_q, h_kv, max_seqlen_pad, dv, dpe, BLOCK_N, BLOCK_H,
-                                 num_kv_splits, block_size)
+    kernel = mla_decode_tilelang(b, h_q, h_kv, max_seqlen_pad, dv, dpe, BLOCK_N, BLOCK_H, num_kv_splits, block_size)
 
     def flash_mla_tilelang():
         out = kernel(
@@ -486,38 +476,31 @@ def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal
 
     q = torch.randn(b, s_q, h_q, d)
     block_size = 64
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
 
-    out_a, lse_a, perf_a = baseline_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                         s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
-    out_b, lse_b, perf_b = target_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                       s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_a, lse_a, perf_a = baseline_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     torch.testing.assert_close(out_b.float(), out_a.float(), atol=1e-2, rtol=1e-2), "out"
-    if target not in ["flashinfer", "flash_mla_triton", "tilelang"
-                     ] and baseline not in ["flashinfer", "flash_mla_triton", "tilelang"]:
+    if target not in ["flashinfer", "flash_mla_triton", "tilelang"] and baseline not in ["flashinfer", "flash_mla_triton", "tilelang"]:
         # flashinfer has a different lse return value
         # flash_mla_triton and flash_mla_tilelang doesn't return lse
         torch.testing.assert_close(lse_b.float(), lse_a.float(), atol=1e-2, rtol=1e-2), "lse"
 
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (
-        torch.finfo(dtype).bits // 8)
-    print(
-        f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10 ** 9 / perf_a:.0f} TFLOPS, {bytes / 10 ** 6 / perf_a:.0f} GB/s"
-    )
-    print(
-        f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10 ** 9 / perf_b:.0f} TFLOPS, {bytes / 10 ** 6 / perf_b:.0f} GB/s"
-    )
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10**9 / perf_a:.0f} TFLOPS, {bytes / 10**6 / perf_a:.0f} GB/s")
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.0f} TFLOPS, {bytes / 10**6 / perf_b:.0f} GB/s")
     return bytes / 10**6 / perf_a, bytes / 10**6 / perf_b
 
 
 def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
-    print(
-        f"{target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}"
-    )
+    print(f"{target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}")
     torch.set_default_dtype(dtype)
     device = torch.device("cuda:0")
     torch.set_default_device(device)
@@ -534,19 +517,16 @@ def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
 
     q = torch.randn(b, s_q, h_q, d)
     block_size = 64
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
 
-    out_b, lse_b, perf_b = target_func(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                       s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
-    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (
-        torch.finfo(dtype).bits // 8)
-    print(
-        f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10 ** 9 / perf_b:.0f} TFLOPS, {bytes / 10 ** 6 / perf_b:.0f} GB/s"
-    )
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.0f} TFLOPS, {bytes / 10**6 / perf_b:.0f} GB/s")
     return bytes / 10**6 / perf_b
 
 
@@ -558,26 +538,22 @@ def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     "flash_mla_triton",
 ]
 
-shape_configs = [{
-    "b":
-        batch,
-    "s_q":
-        1,
-    "cache_seqlens":
-        torch.tensor([seqlen + 2 * i for i in range(batch)], dtype=torch.int32, device="cuda"),
-    "h_q":
-        head,
-    "h_kv":
-        1,
-    "d":
-        512 + 64,
-    "dv":
-        512,
-    "causal":
-        True,
-    "dtype":
-        torch.float16
-} for batch in [128] for seqlen in [1024, 2048, 4096, 8192, 16384, 32768] for head in [128]]
+shape_configs = [
+    {
+        "b": batch,
+        "s_q": 1,
+        "cache_seqlens": torch.tensor([seqlen + 2 * i for i in range(batch)], dtype=torch.int32, device="cuda"),
+        "h_q": head,
+        "h_kv": 1,
+        "d": 512 + 64,
+        "dv": 512,
+        "causal": True,
+        "dtype": torch.float16,
+    }
+    for batch in [128]
+    for seqlen in [1024, 2048, 4096, 8192, 16384, 32768]
+    for head in [128]
+]
 
 
 def get_args():
@@ -599,26 +575,54 @@ def get_args():
         for shape in shape_configs:
             if args.all:
                 for target in available_targets:
-                    perf = compare_a(target, shape["b"], shape["s_q"], shape["cache_seqlens"],
-                                     shape["h_q"], shape["h_kv"], shape["d"], shape["dv"],
-                                     shape["causal"], shape["dtype"])
+                    perf = compare_a(
+                        target,
+                        shape["b"],
+                        shape["s_q"],
+                        shape["cache_seqlens"],
+                        shape["h_q"],
+                        shape["h_kv"],
+                        shape["d"],
+                        shape["dv"],
+                        shape["causal"],
+                        shape["dtype"],
+                    )
                     fout.write(
-                        f'{target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perf:.0f}\n'
+                        f"{target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
                     )
             elif args.compare:
-                perfa, prefb = compare_ab(args.baseline, args.target, shape["b"], shape["s_q"],
-                                          shape["cache_seqlens"], shape["h_q"], shape["h_kv"],
-                                          shape["d"], shape["dv"], shape["causal"], shape["dtype"])
+                perfa, prefb = compare_ab(
+                    args.baseline,
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
                 fout.write(
-                    f'{args.baseline},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perfa:.0f}\n'
+                    f"{args.baseline},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perfa:.0f}\n"
                 )
                 fout.write(
-                    f'{args.target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{prefb:.0f}\n'
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{prefb:.0f}\n"
                 )
             elif args.one:
-                perf = compare_a(args.target, shape["b"], shape["s_q"], shape["cache_seqlens"],
-                                 shape["h_q"], shape["h_kv"], shape["d"], shape["dv"],
-                                 shape["causal"], shape["dtype"])
+                perf = compare_a(
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
                 fout.write(
-                    f'{args.target},{shape["b"]},{shape["cache_seqlens"].float().mean().cpu().item():.0f},{shape["h_q"]},{perf:.0f}\n'
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
                 )
diff --git a/examples/deepseek_mla/example_mla_decode.py b/examples/deepseek_mla/example_mla_decode.py
index e1dd0b4d6..7de4faf08 100644
--- a/examples/deepseek_mla/example_mla_decode.py
+++ b/examples/deepseek_mla/example_mla_decode.py
@@ -10,27 +10,31 @@
 
 
 @tilelang.jit(
-    out_idx=[6], pass_configs={
+    out_idx=[6],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split,
-              softmax_scale):
+    },
+)
+def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split, softmax_scale):
     scale = float(softmax_scale * 1.44269504)  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert kv_head_num == 1, "kv_head_num must be 1"
 
-    @T.macro
-    def flash_attn(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+    @T.prim_func
+    def main_split(
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
-        with T.Kernel(heads // min(block_H, kv_group_num), batch, threads=256) as (hid, bid):
+        # flash_attn_split
+        with T.Kernel(batch, heads // min(block_H, kv_group_num), num_split, threads=256) as (bid, hid, bz):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
             S_shared = T.alloc_shared([block_H, block_N], dtype)
             Q_pe_shared = T.alloc_shared([block_H, pe_dim], dtype)
@@ -38,6 +42,7 @@ def flash_attn(
             K_pe_shared = T.alloc_shared([block_N, pe_dim], dtype)
             O_shared = T.alloc_shared([block_H, dim], dtype)
             acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
+            acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
             acc_o = T.alloc_fragment([block_H, dim], accum_dtype)
             scores_max = T.alloc_fragment([block_H], accum_dtype)
             scores_max_prev = T.alloc_fragment([block_H], accum_dtype)
@@ -46,64 +51,87 @@ def flash_attn(
             logsum = T.alloc_fragment([block_H], accum_dtype)
 
             cur_kv_head = hid // (kv_group_num // block_H)
-            T.annotate_layout({
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-            })
+            T.use_swizzle(10)
 
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_shared)
-            T.copy(Q_pe[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_shared)
+            T.copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
-            loop_range = T.ceildiv(seqlen_kv, block_N)
+            loop_range = T.ceildiv((seqlen_kv // num_split), block_N)
             for k in T.Pipelined(loop_range, num_stages=2):
-                T.copy(KV[bid, k * block_N:(k + 1) * block_N, cur_kv_head, :], KV_shared)
-                T.copy(K_pe[bid, k * block_N:(k + 1) * block_N, cur_kv_head, :], K_pe_shared)
-                T.gemm(
-                    Q_shared,
-                    KV_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol,
-                    clear_accum=True)
-                T.gemm(
-                    Q_pe_shared,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                kv_start = (seqlen_kv // num_split) * bz + k * block_N
+                kv_end = (seqlen_kv // num_split) * bz + (k + 1) * block_N
+                T.copy(KV[bid, kv_start:kv_end, cur_kv_head, :], KV_shared)
+                T.copy(K_pe[bid, kv_start:kv_end, cur_kv_head, :], K_pe_shared)
+                T.clear(acc_s)
+                T.gemm(Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_pe_shared, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
                     acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                 T.reduce_sum(acc_s, scores_sum, dim=1)
                 T.copy(acc_s, S_shared)
+                T.copy(S_shared, acc_s_cast)
                 for i in T.Parallel(block_H):
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
                 for i, j in T.Parallel(block_H, dim):
                     acc_o[i, j] *= scores_scale[i]
-                T.gemm(S_shared, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(acc_s_cast, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullCol)
             for i, j in T.Parallel(block_H, dim):
                 acc_o[i, j] /= logsum[i]
+            for i in T.Parallel(block_H):
+                logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
+            T.copy(logsum, glse[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, bz])
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :])
+            T.copy(O_shared, Output_partial[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, bz, :])
+
+        # combine
+        with T.Kernel(heads, batch, threads=128) as (hid, bz):
+            po_local = T.alloc_fragment([dim], dtype)
+            o_accum_local = T.alloc_fragment([dim], accum_dtype)
+            lse_local_split = T.alloc_var(accum_dtype)
+            lse_logsum_local = T.alloc_var(accum_dtype)
+            lse_max_local = T.alloc_var(accum_dtype)
+            scale_local = T.alloc_var(accum_dtype)
+
+            T.clear(lse_logsum_local)
+            T.clear(o_accum_local)
+            lse_max_local = -T.infinity(accum_dtype)
+            for k in T.serial(num_split):
+                lse_max_local = T.max(lse_max_local, glse[bz, hid, k])
+            for k in T.Pipelined(num_split, num_stages=1):
+                lse_local_split = glse[bz, hid, k]
+                lse_logsum_local += T.exp2(lse_local_split - lse_max_local)
+            lse_logsum_local = T.log2(lse_logsum_local) + lse_max_local
+            for k in T.serial(num_split):
+                for i in T.Parallel(dim):
+                    po_local[i] = Output_partial[bz, hid, k, i]
+                lse_local_split = glse[bz, hid, k]
+                scale_local = T.exp2(lse_local_split - lse_logsum_local)
+                for i in T.Parallel(dim):
+                    o_accum_local[i] += po_local[i] * scale_local
+            for i in T.Parallel(dim):
+                Output[bz, hid, i] = o_accum_local[i]
 
-    @T.macro
-    def flash_attn_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+    @T.prim_func
+    def main_no_split(
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
-        with T.Kernel(
-                batch, heads // min(block_H, kv_group_num), num_split,
-                threads=256) as (bid, hid, bz):
+        with T.Kernel(heads // min(block_H, kv_group_num), batch, threads=256) as (hid, bid):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
             S_shared = T.alloc_shared([block_H, block_N], dtype)
             Q_pe_shared = T.alloc_shared([block_H, pe_dim], dtype)
@@ -111,7 +139,6 @@ def flash_attn_split(
             K_pe_shared = T.alloc_shared([block_N, pe_dim], dtype)
             O_shared = T.alloc_shared([block_H, dim], dtype)
             acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
-            acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
             acc_o = T.alloc_fragment([block_H, dim], accum_dtype)
             scores_max = T.alloc_fragment([block_H], accum_dtype)
             scores_max_prev = T.alloc_fragment([block_H], accum_dtype)
@@ -120,118 +147,39 @@ def flash_attn_split(
             logsum = T.alloc_fragment([block_H], accum_dtype)
 
             cur_kv_head = hid // (kv_group_num // block_H)
-            T.use_swizzle(10)
-            T.annotate_layout({
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                S_shared: tilelang.layout.make_swizzled_layout(S_shared),
-            })
 
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_shared)
-            T.copy(Q_pe[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_shared)
+            T.copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
-            loop_range = T.ceildiv((seqlen_kv // num_split), block_N)
+            loop_range = T.ceildiv(seqlen_kv, block_N)
             for k in T.Pipelined(loop_range, num_stages=2):
-                kv_start = (seqlen_kv // num_split) * bz + k * block_N
-                kv_end = (seqlen_kv // num_split) * bz + (k + 1) * block_N
-                T.copy(KV[bid, kv_start:kv_end, cur_kv_head, :], KV_shared)
-                T.copy(K_pe[bid, kv_start:kv_end, cur_kv_head, :], K_pe_shared)
-                T.clear(acc_s)
-                T.gemm(
-                    Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
-                T.gemm(
-                    Q_pe_shared,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.copy(KV[bid, k * block_N : (k + 1) * block_N, cur_kv_head, :], KV_shared)
+                T.copy(K_pe[bid, k * block_N : (k + 1) * block_N, cur_kv_head, :], K_pe_shared)
+                T.gemm(Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol, clear_accum=True)
+                T.gemm(Q_pe_shared, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
                     acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                 T.reduce_sum(acc_s, scores_sum, dim=1)
                 T.copy(acc_s, S_shared)
-                T.copy(S_shared, acc_s_cast)
                 for i in T.Parallel(block_H):
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
                 for i, j in T.Parallel(block_H, dim):
                     acc_o[i, j] *= scores_scale[i]
-                T.gemm(acc_s_cast, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(S_shared, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullCol)
             for i, j in T.Parallel(block_H, dim):
                 acc_o[i, j] /= logsum[i]
-            for i in T.Parallel(block_H):
-                logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, glse[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, bz])
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output_partial[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                            bz, :])
-
-    @T.macro
-    def combine(
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
-    ):
-        with T.Kernel(heads, batch, threads=128) as (hid, bz):
-            po_local = T.alloc_fragment([dim], dtype)
-            o_accum_local = T.alloc_fragment([dim], accum_dtype)
-            lse_local_split = T.alloc_local([1], accum_dtype)
-            lse_logsum_local = T.alloc_local([1], accum_dtype)
-            lse_max_local = T.alloc_local([1], accum_dtype)
-            scale_local = T.alloc_local([1], accum_dtype)
-
-            T.annotate_layout({
-                lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-            })
-
-            T.clear(lse_logsum_local)
-            T.clear(o_accum_local)
-            lse_max_local[0] = -T.infinity(accum_dtype)
-            for k in T.serial(num_split):
-                lse_max_local[0] = T.max(lse_max_local[0], glse[bz, hid, k])
-            for k in T.Pipelined(num_split, num_stages=1):
-                lse_local_split[0] = glse[bz, hid, k]
-                lse_logsum_local[0] += T.exp2(lse_local_split[0] - lse_max_local[0])
-            lse_logsum_local[0] = T.log2(lse_logsum_local[0]) + lse_max_local[0]
-            for k in T.serial(num_split):
-                for i in T.Parallel(dim):
-                    po_local[i] = Output_partial[bz, hid, k, i]
-                lse_local_split[0] = glse[bz, hid, k]
-                scale_local[0] = T.exp2(lse_local_split[0] - lse_logsum_local[0])
-                for i in T.Parallel(dim):
-                    o_accum_local[i] += po_local[i] * scale_local[0]
-            for i in T.Parallel(dim):
-                Output[bz, hid, i] = o_accum_local[i]
-
-    @T.prim_func
-    def main_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
-    ):
-        flash_attn_split(Q, Q_pe, KV, K_pe, glse, Output_partial)
-        combine(glse, Output_partial, Output)
-
-    @T.prim_func
-    def main_no_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
-    ):
-        flash_attn(Q, Q_pe, KV, K_pe, Output)
+            T.copy(O_shared, Output[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :])
 
     if num_split > 1:
         return main_split
@@ -254,31 +202,24 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
     dim = q.shape[-1]
     pe_dim = q_pe.shape[-1]
     num_head_groups = q.shape[1] // kv.shape[2]
-    scale = (dim + pe_dim)**0.5
-    q = rearrange(
-        q, 'b (h g) d -> b g h d', g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    scale = (dim + pe_dim) ** 0.5
+    q = rearrange(q, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    q_pe = rearrange(
-        q_pe, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
+    q_pe = rearrange(q_pe, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
 
-    kv = rearrange(kv, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    kv = rearrange(kv, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    k_pe = rearrange(k_pe, 'b n h d -> b h n d')  # [batch_size, num_head_groups, groups, pe_dim]
+    k_pe = rearrange(k_pe, "b n h d -> b h n d")  # [batch_size, num_head_groups, groups, pe_dim]
 
     query = torch.concat([q, q_pe], dim=-1)
     key = torch.concat([kv, k_pe], dim=-1)
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, kv,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, kv, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
@@ -296,10 +237,9 @@ def main(
     BLOCK_N = 64
     BLOCK_H = min(64, heads // kv_heads)
     num_split = 1
-    softmax_scale = (dim + pe_dim)**-0.5
+    softmax_scale = (dim + pe_dim) ** -0.5
 
-    kernel = flashattn(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split,
-                       softmax_scale)
+    kernel = flashattn(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split, softmax_scale)
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
     profiler.assert_allclose(ref_program, rtol=1e-4, atol=1e-4)
     latency = profiler.do_bench(warmup=500)
@@ -307,14 +247,33 @@ def main(
     print(f"TFlops: {total_flops / latency * 1e-9} TFlops")
 
 
+def run_regression_perf(
+    batch=1,
+    heads=128,
+    kv_heads=1,
+    kv_ctx=8192,
+    dim=512,
+    pe_dim=64,
+):
+    BLOCK_N = 64
+    BLOCK_H = min(64, heads // kv_heads)
+    num_split = 1
+    softmax_scale = (dim + pe_dim) ** -0.5
+
+    kernel = flashattn(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split, softmax_scale)
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
+    profiler.assert_allclose(ref_program, rtol=1e-4, atol=1e-4)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=132, help='batch size')
-    parser.add_argument('--heads', type=int, default=128, help='q heads number')
-    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
-    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
-    parser.add_argument('--dim', type=int, default=512, help='head dim')
-    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
+    parser.add_argument("--batch", type=int, default=132, help="batch size")
+    parser.add_argument("--heads", type=int, default=128, help="q heads number")
+    parser.add_argument("--kv_heads", type=int, default=1, help="kv heads number")
+    parser.add_argument("--kv_ctx", type=int, default=8192, help="kv context length")
+    parser.add_argument("--dim", type=int, default=512, help="head dim")
+    parser.add_argument("--pe_dim", type=int, default=64, help="pe head dim")
     args = parser.parse_args()
     batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
     main(batch, heads, kv_heads, kv_ctx, dim, pe_dim)
diff --git a/examples/deepseek_mla/example_mla_decode_paged.py b/examples/deepseek_mla/example_mla_decode_paged.py
index fe50d4d4f..2e1911028 100644
--- a/examples/deepseek_mla/example_mla_decode_paged.py
+++ b/examples/deepseek_mla/example_mla_decode_paged.py
@@ -8,41 +8,36 @@
 
 
 @tilelang.jit(
-    out_idx=[8], pass_configs={
+    out_idx=[8],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def mla_decode_tilelang(batch,
-                        h_q,
-                        h_kv,
-                        max_seqlen_pad,
-                        dv,
-                        dpe,
-                        block_N,
-                        block_H,
-                        num_split,
-                        block_size,
-                        softmax_scale=None):
+    },
+)
+def mla_decode_tilelang(batch, h_q, h_kv, max_seqlen_pad, dv, dpe, block_N, block_H, num_split, block_size, softmax_scale=None):
     if softmax_scale is None:
-        softmax_scale = (dv + dpe)**-0.5
+        softmax_scale = (dv + dpe) ** -0.5
     scale = float(softmax_scale * 1.44269504)  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = h_q // h_kv
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert h_kv == 1, "h_kv must be 1"
     assert block_size >= block_N and block_size % block_N == 0, "block_size must be larger than block_N and a multiple of block_N"
 
-    @T.macro
-    def flash_mla_kernel(
-            Q: T.Tensor([batch, h_q, dv], dtype),
-            Q_pe: T.Tensor([batch, h_q, dpe], dtype),
-            KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
-            K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
-            BLOCK_TABLE: T.Tensor([batch, max_seqlen_pad // block_size], "int32"),
-            CACHE_SEQLENS: T.Tensor([batch], "int32"),
-            Output: T.Tensor([batch, h_q, dv], dtype),
+    @T.prim_func
+    def main_split(
+        Q: T.Tensor([batch, h_q, dv], dtype),
+        Q_pe: T.Tensor([batch, h_q, dpe], dtype),
+        KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
+        K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
+        block_table: T.Tensor([batch, max_seqlen_pad // block_size], T.int32),
+        cache_seqlens: T.Tensor([batch], T.int32),
+        glse: T.Tensor([batch, h_q, num_split], dtype),
+        Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
+        Output: T.Tensor([batch, h_q, dv], dtype),
     ):
-        with T.Kernel(batch, h_q // min(block_H, kv_group_num), threads=256) as (bx, by):
+        # split kv
+        with T.Kernel(batch, h_q // min(block_H, kv_group_num), num_split, threads=256) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_H, dv], dtype)
             S_shared = T.alloc_shared([block_H, block_N], dtype)
             Q_pe_shared = T.alloc_shared([block_H, dpe], dtype)
@@ -50,6 +45,7 @@ def flash_mla_kernel(
             K_pe_shared = T.alloc_shared([block_N, dpe], dtype)
             O_shared = T.alloc_shared([block_H, dv], dtype)
             acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
+            acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
             acc_o = T.alloc_fragment([block_H, dv], accum_dtype)
             scores_max = T.alloc_fragment([block_H], accum_dtype)
             scores_max_prev = T.alloc_fragment([block_H], accum_dtype)
@@ -59,69 +55,94 @@ def flash_mla_kernel(
 
             cur_kv_head = by // (kv_group_num // block_H)
             T.use_swizzle(10)
-            T.annotate_layout({
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                S_shared: tilelang.layout.make_swizzled_layout(S_shared),
-            })
 
-            T.copy(Q[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_shared)
-            T.copy(Q_pe[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+            T.copy(Q[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_shared)
+            T.copy(Q_pe[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_pe_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
-            loop_range = T.ceildiv(CACHE_SEQLENS[bx], block_N)
-            for kr in T.Pipelined(loop_range, num_stages=2):
-                k = loop_range - 1 - kr
-                kv_start = BLOCK_TABLE[bx, (k * block_N) //
-                                       block_size] * block_size + (k * block_N) % block_size
-                T.copy(KV[kv_start:kv_start + block_N, cur_kv_head, :], KV_shared)
-                T.copy(K_pe[kv_start:kv_start + block_N, cur_kv_head, :], K_pe_shared)
+            total_blocks = T.ceildiv(cache_seqlens[bx], block_N)
+            blocks_per_split = T.floordiv(total_blocks, num_split)
+            remaining_blocks = T.floormod(total_blocks, num_split)
+            loop_range = blocks_per_split + T.if_then_else(bz < remaining_blocks, 1, 0)
+            start = (blocks_per_split * bz + T.min(bz, remaining_blocks)) * block_N
+
+            for k in T.Pipelined(loop_range, num_stages=2):
+                kv_start = block_table[bx, (start + k * block_N) // block_size] * block_size + (k * block_N) % block_size
+                T.copy(KV[kv_start : kv_start + block_N, cur_kv_head, :], KV_shared)
+                T.copy(K_pe[kv_start : kv_start + block_N, cur_kv_head, :], K_pe_shared)
                 T.clear(acc_s)
-                T.gemm(
-                    Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
-                T.gemm(
-                    Q_pe_shared,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_pe_shared, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
-                if kr == 0:
-                    for i, j in T.Parallel(block_H, block_N):
-                        acc_s[i, j] = T.if_then_else(k * block_N + j >= CACHE_SEQLENS[bx],
-                                                     -T.infinity(accum_dtype), acc_s[i, j])
+                for i, j in T.Parallel(block_H, block_N):
+                    acc_s[i, j] = T.if_then_else(start + k * block_N + j >= cache_seqlens[bx], -T.infinity(accum_dtype), acc_s[i, j])
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
                     acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                 T.reduce_sum(acc_s, scores_sum, dim=1)
                 T.copy(acc_s, S_shared)
+                T.copy(S_shared, acc_s_cast)
                 for i in T.Parallel(block_H):
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
                 for i, j in T.Parallel(block_H, dv):
                     acc_o[i, j] *= scores_scale[i]
-                T.gemm(S_shared, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(acc_s_cast, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullCol)
             for i, j in T.Parallel(block_H, dv):
                 acc_o[i, j] /= logsum[i]
+            for i in T.Parallel(block_H):
+                logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
+            T.copy(logsum, glse[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, bz])
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :])
-
-    @T.macro
-    def flash_mla_split_kv_kernel(
-            Q: T.Tensor([batch, h_q, dv], dtype),
-            Q_pe: T.Tensor([batch, h_q, dpe], dtype),
-            KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
-            K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
-            BLOCK_TABLE: T.Tensor([batch, max_seqlen_pad // block_size], "int32"),
-            CACHE_SEQLENS: T.Tensor([batch], "int32"),
-            glse: T.Tensor([batch, h_q, num_split], dtype),
-            Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
+            T.copy(O_shared, Output_partial[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, bz, :])
+
+        # combine
+        with T.Kernel(h_q, batch, threads=128) as (by, bz):
+            po_local = T.alloc_fragment([dv], dtype)
+            o_accum_local = T.alloc_fragment([dv], accum_dtype)
+            lse_local_split = T.alloc_var(accum_dtype)
+            lse_logsum_local = T.alloc_var(accum_dtype)
+            lse_max_local = T.alloc_var(accum_dtype)
+            scale_local = T.alloc_var(accum_dtype)
+
+            T.clear(lse_logsum_local)
+            T.clear(o_accum_local)
+            lse_max_local = -T.infinity(accum_dtype)
+            for k in T.serial(num_split):
+                lse_max_local = T.max(lse_max_local, glse[bz, by, k])
+            for k in T.Pipelined(num_split, num_stages=1):
+                lse_local_split = glse[bz, by, k]
+                lse_logsum_local += T.exp2(lse_local_split - lse_max_local)
+            lse_logsum_local = T.log2(lse_logsum_local) + lse_max_local
+            for k in T.serial(num_split):
+                for i in T.Parallel(dv):
+                    po_local[i] = Output_partial[bz, by, k, i]
+                lse_local_split = glse[bz, by, k]
+                scale_local = T.exp2(lse_local_split - lse_logsum_local)
+                for i in T.Parallel(dv):
+                    o_accum_local[i] += po_local[i] * scale_local
+            for i in T.Parallel(dv):
+                Output[bz, by, i] = o_accum_local[i]
+
+    @T.prim_func
+    def main_no_split(
+        Q: T.Tensor([batch, h_q, dv], dtype),
+        Q_pe: T.Tensor([batch, h_q, dpe], dtype),
+        KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
+        K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
+        block_table: T.Tensor([batch, max_seqlen_pad // block_size], T.int32),
+        cache_seqlens: T.Tensor([batch], T.int32),
+        glse: T.Tensor([batch, h_q, num_split], dtype),
+        Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
+        Output: T.Tensor([batch, h_q, dv], dtype),
     ):
-        with T.Kernel(
-                batch, h_q // min(block_H, kv_group_num), num_split, threads=256) as (bx, by, bz):
+        with T.Kernel(batch, h_q // min(block_H, kv_group_num), threads=256) as (bx, by):
             Q_shared = T.alloc_shared([block_H, dv], dtype)
             S_shared = T.alloc_shared([block_H, block_N], dtype)
             Q_pe_shared = T.alloc_shared([block_H, dpe], dtype)
@@ -129,7 +150,6 @@ def flash_mla_split_kv_kernel(
             K_pe_shared = T.alloc_shared([block_N, dpe], dtype)
             O_shared = T.alloc_shared([block_H, dv], dtype)
             acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
-            acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
             acc_o = T.alloc_fragment([block_H, dv], accum_dtype)
             scores_max = T.alloc_fragment([block_H], accum_dtype)
             scores_max_prev = T.alloc_fragment([block_H], accum_dtype)
@@ -139,129 +159,45 @@ def flash_mla_split_kv_kernel(
 
             cur_kv_head = by // (kv_group_num // block_H)
             T.use_swizzle(10)
-            T.annotate_layout({
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                S_shared: tilelang.layout.make_swizzled_layout(S_shared),
-            })
 
-            T.copy(Q[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_shared)
-            T.copy(Q_pe[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+            T.copy(Q[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_shared)
+            T.copy(Q_pe[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_pe_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
-            total_blocks = T.ceildiv(CACHE_SEQLENS[bx], block_N)
-            blocks_per_split = T.floordiv(total_blocks, num_split)
-            remaining_blocks = T.floormod(total_blocks, num_split)
-            loop_range = (blocks_per_split + T.if_then_else(bz < remaining_blocks, 1, 0))
-            start = (blocks_per_split * bz + T.min(bz, remaining_blocks)) * block_N
-
-            for k in T.Pipelined(loop_range, num_stages=2):
-                kv_start = BLOCK_TABLE[bx, (start + k * block_N) //
-                                       block_size] * block_size + (k * block_N) % block_size
-                T.copy(KV[kv_start:kv_start + block_N, cur_kv_head, :], KV_shared)
-                T.copy(K_pe[kv_start:kv_start + block_N, cur_kv_head, :], K_pe_shared)
+            loop_range = T.ceildiv(cache_seqlens[bx], block_N)
+            for kr in T.Pipelined(loop_range, num_stages=2):
+                k = loop_range - 1 - kr
+                kv_start = block_table[bx, (k * block_N) // block_size] * block_size + (k * block_N) % block_size
+                T.copy(KV[kv_start : kv_start + block_N, cur_kv_head, :], KV_shared)
+                T.copy(K_pe[kv_start : kv_start + block_N, cur_kv_head, :], K_pe_shared)
                 T.clear(acc_s)
-                T.gemm(
-                    Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
-                T.gemm(
-                    Q_pe_shared,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_pe_shared, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
-                for i, j in T.Parallel(block_H, block_N):
-                    acc_s[i, j] = T.if_then_else(start + k * block_N + j >= CACHE_SEQLENS[bx],
-                                                 -T.infinity(accum_dtype), acc_s[i, j])
+                if kr == 0:
+                    for i, j in T.Parallel(block_H, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= cache_seqlens[bx], -T.infinity(accum_dtype), acc_s[i, j])
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
                     acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                 T.reduce_sum(acc_s, scores_sum, dim=1)
                 T.copy(acc_s, S_shared)
-                T.copy(S_shared, acc_s_cast)
                 for i in T.Parallel(block_H):
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
                 for i, j in T.Parallel(block_H, dv):
                     acc_o[i, j] *= scores_scale[i]
-                T.gemm(acc_s_cast, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(S_shared, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullCol)
             for i, j in T.Parallel(block_H, dv):
                 acc_o[i, j] /= logsum[i]
-            for i in T.Parallel(block_H):
-                logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, glse[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, bz])
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output_partial[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, bz, :])
-
-    @T.macro
-    def combine(
-            glse: T.Tensor([batch, h_q, num_split], dtype),
-            Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
-            Output: T.Tensor([batch, h_q, dv], dtype),
-    ):
-        with T.Kernel(h_q, batch, threads=128) as (by, bz):
-            po_local = T.alloc_fragment([dv], dtype)
-            o_accum_local = T.alloc_fragment([dv], accum_dtype)
-            lse_local_split = T.alloc_local([1], accum_dtype)
-            lse_logsum_local = T.alloc_local([1], accum_dtype)
-            lse_max_local = T.alloc_local([1], accum_dtype)
-            scale_local = T.alloc_local([1], accum_dtype)
-
-            T.annotate_layout({
-                lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-            })
-
-            T.clear(lse_logsum_local)
-            T.clear(o_accum_local)
-            lse_max_local[0] = -T.infinity(accum_dtype)
-            for k in T.serial(num_split):
-                lse_max_local[0] = T.max(lse_max_local[0], glse[bz, by, k])
-            for k in T.Pipelined(num_split, num_stages=1):
-                lse_local_split[0] = glse[bz, by, k]
-                lse_logsum_local[0] += T.exp2(lse_local_split[0] - lse_max_local[0])
-            lse_logsum_local[0] = T.log2(lse_logsum_local[0]) + lse_max_local[0]
-            for k in T.serial(num_split):
-                for i in T.Parallel(dv):
-                    po_local[i] = Output_partial[bz, by, k, i]
-                lse_local_split[0] = glse[bz, by, k]
-                scale_local[0] = T.exp2(lse_local_split[0] - lse_logsum_local[0])
-                for i in T.Parallel(dv):
-                    o_accum_local[i] += po_local[i] * scale_local[0]
-            for i in T.Parallel(dv):
-                Output[bz, by, i] = o_accum_local[i]
-
-    @T.prim_func
-    def main_split(
-            Q: T.Tensor([batch, h_q, dv], dtype),
-            Q_pe: T.Tensor([batch, h_q, dpe], dtype),
-            KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
-            K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
-            block_table: T.Tensor([batch, max_seqlen_pad // block_size], "int32"),
-            cache_seqlens: T.Tensor([batch], "int32"),
-            glse: T.Tensor([batch, h_q, num_split], dtype),
-            Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
-            Output: T.Tensor([batch, h_q, dv], dtype),
-    ):
-        flash_mla_split_kv_kernel(Q, Q_pe, KV, K_pe, block_table, cache_seqlens, glse,
-                                  Output_partial)
-        combine(glse, Output_partial, Output)
-
-    @T.prim_func
-    def main_no_split(
-            Q: T.Tensor([batch, h_q, dv], dtype),
-            Q_pe: T.Tensor([batch, h_q, dpe], dtype),
-            KV: T.Tensor([batch * max_seqlen_pad, h_kv, dv], dtype),
-            K_pe: T.Tensor([batch * max_seqlen_pad, h_kv, dpe], dtype),
-            block_table: T.Tensor([batch, max_seqlen_pad // block_size], "int32"),
-            cache_seqlens: T.Tensor([batch], "int32"),
-            glse: T.Tensor([batch, h_q, num_split], dtype),
-            Output_partial: T.Tensor([batch, h_q, num_split, dv], dtype),
-            Output: T.Tensor([batch, h_q, dv], dtype),
-    ):
-        flash_mla_kernel(Q, Q_pe, KV, K_pe, block_table, cache_seqlens, Output)
+            T.copy(O_shared, Output[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :])
 
     if num_split > 1:
         return main_split
@@ -280,8 +216,7 @@ def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal=False):
         s_q = query.shape[-2]
         s_k = key.shape[-2]
         attn_bias = torch.zeros(s_q, s_k, dtype=query.dtype, device=query.device)
-        temp_mask = torch.ones(
-            s_q, s_k, dtype=torch.bool, device=query.device).tril(diagonal=s_k - s_q)
+        temp_mask = torch.ones(s_q, s_k, dtype=torch.bool, device=query.device).tril(diagonal=s_k - s_q)
         attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
         attn_bias.to(query.dtype)
         attn_weight += attn_bias
@@ -291,8 +226,7 @@ def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal=False):
 
 
 @torch.inference_mode()
-def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q,
-                  h_kv, d, dv, causal, dtype):
+def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     # q: [b, s_q, h_q, d]
     # block_table: [b, max_seqlen_pad // block_size]
     # blocked_k: [b * max_seqlen_pad // block_size, block_size, h_kv, d]
@@ -321,13 +255,10 @@ def ref_mla():
     return out_torch
 
 
-def run_tilelang_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens,
-                     h_q, h_kv, d, dv, causal, dtype):
-
+def run_tilelang_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
     assert d > dv, "mla with rope dim should be larger than no rope dim"
     q_nope, q_pe = q[..., :dv].contiguous(), q[..., dv:].contiguous()
-    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[...,
-                                                                               dv:].contiguous()
+    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[..., dv:].contiguous()
 
     dpe = d - dv
     num_kv_splits = 1
@@ -337,8 +268,7 @@ def run_tilelang_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s
 
     out_partial = torch.empty(b, h_q, num_kv_splits, dv, dtype=dtype, device=q.device)
     glse = torch.empty(b, h_q, num_kv_splits, dtype=dtype, device=q.device)
-    kernel = mla_decode_tilelang(b, h_q, h_kv, max_seqlen_pad, dv, dpe, BLOCK_N, BLOCK_H,
-                                 num_kv_splits, block_size, softmax_scale)
+    kernel = mla_decode_tilelang(b, h_q, h_kv, max_seqlen_pad, dv, dpe, BLOCK_N, BLOCK_H, num_kv_splits, block_size, softmax_scale)
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
 
     def flash_mla_tilelang():
@@ -356,8 +286,7 @@ def flash_mla_tilelang():
 
     out_flash = flash_mla_tilelang()
     t = do_bench(flash_mla_tilelang)
-    out_ref = run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
-                            cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_ref = run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
     torch.testing.assert_close(out_flash, out_ref, rtol=0.01, atol=0.01)
     print("All close")
     return out_flash, t
@@ -365,12 +294,12 @@ def flash_mla_tilelang():
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=128, help='batch size')
-    parser.add_argument('--h_q', type=int, default=128, help='q heads number')
-    parser.add_argument('--h_kv', type=int, default=1, help='kv heads number')
-    parser.add_argument('--cache_seqlen', type=int, default=8192, help='kv cache context length')
-    parser.add_argument('--d', type=int, default=576, help='query/key head dim, d = dv + dpe')
-    parser.add_argument('--dv', type=int, default=512, help='value head dim')
+    parser.add_argument("--batch", type=int, default=128, help="batch size")
+    parser.add_argument("--h_q", type=int, default=128, help="q heads number")
+    parser.add_argument("--h_kv", type=int, default=1, help="kv heads number")
+    parser.add_argument("--cache_seqlen", type=int, default=8192, help="kv cache context length")
+    parser.add_argument("--d", type=int, default=576, help="query/key head dim, d = dv + dpe")
+    parser.add_argument("--dv", type=int, default=512, help="value head dim")
     args = parser.parse_args()
     b, h_q, h_kv, cache_seqlen, d, dv = args.batch, args.h_q, args.h_kv, args.cache_seqlen, args.d, args.dv
 
@@ -379,9 +308,7 @@ def flash_mla_tilelang():
 
     s_q = 1  # for decode, s_q = 1
     block_size = 64
-    cache_seqlens = torch.tensor([cache_seqlen + 2 * i for i in range(b)],
-                                 dtype=torch.int32,
-                                 device=device)
+    cache_seqlens = torch.tensor([cache_seqlen + 2 * i for i in range(b)], dtype=torch.int32, device=device)
     dpe = d - dv
     causal = True
 
@@ -393,12 +320,11 @@ def flash_mla_tilelang():
     total_flops = s_q * total_seqlens * h_q * d * 2
 
     q = torch.randn(b, s_q, h_q, d, dtype=dtype, device=device)
-    block_table = torch.arange(
-        b * max_seqlen_pad // block_size, dtype=torch.int32,
-        device=device).view(b, max_seqlen_pad // block_size)
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32, device=device).view(b, max_seqlen_pad // block_size)
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d, dtype=dtype, device=device)
-    out_flash, latency = run_tilelang_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                          s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_flash, latency = run_tilelang_mla(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
 
     print("Tile-lang: {:.2f} ms".format(latency))
     print("Tile-lang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
diff --git a/examples/deepseek_mla/example_mla_decode_persistent.py b/examples/deepseek_mla/example_mla_decode_persistent.py
index 3f57ea051..74d974fbb 100644
--- a/examples/deepseek_mla/example_mla_decode_persistent.py
+++ b/examples/deepseek_mla/example_mla_decode_persistent.py
@@ -9,13 +9,15 @@
 
 
 @tilelang.jit(
-    out_idx=[6], pass_configs={
+    out_idx=[6],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split):
-    scale = (1.0 / (dim + pe_dim))**0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    scale = (1.0 / (dim + pe_dim)) ** 0.5 * 1.44269504  # log2(e)
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert kv_head_num == 1, "kv_head_num must be 1"
@@ -23,13 +25,13 @@ def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_
 
     @T.prim_func
     def main_split_persistent(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(sm_num, threads=256) as (block_id):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
@@ -48,16 +50,11 @@ def main_split_persistent(
             logsum = T.alloc_fragment([block_H], accum_dtype)
             po_local = T.alloc_fragment([dim], dtype)
             o_accum_local = T.alloc_fragment([dim], accum_dtype)
-            lse_local_split = T.alloc_local([1], accum_dtype)
-            lse_logsum_local = T.alloc_local([1], accum_dtype)
-            lse_max_local = T.alloc_local([1], accum_dtype)
-            scale_local = T.alloc_local([1], accum_dtype)
-
-            T.annotate_layout({
-                # O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                S_shared: tilelang.layout.make_swizzled_layout(S_shared),
-                lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-            })
+            lse_local_split = T.alloc_var(accum_dtype)
+            lse_logsum_local = T.alloc_var(accum_dtype)
+            lse_max_local = T.alloc_var(accum_dtype)
+            scale_local = T.alloc_var(accum_dtype)
+
             T.use_swizzle(10)
 
             total_tiles = batch * (heads // min(block_H, kv_group_num)) * num_split
@@ -70,8 +67,8 @@ def main_split_persistent(
                 cur_kv_head = hid // (kv_group_num // block_H)
 
                 if bid < batch and hid * VALID_BLOCK_H < heads and sid < num_split:
-                    T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_shared)
-                    T.copy(Q_pe[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+                    T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_shared)
+                    T.copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
                     T.fill(acc_o, 0)
                     T.fill(logsum, 0)
                     T.fill(scores_max, -T.infinity(accum_dtype))
@@ -83,24 +80,15 @@ def main_split_persistent(
                         T.copy(KV[bid, kv_start:kv_end, cur_kv_head, :], KV_shared)
                         T.copy(K_pe[bid, kv_start:kv_end, cur_kv_head, :], K_pe_shared)
                         T.clear(acc_s)
-                        T.gemm(
-                            Q_shared,
-                            KV_shared,
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullCol)
-                        T.gemm(
-                            Q_pe_shared,
-                            K_pe_shared,
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullCol)
+                        T.gemm(Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                        T.gemm(Q_pe_shared, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                         T.copy(scores_max, scores_max_prev)
                         T.fill(scores_max, -T.infinity(accum_dtype))
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                         for i in T.Parallel(block_H):
-                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
-                                                     scores_max[i] * scale)
+                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                        for i in T.Parallel(block_H):
+                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                         for i, j in T.Parallel(block_H, block_N):
                             acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                         T.reduce_sum(acc_s, scores_sum, dim=1)
@@ -115,11 +103,9 @@ def main_split_persistent(
                         acc_o[i, j] /= logsum[i]
                     for i in T.Parallel(block_H):
                         logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-                    T.copy(logsum, glse[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, sid])
+                    T.copy(logsum, glse[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, sid])
                     # T.copy(acc_o, O_shared)
-                    T.copy(
-                        acc_o, Output_partial[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                              sid, :])
+                    T.copy(acc_o, Output_partial[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, sid, :])
 
             T.sync_grid()
             waves = T.ceildiv(heads * batch, sm_num)
@@ -130,20 +116,20 @@ def main_split_persistent(
                 if bid < batch and hid < heads:
                     T.clear(lse_logsum_local)
                     T.clear(o_accum_local)
-                    lse_max_local[0] = -T.infinity(accum_dtype)
+                    lse_max_local = -T.infinity(accum_dtype)
                     for k in T.serial(num_split):
-                        lse_max_local[0] = T.max(lse_max_local[0], glse[bid, hid, k])
+                        lse_max_local = T.max(lse_max_local, glse[bid, hid, k])
                     for k in T.Pipelined(num_split, num_stages=1):
-                        lse_local_split[0] = glse[bid, hid, k]
-                        lse_logsum_local[0] += T.exp2(lse_local_split[0] - lse_max_local[0])
-                    lse_logsum_local[0] = T.log2(lse_logsum_local[0]) + lse_max_local[0]
+                        lse_local_split = glse[bid, hid, k]
+                        lse_logsum_local += T.exp2(lse_local_split - lse_max_local)
+                    lse_logsum_local = T.log2(lse_logsum_local) + lse_max_local
                     for k in T.serial(num_split):
                         for i in T.Parallel(dim):
                             po_local[i] = Output_partial[bid, hid, k, i]
-                        lse_local_split[0] = glse[bid, hid, k]
-                        scale_local[0] = T.exp2(lse_local_split[0] - lse_logsum_local[0])
+                        lse_local_split = glse[bid, hid, k]
+                        scale_local = T.exp2(lse_local_split - lse_logsum_local)
                         for i in T.Parallel(dim):
-                            o_accum_local[i] += po_local[i] * scale_local[0]
+                            o_accum_local[i] += po_local[i] * scale_local
                     for i in T.Parallel(dim):
                         Output[bid, hid, i] = o_accum_local[i]
 
@@ -165,42 +151,35 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
     dim = q.shape[-1]
     pe_dim = q_pe.shape[-1]
     num_head_groups = q.shape[1] // kv.shape[2]
-    scale = (dim + pe_dim)**0.5
-    q = rearrange(
-        q, 'b (h g) d -> b g h d', g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    scale = (dim + pe_dim) ** 0.5
+    q = rearrange(q, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    q_pe = rearrange(
-        q_pe, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
+    q_pe = rearrange(q_pe, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
 
-    kv = rearrange(kv, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    kv = rearrange(kv, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    k_pe = rearrange(k_pe, 'b n h d -> b h n d')  # [batch_size, num_head_groups, groups, pe_dim]
+    k_pe = rearrange(k_pe, "b n h d -> b h n d")  # [batch_size, num_head_groups, groups, pe_dim]
 
     query = torch.concat([q, q_pe], dim=-1)
     key = torch.concat([kv, k_pe], dim=-1)
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, kv,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, kv, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=128, help='batch size')
-    parser.add_argument('--heads', type=int, default=128, help='q heads number')
-    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
-    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
-    parser.add_argument('--dim', type=int, default=512, help='head dim')
-    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
+    parser.add_argument("--batch", type=int, default=128, help="batch size")
+    parser.add_argument("--heads", type=int, default=128, help="q heads number")
+    parser.add_argument("--kv_heads", type=int, default=1, help="kv heads number")
+    parser.add_argument("--kv_ctx", type=int, default=8192, help="kv context length")
+    parser.add_argument("--dim", type=int, default=512, help="head dim")
+    parser.add_argument("--pe_dim", type=int, default=64, help="pe head dim")
     args = parser.parse_args()
     batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
     qk_flops = 2 * batch * heads * kv_ctx * (dim + pe_dim)
diff --git a/examples/deepseek_mla/example_mla_decode_ws.py b/examples/deepseek_mla/example_mla_decode_ws.py
index 6554d57de..32eb0d475 100644
--- a/examples/deepseek_mla/example_mla_decode_ws.py
+++ b/examples/deepseek_mla/example_mla_decode_ws.py
@@ -13,30 +13,38 @@
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
     },
     compile_flags=[
-        "-O3", "-Wno-deprecated-declarations", "-U__CUDA_NO_HALF_OPERATORS__",
-        "-U__CUDA_NO_HALF_CONVERSIONS__", "-U__CUDA_NO_HALF2_OPERATORS__",
-        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__", "--expt-relaxed-constexpr", "--expt-extended-lambda",
-        "--ptxas-options=-v,--register-usage-level=10", "-DNDEBUG"
+        "-O3",
+        "-Wno-deprecated-declarations",
+        "-U__CUDA_NO_HALF_OPERATORS__",
+        "-U__CUDA_NO_HALF_CONVERSIONS__",
+        "-U__CUDA_NO_HALF2_OPERATORS__",
+        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+        "--expt-relaxed-constexpr",
+        "--expt-extended-lambda",
+        "--ptxas-options=-v,--register-usage-level=10",
+        "-DNDEBUG",
     ],
 )
-def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split,
-              softmax_scale):
+def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split, softmax_scale):
     sm_scale = float(softmax_scale * 1.44269504)  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert kv_head_num == 1, "kv_head_num must be 1"
 
-    @T.macro
-    def flash_attn(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+    @T.prim_func
+    def main_split(
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
-        with T.Kernel(heads // min(block_H, kv_group_num), batch, threads=384) as (hid, bid):
+        # flash_attn_split
+        with T.Kernel(batch, heads // min(block_H, kv_group_num), num_split, threads=384) as (bid, hid, bz):
             Q_shared_l = T.alloc_shared([block_H, dim // 2], dtype)
             Q_shared_r = T.alloc_shared([block_H, dim // 2], dtype)
             Q_tail_shared = T.alloc_shared([block_H, pe_dim], dtype)
@@ -75,16 +83,16 @@ def flash_attn(
 
             tx = T.get_thread_binding()
 
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, 0:dim // 2], Q_shared_l)
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, dim // 2:dim], Q_shared_r)
-            T.copy(Q_pe[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_tail_shared)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, 0 : dim // 2], Q_shared_l)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, dim // 2 : dim], Q_shared_r)
+            T.copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_tail_shared)
 
             T.barrier_arrive(bar_q)
 
             if tx < 128:
                 T.set_max_nreg(240, 1)
                 T.fill(sumexp, 0)
-                T.fill(m_i, -2**30)  # avoid -inf - inf to cause nan
+                T.fill(m_i, -(2**30))  # avoid -inf - inf to cause nan
                 T.fill(acc_o_l, 0)
                 T.barrier_wait(bar_q, 0)
 
@@ -105,6 +113,8 @@ def flash_attn(
 
                     T.copy(m_i, m_i_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(block_H):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(block_H):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(block_H, block_N):
@@ -137,6 +147,8 @@ def flash_attn(
 
                     T.copy(m_i, m_i_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(block_H):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(block_H):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(block_H, block_N):
@@ -162,8 +174,8 @@ def flash_attn(
                 for h_i in T.Parallel(block_H):
                     sumexp[h_i] = T.log2(sumexp[h_i]) + m_i[h_i] * sm_scale
                 T.copy(acc_o_l, O_shared_l)
-                T.copy(O_shared_l, Output[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                          0:dim // 2])
+                T.copy(O_shared_l, Output_partial[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, bz, 0 : dim // 2])
+                T.copy(sumexp, glse[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, bz])
 
             elif tx >= 128 and tx < 256:
                 T.set_max_nreg(168, 1)
@@ -193,8 +205,7 @@ def flash_attn(
                     acc_o_r[h_i, d_i] /= sum_exp_shared[h_i]
 
                 T.copy(acc_o_r, O_shared_r)
-                T.copy(O_shared_r, Output[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                          dim // 2:dim])
+                T.copy(O_shared_r, Output_partial[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, bz, dim // 2 : dim])
 
             elif tx >= 256:
                 # producer
@@ -203,59 +214,82 @@ def flash_attn(
                     # Buffer 0
                     T.barrier_wait(bar_k_0_free[0], ((i_i & 1) ^ 1))
                     for r in T.serial(4):
-                        kv_indices = (i_i * 2) * block_N + r * 16 + (tx - 256) // 8
+                        kv_indices = (seqlen_kv // num_split) * bz + (i_i * 2) * block_N + r * 16 + (tx - 256) // 8
                         with T.attr("default", "async_scope", 1):
                             for u in T.serial(4):
                                 for v in T.vectorized(8):
-                                    KV_shared_0_l[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head,
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
-                                    KV_shared_0_r[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head, dim // 2 +
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
+                                    KV_shared_0_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
+                                    KV_shared_0_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, dim // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
                         with T.attr("default", "async_scope", 1):
                             for v in T.vectorized(8):
-                                K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 +
-                                                v] = K_pe[bid, kv_indices, cur_kv_head,
-                                                          (tx - 256) % 8 * 8 + v]
+                                K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = K_pe[
+                                    bid, kv_indices, cur_kv_head, (tx - 256) % 8 * 8 + v
+                                ]
                     T.cp_async_barrier_noinc(bar_k_0_ready[0])
 
                     # Buffer 1
                     T.barrier_wait(bar_k_1_free[0], ((i_i & 1) ^ 1))
                     for r in T.serial(4):
-                        kv_indices = (i_i * 2 + 1) * block_N + r * 16 + (tx - 256) // 8
+                        kv_indices = (seqlen_kv // num_split) * bz + (i_i * 2 + 1) * block_N + r * 16 + (tx - 256) // 8
                         with T.attr("default", "async_scope", 1):
                             for u in T.serial(4):
                                 for v in T.vectorized(8):
-                                    KV_shared_1_l[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head,
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
-                                    KV_shared_1_r[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head, dim // 2 +
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
+                                    KV_shared_1_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
+                                    KV_shared_1_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, dim // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
                         with T.attr("default", "async_scope", 1):
                             for v in T.vectorized(8):
-                                K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 +
-                                                v] = K_pe[bid, kv_indices, cur_kv_head,
-                                                          (tx - 256) % 8 * 8 + v]
+                                K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = K_pe[
+                                    bid, kv_indices, cur_kv_head, (tx - 256) % 8 * 8 + v
+                                ]
                     T.cp_async_barrier_noinc(bar_k_1_ready[0])
 
-    @T.macro
-    def flash_attn_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        # combine
+        with T.Kernel(heads, batch, threads=128) as (hid, bz):
+            po_local = T.alloc_fragment([dim], dtype)
+            o_accum_local = T.alloc_fragment([dim], accum_dtype)
+            lse_local_split = T.alloc_var(accum_dtype)
+            lse_logsum_local = T.alloc_var(accum_dtype)
+            lse_max_local = T.alloc_var(accum_dtype)
+            scale_local = T.alloc_var(accum_dtype)
+
+            T.clear(lse_logsum_local)
+            T.clear(o_accum_local)
+            lse_max_local = -T.infinity(accum_dtype)
+            for k in T.serial(num_split):
+                lse_max_local = T.max(lse_max_local, glse[bz, hid, k])
+            for k in T.Pipelined(num_split, num_stages=1):
+                lse_local_split = glse[bz, hid, k]
+                lse_logsum_local += T.exp2(lse_local_split - lse_max_local)
+            lse_logsum_local = T.log2(lse_logsum_local) + lse_max_local
+            for k in T.serial(num_split):
+                for i in T.Parallel(dim):
+                    po_local[i] = Output_partial[bz, hid, k, i]
+                lse_local_split = glse[bz, hid, k]
+                scale_local = T.exp2(lse_local_split - lse_logsum_local)
+                for i in T.Parallel(dim):
+                    o_accum_local[i] += po_local[i] * scale_local
+            for i in T.Parallel(dim):
+                Output[bz, hid, i] = o_accum_local[i]
+
+    @T.prim_func
+    def main_no_split(
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
-        with T.Kernel(
-                batch, heads // min(block_H, kv_group_num), num_split,
-                threads=384) as (bid, hid, bz):
+        with T.Kernel(heads // min(block_H, kv_group_num), batch, threads=384) as (hid, bid):
             Q_shared_l = T.alloc_shared([block_H, dim // 2], dtype)
             Q_shared_r = T.alloc_shared([block_H, dim // 2], dtype)
             Q_tail_shared = T.alloc_shared([block_H, pe_dim], dtype)
@@ -294,16 +328,16 @@ def flash_attn_split(
 
             tx = T.get_thread_binding()
 
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, 0:dim // 2], Q_shared_l)
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, dim // 2:dim], Q_shared_r)
-            T.copy(Q_pe[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_tail_shared)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, 0 : dim // 2], Q_shared_l)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, dim // 2 : dim], Q_shared_r)
+            T.copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_tail_shared)
 
             T.barrier_arrive(bar_q)
 
             if tx < 128:
                 T.set_max_nreg(240, 1)
                 T.fill(sumexp, 0)
-                T.fill(m_i, -2**30)  # avoid -inf - inf to cause nan
+                T.fill(m_i, -(2**30))  # avoid -inf - inf to cause nan
                 T.fill(acc_o_l, 0)
                 T.barrier_wait(bar_q, 0)
 
@@ -323,7 +357,9 @@ def flash_attn_split(
                         T.barrier_wait(bar_sScale_and_sS_free, ((i_i * 2) & 1) ^ 1)
 
                     T.copy(m_i, m_i_prev)
-                    T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    T.reduce_max(acc_s, out=m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(block_H):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(block_H):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(block_H, block_N):
@@ -356,6 +392,8 @@ def flash_attn_split(
 
                     T.copy(m_i, m_i_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(block_H):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(block_H):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(block_H, block_N):
@@ -381,10 +419,7 @@ def flash_attn_split(
                 for h_i in T.Parallel(block_H):
                     sumexp[h_i] = T.log2(sumexp[h_i]) + m_i[h_i] * sm_scale
                 T.copy(acc_o_l, O_shared_l)
-                T.copy(
-                    O_shared_l, Output_partial[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                               bz, 0:dim // 2])
-                T.copy(sumexp, glse[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, bz])
+                T.copy(O_shared_l, Output[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, 0 : dim // 2])
 
             elif tx >= 128 and tx < 256:
                 T.set_max_nreg(168, 1)
@@ -414,9 +449,7 @@ def flash_attn_split(
                     acc_o_r[h_i, d_i] /= sum_exp_shared[h_i]
 
                 T.copy(acc_o_r, O_shared_r)
-                T.copy(
-                    O_shared_r, Output_partial[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                               bz, dim // 2:dim])
+                T.copy(O_shared_r, Output[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, dim // 2 : dim])
 
             elif tx >= 256:
                 # producer
@@ -425,111 +458,43 @@ def flash_attn_split(
                     # Buffer 0
                     T.barrier_wait(bar_k_0_free[0], ((i_i & 1) ^ 1))
                     for r in T.serial(4):
-                        kv_indices = (seqlen_kv // num_split) * bz + (
-                            i_i * 2) * block_N + r * 16 + (tx - 256) // 8
+                        kv_indices = (i_i * 2) * block_N + r * 16 + (tx - 256) // 8
                         with T.attr("default", "async_scope", 1):
                             for u in T.serial(4):
                                 for v in T.vectorized(8):
-                                    KV_shared_0_l[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head,
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
-                                    KV_shared_0_r[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head, dim // 2 +
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
+                                    KV_shared_0_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
+                                    KV_shared_0_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, dim // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
                         with T.attr("default", "async_scope", 1):
                             for v in T.vectorized(8):
-                                K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 +
-                                                v] = K_pe[bid, kv_indices, cur_kv_head,
-                                                          (tx - 256) % 8 * 8 + v]
+                                K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = K_pe[
+                                    bid, kv_indices, cur_kv_head, (tx - 256) % 8 * 8 + v
+                                ]
                     T.cp_async_barrier_noinc(bar_k_0_ready[0])
 
                     # Buffer 1
                     T.barrier_wait(bar_k_1_free[0], ((i_i & 1) ^ 1))
                     for r in T.serial(4):
-                        kv_indices = (seqlen_kv // num_split) * bz + (
-                            i_i * 2 + 1) * block_N + r * 16 + (tx - 256) // 8
+                        kv_indices = (i_i * 2 + 1) * block_N + r * 16 + (tx - 256) // 8
                         with T.attr("default", "async_scope", 1):
                             for u in T.serial(4):
                                 for v in T.vectorized(8):
-                                    KV_shared_1_l[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head,
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
-                                    KV_shared_1_r[r * 16 + (tx - 256) // 8,
-                                                  64 * u + (tx - 256) % 8 * 8 +
-                                                  v] = KV[bid, kv_indices, cur_kv_head, dim // 2 +
-                                                          64 * u + (tx - 256) % 8 * 8 + v]
+                                    KV_shared_1_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
+                                    KV_shared_1_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                        bid, kv_indices, cur_kv_head, dim // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                    ]
                         with T.attr("default", "async_scope", 1):
                             for v in T.vectorized(8):
-                                K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 +
-                                                v] = K_pe[bid, kv_indices, cur_kv_head,
-                                                          (tx - 256) % 8 * 8 + v]
+                                K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = K_pe[
+                                    bid, kv_indices, cur_kv_head, (tx - 256) % 8 * 8 + v
+                                ]
                     T.cp_async_barrier_noinc(bar_k_1_ready[0])
 
-    @T.macro
-    def combine(
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
-    ):
-        with T.Kernel(heads, batch, threads=128) as (hid, bz):
-            po_local = T.alloc_fragment([dim], dtype)
-            o_accum_local = T.alloc_fragment([dim], accum_dtype)
-            lse_local_split = T.alloc_local([1], accum_dtype)
-            lse_logsum_local = T.alloc_local([1], accum_dtype)
-            lse_max_local = T.alloc_local([1], accum_dtype)
-            scale_local = T.alloc_local([1], accum_dtype)
-
-            T.annotate_layout({
-                lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-            })
-
-            T.clear(lse_logsum_local)
-            T.clear(o_accum_local)
-            lse_max_local[0] = -T.infinity(accum_dtype)
-            for k in T.serial(num_split):
-                lse_max_local[0] = T.max(lse_max_local[0], glse[bz, hid, k])
-            for k in T.Pipelined(num_split, num_stages=1):
-                lse_local_split[0] = glse[bz, hid, k]
-                lse_logsum_local[0] += T.exp2(lse_local_split[0] - lse_max_local[0])
-            lse_logsum_local[0] = T.log2(lse_logsum_local[0]) + lse_max_local[0]
-            for k in T.serial(num_split):
-                for i in T.Parallel(dim):
-                    po_local[i] = Output_partial[bz, hid, k, i]
-                lse_local_split[0] = glse[bz, hid, k]
-                scale_local[0] = T.exp2(lse_local_split[0] - lse_logsum_local[0])
-                for i in T.Parallel(dim):
-                    o_accum_local[i] += po_local[i] * scale_local[0]
-            for i in T.Parallel(dim):
-                Output[bz, hid, i] = o_accum_local[i]
-
-    @T.prim_func
-    def main_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
-    ):
-        flash_attn_split(Q, Q_pe, KV, K_pe, glse, Output_partial)
-        combine(glse, Output_partial, Output)
-
-    @T.prim_func
-    def main_no_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
-    ):
-        flash_attn(Q, Q_pe, KV, K_pe, Output)
-
     if num_split > 1:
         return main_split
     else:
@@ -551,31 +516,24 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
     dim = q.shape[-1]
     pe_dim = q_pe.shape[-1]
     num_head_groups = q.shape[1] // kv.shape[2]
-    scale = (dim + pe_dim)**0.5
-    q = rearrange(
-        q, 'b (h g) d -> b g h d', g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    scale = (dim + pe_dim) ** 0.5
+    q = rearrange(q, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    q_pe = rearrange(
-        q_pe, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
+    q_pe = rearrange(q_pe, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
 
-    kv = rearrange(kv, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    kv = rearrange(kv, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    k_pe = rearrange(k_pe, 'b n h d -> b h n d')  # [batch_size, num_head_groups, groups, pe_dim]
+    k_pe = rearrange(k_pe, "b n h d -> b h n d")  # [batch_size, num_head_groups, groups, pe_dim]
 
     query = torch.concat([q, q_pe], dim=-1)
     key = torch.concat([kv, k_pe], dim=-1)
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, kv,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, kv, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
@@ -593,10 +551,9 @@ def main(
     BLOCK_N = 64
     BLOCK_H = min(64, heads // kv_heads)
     num_split = 1
-    softmax_scale = (dim + pe_dim)**-0.5
+    softmax_scale = (dim + pe_dim) ** -0.5
 
-    kernel = flashattn(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split,
-                       softmax_scale)
+    kernel = flashattn(batch, heads, kv_heads, kv_ctx, dim, pe_dim, BLOCK_N, BLOCK_H, num_split, softmax_scale)
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
     profiler.assert_allclose(ref_program, rtol=1e-4, atol=1e-4)
     latency = profiler.do_bench(warmup=500)
@@ -606,12 +563,12 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=132, help='batch size')
-    parser.add_argument('--heads', type=int, default=128, help='q heads number')
-    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
-    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
-    parser.add_argument('--dim', type=int, default=512, help='head dim')
-    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
+    parser.add_argument("--batch", type=int, default=132, help="batch size")
+    parser.add_argument("--heads", type=int, default=128, help="q heads number")
+    parser.add_argument("--kv_heads", type=int, default=1, help="kv heads number")
+    parser.add_argument("--kv_ctx", type=int, default=8192, help="kv context length")
+    parser.add_argument("--dim", type=int, default=512, help="head dim")
+    parser.add_argument("--pe_dim", type=int, default=64, help="pe head dim")
     args = parser.parse_args()
     batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
     main(batch, heads, kv_heads, kv_ctx, dim, pe_dim)
diff --git a/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py b/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
index 1b1447e88..e70c35349 100644
--- a/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
+++ b/examples/deepseek_mla/experimental/example_mla_decode_kv_fp8.py
@@ -8,25 +8,27 @@
 
 
 @tilelang.jit(
-    out_idx=[-1], pass_configs={
+    out_idx=[-1],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H):
-    scale = (1.0 / (dim + pe_dim))**0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    q_dtype = "float8_e4m3"
-    accum_dtype = "float"
+    scale = (1.0 / (dim + pe_dim)) ** 0.5 * 1.44269504  # log2(e)
+    dtype = T.float16
+    q_dtype = T.float8_e4m3fn
+    accum_dtype = T.float32
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert kv_head_num == 1, "kv_head_num must be 1"
 
     @T.prim_func
     def main_no_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], q_dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], q_dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(batch, heads // min(block_H, kv_group_num), threads=256) as (bx, by):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
@@ -46,34 +48,27 @@ def main_no_split(
 
             cur_kv_head = by // (kv_group_num // block_H)
             T.use_swizzle(10)
-            T.annotate_layout({
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-            })
 
-            T.copy(Q[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_shared)
-            T.copy(Q_pe[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+            T.copy(Q[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_shared)
+            T.copy(Q_pe[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :], Q_pe_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
             T.disable_warp_group_reg_alloc()
             loop_range = T.ceildiv(seqlen_kv, block_N)
             for k in T.Pipelined(loop_range, num_stages=2):
-                T.copy(KV[bx, k * block_N:(k + 1) * block_N, cur_kv_head, :], qKV_shared)
-                T.copy(K_pe[bx, k * block_N:(k + 1) * block_N, cur_kv_head, :], K_pe_shared)
+                T.copy(KV[bx, k * block_N : (k + 1) * block_N, cur_kv_head, :], qKV_shared)
+                T.copy(K_pe[bx, k * block_N : (k + 1) * block_N, cur_kv_head, :], K_pe_shared)
                 T.copy(qKV_shared, KV_shared)
 
                 T.clear(acc_s)
-                T.gemm(
-                    Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
-                T.gemm(
-                    Q_pe_shared,
-                    K_pe_shared,
-                    acc_s,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_shared, KV_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_pe_shared, K_pe_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
@@ -88,7 +83,7 @@ def main_no_split(
             for i, j in T.Parallel(block_H, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bx, by * VALID_BLOCK_H:(by + 1) * VALID_BLOCK_H, :])
+            T.copy(O_shared, Output[bx, by * VALID_BLOCK_H : (by + 1) * VALID_BLOCK_H, :])
 
     return main_no_split
 
@@ -106,42 +101,35 @@ def ref_program(q, q_pe, kv, k_pe):
     dim = q.shape[-1]
     pe_dim = q_pe.shape[-1]
     num_head_groups = q.shape[1] // kv.shape[2]
-    scale = (dim + pe_dim)**0.5
-    q = rearrange(
-        q, 'b (h g) d -> b g h d', g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    scale = (dim + pe_dim) ** 0.5
+    q = rearrange(q, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    q_pe = rearrange(
-        q_pe, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
+    q_pe = rearrange(q_pe, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
 
-    kv = rearrange(kv, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    kv = rearrange(kv, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    k_pe = rearrange(k_pe, 'b n h d -> b h n d')  # [batch_size, num_head_groups, groups, pe_dim]
+    k_pe = rearrange(k_pe, "b n h d -> b h n d")  # [batch_size, num_head_groups, groups, pe_dim]
 
     query = torch.concat([q, q_pe], dim=-1)
     key = torch.concat([kv, k_pe], dim=-1)
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, kv,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, kv, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=128, help='batch size')
-    parser.add_argument('--heads', type=int, default=128, help='q heads number')
-    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
-    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
-    parser.add_argument('--dim', type=int, default=512, help='head dim')
-    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
+    parser.add_argument("--batch", type=int, default=128, help="batch size")
+    parser.add_argument("--heads", type=int, default=128, help="q heads number")
+    parser.add_argument("--kv_heads", type=int, default=1, help="kv heads number")
+    parser.add_argument("--kv_ctx", type=int, default=8192, help="kv context length")
+    parser.add_argument("--dim", type=int, default=512, help="head dim")
+    parser.add_argument("--pe_dim", type=int, default=64, help="pe head dim")
     args = parser.parse_args()
     batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
     qk_flops = 2 * batch * heads * kv_ctx * (dim + pe_dim)
diff --git a/examples/deepseek_mla/regression_example_mla_decode.py b/examples/deepseek_mla/regression_example_mla_decode.py
new file mode 100644
index 000000000..64e1c436a
--- /dev/null
+++ b/examples/deepseek_mla/regression_example_mla_decode.py
@@ -0,0 +1,10 @@
+import tilelang.testing
+import example_mla_decode
+
+
+def regression_example_mla_decode():
+    tilelang.testing.process_func(example_mla_decode.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/deepseek_mla/test_example_mla_decode.py b/examples/deepseek_mla/test_example_mla_decode.py
index 66a750f7d..a269ea57a 100644
--- a/examples/deepseek_mla/test_example_mla_decode.py
+++ b/examples/deepseek_mla/test_example_mla_decode.py
@@ -1,5 +1,4 @@
 import tilelang.testing
-
 import example_mla_decode
 
 
diff --git a/examples/deepseek_mla/torch_refs.py b/examples/deepseek_mla/torch_refs.py
index 4b4c888cd..aae6c7cd2 100644
--- a/examples/deepseek_mla/torch_refs.py
+++ b/examples/deepseek_mla/torch_refs.py
@@ -11,7 +11,7 @@ def flash_split_ref(Q, Q_pe, KV, K_pe):
     block_N = 64
     seqlen_kv = KV.size(1)
 
-    scale = (1.0 / (dim + pe_dim))**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / (dim + pe_dim)) ** 0.5 * 1.44269504  # log2(e)
     acc_s = torch.empty((batch, nheads, block_N), device="cuda", dtype=torch.float)
     acc_s_cast = torch.empty((batch, nheads, block_N), device="cuda", dtype=torch.float16)
     acc_o = torch.empty((batch, nheads, dim), device="cuda", dtype=torch.float)
@@ -31,18 +31,20 @@ def flash_split_ref(Q, Q_pe, KV, K_pe):
     for ks in range(num_split):
         acc_o.fill_(0)
         logsum.fill_(0)
-        scores_max.fill_(float('-inf'))
-        scores_max_prev.fill_(float('-inf'))
+        scores_max.fill_(float("-inf"))
+        scores_max_prev.fill_(float("-inf"))
         for i in range(int((seqlen_kv // num_split) / block_N)):
             acc_s.fill_(0)
-            acc_s = torch.einsum('bhd,bkhd->bhk', Q_,
-                                 KV_[:, (seqlen_kv // num_split) * ks +
-                                     i * block_N:(seqlen_kv // num_split) * ks +
-                                     (i + 1) * block_N, :, :])  # [batch, nheads, block_N]
+            acc_s = torch.einsum(
+                "bhd,bkhd->bhk",
+                Q_,
+                KV_[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )  # [batch, nheads, block_N]
             acc_s += torch.einsum(
-                'bhd,bkhd->bhk', Q_pe_,
-                K_pe_[:, (seqlen_kv // num_split) * ks + i * block_N:(seqlen_kv // num_split) * ks +
-                      (i + 1) * block_N, :, :])
+                "bhd,bkhd->bhk",
+                Q_pe_,
+                K_pe_[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )
             scores_max_prev = scores_max
             scores_max = acc_s.max(dim=-1, keepdim=False).values  # [batch, nheads]
             scores_scale = torch.exp2(scores_max_prev - scores_max)  # [batch, nheads]
@@ -50,9 +52,10 @@ def flash_split_ref(Q, Q_pe, KV, K_pe):
             acc_s = torch.exp2(acc_s - scores_max[:, :, None])
             acc_s_cast = acc_s.to(torch.float16)  # [batch, nheads, block_N]
             acc_o += torch.einsum(
-                'bhk,bkhd->bhd', acc_s_cast,
-                KV_[:, (seqlen_kv // num_split) * ks + i * block_N:(seqlen_kv // num_split) * ks +
-                    (i + 1) * block_N, :, :])
+                "bhk,bkhd->bhd",
+                acc_s_cast,
+                KV_[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )
             scores_sum = acc_s.sum(dim=-1, keepdim=False)
             logsum = logsum * scores_scale + scores_sum
         acc_o /= logsum[:, :, None]
diff --git a/examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py b/examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py
index daee39865..ca98d01be 100644
--- a/examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py
+++ b/examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py
@@ -14,21 +14,44 @@
 from fla.utils import autocast_custom_fwd, contiguous
 
 
-@triton.heuristics({
-    'USE_OFFSETS': lambda args: args['offsets'] is not None,
-    'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor),
-})
+@triton.heuristics(
+    {
+        "USE_OFFSETS": lambda args: args["offsets"] is not None,
+        "USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor),
+    }
+)
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
 @triton.jit
-def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, block_indices,
-                            block_counts, offsets, token_indices, T, H: tl.constexpr,
-                            HQ: tl.constexpr, G: tl.constexpr, K: tl.constexpr, V: tl.constexpr,
-                            S: tl.constexpr, BS: tl.constexpr, WS: tl.constexpr, BK: tl.constexpr,
-                            BV: tl.constexpr, USE_OFFSETS: tl.constexpr,
-                            USE_BLOCK_COUNTS: tl.constexpr):
+def parallel_nsa_fwd_kernel(
+    q,
+    k,
+    v,
+    o_slc,
+    o_swa,
+    lse_slc,
+    lse_swa,
+    scale,
+    block_indices,
+    block_counts,
+    offsets,
+    token_indices,
+    T,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
@@ -40,20 +63,18 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
 
     NS = S
 
-    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK),
-                            (1, 0))
+    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
     # the Q block is kept in the shared memory throughout the whole kernel
     # [G, BK]
     b_q = tl.load(p_q, boundary_check=(0, 1))
     b_q = (b_q * scale).to(b_q.dtype)
 
-    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV),
-                                (G, BV), (1, 0))
+    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
     p_lse_slc = lse_slc + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
     # [G, BV]
     b_o_slc = tl.zeros([G, BV], dtype=tl.float32)
 
-    b_m_slc = tl.full([G], float('-inf'), dtype=tl.float32)
+    b_m_slc = tl.full([G], float("-inf"), dtype=tl.float32)
     b_acc_slc = tl.zeros([G], dtype=tl.float32)
     for i in range(NS):
         i_s = tl.load(block_indices + i).to(tl.int32) * BS
@@ -66,7 +87,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_slc = tl.load(p_v_slc, boundary_check=(0, 1))
             # [G, BS]
             b_s_slc = tl.dot(b_q, b_k_slc)
-            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float('-inf'))
+            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float("-inf"))
 
             # [G]
             b_m_slc, b_mp_slc = tl.maximum(b_m_slc, tl.max(b_s_slc, 1)), b_m_slc
@@ -87,7 +108,6 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
 
 
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -100,8 +120,7 @@ def forward(ctx, q, k, v, block_indices, block_size, scale, offsets):
         # [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]]
         token_indices = prepare_token_indices(offsets) if offsets is not None else None
 
-        o, lse = parallel_nsa_fwd(
-            q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
+        o, lse = parallel_nsa_fwd(q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
         ctx.save_for_backward(q, k, v, o, lse)
         ctx.block_indices = block_indices
         ctx.block_size = block_size
@@ -172,7 +191,6 @@ def parallel_nsa_fwd(
 
 @torch.compile
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -195,7 +213,8 @@ def forward(ctx, q, k, v, block_indices, block_counts, block_size, window_size,
             window_size=window_size,
             scale=scale,
             offsets=offsets,
-            token_indices=token_indices)
+            token_indices=token_indices,
+        )
         ctx.save_for_backward(q, k, v, o_slc, lse_slc, o_swa, lse_swa)
         ctx.block_indices = block_indices
         ctx.block_counts = block_counts
@@ -207,18 +226,20 @@ def forward(ctx, q, k, v, block_indices, block_counts, block_size, window_size,
         return o_slc.to(q.dtype), o_swa.to(q.dtype) if o_swa is not None else o_swa
 
 
-def parallel_nsa(q: torch.Tensor,
-                 k: torch.Tensor,
-                 v: torch.Tensor,
-                 g_slc: torch.Tensor,
-                 g_swa: torch.Tensor,
-                 block_indices: torch.LongTensor,
-                 block_counts: Optional[Union[torch.LongTensor, int]] = None,
-                 block_size: int = 64,
-                 window_size: int = 0,
-                 scale: Optional[float] = None,
-                 cu_seqlens: Optional[torch.LongTensor] = None,
-                 head_first: bool = False) -> torch.Tensor:
+def parallel_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -258,44 +279,44 @@ def parallel_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
     assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
     if isinstance(block_counts, int):
         block_indices = block_indices[:, :, :, :block_counts]
         block_counts = None
 
-    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size,
-                                             window_size, scale, cu_seqlens)
+    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size, scale, cu_seqlens)
     if window_size > 0:
         o = torch.addcmul(o_slc * g_slc.unsqueeze(-1), o_swa, g_swa.unsqueeze(-1))
     else:
         o = o_slc * g_slc.unsqueeze(-1)
     if head_first:
-        o = rearrange(o, 'b t h d -> b h t d')
+        o = rearrange(o, "b t h d -> b h t d")
     return o
 
 
-def naive_nsa(q: torch.Tensor,
-              k: torch.Tensor,
-              v: torch.Tensor,
-              g_slc: torch.Tensor,
-              g_swa: torch.Tensor,
-              block_indices: torch.LongTensor,
-              block_counts: Optional[Union[torch.LongTensor, int]] = None,
-              block_size: int = 64,
-              window_size: int = 0,
-              scale: Optional[float] = None,
-              cu_seqlens: Optional[torch.LongTensor] = None,
-              head_first: bool = False) -> torch.Tensor:
+def naive_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -335,26 +356,24 @@ def naive_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
         if head_first:
-            raise RuntimeError(
-                "Sequences with variable lengths are not supported for head-first mode")
+            raise RuntimeError("Sequences with variable lengths are not supported for head-first mode")
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
 
     dtype = q.dtype
     G = q.shape[2] // k.shape[2]
     BS = block_size
     S = block_indices.shape[-1]
-    k, v, block_indices = (repeat(x, 'b t h d -> b t (h g) d', g=G) for x in (k, v, block_indices))
+    k, v, block_indices = (repeat(x, "b t h d -> b t (h g) d", g=G) for x in (k, v, block_indices))
     if isinstance(block_counts, torch.Tensor):
-        block_counts = repeat(block_counts, 'b t h -> b t (h g)', g=G)
+        block_counts = repeat(block_counts, "b t h -> b t (h g)", g=G)
     c = torch.arange(S).repeat_interleave(BS).unsqueeze(1).expand(-1, q.shape[2]).to(q.device)
     q, k, v = map(lambda x: x.float(), (q, k, v))
 
@@ -364,14 +383,11 @@ def naive_nsa(q: torch.Tensor,
     if cu_seqlens is None:
         varlen = False
         B, T = q.shape[:2]
-        cu_seqlens = torch.cat(
-            [block_indices.new_tensor(range(0, B * T, T)),
-             block_indices.new_tensor([B * T])])
+        cu_seqlens = torch.cat([block_indices.new_tensor(range(0, B * T, T)), block_indices.new_tensor([B * T])])
 
     for i in range(len(cu_seqlens) - 1):
         if not varlen:
-            q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = q[i], k[i], v[i], g_slc[i], g_swa[
-                i], block_indices[i]
+            q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = q[i], k[i], v[i], g_slc[i], g_swa[i], block_indices[i]
             if isinstance(block_counts, torch.Tensor):
                 s_b = block_counts[i]
             else:
@@ -379,10 +395,10 @@ def naive_nsa(q: torch.Tensor,
         else:
             T = cu_seqlens[i + 1] - cu_seqlens[i]
             q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = map(
-                lambda x: x[0][cu_seqlens[i]:cu_seqlens[i + 1]],
-                (q, k, v, g_slc, g_swa, block_indices))
+                lambda x: x[0][cu_seqlens[i] : cu_seqlens[i + 1]], (q, k, v, g_slc, g_swa, block_indices)
+            )
             if isinstance(block_counts, torch.Tensor):
-                s_b = block_counts[0][cu_seqlens[i]:cu_seqlens[i + 1]]
+                s_b = block_counts[0][cu_seqlens[i] : cu_seqlens[i + 1]]
             else:
                 s_b = block_counts
 
@@ -404,71 +420,58 @@ def naive_nsa(q: torch.Tensor,
             else:
                 s_i = s_b
             # [S*BS, HQ, -1]
-            k_i_slc, v_i_slc = map(
-                lambda x: x.gather(
-                    0,
-                    i_i.clamp(0, T - 1).unsqueeze(-1).expand(*i_i.shape, x.shape[-1])), (k_b, v_b))
+            k_i_slc, v_i_slc = map(lambda x: x.gather(0, i_i.clamp(0, T - 1).unsqueeze(-1).expand(*i_i.shape, x.shape[-1])), (k_b, v_b))
             # [S*BS, HQ]
-            attn_slc = torch.einsum('h d, n h d -> n h', q_i, k_i_slc).masked_fill(
-                torch.logical_or(i_i < 0, i_i > i_q) |
-                (c >= s_i if block_counts is not None else False), float('-inf')).softmax(0)
+            attn_slc = (
+                torch.einsum("h d, n h d -> n h", q_i, k_i_slc)
+                .masked_fill(torch.logical_or(i_i < 0, i_i > i_q) | (c >= s_i if block_counts is not None else False), float("-inf"))
+                .softmax(0)
+            )
             if not varlen:
-                o_slc[i, i_q] = torch.einsum('n h, n h v -> h v', attn_slc,
-                                             v_i_slc) * g_slc_i.unsqueeze(-1)
+                o_slc[i, i_q] = torch.einsum("n h, n h v -> h v", attn_slc, v_i_slc) * g_slc_i.unsqueeze(-1)
             else:
-                o_slc[0][cu_seqlens[i] + i_q] = torch.einsum('n h, n h v -> h v', attn_slc,
-                                                             v_i_slc) * g_slc_i.unsqueeze(-1)
+                o_slc[0][cu_seqlens[i] + i_q] = torch.einsum("n h, n h v -> h v", attn_slc, v_i_slc) * g_slc_i.unsqueeze(-1)
             if window_size > 0:
-                k_i_swa, v_i_swa = map(lambda x: x[max(0, i_q - window_size + 1):i_q + 1],
-                                       (k_b, v_b))
-                attn_swa = torch.einsum('h d, n h d -> n h', q_i, k_i_swa).softmax(0)
+                k_i_swa, v_i_swa = map(lambda x: x[max(0, i_q - window_size + 1) : i_q + 1], (k_b, v_b))
+                attn_swa = torch.einsum("h d, n h d -> n h", q_i, k_i_swa).softmax(0)
                 if not varlen:
-                    o_swa[i, i_q] = torch.einsum('n h, n h v -> h v', attn_swa,
-                                                 v_i_swa) * g_swa_i.unsqueeze(-1)
+                    o_swa[i, i_q] = torch.einsum("n h, n h v -> h v", attn_swa, v_i_swa) * g_swa_i.unsqueeze(-1)
                 else:
-                    o_swa[0][cu_seqlens[i] + i_q] = torch.einsum('n h, n h v -> h v', attn_swa,
-                                                                 v_i_swa) * g_swa_i.unsqueeze(-1)
+                    o_swa[0][cu_seqlens[i] + i_q] = torch.einsum("n h, n h v -> h v", attn_swa, v_i_swa) * g_swa_i.unsqueeze(-1)
 
     if head_first:
-        o_slc = rearrange(o_slc, 'b t h d -> b h t d')
-        o_swa = rearrange(o_swa, 'b t h d -> b h t d')
+        o_slc = rearrange(o_slc, "b t h d -> b h t d")
+        o_swa = rearrange(o_swa, "b t h d -> b h t d")
 
     return o_slc.to(dtype) + o_swa.to(dtype) if o_swa is not None else o_slc.to(dtype)
 
 
 def get_configs():
     import itertools
+
     iter_params = dict(
         block_T=[128, 256, 512],
         num_stages=[0, 1, 2, 4, 5],
         threads=[32, 64, 128, 256, 512],
     )
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
 
 
-@tilelang.autotune(configs=get_configs(),)
+@tilelang.autotune(
+    configs=get_configs(),
+)
 @tilelang.jit(
     pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
-def tilelang_sparse_attention(batch,
-                              heads,
-                              seq_len,
-                              dim,
-                              is_causal,
-                              scale=None,
-                              block_size=64,
-                              groups=1,
-                              selected_blocks=16,
-                              block_T=128,
-                              num_stages=2,
-                              threads=32):
+    }
+)
+def tilelang_sparse_attention(
+    batch, heads, seq_len, dim, is_causal, scale=None, block_size=64, groups=1, selected_blocks=16, block_T=128, num_stages=2, threads=32
+):
     if scale is None:
-        scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+        scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     else:
         scale = scale * 1.44269504  # log2(e)
 
@@ -476,9 +479,9 @@ def tilelang_sparse_attention(batch,
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
     block_indices_shape = [batch, seq_len, head_kv, selected_blocks]
-    block_indices_dtype = "int32"
-    dtype = "float16"
-    accum_dtype = "float"
+    block_indices_dtype = T.int32
+    dtype = T.float16
+    accum_dtype = T.float32
     block_S = block_size
     block_T = min(block_T, tilelang.math.next_power_of_2(dim))
 
@@ -493,11 +496,11 @@ def tilelang_sparse_attention(batch,
 
     @T.prim_func
     def tilelang_sparse_attention(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(seq_len, NV, batch * head_kv, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([G, BK], dtype)
@@ -514,13 +517,11 @@ def tilelang_sparse_attention(
             scores_sum = T.alloc_fragment([G], accum_dtype)
             logsum = T.alloc_fragment([G], accum_dtype)
 
-            T.annotate_layout({O_shared: tilelang.layout.make_swizzled_layout(O_shared)})
-
             i_t, i_v, i_bh = bx, by, bz
             i_b, i_h = i_bh // head_kv, i_bh % head_kv
 
             NS = S
-            T.copy(Q[i_b, i_t, i_h * G:(i_h + 1) * G, :], Q_shared)
+            T.copy(Q[i_b, i_t, i_h * G : (i_h + 1) * G, :], Q_shared)
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
@@ -530,21 +531,15 @@ def tilelang_sparse_attention(
                 i_s = BlockIndices[i_b, i_t, i_h, i] * BS
                 if i_s <= i_t and i_s >= 0:
                     # [BS, BK]
-                    T.copy(K[i_b, i_s:i_s + BS, i_h, :], K_shared)
+                    T.copy(K[i_b, i_s : i_s + BS, i_h, :], K_shared)
 
                     if is_causal:
                         for i, j in T.Parallel(G, BS):
-                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0,
-                                                         -T.infinity(acc_s.dtype))
+                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0, -T.infinity(acc_s.dtype))
                     else:
                         T.clear(acc_s)
 
-                    T.gemm(
-                        Q_shared,
-                        K_shared,
-                        acc_s,
-                        transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullRow)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
                     # Softmax
                     T.copy(scores_max, scores_max_prev)
@@ -564,45 +559,33 @@ def tilelang_sparse_attention(
                         acc_o[i, j] *= scores_scale[i]
 
                     # V * softmax(Q * K)
-                    T.copy(V[i_b, i_s:i_s + BS, i_h, i_v * BV:(i_v + 1) * BV], V_shared)
+                    T.copy(V[i_b, i_s : i_s + BS, i_h, i_v * BV : (i_v + 1) * BV], V_shared)
                     T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             for i, j in T.Parallel(G, BV):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[i_b, i_t, i_h * G:(i_h + 1) * G, i_v * BV:(i_v + 1) * BV])
+            T.copy(O_shared, Output[i_b, i_t, i_h * G : (i_h + 1) * G, i_v * BV : (i_v + 1) * BV])
 
     return tilelang_sparse_attention
 
 
 def generate_block_indices(batch, seq_len, heads, selected_blocks, block_size):
     """Generate random block indices for the benchmark."""
-    block_indices = torch.full((batch, seq_len, heads, selected_blocks),
-                               seq_len,
-                               dtype=torch.long,
-                               device='cuda')
+    block_indices = torch.full((batch, seq_len, heads, selected_blocks), seq_len, dtype=torch.long, device="cuda")
 
     for b in range(batch):
         for t in range(seq_len):
             for h in range(heads):
                 i_i = torch.randperm(max(1, (t // block_size)))[:selected_blocks]
-                block_indices[b, t, h, :len(i_i)] = i_i
+                block_indices[b, t, h, : len(i_i)] = i_i
 
     return block_indices.sort(-1)[0]
 
 
-def benchmark_nsa(batch_size,
-                  seq_len,
-                  heads,
-                  head_query,
-                  dim,
-                  selected_blocks,
-                  block_size,
-                  dtype,
-                  scale,
-                  warmup=10,
-                  iterations=100,
-                  validate=False):
+def benchmark_nsa(
+    batch_size, seq_len, heads, head_query, dim, selected_blocks, block_size, dtype, scale, warmup=10, iterations=100, validate=False
+):
     """Benchmark the TileLang Sparse Attention implementation."""
 
     # Set random seed for reproducibility
@@ -628,14 +611,13 @@ def benchmark_nsa(batch_size,
     print(f"Profiler latency: {profiler_latency} ms")
 
     # Create input tensors
-    Q = torch.randn((batch_size, seq_len, head_query, dim), dtype=dtype, device='cuda')
-    K = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device='cuda')
-    out = torch.empty((batch_size, seq_len, head_query, dim), dtype=dtype, device='cuda')
+    Q = torch.randn((batch_size, seq_len, head_query, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device="cuda")
+    out = torch.empty((batch_size, seq_len, head_query, dim), dtype=dtype, device="cuda")
 
     # Generate block indices
-    block_indices = generate_block_indices(batch_size, seq_len, heads, selected_blocks,
-                                           block_size).to(torch.int32)
+    block_indices = generate_block_indices(batch_size, seq_len, heads, selected_blocks, block_size).to(torch.int32)
 
     # Warmup
     for _ in range(warmup):
@@ -666,10 +648,9 @@ def benchmark_nsa(batch_size,
 
     # Validate result against reference if requested
     if validate:
-        g_slc = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device='cuda')
-        g_swa = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device='cuda')
-        block_counts = torch.randint(
-            1, selected_blocks + 1, (batch_size, seq_len, heads), device='cuda')
+        g_slc = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device="cuda")
+        g_swa = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device="cuda")
+        block_counts = torch.randint(1, selected_blocks + 1, (batch_size, seq_len, heads), device="cuda")
 
         ref = naive_nsa(
             q=Q,
@@ -700,22 +681,13 @@ def benchmark_nsa(batch_size,
         "head_query": head_query,
         "dim": dim,
         "selected_blocks": selected_blocks,
-        "block_size": block_size
+        "block_size": block_size,
     }
 
 
-def benchmark_triton_nsa(batch_size,
-                         seq_len,
-                         heads,
-                         head_query,
-                         dim,
-                         selected_blocks,
-                         block_size,
-                         dtype,
-                         scale,
-                         warmup=10,
-                         iterations=100,
-                         validate=False):
+def benchmark_triton_nsa(
+    batch_size, seq_len, heads, head_query, dim, selected_blocks, block_size, dtype, scale, warmup=10, iterations=100, validate=False
+):
     """Benchmark the Triton-based TileLang Sparse Attention implementation."""
 
     # Set random seed for reproducibility
@@ -723,18 +695,17 @@ def benchmark_triton_nsa(batch_size,
     torch.random.manual_seed(0)
 
     # Create input tensors
-    Q = torch.randn((batch_size, seq_len, head_query, dim), dtype=dtype, device='cuda')
-    K = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device='cuda')
-    V = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device='cuda')
-    g_slc = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device='cuda')
-    g_swa = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device='cuda')
+    Q = torch.randn((batch_size, seq_len, head_query, dim), dtype=dtype, device="cuda")
+    K = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device="cuda")
+    V = torch.randn((batch_size, seq_len, heads, dim), dtype=dtype, device="cuda")
+    g_slc = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device="cuda")
+    g_swa = torch.ones((batch_size, seq_len, head_query), dtype=dtype, device="cuda")
 
     # Generate block indices
     block_indices = generate_block_indices(batch_size, seq_len, heads, selected_blocks, block_size)
-    block_counts = torch.randint(
-        1, selected_blocks + 1, (batch_size, seq_len, heads), device='cuda')
-    o_slc = torch.empty((batch_size, seq_len, head_query, dim), dtype=dtype, device='cuda')
-    lse_slc = torch.empty((batch_size, seq_len, head_query), dtype=torch.float, device='cuda')
+    block_counts = torch.randint(1, selected_blocks + 1, (batch_size, seq_len, heads), device="cuda")
+    o_slc = torch.empty((batch_size, seq_len, head_query, dim), dtype=dtype, device="cuda")
+    lse_slc = torch.empty((batch_size, seq_len, head_query), dtype=torch.float, device="cuda")
 
     # Warmup
     for _ in range(warmup):
@@ -750,7 +721,8 @@ def benchmark_triton_nsa(batch_size,
             block_counts=block_counts,
             block_size=block_size,
             window_size=0,
-            scale=scale)
+            scale=scale,
+        )
 
     # Synchronize before timing
     torch.cuda.synchronize()
@@ -770,7 +742,8 @@ def benchmark_triton_nsa(batch_size,
             block_counts=block_counts,
             block_size=block_size,
             window_size=0,
-            scale=scale)
+            scale=scale,
+        )
     torch.cuda.synchronize()
     end_time = time.time()
 
@@ -815,54 +788,28 @@ def benchmark_triton_nsa(batch_size,
         "head_query": head_query,
         "dim": dim,
         "selected_blocks": selected_blocks,
-        "block_size": block_size
+        "block_size": block_size,
     }
 
 
-def run_benchmark_suite(impl='all'):
+def run_benchmark_suite(impl="all"):
     """Run a suite of benchmarks with different configurations."""
 
     # Define configurations to benchmark
     configs = [
         # Small model config - Note: head_query must be a multiple of heads*16 for Triton
-        {
-            "batch_size": 2,
-            "seq_len": 1024,
-            "heads": 8,
-            "head_query": 8 * 16,
-            "dim": 64,
-            "selected_blocks": 8,
-            "block_size": 32
-        },
-
+        {"batch_size": 2, "seq_len": 1024, "heads": 8, "head_query": 8 * 16, "dim": 64, "selected_blocks": 8, "block_size": 32},
         # Medium model config
-        {
-            "batch_size": 2,
-            "seq_len": 2048,
-            "heads": 16,
-            "head_query": 16 * 16,
-            "dim": 64,
-            "selected_blocks": 16,
-            "block_size": 64
-        },
-
+        {"batch_size": 2, "seq_len": 2048, "heads": 16, "head_query": 16 * 16, "dim": 64, "selected_blocks": 16, "block_size": 64},
         # Large model config
-        {
-            "batch_size": 1,
-            "seq_len": 4096,
-            "heads": 32,
-            "head_query": 32 * 16,
-            "dim": 128,
-            "selected_blocks": 32,
-            "block_size": 128
-        },
+        {"batch_size": 1, "seq_len": 4096, "heads": 32, "head_query": 32 * 16, "dim": 128, "selected_blocks": 32, "block_size": 128},
     ]
 
     results = []
     for config in configs:
         print(f"Running benchmark with config: {config}")
 
-        if impl in ['all', 'tilelang']:
+        if impl in ["all", "tilelang"]:
             print("Benchmarking TileLang implementation:")
             result = benchmark_nsa(
                 batch_size=config["batch_size"],
@@ -874,12 +821,13 @@ def run_benchmark_suite(impl='all'):
                 block_size=config["block_size"],
                 dtype=torch.float16,
                 scale=0.1,
-                validate=False)
+                validate=False,
+            )
             results.append({"impl": "tilelang", **result})
             print(f"Average time: {result['avg_time_ms']:.2f} ms")
             print(f"Performance: {result['tflops']:.2f} TFLOPs")
 
-        if impl in ['all', 'triton']:
+        if impl in ["all", "triton"]:
             print("Benchmarking Triton implementation:")
             result = benchmark_triton_nsa(
                 batch_size=config["batch_size"],
@@ -891,19 +839,24 @@ def run_benchmark_suite(impl='all'):
                 block_size=config["block_size"],
                 dtype=torch.float16,
                 scale=0.1,
-                validate=False)
+                validate=False,
+            )
             results.append({"impl": "triton", **result})
             print(f"Average time: {result['avg_time_ms']:.2f} ms")
             print(f"Performance: {result['tflops']:.2f} TFLOPs")
 
-        if impl in ['all']:
+        if impl in ["all"]:
             # Print comparison if both implementations were run
             tilelang_result = next(
-                r for r in results if r["impl"] == "tilelang" and
-                r["batch_size"] == config["batch_size"] and r["seq_len"] == config["seq_len"])
+                r
+                for r in results
+                if r["impl"] == "tilelang" and r["batch_size"] == config["batch_size"] and r["seq_len"] == config["seq_len"]
+            )
             triton_result = next(
-                r for r in results if r["impl"] == "triton" and
-                r["batch_size"] == config["batch_size"] and r["seq_len"] == config["seq_len"])
+                r
+                for r in results
+                if r["impl"] == "triton" and r["batch_size"] == config["batch_size"] and r["seq_len"] == config["seq_len"]
+            )
             speedup = tilelang_result["avg_time_ms"] / triton_result["avg_time_ms"]
             print(f"Speedup (Triton vs TileLang): {speedup:.2f}x")
 
@@ -921,8 +874,7 @@ def run_benchmark_suite(impl='all'):
     parser.add_argument("--dim", type=int, default=128, help="Head dimension")
     parser.add_argument("--selected_blocks", type=int, default=16, help="Number of selected blocks")
     parser.add_argument("--block_size", type=int, default=32, help="Block size")
-    parser.add_argument(
-        "--dtype", type=str, default="float16", help="Data type (float16 or float32)")
+    parser.add_argument("--dtype", type=str, default=T.float16, help="Data type (float16 or float32)")
     parser.add_argument("--scale", type=float, default=0.1, help="Attention scale factor")
     parser.add_argument("--iterations", type=int, default=100, help="Number of iterations")
     parser.add_argument("--warmup", type=int, default=10, help="Warmup iterations")
@@ -933,7 +885,8 @@ def run_benchmark_suite(impl='all'):
         type=str,
         default="all",
         choices=["tilelang", "triton", "all"],
-        help="Implementation to benchmark (tilelang, triton, or all)")
+        help="Implementation to benchmark (tilelang, triton, or all)",
+    )
 
     args = parser.parse_args()
 
@@ -941,13 +894,12 @@ def run_benchmark_suite(impl='all'):
     if args.impl in ["triton", "all"] and args.head_query % (args.heads * 16) != 0:
         # Adjust head_query to nearest valid value
         args.head_query = ((args.head_query // (args.heads * 16)) + 1) * (args.heads * 16)
-        print(
-            f"Adjusted head_query to {args.head_query} to be compatible with Triton implementation")
+        print(f"Adjusted head_query to {args.head_query} to be compatible with Triton implementation")
 
     if args.suite:
         run_benchmark_suite(impl=args.impl)
     else:
-        dtype = torch.float16 if args.dtype == "float16" else torch.float32
+        dtype = torch.float16 if args.dtype == T.float16 else torch.float32
 
         if args.impl in ["tilelang", "all"]:
             print("Benchmarking TileLang implementation:")
@@ -963,12 +915,14 @@ def run_benchmark_suite(impl='all'):
                 scale=args.scale,
                 warmup=args.warmup,
                 iterations=args.iterations,
-                validate=args.validate)
+                validate=args.validate,
+            )
             print("\nBenchmark Results (TileLang):")
             print(
-                f"Configuration: batch={args.batch}, seq_len={args.seq_len}, heads={args.heads}, " +
-                f"head_query={args.head_query}, dim={args.dim}, blocks={args.selected_blocks}, " +
-                f"block_size={args.block_size}")
+                f"Configuration: batch={args.batch}, seq_len={args.seq_len}, heads={args.heads}, "
+                + f"head_query={args.head_query}, dim={args.dim}, blocks={args.selected_blocks}, "
+                + f"block_size={args.block_size}"
+            )
             print(f"Average time: {result['avg_time_ms']:.2f} ms")
             print(f"Performance: {result['tflops']:.2f} TFLOPs")
 
@@ -986,11 +940,13 @@ def run_benchmark_suite(impl='all'):
                 scale=args.scale,
                 warmup=args.warmup,
                 iterations=args.iterations,
-                validate=args.validate)
+                validate=args.validate,
+            )
             print("\nBenchmark Results (Triton):")
             print(
-                f"Configuration: batch={args.batch}, seq_len={args.seq_len}, heads={args.heads}, " +
-                f"head_query={args.head_query}, dim={args.dim}, blocks={args.selected_blocks}, " +
-                f"block_size={args.block_size}")
+                f"Configuration: batch={args.batch}, seq_len={args.seq_len}, heads={args.heads}, "
+                + f"head_query={args.head_query}, dim={args.dim}, blocks={args.selected_blocks}, "
+                + f"block_size={args.block_size}"
+            )
             print(f"Average time: {result['avg_time_ms']:.2f} ms")
             print(f"Performance: {result['tflops']:.2f} TFLOPs")
diff --git a/examples/deepseek_nsa/example_tilelang_nsa_bwd.py b/examples/deepseek_nsa/example_tilelang_nsa_bwd.py
index 8387d2271..3da285a9b 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_bwd.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_bwd.py
@@ -7,6 +7,7 @@
 import triton
 
 import fla
+
 if parse(fla.__version__) < parse("0.2.1"):
     from fla.ops.common.utils import prepare_token_indices
 else:
@@ -22,7 +23,8 @@
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
+    }
+)
 def tilelang_kernel_fwd(
     batch,
     heads,
@@ -34,11 +36,10 @@ def tilelang_kernel_fwd(
     groups=1,
     selected_blocks=16,
 ):
-
     from tilelang import language as T
 
     if scale is None:
-        scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+        scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     else:
         scale = scale * 1.44269504  # log2(e)
 
@@ -48,9 +49,9 @@ def tilelang_kernel_fwd(
     o_slc_shape = [batch, seq_len, heads, dim]
     lse_slc_shape = [batch, seq_len, heads]
     block_indices_shape = [batch, seq_len, head_kv, selected_blocks]
-    block_indices_dtype = "int32"
-    dtype = "float16"
-    accum_dtype = "float"
+    block_indices_dtype = T.int32
+    dtype = T.float16
+    accum_dtype = T.float32
     block_S = block_size
     block_T = min(128, tilelang.math.next_power_of_2(dim))
 
@@ -67,12 +68,12 @@ def tilelang_kernel_fwd(
 
     @T.prim_func
     def native_sparse_attention(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
-            O_slc: T.Tensor(o_slc_shape, dtype),
-            LSE_slc: T.Tensor(lse_slc_shape, accum_dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
+        O_slc: T.Tensor(o_slc_shape, dtype),
+        LSE_slc: T.Tensor(lse_slc_shape, accum_dtype),
     ):
         with T.Kernel(seq_len, NV, batch * head_kv, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([G, BK], dtype)
@@ -93,7 +94,7 @@ def native_sparse_attention(
             i_b, i_h = i_bh // head_kv, i_bh % head_kv
 
             NS = S
-            T.copy(Q[i_b, i_t, i_h * G:(i_h + 1) * G, :], Q_shared)
+            T.copy(Q[i_b, i_t, i_h * G : (i_h + 1) * G, :], Q_shared)
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
@@ -103,12 +104,11 @@ def native_sparse_attention(
                 i_s = BlockIndices[i_b, i_t, i_h, i] * BS
                 if i_s <= i_t and i_s >= 0:
                     # [BS, BK]
-                    T.copy(K[i_b, i_s:i_s + BS, i_h, :], K_shared)
+                    T.copy(K[i_b, i_s : i_s + BS, i_h, :], K_shared)
 
                     if is_causal:
-                        for i, j in T.Parallel(G, BS):
-                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0,
-                                                         -T.infinity(acc_s.dtype))
+                        for k, j in T.Parallel(G, BS):
+                            acc_s[k, j] = T.if_then_else(i_t >= (i_s + j), 0, -T.infinity(acc_s.dtype))
                     else:
                         T.clear(acc_s)
 
@@ -124,21 +124,21 @@ def native_sparse_attention(
                     T.copy(scores_max, scores_max_prev)
                     T.fill(scores_max, -T.infinity(accum_dtype))
                     T.reduce_max(acc_s, scores_max, dim=1, clear=True)
-                    for i in T.Parallel(G):
-                        scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-                    for i, j in T.Parallel(G, BS):
-                        acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                    for k in T.Parallel(G):
+                        scores_scale[k] = T.exp2(scores_max_prev[k] * scale - scores_max[k] * scale)
+                    for k, j in T.Parallel(G, BS):
+                        acc_s[k, j] = T.exp2(acc_s[k, j] * scale - scores_max[k] * scale)
                     T.reduce_sum(acc_s, scores_sum, dim=1)
-                    for i in T.Parallel(G):
-                        logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                    for k in T.Parallel(G):
+                        logsum[k] = logsum[k] * scores_scale[k] + scores_sum[k]
                     T.copy(acc_s, acc_s_cast)
 
                     # Rescale
-                    for i, j in T.Parallel(G, BV):
-                        acc_o[i, j] *= scores_scale[i]
+                    for k, j in T.Parallel(G, BV):
+                        acc_o[k, j] *= scores_scale[k]
 
                     # V * softmax(Q * K)
-                    T.copy(V[i_b, i_s:i_s + BS, i_h, i_v * BV:(i_v + 1) * BV], V_shared)
+                    T.copy(V[i_b, i_s : i_s + BS, i_h, i_v * BV : (i_v + 1) * BV], V_shared)
                     T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             for i, j in T.Parallel(G, BV):
@@ -146,18 +146,20 @@ def native_sparse_attention(
             T.copy(acc_o, O_shared)
             T.copy(
                 O_shared,
-                O_slc[i_b, i_t, i_h * G:(i_h + 1) * G, i_v * BV:(i_v + 1) * BV],
+                O_slc[i_b, i_t, i_h * G : (i_h + 1) * G, i_v * BV : (i_v + 1) * BV],
             )
             for i in T.Parallel(G):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, LSE_slc[i_b, i_t, i_h * G:(i_h + 1) * G])
+            T.copy(logsum, LSE_slc[i_b, i_t, i_h * G : (i_h + 1) * G])
 
     return native_sparse_attention
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
 def tilelang_kernel_bwd_dkv(
     batch,
     heads,
@@ -168,11 +170,11 @@ def tilelang_kernel_bwd_dkv(
     block_size=64,
     groups=1,
     selected_blocks=16,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
     if scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     else:
         sm_scale = scale
 
@@ -207,15 +209,15 @@ def tilelang_kernel_bwd_dkv(
 
     @T.prim_func
     def flash_bwd_dkv(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(k_shape, dtype),
-            V: T.Tensor(v_shape, dtype),
-            LSE_slc: T.Tensor(lse_slc_shape, accum_dtype),
-            Delta_slc: T.Tensor(delta_slc_shape, accum_dtype),
-            DO_slc: T.Tensor(do_slc_shape, dtype),
-            DK: T.Tensor(dk_shape, dtype),
-            DV: T.Tensor(dv_shape, dtype),
-            BlockMask: T.Tensor(block_mask_shape, "int32"),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(k_shape, dtype),
+        V: T.Tensor(v_shape, dtype),
+        LSE_slc: T.Tensor(lse_slc_shape, accum_dtype),
+        Delta_slc: T.Tensor(delta_slc_shape, accum_dtype),
+        DO_slc: T.Tensor(do_slc_shape, dtype),
+        DK: T.Tensor(dk_shape, dtype),
+        DV: T.Tensor(dv_shape, dtype),
+        BlockMask: T.Tensor(block_mask_shape, T.int32),
     ):
         with T.Kernel(NV, NS, B * H, threads=num_threads) as (i_v, i_s, i_bh):
             K_shared = T.alloc_shared([BS, BK], dtype)
@@ -238,31 +240,25 @@ def flash_bwd_dkv(
 
             i_b, i_h = i_bh // H, i_bh % H
 
-            T.copy(K[i_b, i_s * BS:(i_s + 1) * BS, i_h, :BK], K_shared)
-            T.copy(V[i_b, i_s * BS:(i_s + 1) * BS, i_h, :BV], V_shared)
+            T.copy(K[i_b, i_s * BS : (i_s + 1) * BS, i_h, :BK], K_shared)
+            T.copy(V[i_b, i_s * BS : (i_s + 1) * BS, i_h, :BV], V_shared)
 
             # [BS, BK]
             T.clear(dk)
             # [BS, BV]
             T.clear(dv)
 
-            T.annotate_layout({
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
-
             loop_st = i_s * BS
             loop_ed = seq_len
             for i in T.Pipelined(
-                    start=loop_st,
-                    stop=loop_ed,
-                    num_stages=0,
+                start=loop_st,
+                stop=loop_ed,
+                num_stages=0,
             ):
                 b_m_slc = BlockMask[i_b, i, i_h, i_s]
                 if b_m_slc != 0:
                     # [G, BK]
-                    T.copy(Q[i_b, i, i_h * G:(i_h + 1) * G, :BK], Q_shared)
+                    T.copy(Q[i_b, i, i_h * G : (i_h + 1) * G, :BK], Q_shared)
                     T.clear(qkT)
                     # [BS, BK] @ [G, BK] -> [BS, G]
                     T.gemm(
@@ -273,7 +269,7 @@ def flash_bwd_dkv(
                         policy=T.GemmWarpPolicy.FullRow,
                     )
                     # [G]
-                    T.copy(LSE_slc[i_b, i, i_h * G:(i_h + 1) * G], lse_shared)
+                    T.copy(LSE_slc[i_b, i, i_h * G : (i_h + 1) * G], lse_shared)
 
                     for _i, _j in T.Parallel(BS, G):
                         qkT[_i, _j] = T.exp2(qkT[_i, _j] * scale - lse_shared[_j])
@@ -282,7 +278,7 @@ def flash_bwd_dkv(
                         qkT[_i, _j] = T.if_then_else(i >= (i_s * BS + _i), qkT[_i, _j], 0)
 
                     # [G, BV]
-                    T.copy(DO_slc[i_b, i, i_h * G:(i_h + 1) * G, :BV], do)
+                    T.copy(DO_slc[i_b, i, i_h * G : (i_h + 1) * G, :BV], do)
                     T.clear(dsT)
                     # [BS, BV] @ [G, BV] -> [BS, G]
                     T.gemm(
@@ -296,7 +292,7 @@ def flash_bwd_dkv(
                     # [BS, G] @ [G, BV] -> [BS, BV]
                     T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
                     # [G]
-                    T.copy(Delta_slc[i_b, i, i_h * G:(i_h + 1) * G], delta)
+                    T.copy(Delta_slc[i_b, i, i_h * G : (i_h + 1) * G], delta)
                     for i, j in T.Parallel(BS, G):
                         dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
 
@@ -305,8 +301,8 @@ def flash_bwd_dkv(
 
             T.copy(dv, dv_shared)
             T.copy(dk, dk_shared)
-            T.copy(dv_shared, DV[i_b, i_s * BS:(i_s + 1) * BS, i_h, :BV])
-            T.copy(dk_shared, DK[i_v, i_b, i_s * BS:(i_s + 1) * BS, i_h, :BK])
+            T.copy(dv_shared, DV[i_b, i_s * BS : (i_s + 1) * BS, i_h, :BV])
+            T.copy(dk_shared, DK[i_v, i_b, i_s * BS : (i_s + 1) * BS, i_h, :BK])
 
     return flash_bwd_dkv
 
@@ -321,9 +317,11 @@ def make_dq_layout(dQ):
     )
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
 def tilelang_kernel_bwd_dqkv(
     batch,
     heads,
@@ -334,11 +332,11 @@ def tilelang_kernel_bwd_dqkv(
     block_size=64,
     groups=1,
     selected_blocks=16,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
     if scale is None:
-        sm_scale = (1.0 / dim)**0.5
+        sm_scale = (1.0 / dim) ** 0.5
     else:
         sm_scale = scale
 
@@ -373,16 +371,16 @@ def tilelang_kernel_bwd_dqkv(
 
     @T.prim_func
     def flash_bwd_dqkv(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(k_shape, dtype),
-            V: T.Tensor(v_shape, dtype),
-            LSE_slc: T.Tensor(lse_slc_shape, accum_dtype),
-            Delta_slc: T.Tensor(delta_slc_shape, accum_dtype),
-            DO_slc: T.Tensor(do_slc_shape, dtype),
-            DQ: T.Tensor(dq_shape, dtype),
-            DK: T.Tensor(dk_shape, dtype),
-            DV: T.Tensor(dv_shape, dtype),
-            BlockMask: T.Tensor(block_mask_shape, "int32"),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(k_shape, dtype),
+        V: T.Tensor(v_shape, dtype),
+        LSE_slc: T.Tensor(lse_slc_shape, accum_dtype),
+        Delta_slc: T.Tensor(delta_slc_shape, accum_dtype),
+        DO_slc: T.Tensor(do_slc_shape, dtype),
+        DQ: T.Tensor(dq_shape, dtype),
+        DK: T.Tensor(dk_shape, dtype),
+        DV: T.Tensor(dv_shape, dtype),
+        BlockMask: T.Tensor(block_mask_shape, T.int32),
     ):
         with T.Kernel(NV, NS, B * H, threads=num_threads) as (i_v, i_s, i_bh):
             K_shared = T.alloc_shared([BS, BK], dtype)
@@ -406,31 +404,25 @@ def flash_bwd_dqkv(
 
             i_b, i_h = i_bh // H, i_bh % H
 
-            T.copy(K[i_b, i_s * BS:(i_s + 1) * BS, i_h, :BK], K_shared)
-            T.copy(V[i_b, i_s * BS:(i_s + 1) * BS, i_h, :BV], V_shared)
+            T.copy(K[i_b, i_s * BS : (i_s + 1) * BS, i_h, :BK], K_shared)
+            T.copy(V[i_b, i_s * BS : (i_s + 1) * BS, i_h, :BV], V_shared)
 
             # [BS, BK]
             T.clear(dk)
             # [BS, BV]
             T.clear(dv)
 
-            T.annotate_layout({
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
-
             loop_st = i_s * BS
             loop_ed = seq_len
             for i in T.Pipelined(
-                    start=loop_st,
-                    stop=loop_ed,
-                    num_stages=0,
+                start=loop_st,
+                stop=loop_ed,
+                num_stages=0,
             ):
                 b_m_slc = BlockMask[i_b, i, i_h, i_s]
                 if b_m_slc != 0:
                     # [G, BK]
-                    T.copy(Q[i_b, i, i_h * G:(i_h + 1) * G, :BK], Q_shared)
+                    T.copy(Q[i_b, i, i_h * G : (i_h + 1) * G, :BK], Q_shared)
                     T.clear(qkT)
                     # [BS, BK] @ [G, BK] -> [BS, G]
                     T.gemm(
@@ -441,7 +433,7 @@ def flash_bwd_dqkv(
                         policy=T.GemmWarpPolicy.FullRow,
                     )
                     # [G]
-                    T.copy(LSE_slc[i_b, i, i_h * G:(i_h + 1) * G], lse_shared)
+                    T.copy(LSE_slc[i_b, i, i_h * G : (i_h + 1) * G], lse_shared)
 
                     for _i, _j in T.Parallel(BS, G):
                         qkT[_i, _j] = T.exp2(qkT[_i, _j] * scale - lse_shared[_j])
@@ -450,7 +442,7 @@ def flash_bwd_dqkv(
                         qkT[_i, _j] = T.if_then_else(i >= (i_s * BS + _i), qkT[_i, _j], 0)
 
                     # [G, BV]
-                    T.copy(DO_slc[i_b, i, i_h * G:(i_h + 1) * G, :BV], do)
+                    T.copy(DO_slc[i_b, i, i_h * G : (i_h + 1) * G, :BV], do)
                     T.clear(dsT)
                     # [BS, BV] @ [G, BV] -> [BS, G]
                     T.gemm(
@@ -464,9 +456,9 @@ def flash_bwd_dqkv(
                     # [BS, G] @ [G, BV] -> [BS, BV]
                     T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
                     # [G]
-                    T.copy(Delta_slc[i_b, i, i_h * G:(i_h + 1) * G], delta)
-                    for i, j in T.Parallel(BS, G):
-                        dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
+                    T.copy(Delta_slc[i_b, i, i_h * G : (i_h + 1) * G], delta)
+                    for _i, _j in T.Parallel(BS, G):
+                        dsT_cast[_i, _j] = qkT[_i, _j] * (dsT[_i, _j] - delta[_j]) * sm_scale
 
                     # [BS, G] @ [G, BK] -> [BS, BK]
                     T.gemm(dsT_cast, Q_shared, dk, policy=T.GemmWarpPolicy.FullRow)
@@ -480,23 +472,25 @@ def flash_bwd_dqkv(
 
             T.copy(dv, dv_shared)
             T.copy(dk, dk_shared)
-            T.copy(dv_shared, DV[i_b, i_s * BS:(i_s + 1) * BS, i_h, :BV])
-            T.copy(dk_shared, DK[i_v, i_b, i_s * BS:(i_s + 1) * BS, i_h, :BK])
+            T.copy(dv_shared, DV[i_b, i_s * BS : (i_s + 1) * BS, i_h, :BV])
+            T.copy(dk_shared, DK[i_v, i_b, i_s * BS : (i_s + 1) * BS, i_h, :BK])
 
     return flash_bwd_dqkv
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def tilelang_kernel_preprocess(
     batch,
     heads,
     seq_len,
     dim,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
     blk=32,
 ):
     from tilelang import language as T
@@ -505,9 +499,9 @@ def tilelang_kernel_preprocess(
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, seq_len, heads], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, seq_len, heads], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -516,27 +510,29 @@ def flash_bwd_prep(
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, by * blk:(by + 1) * blk, bx])
+            T.copy(delta, Delta[bz, by * blk : (by + 1) * blk, bx])
 
     return flash_bwd_prep
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def tilelang_kernel_block_mask(
     batch,
     heads,
     seq_len,
     selected_blocks,
     block_size,
-    dtype="int32",
+    dtype=T.int32,
 ):
     from tilelang import language as T
 
@@ -551,9 +547,9 @@ def tilelang_kernel_block_mask(
 
     @T.prim_func
     def flash_bwd_block_mask(
-            BlockIndices: T.Tensor(block_indices_shape, dtype),  # type: ignore
-            BlockCounts: T.Tensor(block_counts_shape, dtype),  # type: ignore
-            BlockMask: T.Tensor(block_mask_shape, dtype),  # type: ignore
+        BlockIndices: T.Tensor(block_indices_shape, dtype),  # type: ignore
+        BlockCounts: T.Tensor(block_counts_shape, dtype),  # type: ignore
+        BlockMask: T.Tensor(block_mask_shape, dtype),  # type: ignore
     ):
         with T.Kernel(seq_len, batch, heads * S) as (bx, by, bz):
             i_t, i_b, i_hs = bx, by, bz
@@ -603,9 +599,7 @@ def parallel_nsa_bwd(
     dk = torch.empty(NV, *k.shape, dtype=k.dtype, device=q.device)
     dv = torch.empty(v.shape, dtype=v.dtype, device=q.device)
 
-    block_mask = tilelang_kernel_block_mask(B, H, T, S,
-                                            BS)(block_indices.to(torch.int32),
-                                                block_counts.to(torch.int32)).to(torch.bool)
+    block_mask = tilelang_kernel_block_mask(B, H, T, S, BS)(block_indices.to(torch.int32), block_counts.to(torch.int32)).to(torch.bool)
 
     fused_qkv_bwd_kernel = tilelang_kernel_bwd_dqkv(
         batch=B,
@@ -618,8 +612,7 @@ def parallel_nsa_bwd(
         selected_blocks=S,
         scale=scale,
     )
-    fused_qkv_bwd_kernel(q, k, v, lse_slc, delta_slc, do_slc, dq, dk, dv,
-                         block_mask.to(torch.int32))
+    fused_qkv_bwd_kernel(q, k, v, lse_slc, delta_slc, do_slc, dq, dk, dv, block_mask.to(torch.int32))
 
     dq = dq.sum(0)
     dk = dk.sum(0)
@@ -628,7 +621,6 @@ def parallel_nsa_bwd(
 
 @torch.compile
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -773,23 +765,21 @@ def parallel_nsa(
             Outputs of shape `[B, SEQLEN, HQ, V]` if `head_first=False` else `[B, HQ, SEQLEN, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"),
-                                     (q, k, v, block_indices))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
         g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
             block_counts = rearrange(block_counts, "b h t -> b t h")
-    assert (q.shape[2] % (k.shape[2] * 16) == 0), "Group size must be a multiple of 16 in NSA"
+    assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
     if isinstance(block_counts, int):
         block_indices = block_indices[:, :, :, :block_counts]
         block_counts = None
 
-    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size,
-                                             window_size, scale, cu_seqlens)
+    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size, scale, cu_seqlens)
     if window_size > 0:
         o = torch.addcmul(o_slc * g_slc.unsqueeze(-1), o_swa, g_swa.unsqueeze(-1))
     else:
@@ -814,7 +804,7 @@ def parallel_nsa(
         for t in range(T):
             for h in range(H):
                 i_i = torch.randperm(max(1, (t // block_size)))[:S]
-                block_indices[b, t, h, :len(i_i)] = i_i
+                block_indices[b, t, h, : len(i_i)] = i_i
     block_indices = block_indices.sort(-1)[0]
 
     block_counts = torch.randint(1, S + 1, (B, T, H), device="cuda")
diff --git a/examples/deepseek_nsa/example_tilelang_nsa_decode.py b/examples/deepseek_nsa/example_tilelang_nsa_decode.py
index 58f435509..381d92493 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_decode.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_decode.py
@@ -16,7 +16,8 @@
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def native_sparse_attention(
     batch,
     heads,
@@ -25,18 +26,18 @@ def native_sparse_attention(
     scale=None,
     block_size=64,  # Tile size for attention computation
     groups=1,  # Grouped query attention (GQA) groups
-    selected_blocks=16  # Number of blocks to select per attention head
+    selected_blocks=16,  # Number of blocks to select per attention head
 ):
     if scale is None:
-        scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+        scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     # Modified shapes for inference (q has seq_len=1)a
     q_shape = [batch, 1, heads, dim]  # Changed seq_len to 1
     kv_shape = [batch, seq_len, head_kv, dim]
     block_indices_shape = [batch, 1, head_kv, selected_blocks]  # Changed seq_len to 1
-    block_indices_dtype = "int32"
-    dtype = "float16"
-    accum_dtype = "float"
+    block_indices_dtype = T.int32
+    dtype = T.float16
+    accum_dtype = T.float32
     block_S = block_size
     block_T = min(128, tilelang.math.next_power_of_2(dim))
 
@@ -53,12 +54,11 @@ def native_sparse_attention(
 
     @T.prim_func
     def native_sparse_attention(
-            Q: T.Tensor(q_shape, dtype),  # [batch, 1, heads, dim] 
-            K: T.Tensor(kv_shape, dtype),  # [batch, seq_len, head_kv, dim]
-            V: T.Tensor(kv_shape, dtype),  # Same shape as K
-            BlockIndices: T.Tensor(block_indices_shape,
-                                   block_indices_dtype),  # Selected block indices
-            Output: T.Tensor(q_shape, dtype),  # Output attention tensor
+        Q: T.Tensor(q_shape, dtype),  # [batch, 1, heads, dim]
+        K: T.Tensor(kv_shape, dtype),  # [batch, seq_len, head_kv, dim]
+        V: T.Tensor(kv_shape, dtype),  # Same shape as K
+        BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),  # Selected block indices
+        Output: T.Tensor(q_shape, dtype),  # Output attention tensor
     ):
         with T.Kernel(1, NV, batch * head_kv, threads=threads) as (bx, by, bz):
             # Shared memory allocations for tile storage
@@ -82,7 +82,7 @@ def native_sparse_attention(
 
             NS = S
             # Copy Q for the single position
-            T.copy(Q[i_b, 0, i_h * G:(i_h + 1) * G, :], Q_shared)  # Changed i_t to 0
+            T.copy(Q[i_b, 0, i_h * G : (i_h + 1) * G, :], Q_shared)  # Changed i_t to 0
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
@@ -93,16 +93,11 @@ def native_sparse_attention(
                 i_s = BlockIndices[i_b, 0, i_h, i] * BS  # Get block offset
                 if i_s >= 0:  # Skip invalid/padding blocks
                     # Load current key block to shared memory
-                    T.copy(K[i_b, i_s:i_s + BS, i_h, :], K_shared)
+                    T.copy(K[i_b, i_s : i_s + BS, i_h, :], K_shared)
 
                     # Compute QK^T attention scores
                     T.clear(acc_s)
-                    T.gemm(
-                        Q_shared,
-                        K_shared,
-                        acc_s,
-                        transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullRow)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
                     # Online softmax with numerical stability
                     # 1. Compute max for scaling
@@ -122,15 +117,14 @@ def native_sparse_attention(
                     T.copy(acc_s, acc_s_cast)
 
                     # Accumulate attention-weighted values
-                    T.copy(V[i_b, i_s:i_s + BS, i_h, i_v * BV:(i_v + 1) * BV], V_shared)
+                    T.copy(V[i_b, i_s : i_s + BS, i_h, i_v * BV : (i_v + 1) * BV], V_shared)
                     T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             # Final normalization and output
             for i, j in T.Parallel(G, BV):
                 acc_o[i, j] /= logsum[i]  # Normalize by logsum
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[i_b, 0, i_h * G:(i_h + 1) * G,
-                                    i_v * BV:(i_v + 1) * BV])  # Changed i_t to 0
+            T.copy(O_shared, Output[i_b, 0, i_h * G : (i_h + 1) * G, i_v * BV : (i_v + 1) * BV])  # Changed i_t to 0
 
     return native_sparse_attention
 
@@ -149,21 +143,21 @@ def main():
         selected_blocks=S,
     )
 
-    Q = torch.randn((B, SEQ_LEN_Q, HQ, D), dtype=dtype, device='cuda').requires_grad_(True)
-    K = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    V = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device='cuda').requires_grad_(True)
+    Q = torch.randn((B, SEQ_LEN_Q, HQ, D), dtype=dtype, device="cuda").requires_grad_(True)
+    K = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    V = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device="cuda").requires_grad_(True)
 
-    mask = torch.randint(0, 2, (B, SEQ_LEN, groups), device='cuda')
-    DO = torch.randn((B, SEQ_LEN_Q, HQ, D), dtype=dtype, device='cuda')
+    mask = torch.randint(0, 2, (B, SEQ_LEN, groups), device="cuda")
+    DO = torch.randn((B, SEQ_LEN_Q, HQ, D), dtype=dtype, device="cuda")
 
-    block_indices = torch.full((B, SEQ_LEN_Q, H, S), SEQ_LEN, dtype=torch.long, device='cuda')
+    block_indices = torch.full((B, SEQ_LEN_Q, H, S), SEQ_LEN, dtype=torch.long, device="cuda")
     for b in range(B):
         for t in range(SEQ_LEN_Q):
             for h in range(H):
                 i_i = torch.randperm(max(1, (t // block_size)))[:S]
-                block_indices[b, t, h, :len(i_i)] = i_i
+                block_indices[b, t, h, : len(i_i)] = i_i
     block_indices = block_indices.sort(-1)[0]
-    block_counts = torch.randint(1, S + 1, (B, SEQ_LEN_Q, H), device='cuda')
+    block_counts = torch.randint(1, S + 1, (B, SEQ_LEN_Q, H), device="cuda")
 
     out = kernel(Q, K, V, block_indices.to(torch.int32))
 
@@ -178,5 +172,38 @@ def main():
     torch.testing.assert_close(ref, out, atol=1e-2, rtol=1e-2)
 
 
+def run_regression_perf():
+    B, SEQ_LEN, H, HQ, D, S, block_size, dtype = 2, 64, 1, 16, 16, 1, 32, torch.float16
+    groups = HQ // H
+    SEQ_LEN_Q = 1
+    kernel = native_sparse_attention(
+        batch=B,
+        heads=HQ,
+        seq_len=SEQ_LEN,
+        dim=D,
+        block_size=block_size,
+        groups=HQ // H,
+        selected_blocks=S,
+    )
+
+    Q = torch.randn((B, SEQ_LEN_Q, HQ, D), dtype=dtype, device="cuda").requires_grad_(True)
+    K = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    V = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    block_indices = torch.full((B, SEQ_LEN_Q, H, S), SEQ_LEN, dtype=torch.long, device="cuda")
+    for b in range(B):
+        for t in range(SEQ_LEN_Q):
+            for h in range(H):
+                i_i = torch.randperm(max(1, (t // block_size)))[:S]
+                block_indices[b, t, h, : len(i_i)] = i_i
+    block_indices = block_indices.sort(-1)[0]
+
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(Q, K, V, block_indices.to(torch.int32))
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/deepseek_nsa/example_tilelang_nsa_fwd.py b/examples/deepseek_nsa/example_tilelang_nsa_fwd.py
index f8a7ebfb0..7b36d6e26 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_fwd.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_fwd.py
@@ -14,18 +14,11 @@
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
-def native_sparse_attention(batch,
-                            heads,
-                            seq_len,
-                            dim,
-                            is_causal,
-                            scale=None,
-                            block_size=64,
-                            groups=1,
-                            selected_blocks=16):
+    },
+)
+def native_sparse_attention(batch, heads, seq_len, dim, is_causal, scale=None, block_size=64, groups=1, selected_blocks=16):
     if scale is None:
-        scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+        scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     else:
         scale = scale * 1.44269504  # log2(e)
 
@@ -33,9 +26,9 @@ def native_sparse_attention(batch,
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
     block_indices_shape = [batch, seq_len, head_kv, selected_blocks]
-    block_indices_dtype = "int32"
-    dtype = "float16"
-    accum_dtype = "float"
+    block_indices_dtype = T.int32
+    dtype = T.float16
+    accum_dtype = T.float32
     block_S = block_size
     block_T = min(128, tilelang.math.next_power_of_2(dim))
 
@@ -52,11 +45,11 @@ def native_sparse_attention(batch,
 
     @T.prim_func
     def native_sparse_attention(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(seq_len, NV, batch * head_kv, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([G, BK], dtype)
@@ -77,7 +70,7 @@ def native_sparse_attention(
             i_b, i_h = i_bh // head_kv, i_bh % head_kv
 
             NS = S
-            T.copy(Q[i_b, i_t, i_h * G:(i_h + 1) * G, :], Q_shared)
+            T.copy(Q[i_b, i_t, i_h * G : (i_h + 1) * G, :], Q_shared)
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
@@ -87,21 +80,15 @@ def native_sparse_attention(
                 i_s = BlockIndices[i_b, i_t, i_h, i] * BS
                 if i_s <= i_t and i_s >= 0:
                     # [BS, BK]
-                    T.copy(K[i_b, i_s:i_s + BS, i_h, :], K_shared)
+                    T.copy(K[i_b, i_s : i_s + BS, i_h, :], K_shared)
 
                     if is_causal:
                         for i, j in T.Parallel(G, BS):
-                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0,
-                                                         -T.infinity(acc_s.dtype))
+                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0, -T.infinity(acc_s.dtype))
                     else:
                         T.clear(acc_s)
 
-                    T.gemm(
-                        Q_shared,
-                        K_shared,
-                        acc_s,
-                        transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullRow)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
                     # Softmax
                     T.copy(scores_max, scores_max_prev)
@@ -121,13 +108,13 @@ def native_sparse_attention(
                         acc_o[i, j] *= scores_scale[i]
 
                     # V * softmax(Q * K)
-                    T.copy(V[i_b, i_s:i_s + BS, i_h, i_v * BV:(i_v + 1) * BV], V_shared)
+                    T.copy(V[i_b, i_s : i_s + BS, i_h, i_v * BV : (i_v + 1) * BV], V_shared)
                     T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             for i, j in T.Parallel(G, BV):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[i_b, i_t, i_h * G:(i_h + 1) * G, i_v * BV:(i_v + 1) * BV])
+            T.copy(O_shared, Output[i_b, i_t, i_h * G : (i_h + 1) * G, i_v * BV : (i_v + 1) * BV])
 
     return native_sparse_attention
 
@@ -148,21 +135,22 @@ def main():
     )
     print(kernel.get_kernel_source())
     torch.random.manual_seed(0)
-    Q = torch.randn((B, SEQ_LEN, HQ, D), dtype=dtype, device='cuda').requires_grad_(True)
-    K = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    V = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    g_slc = torch.ones((B, SEQ_LEN, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    g_swa = torch.ones((B, SEQ_LEN, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    DO = torch.randn((B, SEQ_LEN, HQ, D), dtype=dtype, device='cuda')
-
-    block_indices = torch.full((B, SEQ_LEN, H, S), SEQ_LEN, dtype=torch.long, device='cuda')
+    Q = torch.randn((B, SEQ_LEN, HQ, D), dtype=dtype, device="cuda").requires_grad_(True)
+    K = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    V = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    g_slc = torch.ones((B, SEQ_LEN, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    g_swa = torch.ones((B, SEQ_LEN, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    DO = torch.randn((B, SEQ_LEN, HQ, D), dtype=dtype, device="cuda")
+
+    block_indices = torch.full((B, SEQ_LEN, H, S), SEQ_LEN, dtype=torch.long, device="cuda")
+    block_counts = torch.zeros((B, SEQ_LEN, H), dtype=torch.long, device="cuda")
     for b in range(B):
         for t in range(SEQ_LEN):
             for h in range(H):
                 i_i = torch.randperm(max(1, (t // block_size)))[:S]
-                block_indices[b, t, h, :len(i_i)] = i_i
+                block_indices[b, t, h, : len(i_i)] = i_i
+                block_counts[b, t, h] = (block_indices[b, t, h] != SEQ_LEN).sum().item()
     block_indices = block_indices.sort(-1)[0]
-    block_counts = torch.randint(1, S + 1, (B, SEQ_LEN, H), device='cuda')
 
     out = kernel(Q, K, V, block_indices.to(torch.int32))
 
@@ -183,5 +171,43 @@ def main():
     torch.testing.assert_close(ref, out, atol=1e-2, rtol=1e-2)
 
 
+def run_regression_perf():
+    B, SEQ_LEN, H, HQ, D, S, block_size, dtype, scale = 2, 64, 1, 16, 32, 1, 32, torch.float16, 0.1
+    kernel = native_sparse_attention(
+        batch=B,
+        heads=HQ,
+        seq_len=SEQ_LEN,
+        dim=D,
+        is_causal=True,
+        block_size=block_size,
+        groups=HQ // H,
+        selected_blocks=S,
+        scale=scale,
+    )
+    torch.random.manual_seed(0)
+    Q = torch.randn((B, SEQ_LEN, HQ, D), dtype=dtype, device="cuda").requires_grad_(True)
+    K = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    V = torch.randn((B, SEQ_LEN, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    g_slc = torch.ones((B, SEQ_LEN, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    g_swa = torch.ones((B, SEQ_LEN, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    DO = torch.randn((B, SEQ_LEN, HQ, D), dtype=dtype, device="cuda")
+    block_indices = torch.full((B, SEQ_LEN, H, S), SEQ_LEN, dtype=torch.long, device="cuda")
+    block_counts = torch.zeros((B, SEQ_LEN, H), dtype=torch.long, device="cuda")
+    for b in range(B):
+        for t in range(SEQ_LEN):
+            for h in range(H):
+                i_i = torch.randperm(max(1, (t // block_size)))[:S]
+                block_indices[b, t, h, : len(i_i)] = i_i
+                block_counts[b, t, h] = (block_indices[b, t, h] != SEQ_LEN).sum().item()
+    block_indices = block_indices.sort(-1)[0]
+
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(Q, K, V, block_indices.to(torch.int32))
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py b/examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py
index d365e7a5f..b52ebe42e 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py
@@ -8,6 +8,7 @@
 import tilelang.testing
 
 import fla
+
 if parse(fla.__version__) < parse("0.2.1"):
     from fla.ops.common.utils import prepare_token_indices
 else:
@@ -21,18 +22,11 @@
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
-def native_sparse_attention_varlen(batch,
-                                   heads,
-                                   c_seq_len,
-                                   dim,
-                                   is_causal,
-                                   scale=None,
-                                   block_size=64,
-                                   groups=1,
-                                   selected_blocks=16):
+    }
+)
+def native_sparse_attention_varlen(batch, heads, c_seq_len, dim, is_causal, scale=None, block_size=64, groups=1, selected_blocks=16):
     if scale is None:
-        scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+        scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [c_seq_len, heads, dim]
     kv_shape = [c_seq_len, head_kv, dim]
@@ -44,12 +38,12 @@ def native_sparse_attention_varlen(batch,
     block_counts_shape = [c_seq_len, head_kv]
     offsets_shape = [batch + 1]
     token_indices_shape = [c_seq_len, 2]
-    block_indices_dtype = "int32"
-    block_counts_dtype = "int32"
-    offsets_dtype = "int32"
-    token_indices_dtype = "int32"
-    dtype = "float16"
-    accum_dtype = "float"
+    block_indices_dtype = T.int32
+    block_counts_dtype = T.int32
+    offsets_dtype = T.int32
+    token_indices_dtype = T.int32
+    dtype = T.float16
+    accum_dtype = T.float32
     block_S = block_size
     block_T = min(128, tilelang.math.next_power_of_2(dim))
 
@@ -66,14 +60,14 @@ def native_sparse_attention_varlen(batch,
 
     @T.prim_func
     def native_sparse_attention_varlen(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            O_slc: T.Tensor(o_slc_shape, dtype),
-            BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
-            BlockCounts: T.Tensor(block_counts_shape, block_counts_dtype),
-            Offsets: T.Tensor(offsets_shape, offsets_dtype),
-            TokenIndices: T.Tensor(token_indices_shape, token_indices_dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        O_slc: T.Tensor(o_slc_shape, dtype),
+        BlockIndices: T.Tensor(block_indices_shape, block_indices_dtype),
+        BlockCounts: T.Tensor(block_counts_shape, block_counts_dtype),
+        Offsets: T.Tensor(offsets_shape, offsets_dtype),
+        TokenIndices: T.Tensor(token_indices_shape, token_indices_dtype),
     ):
         with T.Kernel(c_seq_len, NV, batch * head_kv, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([G, BK], dtype)
@@ -100,7 +94,7 @@ def native_sparse_attention_varlen(
             current_seq_len = eos - bos
 
             NS = BlockCounts[i_t, i_h]
-            T.copy(Q[bos + i_t, i_h * G:(i_h + 1) * G, :BK], Q_shared)
+            T.copy(Q[bos + i_t, i_h * G : (i_h + 1) * G, :BK], Q_shared)
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
@@ -112,21 +106,15 @@ def native_sparse_attention_varlen(
                     # [BS, BK]
                     # Lei: may have some padding issues
                     # we should learn from mha varlen templates to handle this
-                    T.copy(K[bos + i_s:bos + i_s + BS, i_h, :BK], K_shared)
+                    T.copy(K[bos + i_s : bos + i_s + BS, i_h, :BK], K_shared)
 
                     if is_causal:
                         for i, j in T.Parallel(G, BS):
-                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0,
-                                                         -T.infinity(acc_s.dtype))
+                            acc_s[i, j] = T.if_then_else(i_t >= (i_s + j), 0, -T.infinity(acc_s.dtype))
                     else:
                         T.clear(acc_s)
 
-                    T.gemm(
-                        Q_shared,
-                        K_shared,
-                        acc_s,
-                        transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullRow)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
                     # Softmax
                     T.copy(scores_max, scores_max_prev)
@@ -146,13 +134,13 @@ def native_sparse_attention_varlen(
                         acc_o[i, j] *= scores_scale[i]
 
                     # V * softmax(Q * K)
-                    T.copy(V[bos + i_s:bos + i_s + BS, i_h, i_v * BV:(i_v + 1) * BV], V_shared)
+                    T.copy(V[bos + i_s : bos + i_s + BS, i_h, i_v * BV : (i_v + 1) * BV], V_shared)
                     T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             for i, j in T.Parallel(G, BV):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, O_slc[bos + i_t, i_h * G:(i_h + 1) * G, i_v * BV:(i_v + 1) * BV])
+            T.copy(O_shared, O_slc[bos + i_t, i_h * G : (i_h + 1) * G, i_v * BV : (i_v + 1) * BV])
 
     return native_sparse_attention_varlen
 
@@ -190,17 +178,20 @@ def parallel_nsa_fwd(
 
     o_slc = torch.empty(B, C_SEQ_LEN, HQ, V, dtype=v.dtype, device=q.device)
     kernel(
-        q.view(C_SEQ_LEN, HQ, D), k.view(C_SEQ_LEN, H, D), v.view(C_SEQ_LEN, H, D),
+        q.view(C_SEQ_LEN, HQ, D),
+        k.view(C_SEQ_LEN, H, D),
+        v.view(C_SEQ_LEN, H, D),
         o_slc.view(C_SEQ_LEN, HQ, V),
         block_indices.to(torch.int32).view(C_SEQ_LEN, H, S),
-        block_counts.to(torch.int32).view(C_SEQ_LEN, H), offsets.to(torch.int32),
-        token_indices.to(torch.int32))
+        block_counts.to(torch.int32).view(C_SEQ_LEN, H),
+        offsets.to(torch.int32),
+        token_indices.to(torch.int32),
+    )
     return o_slc
 
 
 @torch.compile
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, block_indices, block_counts, block_size, window_size, scale, offsets):
         ctx.dtype = q.dtype
@@ -221,22 +212,25 @@ def forward(ctx, q, k, v, block_indices, block_counts, block_size, window_size,
             window_size=window_size,
             scale=scale,
             offsets=offsets,
-            token_indices=token_indices)
+            token_indices=token_indices,
+        )
         return o_slc.to(q.dtype)
 
 
-def parallel_nsa(q: torch.Tensor,
-                 k: torch.Tensor,
-                 v: torch.Tensor,
-                 g_slc: torch.Tensor,
-                 g_swa: torch.Tensor,
-                 block_indices: torch.LongTensor,
-                 block_counts: Optional[Union[torch.LongTensor, int]] = None,
-                 block_size: int = 64,
-                 window_size: int = 0,
-                 scale: Optional[float] = None,
-                 cu_seqlens: Optional[torch.LongTensor] = None,
-                 head_first: bool = False) -> torch.Tensor:
+def parallel_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -276,29 +270,27 @@ def parallel_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
     assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
     if isinstance(block_counts, int):
         block_indices = block_indices[:, :, :, :block_counts]
         block_counts = None
 
-    o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size,
-                                      scale, cu_seqlens)
+    o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size, scale, cu_seqlens)
     if window_size > 0:
         assert False, "Window size is not supported yet"
     else:
         o = o_slc * g_slc.unsqueeze(-1)
     if head_first:
-        o = rearrange(o, 'b t h d -> b h t d')
+        o = rearrange(o, "b t h d -> b h t d")
     return o
 
 
@@ -306,41 +298,57 @@ def parallel_nsa(q: torch.Tensor,
     N, C_SEQ_LEN, H, HQ, D, S, block_size, dtype = 2, 64, 1, 16, 64, 1, 32, torch.float16
     torch.manual_seed(42)
     # randomly split the sequence into N segments
-    offsets = torch.cat([
-        torch.tensor([0], dtype=torch.long),
-        torch.arange(16, C_SEQ_LEN)[torch.randperm(C_SEQ_LEN - 1)[:N - 1]],
-        torch.tensor([C_SEQ_LEN], dtype=torch.long)
-    ], 0).cuda().sort()[0]
+    offsets = (
+        torch.cat(
+            [
+                torch.tensor([0], dtype=torch.long),
+                torch.arange(16, C_SEQ_LEN)[torch.randperm(C_SEQ_LEN - 1)[: N - 1]],
+                torch.tensor([C_SEQ_LEN], dtype=torch.long),
+            ],
+            0,
+        )
+        .cuda()
+        .sort()[0]
+    )
 
     # seq-first required for inputs with variable lengths
-    perm_q = torch.randperm(C_SEQ_LEN, device='cuda')
-    perm_k = torch.randperm(C_SEQ_LEN, device='cuda')
-    perm_v = torch.randperm(C_SEQ_LEN, device='cuda')
-    q = torch.linspace(
-        0, 1, steps=C_SEQ_LEN, dtype=dtype,
-        device='cuda')[perm_q].view(1, C_SEQ_LEN, 1, 1).expand(1, C_SEQ_LEN, HQ,
-                                                               D).clone().requires_grad_(True)
-    k = torch.linspace(
-        0, 1, steps=C_SEQ_LEN, dtype=dtype,
-        device='cuda')[perm_k].view(1, C_SEQ_LEN, 1, 1).expand(1, C_SEQ_LEN, H,
-                                                               D).clone().requires_grad_(True)
-    v = torch.linspace(
-        0, 1, steps=C_SEQ_LEN, dtype=dtype,
-        device='cuda')[perm_v].view(1, C_SEQ_LEN, 1, 1).expand(1, C_SEQ_LEN, H,
-                                                               D).clone().requires_grad_(True)
-    g_slc = torch.rand((1, C_SEQ_LEN, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    g_swa = torch.rand((1, C_SEQ_LEN, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    do = torch.randn((1, C_SEQ_LEN, HQ, D), dtype=dtype, device='cuda')
+    perm_q = torch.randperm(C_SEQ_LEN, device="cuda")
+    perm_k = torch.randperm(C_SEQ_LEN, device="cuda")
+    perm_v = torch.randperm(C_SEQ_LEN, device="cuda")
+    q = (
+        torch.linspace(0, 1, steps=C_SEQ_LEN, dtype=dtype, device="cuda")[perm_q]
+        .view(1, C_SEQ_LEN, 1, 1)
+        .expand(1, C_SEQ_LEN, HQ, D)
+        .clone()
+        .requires_grad_(True)
+    )
+    k = (
+        torch.linspace(0, 1, steps=C_SEQ_LEN, dtype=dtype, device="cuda")[perm_k]
+        .view(1, C_SEQ_LEN, 1, 1)
+        .expand(1, C_SEQ_LEN, H, D)
+        .clone()
+        .requires_grad_(True)
+    )
+    v = (
+        torch.linspace(0, 1, steps=C_SEQ_LEN, dtype=dtype, device="cuda")[perm_v]
+        .view(1, C_SEQ_LEN, 1, 1)
+        .expand(1, C_SEQ_LEN, H, D)
+        .clone()
+        .requires_grad_(True)
+    )
+    g_slc = torch.rand((1, C_SEQ_LEN, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    g_swa = torch.rand((1, C_SEQ_LEN, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((1, C_SEQ_LEN, HQ, D), dtype=dtype, device="cuda")
 
     token_indices = prepare_token_indices(offsets).tolist()
-    block_indices = torch.full((1, C_SEQ_LEN, H, S), C_SEQ_LEN, dtype=torch.long, device='cuda')
+    block_indices = torch.full((1, C_SEQ_LEN, H, S), C_SEQ_LEN, dtype=torch.long, device="cuda")
     for i in range(C_SEQ_LEN):
         _, t = token_indices[i]
         for h in range(H):
             i_i = torch.randperm(max(1, tilelang.cdiv(t, block_size)))[:S]
-            block_indices[0, i, h, :len(i_i)] = i_i
+            block_indices[0, i, h, : len(i_i)] = i_i
     block_indices = block_indices.sort(-1)[0]
-    block_counts = torch.randint(1, S + 1, (1, C_SEQ_LEN, H), device='cuda')
+    block_counts = torch.randint(1, S + 1, (1, C_SEQ_LEN, H), device="cuda")
 
     ref = naive_nsa(
         q=q,
@@ -351,7 +359,8 @@ def parallel_nsa(q: torch.Tensor,
         block_indices=block_indices,
         block_counts=block_counts,
         block_size=block_size,
-        cu_seqlens=offsets)
+        cu_seqlens=offsets,
+    )
 
     tri = parallel_nsa(
         q=q,
@@ -362,7 +371,8 @@ def parallel_nsa(q: torch.Tensor,
         block_indices=block_indices,
         block_counts=block_counts,
         block_size=block_size,
-        cu_seqlens=offsets)
+        cu_seqlens=offsets,
+    )
 
     print("tri", tri)
     print("ref", ref)
diff --git a/examples/deepseek_nsa/example_triton_nsa_bwd.py b/examples/deepseek_nsa/example_triton_nsa_bwd.py
index e912794a4..af05bfa70 100644
--- a/examples/deepseek_nsa/example_triton_nsa_bwd.py
+++ b/examples/deepseek_nsa/example_triton_nsa_bwd.py
@@ -8,6 +8,7 @@
 import triton.language as tl
 
 import fla
+
 if parse(fla.__version__) < parse("0.2.1"):
     from fla.ops.common.utils import prepare_token_indices
 else:
@@ -17,21 +18,44 @@
 from einops import rearrange
 
 
-@triton.heuristics({
-    'USE_OFFSETS': lambda args: args['offsets'] is not None,
-    'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor),
-})
+@triton.heuristics(
+    {
+        "USE_OFFSETS": lambda args: args["offsets"] is not None,
+        "USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor),
+    }
+)
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
 @triton.jit
-def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, block_indices,
-                            block_counts, offsets, token_indices, T, H: tl.constexpr,
-                            HQ: tl.constexpr, G: tl.constexpr, K: tl.constexpr, V: tl.constexpr,
-                            S: tl.constexpr, BS: tl.constexpr, WS: tl.constexpr, BK: tl.constexpr,
-                            BV: tl.constexpr, USE_OFFSETS: tl.constexpr,
-                            USE_BLOCK_COUNTS: tl.constexpr):
+def parallel_nsa_fwd_kernel(
+    q,
+    k,
+    v,
+    o_slc,
+    o_swa,
+    lse_slc,
+    lse_swa,
+    scale,
+    block_indices,
+    block_counts,
+    offsets,
+    token_indices,
+    T,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
@@ -46,20 +70,18 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
     # else:
     NS = S
 
-    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK),
-                            (1, 0))
+    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
     # the Q block is kept in the shared memory throughout the whole kernel
     # [G, BK]
     b_q = tl.load(p_q, boundary_check=(0, 1))
     b_q = (b_q * scale).to(b_q.dtype)
 
-    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV),
-                                (G, BV), (1, 0))
+    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
     p_lse_slc = lse_slc + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
     # [G, BV]
     b_o_slc = tl.zeros([G, BV], dtype=tl.float32)
 
-    b_m_slc = tl.full([G], float('-inf'), dtype=tl.float32)
+    b_m_slc = tl.full([G], float("-inf"), dtype=tl.float32)
     b_acc_slc = tl.zeros([G], dtype=tl.float32)
     for i in range(NS):
         i_s = tl.load(block_indices + i).to(tl.int32) * BS
@@ -72,7 +94,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_slc = tl.load(p_v_slc, boundary_check=(0, 1))
             # [G, BS]
             b_s_slc = tl.dot(b_q, b_k_slc)
-            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float('-inf'))
+            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float("-inf"))
 
             # [G]
             b_m_slc, b_mp_slc = tl.maximum(b_m_slc, tl.max(b_s_slc, 1)), b_m_slc
@@ -92,7 +114,6 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
 
 
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -105,8 +126,7 @@ def forward(ctx, q, k, v, block_indices, block_size, scale, offsets):
         # [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]]
         token_indices = prepare_token_indices(offsets) if offsets is not None else None
 
-        o, lse = parallel_nsa_fwd(
-            q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
+        o, lse = parallel_nsa_fwd(q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
         ctx.save_for_backward(q, k, v, o, lse)
         ctx.block_indices = block_indices
         ctx.block_size = block_size
@@ -134,7 +154,8 @@ def backward(ctx, do_slc, do_swa):
             window_size=ctx.window_size,
             scale=ctx.scale,
             offsets=ctx.offsets,
-            token_indices=ctx.token_indices)
+            token_indices=ctx.token_indices,
+        )
         return dq.to(q), dk.to(k), dv.to(v), None, None, None, None, None, None, None, None
 
 
@@ -199,37 +220,56 @@ def parallel_nsa_fwd(
     return o_slc, lse_slc, o_swa, lse_swa
 
 
-@triton.heuristics({'USE_OFFSETS': lambda args: args['offsets'] is not None})
+@triton.heuristics({"USE_OFFSETS": lambda args: args["offsets"] is not None})
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
-@triton.jit(do_not_specialize=['T'])
-def parallel_nsa_bwd_kernel_dkv(q, k, v, lse_slc, lse_swa, delta_slc, delta_swa, do_slc, do_swa, dk,
-                                dv, block_mask, offsets, chunk_indices, scale, T, B: tl.constexpr,
-                                H: tl.constexpr, HQ: tl.constexpr, G: tl.constexpr, K: tl.constexpr,
-                                V: tl.constexpr, M: tl.constexpr, BS: tl.constexpr,
-                                WS: tl.constexpr, BK: tl.constexpr, BV: tl.constexpr,
-                                USE_OFFSETS: tl.constexpr):
+@triton.jit(do_not_specialize=["T"])
+def parallel_nsa_bwd_kernel_dkv(
+    q,
+    k,
+    v,
+    lse_slc,
+    lse_swa,
+    delta_slc,
+    delta_swa,
+    do_slc,
+    do_swa,
+    dk,
+    dv,
+    block_mask,
+    offsets,
+    chunk_indices,
+    scale,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    M: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+):
     i_v, i_s, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
     if USE_OFFSETS:
-        i_n, i_s = tl.load(chunk_indices + i_s * 2).to(tl.int32), tl.load(chunk_indices + i_s * 2 +
-                                                                          1).to(tl.int32)
+        i_n, i_s = tl.load(chunk_indices + i_s * 2).to(tl.int32), tl.load(chunk_indices + i_s * 2 + 1).to(tl.int32)
         bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
         T = eos - bos
     else:
         bos, eos = i_b * T, i_b * T + T
 
-    p_k = tl.make_block_ptr(k + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_s * BS, 0), (BS, BK),
-                            (1, 0))
-    p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_s * BS, i_v * BV),
-                            (BS, BV), (1, 0))
-    p_dk = tl.make_block_ptr(dk + (i_v * B * T * H + bos * H + i_h) * K, (T, K), (H * K, 1),
-                             (i_s * BS, 0), (BS, BK), (1, 0))
-    p_dv = tl.make_block_ptr(dv + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_s * BS, i_v * BV),
-                             (BS, BV), (1, 0))
+    p_k = tl.make_block_ptr(k + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_s * BS, 0), (BS, BK), (1, 0))
+    p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_s * BS, i_v * BV), (BS, BV), (1, 0))
+    p_dk = tl.make_block_ptr(dk + (i_v * B * T * H + bos * H + i_h) * K, (T, K), (H * K, 1), (i_s * BS, 0), (BS, BK), (1, 0))
+    p_dv = tl.make_block_ptr(dv + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_s * BS, i_v * BV), (BS, BV), (1, 0))
 
     # [BS, BK]
     b_k = tl.load(p_k, boundary_check=(0, 1))
@@ -241,14 +281,12 @@ def parallel_nsa_bwd_kernel_dkv(q, k, v, lse_slc, lse_swa, delta_slc, delta_swa,
     for i in range(i_s * BS, T):
         b_m_slc = tl.load(block_mask + (bos + i) * H * M + i_h * M + i_s)
         if b_m_slc:
-            p_q = tl.make_block_ptr(q + (bos + i) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK),
-                                    (1, 0))
+            p_q = tl.make_block_ptr(q + (bos + i) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
             # [G, BK]
             b_q = tl.load(p_q, boundary_check=(0, 1))
             b_q = (b_q * scale).to(b_q.dtype)
 
-            p_do_slc = tl.make_block_ptr(do_slc + (bos + i) * HQ * V, (HQ, V), (V, 1),
-                                         (i_h * G, i_v * BV), (G, BV), (1, 0))
+            p_do_slc = tl.make_block_ptr(do_slc + (bos + i) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
             p_lse_slc = lse_slc + (bos + i) * HQ + i_h * G + tl.arange(0, G)
             p_delta_slc = delta_slc + (bos + i) * HQ + i_h * G + tl.arange(0, G)
             # [G, BV]
@@ -272,14 +310,12 @@ def parallel_nsa_bwd_kernel_dkv(q, k, v, lse_slc, lse_swa, delta_slc, delta_swa,
         if WS > 0:
             o_s = i_s * BS + tl.arange(0, BS)
             if max(i_s * BS, i - WS + 1) < min((i_s + 1) * BS, i + 1):
-                p_q = tl.make_block_ptr(q + (bos + i) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0),
-                                        (G, BK), (1, 0))
+                p_q = tl.make_block_ptr(q + (bos + i) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
                 # [G, BK]
                 b_q = tl.load(p_q, boundary_check=(0, 1))
                 b_q = (b_q * scale).to(b_q.dtype)
 
-                p_do_swa = tl.make_block_ptr(do_swa + (bos + i) * HQ * V, (HQ, V), (V, 1),
-                                             (i_h * G, i_v * BV), (G, BV), (1, 0))
+                p_do_swa = tl.make_block_ptr(do_swa + (bos + i) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
                 p_lse_swa = lse_swa + (bos + i) * HQ + i_h * G + tl.arange(0, G)
                 p_delta_swa = delta_swa + (bos + i) * HQ + i_h * G + tl.arange(0, G)
                 # [G, BV]
@@ -304,12 +340,19 @@ def parallel_nsa_bwd_kernel_dkv(q, k, v, lse_slc, lse_swa, delta_slc, delta_swa,
     tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
 
 
-@triton.heuristics(
-    {'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor)})
+@triton.heuristics({"USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor)})
 @triton.jit
-def parallel_nsa_kernel_mask(block_indices, block_counts, block_mask, T: tl.constexpr,
-                             H: tl.constexpr, S: tl.constexpr, BS: tl.constexpr, NS: tl.constexpr,
-                             USE_BLOCK_COUNTS: tl.constexpr):
+def parallel_nsa_kernel_mask(
+    block_indices,
+    block_counts,
+    block_mask,
+    T: tl.constexpr,
+    H: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    NS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_b, i_hs = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_h, i_s = i_hs // S, i_hs % S
 
@@ -320,31 +363,56 @@ def parallel_nsa_kernel_mask(block_indices, block_counts, block_mask, T: tl.cons
         b_m = b_i * BS <= i_t
 
     if b_i < NS and b_i >= 0:
-        tl.store(block_mask + i_b * T * H * NS + i_t * H * NS + i_h * NS + b_i,
-                 b_m.to(block_mask.dtype.element_ty))
+        tl.store(block_mask + i_b * T * H * NS + i_t * H * NS + i_h * NS + b_i, b_m.to(block_mask.dtype.element_ty))
 
 
-@triton.heuristics({
-    'USE_OFFSETS': lambda args: args['offsets'] is not None,
-    'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor)
-})
+@triton.heuristics(
+    {
+        "USE_OFFSETS": lambda args: args["offsets"] is not None,
+        "USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor),
+    }
+)
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
-@triton.jit(do_not_specialize=['T'])
-def parallel_nsa_bwd_kernel_dq(q, k, v, lse_slc, delta_slc, do_slc, lse_swa, delta_swa, do_swa, dq,
-                               scale, block_indices, block_counts, offsets, token_indices, T,
-                               B: tl.constexpr, H: tl.constexpr, HQ: tl.constexpr, G: tl.constexpr,
-                               K: tl.constexpr, V: tl.constexpr, S: tl.constexpr, BS: tl.constexpr,
-                               WS: tl.constexpr, BK: tl.constexpr, BV: tl.constexpr,
-                               USE_OFFSETS: tl.constexpr, USE_BLOCK_COUNTS: tl.constexpr):
+@triton.jit(do_not_specialize=["T"])
+def parallel_nsa_bwd_kernel_dq(
+    q,
+    k,
+    v,
+    lse_slc,
+    delta_slc,
+    do_slc,
+    lse_swa,
+    delta_swa,
+    do_swa,
+    dq,
+    scale,
+    block_indices,
+    block_counts,
+    offsets,
+    token_indices,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
     if USE_OFFSETS:
-        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 +
-                                                                          1).to(tl.int32)
+        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 + 1).to(tl.int32)
         bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
         T = eos - bos
     else:
@@ -449,27 +517,49 @@ def parallel_nsa_bwd_kernel_dq(q, k, v, lse_slc, delta_slc, do_slc, lse_swa, del
         tl.store(p_dq, (b_dq_slc + b_dq_swa).to(p_dq.dtype.element_ty), boundary_check=(0, 1))
 
 
-@triton.heuristics({
-    'USE_OFFSETS': lambda args: args['offsets'] is not None,
-    'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor),
-})
+@triton.heuristics(
+    {
+        "USE_OFFSETS": lambda args: args["offsets"] is not None,
+        "USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor),
+    }
+)
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
 @triton.jit
-def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, block_indices,
-                            block_counts, offsets, token_indices, T, H: tl.constexpr,
-                            HQ: tl.constexpr, G: tl.constexpr, K: tl.constexpr, V: tl.constexpr,
-                            S: tl.constexpr, BS: tl.constexpr, WS: tl.constexpr, BK: tl.constexpr,
-                            BV: tl.constexpr, USE_OFFSETS: tl.constexpr,
-                            USE_BLOCK_COUNTS: tl.constexpr):
+def parallel_nsa_fwd_kernel(
+    q,
+    k,
+    v,
+    o_slc,
+    o_swa,
+    lse_slc,
+    lse_swa,
+    scale,
+    block_indices,
+    block_counts,
+    offsets,
+    token_indices,
+    T,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
     if USE_OFFSETS:
-        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 +
-                                                                          1).to(tl.int32)
+        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 + 1).to(tl.int32)
         bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
         T = eos - bos
     else:
@@ -484,20 +574,18 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
     else:
         NS = S
 
-    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK),
-                            (1, 0))
+    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
     # the Q block is kept in the shared memory throughout the whole kernel
     # [G, BK]
     b_q = tl.load(p_q, boundary_check=(0, 1))
     b_q = (b_q * scale).to(b_q.dtype)
 
-    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV),
-                                (G, BV), (1, 0))
+    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
     p_lse_slc = lse_slc + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
     # [G, BV]
     b_o_slc = tl.zeros([G, BV], dtype=tl.float32)
 
-    b_m_slc = tl.full([G], float('-inf'), dtype=tl.float32)
+    b_m_slc = tl.full([G], float("-inf"), dtype=tl.float32)
     b_acc_slc = tl.zeros([G], dtype=tl.float32)
     for i in range(NS):
         i_s = tl.load(block_indices + i).to(tl.int32) * BS
@@ -510,7 +598,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_slc = tl.load(p_v_slc, boundary_check=(0, 1))
             # [G, BS]
             b_s_slc = tl.dot(b_q, b_k_slc)
-            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float('-inf'))
+            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float("-inf"))
 
             # [G]
             b_m_slc, b_mp_slc = tl.maximum(b_m_slc, tl.max(b_s_slc, 1)), b_m_slc
@@ -529,13 +617,12 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
     tl.store(p_lse_slc, b_m_slc.to(p_lse_slc.dtype.element_ty))
 
     if WS > 0:
-        p_o_swa = tl.make_block_ptr(o_swa + (bos + i_t) * HQ * V, (HQ, V), (V, 1),
-                                    (i_h * G, i_v * BV), (G, BV), (1, 0))
+        p_o_swa = tl.make_block_ptr(o_swa + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
         p_lse_swa = lse_swa + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
         # [G, BV]
         b_o_swa = tl.zeros([G, BV], dtype=tl.float32)
 
-        b_m_swa = tl.full([G], float('-inf'), dtype=tl.float32)
+        b_m_swa = tl.full([G], float("-inf"), dtype=tl.float32)
         b_acc_swa = tl.zeros([G], dtype=tl.float32)
         for i_s in range(max(0, i_t - WS + 1), i_t + 1, BS):
             p_k_swa = tl.make_block_ptr(k, (K, T), (1, H * K), (0, i_s), (BK, BS), (0, 1))
@@ -546,7 +633,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_swa = tl.load(p_v_swa, boundary_check=(0, 1))
             # [G, BS]
             b_s_swa = tl.dot(b_q, b_k_swa)
-            b_s_swa = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_swa, float('-inf'))
+            b_s_swa = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_swa, float("-inf"))
 
             # [G]
             b_m_swa, b_mp_swa = tl.maximum(b_m_swa, tl.max(b_s_swa, 1)), b_m_swa
@@ -593,14 +680,8 @@ def parallel_nsa_block_mask(
     block_mask = torch.zeros(B, T, H, NS, dtype=torch.bool, device=block_indices.device)
 
     parallel_nsa_kernel_mask[(T, B, H * S)](
-        block_indices=block_indices,
-        block_counts=block_counts,
-        block_mask=block_mask,
-        T=T,
-        H=H,
-        S=S,
-        BS=BS,
-        NS=NS)
+        block_indices=block_indices, block_counts=block_counts, block_mask=block_mask, T=T, H=H, S=S, BS=BS, NS=NS
+    )
     return block_mask
 
 
@@ -676,7 +757,8 @@ def parallel_nsa_bwd(
         BS=BS,
         WS=WS,
         BK=BK,
-        BV=BV)
+        BV=BV,
+    )
     dq = dq.sum(0)
 
     if offsets is not None:
@@ -719,14 +801,14 @@ def parallel_nsa_bwd(
         BS=BS,
         WS=WS,
         BK=BK,
-        BV=BV)
+        BV=BV,
+    )
     dk = dk.sum(0)
     return dq, dk, dv
 
 
 @torch.compile
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -749,7 +831,8 @@ def forward(ctx, q, k, v, block_indices, block_counts, block_size, window_size,
             window_size=window_size,
             scale=scale,
             offsets=offsets,
-            token_indices=token_indices)
+            token_indices=token_indices,
+        )
         ctx.save_for_backward(q, k, v, o_slc, lse_slc, o_swa, lse_swa)
         ctx.block_indices = block_indices
         ctx.block_counts = block_counts
@@ -781,22 +864,25 @@ def backward(ctx, do_slc, do_swa):
             window_size=ctx.window_size,
             scale=ctx.scale,
             offsets=ctx.offsets,
-            token_indices=ctx.token_indices)
+            token_indices=ctx.token_indices,
+        )
         return dq.to(q), dk.to(k), dv.to(v), None, None, None, None, None, None, None, None
 
 
-def parallel_nsa(q: torch.Tensor,
-                 k: torch.Tensor,
-                 v: torch.Tensor,
-                 g_slc: torch.Tensor,
-                 g_swa: torch.Tensor,
-                 block_indices: torch.LongTensor,
-                 block_counts: Optional[Union[torch.LongTensor, int]] = None,
-                 block_size: int = 64,
-                 window_size: int = 0,
-                 scale: Optional[float] = None,
-                 cu_seqlens: Optional[torch.LongTensor] = None,
-                 head_first: bool = False) -> torch.Tensor:
+def parallel_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -836,51 +922,49 @@ def parallel_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
     assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
     if isinstance(block_counts, int):
         block_indices = block_indices[:, :, :, :block_counts]
         block_counts = None
 
-    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size,
-                                             window_size, scale, cu_seqlens)
+    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size, scale, cu_seqlens)
     if window_size > 0:
         o = torch.addcmul(o_slc * g_slc.unsqueeze(-1), o_swa, g_swa.unsqueeze(-1))
     else:
         o = o_slc * g_slc.unsqueeze(-1)
     if head_first:
-        o = rearrange(o, 'b t h d -> b h t d')
+        o = rearrange(o, "b t h d -> b h t d")
     return o
 
 
 if __name__ == "__main__":
     B, T, H, HQ, D, S, block_size, dtype = 2, 64, 1, 16, 32, 1, 32, torch.float16
     torch.random.manual_seed(0)
-    q = torch.randn((B, T, HQ, D), dtype=dtype, device='cuda').requires_grad_(True)
-    k = torch.randn((B, T, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    v = torch.randn((B, T, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    g_slc = torch.ones((B, T, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    g_swa = torch.ones((B, T, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    do = torch.randn((B, T, HQ, D), dtype=dtype, device='cuda')
-
-    block_indices = torch.full((B, T, H, S), T, dtype=torch.long, device='cuda')
+    q = torch.randn((B, T, HQ, D), dtype=dtype, device="cuda").requires_grad_(True)
+    k = torch.randn((B, T, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    v = torch.randn((B, T, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    g_slc = torch.ones((B, T, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    g_swa = torch.ones((B, T, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((B, T, HQ, D), dtype=dtype, device="cuda")
+
+    block_indices = torch.full((B, T, H, S), T, dtype=torch.long, device="cuda")
     for b in range(B):
         for t in range(T):
             for h in range(H):
                 i_i = torch.randperm(max(1, (t // block_size)))[:S]
-                block_indices[b, t, h, :len(i_i)] = i_i
+                block_indices[b, t, h, : len(i_i)] = i_i
     block_indices = block_indices.sort(-1)[0]
 
-    block_counts = torch.randint(1, S + 1, (B, T, H), device='cuda')
+    block_counts = torch.randint(1, S + 1, (B, T, H), device="cuda")
 
     ref = naive_nsa(
         q=q,
diff --git a/examples/deepseek_nsa/example_triton_nsa_fwd.py b/examples/deepseek_nsa/example_triton_nsa_fwd.py
index 2c740013a..c9ab28daa 100644
--- a/examples/deepseek_nsa/example_triton_nsa_fwd.py
+++ b/examples/deepseek_nsa/example_triton_nsa_fwd.py
@@ -8,6 +8,7 @@
 import triton.language as tl
 
 import fla
+
 if parse(fla.__version__) < parse("0.2.1"):
     from fla.ops.common.utils import prepare_token_indices
 else:
@@ -17,21 +18,44 @@
 from einops import rearrange
 
 
-@triton.heuristics({
-    'USE_OFFSETS': lambda args: args['offsets'] is not None,
-    'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor),
-})
+@triton.heuristics(
+    {
+        "USE_OFFSETS": lambda args: args["offsets"] is not None,
+        "USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor),
+    }
+)
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
 @triton.jit
-def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, block_indices,
-                            block_counts, offsets, token_indices, T, H: tl.constexpr,
-                            HQ: tl.constexpr, G: tl.constexpr, K: tl.constexpr, V: tl.constexpr,
-                            S: tl.constexpr, BS: tl.constexpr, WS: tl.constexpr, BK: tl.constexpr,
-                            BV: tl.constexpr, USE_OFFSETS: tl.constexpr,
-                            USE_BLOCK_COUNTS: tl.constexpr):
+def parallel_nsa_fwd_kernel(
+    q,
+    k,
+    v,
+    o_slc,
+    o_swa,
+    lse_slc,
+    lse_swa,
+    scale,
+    block_indices,
+    block_counts,
+    offsets,
+    token_indices,
+    T,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
@@ -46,20 +70,18 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
     # else:
     NS = S
 
-    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK),
-                            (1, 0))
+    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
     # the Q block is kept in the shared memory throughout the whole kernel
     # [G, BK]
     b_q = tl.load(p_q, boundary_check=(0, 1))
     b_q = (b_q * scale).to(b_q.dtype)
 
-    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV),
-                                (G, BV), (1, 0))
+    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
     p_lse_slc = lse_slc + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
     # [G, BV]
     b_o_slc = tl.zeros([G, BV], dtype=tl.float32)
 
-    b_m_slc = tl.full([G], float('-inf'), dtype=tl.float32)
+    b_m_slc = tl.full([G], float("-inf"), dtype=tl.float32)
     b_acc_slc = tl.zeros([G], dtype=tl.float32)
     for i in range(NS):
         i_s = tl.load(block_indices + i).to(tl.int32) * BS
@@ -72,7 +94,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_slc = tl.load(p_v_slc, boundary_check=(0, 1))
             # [G, BS]
             b_s_slc = tl.dot(b_q, b_k_slc)
-            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float('-inf'))
+            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float("-inf"))
 
             # [G]
             b_m_slc, b_mp_slc = tl.maximum(b_m_slc, tl.max(b_s_slc, 1)), b_m_slc
@@ -92,7 +114,6 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
 
 
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -105,8 +126,7 @@ def forward(ctx, q, k, v, block_indices, block_size, scale, offsets):
         # [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]]
         token_indices = prepare_token_indices(offsets) if offsets is not None else None
 
-        o, lse = parallel_nsa_fwd(
-            q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
+        o, lse = parallel_nsa_fwd(q=q, k=k, v=v, block_indices=block_indices, block_size=block_size, scale=scale)
         ctx.save_for_backward(q, k, v, o, lse)
         ctx.block_indices = block_indices
         ctx.block_size = block_size
@@ -177,7 +197,6 @@ def parallel_nsa_fwd(
 
 @torch.compile
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -200,7 +219,8 @@ def forward(ctx, q, k, v, block_indices, block_counts, block_size, window_size,
             window_size=window_size,
             scale=scale,
             offsets=offsets,
-            token_indices=token_indices)
+            token_indices=token_indices,
+        )
         ctx.save_for_backward(q, k, v, o_slc, lse_slc, o_swa, lse_swa)
         ctx.block_indices = block_indices
         ctx.block_counts = block_counts
@@ -212,18 +232,20 @@ def forward(ctx, q, k, v, block_indices, block_counts, block_size, window_size,
         return o_slc.to(q.dtype), o_swa.to(q.dtype) if o_swa is not None else o_swa
 
 
-def parallel_nsa(q: torch.Tensor,
-                 k: torch.Tensor,
-                 v: torch.Tensor,
-                 g_slc: torch.Tensor,
-                 g_swa: torch.Tensor,
-                 block_indices: torch.LongTensor,
-                 block_counts: Optional[Union[torch.LongTensor, int]] = None,
-                 block_size: int = 64,
-                 window_size: int = 0,
-                 scale: Optional[float] = None,
-                 cu_seqlens: Optional[torch.LongTensor] = None,
-                 head_first: bool = False) -> torch.Tensor:
+def parallel_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -263,51 +285,49 @@ def parallel_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
     assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
     if isinstance(block_counts, int):
         block_indices = block_indices[:, :, :, :block_counts]
         block_counts = None
 
-    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size,
-                                             window_size, scale, cu_seqlens)
+    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size, scale, cu_seqlens)
     if window_size > 0:
         o = torch.addcmul(o_slc * g_slc.unsqueeze(-1), o_swa, g_swa.unsqueeze(-1))
     else:
         o = o_slc * g_slc.unsqueeze(-1)
     if head_first:
-        o = rearrange(o, 'b t h d -> b h t d')
+        o = rearrange(o, "b t h d -> b h t d")
     return o
 
 
 if __name__ == "__main__":
     B, T, H, HQ, D, S, block_size, dtype = 2, 64, 1, 16, 32, 1, 32, torch.float16
     torch.random.manual_seed(0)
-    q = torch.randn((B, T, HQ, D), dtype=dtype, device='cuda').requires_grad_(True)
-    k = torch.randn((B, T, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    v = torch.randn((B, T, H, D), dtype=dtype, device='cuda').requires_grad_(True)
-    g_slc = torch.ones((B, T, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    g_swa = torch.ones((B, T, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    do = torch.randn((B, T, HQ, D), dtype=dtype, device='cuda')
-
-    block_indices = torch.full((B, T, H, S), T, dtype=torch.long, device='cuda')
+    q = torch.randn((B, T, HQ, D), dtype=dtype, device="cuda").requires_grad_(True)
+    k = torch.randn((B, T, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    v = torch.randn((B, T, H, D), dtype=dtype, device="cuda").requires_grad_(True)
+    g_slc = torch.ones((B, T, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    g_swa = torch.ones((B, T, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((B, T, HQ, D), dtype=dtype, device="cuda")
+
+    block_indices = torch.full((B, T, H, S), T, dtype=torch.long, device="cuda")
     for b in range(B):
         for t in range(T):
             for h in range(H):
                 i_i = torch.randperm(max(1, (t // block_size)))[:S]
-                block_indices[b, t, h, :len(i_i)] = i_i
+                block_indices[b, t, h, : len(i_i)] = i_i
     block_indices = block_indices.sort(-1)[0]
 
-    block_counts = torch.randint(1, S + 1, (B, T, H), device='cuda')
+    block_counts = torch.randint(1, S + 1, (B, T, H), device="cuda")
 
     ref = naive_nsa(
         q=q,
diff --git a/examples/deepseek_nsa/example_triton_nsa_fwd_varlen.py b/examples/deepseek_nsa/example_triton_nsa_fwd_varlen.py
index 9ccbff6a4..cb4eb6d7b 100644
--- a/examples/deepseek_nsa/example_triton_nsa_fwd_varlen.py
+++ b/examples/deepseek_nsa/example_triton_nsa_fwd_varlen.py
@@ -8,6 +8,7 @@
 import triton.language as tl
 
 import fla
+
 if parse(fla.__version__) < parse("0.2.1"):
     from fla.ops.common.utils import prepare_token_indices
 else:
@@ -17,27 +18,49 @@
 from einops import rearrange
 
 
-@triton.heuristics({
-    'USE_OFFSETS': lambda args: args['offsets'] is not None,
-    'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor),
-})
+@triton.heuristics(
+    {
+        "USE_OFFSETS": lambda args: args["offsets"] is not None,
+        "USE_BLOCK_COUNTS": lambda args: isinstance(args["block_counts"], torch.Tensor),
+    }
+)
 @triton.autotune(
     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]],
-    key=['BS', 'BK', 'BV'],
+    key=["BS", "BK", "BV"],
 )
 @triton.jit
-def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, block_indices,
-                            block_counts, offsets, token_indices, T, H: tl.constexpr,
-                            HQ: tl.constexpr, G: tl.constexpr, K: tl.constexpr, V: tl.constexpr,
-                            S: tl.constexpr, BS: tl.constexpr, WS: tl.constexpr, BK: tl.constexpr,
-                            BV: tl.constexpr, USE_OFFSETS: tl.constexpr,
-                            USE_BLOCK_COUNTS: tl.constexpr):
+def parallel_nsa_fwd_kernel(
+    q,
+    k,
+    v,
+    o_slc,
+    o_swa,
+    lse_slc,
+    lse_swa,
+    scale,
+    block_indices,
+    block_counts,
+    offsets,
+    token_indices,
+    T,
+    H: tl.constexpr,
+    HQ: tl.constexpr,
+    G: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    S: tl.constexpr,
+    BS: tl.constexpr,
+    WS: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    USE_BLOCK_COUNTS: tl.constexpr,
+):
     i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
     i_b, i_h = i_bh // H, i_bh % H
 
     if USE_OFFSETS:
-        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 +
-                                                                          1).to(tl.int32)
+        i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 + 1).to(tl.int32)
         bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
         T = eos - bos
     else:
@@ -52,20 +75,18 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
     else:
         NS = S
 
-    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK),
-                            (1, 0))
+    p_q = tl.make_block_ptr(q + (bos + i_t) * HQ * K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0))
     # the Q block is kept in the shared memory throughout the whole kernel
     # [G, BK]
     b_q = tl.load(p_q, boundary_check=(0, 1))
     b_q = (b_q * scale).to(b_q.dtype)
 
-    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV),
-                                (G, BV), (1, 0))
+    p_o_slc = tl.make_block_ptr(o_slc + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
     p_lse_slc = lse_slc + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
     # [G, BV]
     b_o_slc = tl.zeros([G, BV], dtype=tl.float32)
 
-    b_m_slc = tl.full([G], float('-inf'), dtype=tl.float32)
+    b_m_slc = tl.full([G], float("-inf"), dtype=tl.float32)
     b_acc_slc = tl.zeros([G], dtype=tl.float32)
     for i in range(NS):
         i_s = tl.load(block_indices + i).to(tl.int32) * BS
@@ -78,7 +99,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_slc = tl.load(p_v_slc, boundary_check=(0, 1))
             # [G, BS]
             b_s_slc = tl.dot(b_q, b_k_slc)
-            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float('-inf'))
+            b_s_slc = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_slc, float("-inf"))
 
             # [G]
             b_m_slc, b_mp_slc = tl.maximum(b_m_slc, tl.max(b_s_slc, 1)), b_m_slc
@@ -97,13 +118,12 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
     tl.store(p_lse_slc, b_m_slc.to(p_lse_slc.dtype.element_ty))
 
     if WS > 0:
-        p_o_swa = tl.make_block_ptr(o_swa + (bos + i_t) * HQ * V, (HQ, V), (V, 1),
-                                    (i_h * G, i_v * BV), (G, BV), (1, 0))
+        p_o_swa = tl.make_block_ptr(o_swa + (bos + i_t) * HQ * V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0))
         p_lse_swa = lse_swa + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)
         # [G, BV]
         b_o_swa = tl.zeros([G, BV], dtype=tl.float32)
 
-        b_m_swa = tl.full([G], float('-inf'), dtype=tl.float32)
+        b_m_swa = tl.full([G], float("-inf"), dtype=tl.float32)
         b_acc_swa = tl.zeros([G], dtype=tl.float32)
         for i_s in range(max(0, i_t - WS + 1), i_t + 1, BS):
             p_k_swa = tl.make_block_ptr(k, (K, T), (1, H * K), (0, i_s), (BK, BS), (0, 1))
@@ -114,7 +134,7 @@ def parallel_nsa_fwd_kernel(q, k, v, o_slc, o_swa, lse_slc, lse_swa, scale, bloc
             b_v_swa = tl.load(p_v_swa, boundary_check=(0, 1))
             # [G, BS]
             b_s_swa = tl.dot(b_q, b_k_swa)
-            b_s_swa = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_swa, float('-inf'))
+            b_s_swa = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s_swa, float("-inf"))
 
             # [G]
             b_m_swa, b_mp_swa = tl.maximum(b_m_swa, tl.max(b_s_swa, 1)), b_m_swa
@@ -196,7 +216,6 @@ def parallel_nsa_fwd(
 
 @torch.compile
 class ParallelNSAFunction(torch.autograd.Function):
-
     @staticmethod
     @contiguous
     @autocast_custom_fwd
@@ -219,7 +238,8 @@ def forward(ctx, q, k, v, block_indices, block_counts, block_size, window_size,
             window_size=window_size,
             scale=scale,
             offsets=offsets,
-            token_indices=token_indices)
+            token_indices=token_indices,
+        )
         ctx.save_for_backward(q, k, v, o_slc, lse_slc, o_swa, lse_swa)
         ctx.block_indices = block_indices
         ctx.block_counts = block_counts
@@ -231,18 +251,20 @@ def forward(ctx, q, k, v, block_indices, block_counts, block_size, window_size,
         return o_slc.to(q.dtype), o_swa.to(q.dtype) if o_swa is not None else o_swa
 
 
-def parallel_nsa(q: torch.Tensor,
-                 k: torch.Tensor,
-                 v: torch.Tensor,
-                 g_slc: torch.Tensor,
-                 g_swa: torch.Tensor,
-                 block_indices: torch.LongTensor,
-                 block_counts: Optional[Union[torch.LongTensor, int]] = None,
-                 block_size: int = 64,
-                 window_size: int = 0,
-                 scale: Optional[float] = None,
-                 cu_seqlens: Optional[torch.LongTensor] = None,
-                 head_first: bool = False) -> torch.Tensor:
+def parallel_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -282,29 +304,27 @@ def parallel_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
     assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA"
 
     if isinstance(block_counts, int):
         block_indices = block_indices[:, :, :, :block_counts]
         block_counts = None
 
-    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size,
-                                             window_size, scale, cu_seqlens)
+    o_slc, o_swa = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, window_size, scale, cu_seqlens)
     if window_size > 0:
         o = torch.addcmul(o_slc * g_slc.unsqueeze(-1), o_swa, g_swa.unsqueeze(-1))
     else:
         o = o_slc * g_slc.unsqueeze(-1)
     if head_first:
-        o = rearrange(o, 'b t h d -> b h t d')
+        o = rearrange(o, "b t h d -> b h t d")
     return o
 
 
@@ -312,38 +332,35 @@ def parallel_nsa(q: torch.Tensor,
     N, T, H, HQ, D, S, block_size, dtype = 2, 64, 1, 16, 64, 1, 32, torch.float16
     torch.manual_seed(42)
     # randomly split the sequence into N segments
-    offsets = torch.cat([
-        torch.tensor([0], dtype=torch.long),
-        torch.arange(16, T)[torch.randperm(T - 1)[:N - 1]],
-        torch.tensor([T], dtype=torch.long)
-    ], 0).cuda().sort()[0]
+    offsets = (
+        torch.cat(
+            [torch.tensor([0], dtype=torch.long), torch.arange(16, T)[torch.randperm(T - 1)[: N - 1]], torch.tensor([T], dtype=torch.long)],
+            0,
+        )
+        .cuda()
+        .sort()[0]
+    )
     # offsets.shape is [N+1]
     # seq-first required for inputs with variable lengths
-    perm_q = torch.randperm(T, device='cuda')
-    perm_k = torch.randperm(T, device='cuda')
-    perm_v = torch.randperm(T, device='cuda')
-    q = torch.linspace(
-        0, 1, steps=T, dtype=dtype,
-        device='cuda')[perm_q].view(1, T, 1, 1).expand(1, T, HQ, D).clone().requires_grad_(True)
-    k = torch.linspace(
-        0, 1, steps=T, dtype=dtype,
-        device='cuda')[perm_k].view(1, T, 1, 1).expand(1, T, H, D).clone().requires_grad_(True)
-    v = torch.linspace(
-        0, 1, steps=T, dtype=dtype,
-        device='cuda')[perm_v].view(1, T, 1, 1).expand(1, T, H, D).clone().requires_grad_(True)
-    g_slc = torch.rand((1, T, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    g_swa = torch.rand((1, T, HQ), dtype=dtype, device='cuda').requires_grad_(True)
-    do = torch.randn((1, T, HQ, D), dtype=dtype, device='cuda')
+    perm_q = torch.randperm(T, device="cuda")
+    perm_k = torch.randperm(T, device="cuda")
+    perm_v = torch.randperm(T, device="cuda")
+    q = torch.linspace(0, 1, steps=T, dtype=dtype, device="cuda")[perm_q].view(1, T, 1, 1).expand(1, T, HQ, D).clone().requires_grad_(True)
+    k = torch.linspace(0, 1, steps=T, dtype=dtype, device="cuda")[perm_k].view(1, T, 1, 1).expand(1, T, H, D).clone().requires_grad_(True)
+    v = torch.linspace(0, 1, steps=T, dtype=dtype, device="cuda")[perm_v].view(1, T, 1, 1).expand(1, T, H, D).clone().requires_grad_(True)
+    g_slc = torch.rand((1, T, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    g_swa = torch.rand((1, T, HQ), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((1, T, HQ, D), dtype=dtype, device="cuda")
 
     token_indices = prepare_token_indices(offsets).tolist()
-    block_indices = torch.full((1, T, H, S), T, dtype=torch.long, device='cuda')
+    block_indices = torch.full((1, T, H, S), T, dtype=torch.long, device="cuda")
     for i in range(T):
         _, t = token_indices[i]
         for h in range(H):
             i_i = torch.randperm(max(1, triton.cdiv(t, block_size)))[:S]
-            block_indices[0, i, h, :len(i_i)] = i_i
+            block_indices[0, i, h, : len(i_i)] = i_i
     block_indices = block_indices.sort(-1)[0]
-    block_counts = torch.randint(1, S + 1, (1, T, H), device='cuda')
+    block_counts = torch.randint(1, S + 1, (1, T, H), device="cuda")
 
     ref = naive_nsa(
         q=q,
@@ -354,7 +371,8 @@ def parallel_nsa(q: torch.Tensor,
         block_indices=block_indices,
         block_counts=block_counts,
         block_size=block_size,
-        cu_seqlens=offsets)
+        cu_seqlens=offsets,
+    )
 
     tri = parallel_nsa(
         q=q,
@@ -365,7 +383,8 @@ def parallel_nsa(q: torch.Tensor,
         block_indices=block_indices,
         block_counts=block_counts,
         block_size=block_size,
-        cu_seqlens=offsets)
+        cu_seqlens=offsets,
+    )
 
     print("tri", tri)
     print("ref", ref)
diff --git a/examples/deepseek_nsa/reference.py b/examples/deepseek_nsa/reference.py
index 958d0c19e..58083108e 100644
--- a/examples/deepseek_nsa/reference.py
+++ b/examples/deepseek_nsa/reference.py
@@ -6,18 +6,20 @@
 from einops import rearrange, repeat
 
 
-def naive_nsa(q: torch.Tensor,
-              k: torch.Tensor,
-              v: torch.Tensor,
-              g_slc: torch.Tensor,
-              g_swa: torch.Tensor,
-              block_indices: torch.LongTensor,
-              block_counts: Optional[Union[torch.LongTensor, int]] = None,
-              block_size: int = 64,
-              window_size: int = 0,
-              scale: Optional[float] = None,
-              cu_seqlens: Optional[torch.LongTensor] = None,
-              head_first: bool = False) -> torch.Tensor:
+def naive_nsa(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g_slc: torch.Tensor,
+    g_swa: torch.Tensor,
+    block_indices: torch.LongTensor,
+    block_counts: Optional[Union[torch.LongTensor, int]] = None,
+    block_size: int = 64,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+) -> torch.Tensor:
     r"""
     Args:
         q (torch.Tensor):
@@ -57,26 +59,24 @@ def naive_nsa(q: torch.Tensor,
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
     if scale is None:
-        scale = k.shape[-1]**-0.5
+        scale = k.shape[-1] ** -0.5
     if cu_seqlens is not None:
         assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided"
         if head_first:
-            raise RuntimeError(
-                "Sequences with variable lengths are not supported for head-first mode")
+            raise RuntimeError("Sequences with variable lengths are not supported for head-first mode")
     if head_first:
-        q, k, v, block_indices = map(lambda x: rearrange(x, 'b h t d -> b t h d'),
-                                     (q, k, v, block_indices))
-        g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h'), (g_slc, g_swa))
+        q, k, v, block_indices = map(lambda x: rearrange(x, "b h t d -> b t h d"), (q, k, v, block_indices))
+        g_slc, g_swa = map(lambda x: rearrange(x, "b h t -> b t h"), (g_slc, g_swa))
         if isinstance(block_counts, torch.Tensor):
-            block_counts = rearrange(block_counts, 'b h t -> b t h')
+            block_counts = rearrange(block_counts, "b h t -> b t h")
 
     dtype = q.dtype
     G = q.shape[2] // k.shape[2]
     BS = block_size
     S = block_indices.shape[-1]
-    k, v, block_indices = (repeat(x, 'b t h d -> b t (h g) d', g=G) for x in (k, v, block_indices))
+    k, v, block_indices = (repeat(x, "b t h d -> b t (h g) d", g=G) for x in (k, v, block_indices))
     if isinstance(block_counts, torch.Tensor):
-        block_counts = repeat(block_counts, 'b t h -> b t (h g)', g=G)
+        block_counts = repeat(block_counts, "b t h -> b t (h g)", g=G)
     c = torch.arange(S).repeat_interleave(BS).unsqueeze(1).expand(-1, q.shape[2]).to(q.device)
     q, k, v = map(lambda x: x.float(), (q, k, v))
 
@@ -86,14 +86,11 @@ def naive_nsa(q: torch.Tensor,
     if cu_seqlens is None:
         varlen = False
         B, T = q.shape[:2]
-        cu_seqlens = torch.cat(
-            [block_indices.new_tensor(range(0, B * T, T)),
-             block_indices.new_tensor([B * T])])
+        cu_seqlens = torch.cat([block_indices.new_tensor(range(0, B * T, T)), block_indices.new_tensor([B * T])])
 
     for i in range(len(cu_seqlens) - 1):
         if not varlen:
-            q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = q[i], k[i], v[i], g_slc[i], g_swa[
-                i], block_indices[i]
+            q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = q[i], k[i], v[i], g_slc[i], g_swa[i], block_indices[i]
             if isinstance(block_counts, torch.Tensor):
                 s_b = block_counts[i]
             else:
@@ -101,10 +98,10 @@ def naive_nsa(q: torch.Tensor,
         else:
             T = cu_seqlens[i + 1] - cu_seqlens[i]
             q_b, k_b, v_b, g_slc_b, g_swa_b, i_b = map(
-                lambda x: x[0][cu_seqlens[i]:cu_seqlens[i + 1]],
-                (q, k, v, g_slc, g_swa, block_indices))
+                lambda x: x[0][cu_seqlens[i] : cu_seqlens[i + 1]], (q, k, v, g_slc, g_swa, block_indices)
+            )
             if isinstance(block_counts, torch.Tensor):
-                s_b = block_counts[0][cu_seqlens[i]:cu_seqlens[i + 1]]
+                s_b = block_counts[0][cu_seqlens[i] : cu_seqlens[i + 1]]
             else:
                 s_b = block_counts
 
@@ -126,34 +123,28 @@ def naive_nsa(q: torch.Tensor,
             else:
                 s_i = s_b
             # [S*BS, HQ, -1]
-            k_i_slc, v_i_slc = map(
-                lambda x: x.gather(
-                    0,
-                    i_i.clamp(0, T - 1).unsqueeze(-1).expand(*i_i.shape, x.shape[-1])), (k_b, v_b))
+            k_i_slc, v_i_slc = map(lambda x: x.gather(0, i_i.clamp(0, T - 1).unsqueeze(-1).expand(*i_i.shape, x.shape[-1])), (k_b, v_b))
             # [S*BS, HQ]
-            attn_slc = torch.einsum('h d, n h d -> n h', q_i, k_i_slc).masked_fill(
-                torch.logical_or(i_i < 0, i_i > i_q) |
-                (c >= s_i if block_counts is not None else False), float('-inf')).softmax(0)
+            attn_slc = (
+                torch.einsum("h d, n h d -> n h", q_i, k_i_slc)
+                .masked_fill(torch.logical_or(i_i < 0, i_i > i_q) | (c >= s_i if block_counts is not None else False), float("-inf"))
+                .softmax(0)
+            )
             if not varlen:
-                o_slc[i, i_q] = torch.einsum('n h, n h v -> h v', attn_slc,
-                                             v_i_slc) * g_slc_i.unsqueeze(-1)
+                o_slc[i, i_q] = torch.einsum("n h, n h v -> h v", attn_slc, v_i_slc) * g_slc_i.unsqueeze(-1)
             else:
-                o_slc[0][cu_seqlens[i] + i_q] = torch.einsum('n h, n h v -> h v', attn_slc,
-                                                             v_i_slc) * g_slc_i.unsqueeze(-1)
+                o_slc[0][cu_seqlens[i] + i_q] = torch.einsum("n h, n h v -> h v", attn_slc, v_i_slc) * g_slc_i.unsqueeze(-1)
             if window_size > 0:
-                k_i_swa, v_i_swa = map(lambda x: x[max(0, i_q - window_size + 1):i_q + 1],
-                                       (k_b, v_b))
-                attn_swa = torch.einsum('h d, n h d -> n h', q_i, k_i_swa).softmax(0)
+                k_i_swa, v_i_swa = map(lambda x: x[max(0, i_q - window_size + 1) : i_q + 1], (k_b, v_b))
+                attn_swa = torch.einsum("h d, n h d -> n h", q_i, k_i_swa).softmax(0)
                 if not varlen:
-                    o_swa[i, i_q] = torch.einsum('n h, n h v -> h v', attn_swa,
-                                                 v_i_swa) * g_swa_i.unsqueeze(-1)
+                    o_swa[i, i_q] = torch.einsum("n h, n h v -> h v", attn_swa, v_i_swa) * g_swa_i.unsqueeze(-1)
                 else:
-                    o_swa[0][cu_seqlens[i] + i_q] = torch.einsum('n h, n h v -> h v', attn_swa,
-                                                                 v_i_swa) * g_swa_i.unsqueeze(-1)
+                    o_swa[0][cu_seqlens[i] + i_q] = torch.einsum("n h, n h v -> h v", attn_swa, v_i_swa) * g_swa_i.unsqueeze(-1)
 
     if head_first:
-        o_slc = rearrange(o_slc, 'b t h d -> b h t d')
-        o_swa = rearrange(o_swa, 'b t h d -> b h t d')
+        o_slc = rearrange(o_slc, "b t h d -> b h t d")
+        o_swa = rearrange(o_swa, "b t h d -> b h t d")
 
     return o_slc.to(dtype) + o_swa.to(dtype) if o_swa is not None else o_slc.to(dtype)
 
@@ -187,7 +178,7 @@ def naive_nsa_simple(
         o (torch.Tensor):
             Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
-    scale = k.shape[-1]**-0.5
+    scale = k.shape[-1] ** -0.5
 
     dtype = q.dtype
     HQ = q.shape[2]
@@ -197,8 +188,8 @@ def naive_nsa_simple(
     BS = block_size
     S = block_indices.shape[-1]
     SELECTED_BLOCKS_SIZE = S * BS
-    k, v, block_indices = (repeat(x, 'b t h d -> b t (h g) d', g=G) for x in (k, v, block_indices))
-    block_counts = repeat(block_counts, 'b t h -> b t (h g)', g=G)
+    k, v, block_indices = (repeat(x, "b t h d -> b t (h g) d", g=G) for x in (k, v, block_indices))
+    block_counts = repeat(block_counts, "b t h -> b t (h g)", g=G)
     c = torch.arange(S).repeat_interleave(BS).unsqueeze(1).expand(-1, q.shape[2]).to(q.device)
     q, k, v = map(lambda x: x.float(), (q, k, v))
     o = torch.zeros_like(v)
@@ -228,10 +219,10 @@ def naive_nsa_simple(
                     v_i[t, h] = v_b[selected_block_index, h, :]
 
             # [S*BS, HQ]
-            attn = torch.einsum('h d, n h d -> n h', q_i, k_i)
-            attn = attn.masked_fill((i_i > i_q) | (c >= s_i), float('-inf'))
+            attn = torch.einsum("h d, n h d -> n h", q_i, k_i)
+            attn = attn.masked_fill((i_i > i_q) | (c >= s_i), float("-inf"))
             attn = torch.softmax(attn, dim=0)
-            o[i, i_q] = torch.einsum('n h, n h v -> h v', attn, v_i)
+            o[i, i_q] = torch.einsum("n h, n h v -> h v", attn, v_i)
 
     return o.to(dtype)
 
@@ -265,7 +256,7 @@ def naive_nsa_simple_inference(
         o (torch.Tensor):
             Outputs of shape `[B, 1, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`.
     """
-    scale = k.shape[-1]**-0.5
+    scale = k.shape[-1] ** -0.5
 
     dtype = q.dtype
     HQ = q.shape[2]
@@ -275,8 +266,8 @@ def naive_nsa_simple_inference(
     BS = block_size
     S = block_indices.shape[-1]
     SELECTED_BLOCKS_SIZE = S * BS
-    k, v, block_indices = (repeat(x, 'b t h d -> b t (h g) d', g=G) for x in (k, v, block_indices))
-    block_counts = repeat(block_counts, 'b t h -> b t (h g)', g=G)
+    k, v, block_indices = (repeat(x, "b t h d -> b t (h g) d", g=G) for x in (k, v, block_indices))
+    block_counts = repeat(block_counts, "b t h -> b t (h g)", g=G)
     c = torch.arange(S).repeat_interleave(BS).unsqueeze(1).expand(-1, q.shape[2]).to(q.device)
     q, k, v = map(lambda x: x.float(), (q, k, v))
     o = torch.zeros_like(q)
@@ -306,9 +297,9 @@ def naive_nsa_simple_inference(
                 v_i[t, h] = v_b[selected_block_index, h, :]
 
         # [S*BS, HQ]
-        attn = torch.einsum('h d, n h d -> n h', q_i, k_i)
-        attn = attn.masked_fill((c >= s_i), float('-inf'))
+        attn = torch.einsum("h d, n h d -> n h", q_i, k_i)
+        attn = attn.masked_fill((c >= s_i), float("-inf"))
         attn = torch.softmax(attn, dim=0)
-        o[i, 0] = torch.einsum('n h, n h v -> h v', attn, v_i)
+        o[i, 0] = torch.einsum("n h, n h v -> h v", attn, v_i)
 
     return o.to(dtype)
diff --git a/examples/deepseek_nsa/regression_example_tilelang_nsa.py b/examples/deepseek_nsa/regression_example_tilelang_nsa.py
new file mode 100644
index 000000000..1858f045a
--- /dev/null
+++ b/examples/deepseek_nsa/regression_example_tilelang_nsa.py
@@ -0,0 +1,15 @@
+import tilelang.testing
+import example_tilelang_nsa_fwd
+import example_tilelang_nsa_decode
+
+
+def regression_example_tilelang_nsa_fwd():
+    tilelang.testing.process_func(example_tilelang_nsa_fwd.run_regression_perf)
+
+
+def regression_example_tilelang_nsa_fwd_decode():
+    tilelang.testing.process_func(example_tilelang_nsa_decode.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/deepseek_nsa/requirements.txt b/examples/deepseek_nsa/requirements.txt
index 777c2ad4c..e096dfd7d 100644
--- a/examples/deepseek_nsa/requirements.txt
+++ b/examples/deepseek_nsa/requirements.txt
@@ -1 +1 @@
-git+https://github.com/fla-org/flash-linear-attention@c3bd56589033610264532b11f0972c69e4645f6e
\ No newline at end of file
+git+https://github.com/fla-org/flash-linear-attention@c3bd56589033610264532b11f0972c69e4645f6e
diff --git a/examples/deepseek_v32/README.md b/examples/deepseek_v32/README.md
index 8457745b0..01a14b6b2 100644
--- a/examples/deepseek_v32/README.md
+++ b/examples/deepseek_v32/README.md
@@ -121,7 +121,7 @@ for i_i in T.Pipelined(NI, num_stages=num_stages):
     # ... compute attention over selected tokens
 ```
 
-This reduces compute from O(seq_len * seq_len_kv) to O(seq_len * topk). The causal mask is enforced by checking whether each index position is valid:
+This reduces compute from O(seq_len *seq_len_kv) to O(seq_len* topk). The causal mask is enforced by checking whether each index position is valid:
 
 ```python
 for bi_i in T.Parallel(BI):
@@ -193,10 +193,10 @@ for i_i in T.Pipelined(NI, num_stages=num_stages):
     # Load KV data for selected indices
     for bi_i, d_i in T.Parallel(BI, D):
         KV_shared[bi_i, d_i] = KV[by, Indices[by, s_i, bz, i_i * BI + bi_i], bz, d_i]
-    
+
     # Recompute attention scores for backward
     T.gemm(Q_shared, KV_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
-    
+
     # Apply softmax gradient: dP = P * (dP_raw - Delta)
     for h_i, bi_i in T.Parallel(padded_H, BI):
         acc_dp[h_i, bi_i] = acc_p[h_i, bi_i] * (acc_dp[h_i, bi_i] - Delta[by, s_i, bz * padded_H + h_i]) * sm_scale
@@ -204,7 +204,7 @@ for i_i in T.Pipelined(NI, num_stages=num_stages):
 
 The key gradient computations are:
 - **dQ = dP @ K** (query gradients)
-- **dK = dP^T @ Q** (key gradients) 
+- **dK = dP^T @ Q** (key gradients)
 - **dV = P^T @ dO** (value gradients)
 
 **3. Atomic Sparse Updates**: Uses atomic operations for dKV accumulation:
@@ -212,7 +212,7 @@ The key gradient computations are:
 ```python
 # Atomically update dKV at selected indices
 for bi_i, d_i in T.Parallel(BI // split_store, D // 4):
-    T.atomic_addx4(dKV[by, Indices[by, s_i, bz, i_i * BI + bi_i + s * (BI // split_store)], bz, d_i * 4], 
+    T.atomic_addx4(dKV[by, Indices[by, s_i, bz, i_i * BI + bi_i + s * (BI // split_store)], bz, d_i * 4],
                    acc_dkv_shared[bi_i, d_i * 4])
 ```
 
diff --git a/examples/deepseek_v32/fp8_lighting_indexer.py b/examples/deepseek_v32/fp8_lighting_indexer.py
index 21baa8fa8..03e88dd97 100644
--- a/examples/deepseek_v32/fp8_lighting_indexer.py
+++ b/examples/deepseek_v32/fp8_lighting_indexer.py
@@ -28,11 +28,11 @@ def validate_tensor_match(a, b, tolerance=1e-8, tensor_name="tensor", should_rai
         if should_raise:
             assert False
     if not torch.isclose(
-            a.masked_fill(a_finite, 0),
-            b.masked_fill(b_finite, 0),
-            rtol=0,
-            atol=0,
-            equal_nan=True,
+        a.masked_fill(a_finite, 0),
+        b.masked_fill(b_finite, 0),
+        rtol=0,
+        atol=0,
+        equal_nan=True,
     ).all():
         display_error_message(f"{tensor_name} Error: nonfinite value mismatch")
         if should_raise:
@@ -55,13 +55,10 @@ def get_configs():
         threads=[128, 256],
         block_Q=[1, 2, 4],
     )
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
 
 
 class SupplyProg:
-
     def __init__(self):
         self.tensors_dict = {}
 
@@ -88,7 +85,8 @@ def supply_prog(self, params):
 @tilelang.jit(
     pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    },)
+    },
+)
 def mqa_attn_return_logits(
     heads,
     index_dim,
@@ -99,9 +97,9 @@ def mqa_attn_return_logits(
 ):
     if block_Q is None:
         block_Q = 128 // heads
-    dtype = "float8_e4m3"
-    accum_dtype = "float"
-    index_dtype = "int32"
+    dtype = T.float8_e4m3fn
+    accum_dtype = T.float32
+    index_dtype = T.int32
 
     seq_len = T.dynamic("seq_len")
     seq_len_kv = T.dynamic("seq_len_kv")
@@ -113,46 +111,42 @@ def mqa_attn_return_logits(
 
     @T.prim_func
     def mqa_attn_return_logits_kernel(
-            IndexQ: T.Tensor(index_q_shape, dtype),  # type: ignore
-            IndexK: T.Tensor(index_k_shape, dtype),  # type: ignore
-            IndexKScale: T.Tensor(index_k_scale_shape, accum_dtype),  # type: ignore
-            Logits: T.Tensor(logits_shape, accum_dtype),  # type: ignore
-            Weights: T.Tensor([seq_len, heads], accum_dtype),  # type: ignore
-            CuSeqLenKS: T.Tensor([seq_len], index_dtype),  # type: ignore
-            CuSeqLenKE: T.Tensor([seq_len], index_dtype),  # type: ignore
+        IndexQ: T.Tensor(index_q_shape, dtype),  # type: ignore
+        IndexK: T.Tensor(index_k_shape, dtype),  # type: ignore
+        IndexKScale: T.Tensor(index_k_scale_shape, accum_dtype),  # type: ignore
+        Logits: T.Tensor(logits_shape, accum_dtype),  # type: ignore
+        Weights: T.Tensor([seq_len, heads], accum_dtype),  # type: ignore
+        CuSeqLenKS: T.Tensor([seq_len], index_dtype),  # type: ignore
+        CuSeqLenKE: T.Tensor([seq_len], index_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_Q), threads=threads) as bx:
-
             index_q_shared = T.alloc_shared([block_Q * heads, index_dim], dtype)
             index_k_shared = T.alloc_shared([block_N, index_dim], dtype)
             index_k_scale_fragment = T.alloc_fragment([block_N], accum_dtype)
             s = T.alloc_fragment([block_N, block_Q * heads], accum_dtype)
-            s_reshaped = T.alloc_fragment([block_N, block_Q, heads], accum_dtype)
+            s_reshaped = T.reshape(s, (block_N, block_Q, heads))
             logits = T.alloc_fragment([block_N, block_Q], accum_dtype)
             weights = T.alloc_fragment([block_Q, heads], accum_dtype)
 
             seq_len_i = bx * block_Q
 
-            cu_k_s_min = T.alloc_local([1], index_dtype)
-            cu_k_e_max = T.alloc_local([1], index_dtype)
+            cu_k_s_min = T.alloc_var(index_dtype)
+            cu_k_e_max = T.alloc_var(index_dtype)
 
-            cu_k_s_min[0] = 2147483647
-            cu_k_e_max[0] = -2147483648
+            cu_k_s_min = 2147483647
+            cu_k_e_max = -2147483648
 
             for bq_i in T.serial(block_Q):
-                cu_k_s_min[0] = T.min(cu_k_s_min[0], T.min(CuSeqLenKS[seq_len_i + bq_i],
-                                                           seq_len_kv))
+                cu_k_s_min = T.min(cu_k_s_min, T.min(CuSeqLenKS[seq_len_i + bq_i], seq_len_kv))
             for bq_i in T.serial(block_Q):
-                cu_k_e_max[0] = T.max(cu_k_e_max[0], T.min(CuSeqLenKE[seq_len_i + bq_i],
-                                                           seq_len_kv))
+                cu_k_e_max = T.max(cu_k_e_max, T.min(CuSeqLenKE[seq_len_i + bq_i], seq_len_kv))
 
             T.copy(IndexQ[seq_len_i * heads, 0], index_q_shared)
             T.copy(Weights[seq_len_i, 0], weights)
 
-            for nbn_i in T.Pipelined(
-                    T.ceildiv(cu_k_e_max[0] - cu_k_s_min[0], block_N), num_stages=num_stages):
-                T.copy(IndexK[cu_k_s_min[0] + nbn_i * block_N, 0], index_k_shared)
-                T.copy(IndexKScale[cu_k_s_min[0] + nbn_i * block_N], index_k_scale_fragment)
+            for nbn_i in T.Pipelined(T.ceildiv(cu_k_e_max - cu_k_s_min, block_N), num_stages=num_stages):
+                T.copy(IndexK[cu_k_s_min + nbn_i * block_N, 0], index_k_shared)
+                T.copy(IndexKScale[cu_k_s_min + nbn_i * block_N], index_k_scale_fragment)
 
                 T.gemm(
                     index_k_shared,
@@ -164,15 +158,14 @@ def mqa_attn_return_logits_kernel(
                 )
 
                 for bn_i, bq_i, h_i in T.Parallel(block_N, block_Q, heads):
-                    s_reshaped[bn_i, bq_i,
-                               h_i] = (T.max(s[bn_i, bq_i * heads + h_i], 0) *
-                                       weights[bq_i, h_i]) * index_k_scale_fragment[bn_i]
+                    s_reshaped[bn_i, bq_i, h_i] = (T.max(s_reshaped[bn_i, bq_i, h_i], 0) * weights[bq_i, h_i]) * index_k_scale_fragment[
+                        bn_i
+                    ]
 
                 T.reduce_sum(s_reshaped, logits, dim=-1, clear=True)
 
                 for bq_i, bn_i in T.Parallel(block_Q, block_N):
-                    Logits[seq_len_i + bq_i, cu_k_s_min[0] + nbn_i * block_N + bn_i] = (
-                        logits[bn_i, bq_i])
+                    Logits[seq_len_i + bq_i, cu_k_s_min + nbn_i * block_N + bn_i] = logits[bn_i, bq_i]
 
     return mqa_attn_return_logits_kernel
 
@@ -185,38 +178,30 @@ def clean_logits_(
     seq_len = T.dynamic("seq_len")
     seq_len_kv = T.dynamic("seq_len_kv")
 
-    dtype = "float"
-    indices_dtype = "int32"
+    dtype = T.float
+    indices_dtype = T.int32
 
     @T.prim_func
     def clean_logits_kernel(
-            Logits: T.Tensor([seq_len, seq_len_kv], dtype),  # type: ignore
-            CuSeqLenKS: T.Tensor([seq_len], indices_dtype),  # type: ignore
-            CuSeqLenKE: T.Tensor([seq_len], indices_dtype),  # type: ignore
+        Logits: T.Tensor([seq_len, seq_len_kv], dtype),  # type: ignore
+        CuSeqLenKS: T.Tensor([seq_len], indices_dtype),  # type: ignore
+        CuSeqLenKE: T.Tensor([seq_len], indices_dtype),  # type: ignore
     ):
         with T.Kernel(seq_len, threads=threads) as bx:
             tx = T.thread_binding(0, threads, thread="threadIdx.x")
-            cu_k_s = T.alloc_local([1], indices_dtype)
-            cu_k_e = T.alloc_local([1], indices_dtype)
-            cu_k_s[0] = CuSeqLenKS[bx]
-            cu_k_e[0] = CuSeqLenKE[bx]
+            cu_k_s = CuSeqLenKS[bx]
+            cu_k_e = CuSeqLenKE[bx]
 
             for n_i in T.Pipelined(T.ceildiv(seq_len_kv, block_K)):
                 for k_i in T.serial(block_K // threads):
                     idx = n_i * block_K + k_i * threads + tx
-                    if idx < cu_k_s[0] or idx >= cu_k_e[0]:
+                    if idx < cu_k_s or idx >= cu_k_e:
                         Logits[bx, idx] = -T.infinity(dtype)
 
     return clean_logits_kernel
 
 
-def mqa_attn_return_logits_interface(q,
-                                     kv,
-                                     kv_scales,
-                                     weights,
-                                     cu_seqlen_ks,
-                                     cu_seqlen_ke,
-                                     clean_logits=True):
+def mqa_attn_return_logits_interface(q, kv, kv_scales, weights, cu_seqlen_ks, cu_seqlen_ke, clean_logits=True):
     seq_len, heads, index_dim = q.shape
     seq_len_kv = kv.shape[0]
 
@@ -238,57 +223,48 @@ def mqa_attn_return_logits_interface(q,
     return logits
 
 
-def ref_fp8_mqa_logits(q: torch.Tensor, kv: torch.Tensor, weights: torch.Tensor,
-                       cu_seqlen_ks: torch.Tensor, cu_seqlen_ke: torch.Tensor):
+def ref_fp8_mqa_logits(q: torch.Tensor, kv: torch.Tensor, weights: torch.Tensor, cu_seqlen_ks: torch.Tensor, cu_seqlen_ke: torch.Tensor):
     k = kv
     q = q.float()
     k = k.float()
 
     seq_len_kv = kv.shape[0]
-    mask_lo = torch.arange(0, seq_len_kv, device='cuda')[None, :] >= cu_seqlen_ks[:, None]
-    mask_hi = torch.arange(0, seq_len_kv, device='cuda')[None, :] < cu_seqlen_ke[:, None]
+    mask_lo = torch.arange(0, seq_len_kv, device="cuda")[None, :] >= cu_seqlen_ks[:, None]
+    mask_hi = torch.arange(0, seq_len_kv, device="cuda")[None, :] < cu_seqlen_ke[:, None]
     mask = mask_lo & mask_hi
 
-    score = torch.einsum('mhd,nd->hmn', q, k)
+    score = torch.einsum("mhd,nd->hmn", q, k)
     logits = (score.relu() * weights.unsqueeze(-1).transpose(0, 1)).sum(dim=0)
-    logits = logits.masked_fill(~mask, float('-inf'))
+    logits = logits.masked_fill(~mask, float("-inf"))
 
     cost = mask.sum()
     return logits, cost
 
 
 def test_fp8_lighting_indexer(S=4096, SKV=8192, H=32, HKV=1, D=64, kv_stride=1):
+    # initial random seed to make the performance reproducible
+    torch.manual_seed(0)
     q = torch.randn(S, H, D, device="cuda", dtype=torch.bfloat16).to(torch.bfloat16)
     kv = torch.randn(SKV, D, device="cuda", dtype=torch.bfloat16).to(torch.bfloat16)
     weights = torch.randn(S, H, device="cuda", dtype=torch.float32)
     p = (torch.randn(S, SKV, device="cuda", dtype=torch.float32) * 4).softmax(dim=-1)
 
-    ks, ke = generate_random_cu_seqlens(
-        per_cp_seqlen=S, cp_size=4, cp_rank=3, kv_stride=kv_stride, average_q_len=2048)
+    ks, ke = generate_random_cu_seqlens(per_cp_seqlen=S, cp_size=4, cp_rank=3, kv_stride=kv_stride, average_q_len=2048)
 
-    logits_ref, cost_ref = ref_fp8_mqa_logits(
-        q=q, kv=kv, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
+    logits_ref, cost_ref = ref_fp8_mqa_logits(q=q, kv=kv, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
 
     q_fp8 = q.to(torch.float8_e4m3fn)
     kv_fp8, kv_scales = per_custom_dims_cast_to_fp8(kv, (0,), False)
 
-    logits_tl = mqa_attn_return_logits_interface(
-        q=q_fp8, kv=kv_fp8, kv_scales=kv_scales, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
-    diff = validate_tensor_match(
-        logits_ref, logits_tl, tolerance=1e-14, tensor_name="logits", should_raise=False)
+    logits_tl = mqa_attn_return_logits_interface(q=q_fp8, kv=kv_fp8, kv_scales=kv_scales, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
+    diff = validate_tensor_match(logits_ref, logits_tl, tolerance=1e-14, tensor_name="logits", should_raise=False)
 
     print(f"diff: {diff}")
 
     from tilelang.profiler import do_bench
 
     def logits_fn():
-        return mqa_attn_return_logits_interface(
-            q=q_fp8,
-            kv=kv_fp8,
-            kv_scales=kv_scales,
-            weights=weights,
-            cu_seqlen_ks=ks,
-            cu_seqlen_ke=ke)
+        return mqa_attn_return_logits_interface(q=q_fp8, kv=kv_fp8, kv_scales=kv_scales, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
 
     with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as prof:
         logits_fn()
@@ -302,5 +278,35 @@ def logits_fn():
     print(f"cost_ref: {cost_ref}")
 
 
+def run_regression_perf(S=4096, SKV=8192, H=32, HKV=1, D=64, kv_stride=1):
+    torch.manual_seed(0)
+    q = torch.randn(S, H, D, device="cuda", dtype=torch.bfloat16).to(torch.bfloat16)
+    kv = torch.randn(SKV, D, device="cuda", dtype=torch.bfloat16).to(torch.bfloat16)
+    weights = torch.randn(S, H, device="cuda", dtype=torch.float32)
+    p = (torch.randn(S, SKV, device="cuda", dtype=torch.float32) * 4).softmax(dim=-1)
+
+    ks, ke = generate_random_cu_seqlens(per_cp_seqlen=S, cp_size=4, cp_rank=3, kv_stride=kv_stride, average_q_len=2048)
+
+    logits_ref, cost_ref = ref_fp8_mqa_logits(q=q, kv=kv, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
+
+    q_fp8 = q.to(torch.float8_e4m3fn)
+    kv_fp8, kv_scales = per_custom_dims_cast_to_fp8(kv, (0,), False)
+
+    logits_tl = mqa_attn_return_logits_interface(q=q_fp8, kv=kv_fp8, kv_scales=kv_scales, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
+    diff = validate_tensor_match(logits_ref, logits_tl, tolerance=1e-14, tensor_name="logits", should_raise=False)
+
+    from tilelang.profiler import do_bench
+
+    def logits_fn():
+        return mqa_attn_return_logits_interface(q=q_fp8, kv=kv_fp8, kv_scales=kv_scales, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
+
+    with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as prof:
+        logits_fn()
+
+    print(prof.key_averages().table(sort_by="cuda_time_total", max_name_column_width=50))
+
+    return do_bench(logits_fn, backend="cupti")
+
+
 if __name__ == "__main__":
     test_fp8_lighting_indexer()
diff --git a/examples/deepseek_v32/inference/README.md b/examples/deepseek_v32/inference/README.md
index fe4cc21bb..60afe7ceb 100644
--- a/examples/deepseek_v32/inference/README.md
+++ b/examples/deepseek_v32/inference/README.md
@@ -11,4 +11,4 @@ Launch the interactive chat interface and start exploring DeepSeek's capabilitie
 ```bash
 export CONFIG=config_671B_v3.2.json
 torchrun --nproc-per-node ${MP} generate.py --ckpt-path ${SAVE_PATH} --config ${CONFIG} --interactive
-```
\ No newline at end of file
+```
diff --git a/examples/deepseek_v32/inference/config_671B_v3.2.json b/examples/deepseek_v32/inference/config_671B_v3.2.json
index be88f1cca..375aa9aa2 100644
--- a/examples/deepseek_v32/inference/config_671B_v3.2.json
+++ b/examples/deepseek_v32/inference/config_671B_v3.2.json
@@ -23,4 +23,4 @@
     "index_n_heads": 64,
     "index_head_dim": 128,
     "index_topk": 2048
-}
\ No newline at end of file
+}
diff --git a/examples/deepseek_v32/inference/convert.py b/examples/deepseek_v32/inference/convert.py
index df7943918..090be7145 100644
--- a/examples/deepseek_v32/inference/convert.py
+++ b/examples/deepseek_v32/inference/convert.py
@@ -42,7 +42,7 @@ def main(hf_ckpt_path, save_path, n_experts, mp):
         save_path (str): Path to the directory where the converted checkpoint files will be saved.
         n_experts (int): Total number of experts in the model.
         mp (int): Model parallelism factor.
-        
+
     Returns:
         None
     """
diff --git a/examples/deepseek_v32/inference/kernel.py b/examples/deepseek_v32/inference/kernel.py
index 262343536..25abf15d5 100644
--- a/examples/deepseek_v32/inference/kernel.py
+++ b/examples/deepseek_v32/inference/kernel.py
@@ -11,21 +11,21 @@
     tilelang.PassConfigKey.TL_DISABLE_FAST_MATH: True,
 }
 
-FP8 = "float8_e4m3"
-BF16 = "bfloat16"
-FP32 = "float32"
+FP8 = T.float8_e4m3fn
+BF16 = T.bfloat16
+FP32 = T.float32
 
 
 def fast_log2_ceil(x):
-    bits_x = T.reinterpret("uint32", x)
+    bits_x = T.reinterpret(T.uint32, x)
     exp_x = (bits_x >> 23) & 0xFF
     man_bits = bits_x & ((1 << 23) - 1)
-    return T.Cast("int32", exp_x - 127 + T.if_then_else(man_bits != 0, 1, 0))
+    return T.Cast(T.int32, exp_x - 127 + T.if_then_else(man_bits != 0, 1, 0))
 
 
 def fast_pow2(x):
     bits_x = (x + 127) << 23
-    return T.reinterpret("float32", bits_x)
+    return T.reinterpret(T.float32, bits_x)
 
 
 def fast_round_scale(amax, fp8_max_inv):
@@ -107,8 +107,8 @@ def act_quant(x: torch.Tensor,
 
 
 @tilelang.jit(pass_configs=pass_configs)
-def fp8_gemm_kernel(N, K, out_dtype=BF16, accum_dtype="float32"):
-    assert out_dtype in [BF16, "float32"]
+def fp8_gemm_kernel(N, K, out_dtype=BF16, accum_dtype=T.float32):
+    assert out_dtype in [BF16, T.float32]
 
     M = T.dynamic("M")
     group_size = 128
diff --git a/examples/deepseek_v32/inference/requirements.txt b/examples/deepseek_v32/inference/requirements.txt
index 604fed552..8c208a8b1 100644
--- a/examples/deepseek_v32/inference/requirements.txt
+++ b/examples/deepseek_v32/inference/requirements.txt
@@ -2,4 +2,4 @@ torch
 transformers
 safetensors
 fast_hadamard_transform
-tilelang==0.1.6
\ No newline at end of file
+tilelang==0.1.6
diff --git a/examples/deepseek_v32/regression_tilelang_example_deepseek_v32.py b/examples/deepseek_v32/regression_tilelang_example_deepseek_v32.py
new file mode 100644
index 000000000..0610002a6
--- /dev/null
+++ b/examples/deepseek_v32/regression_tilelang_example_deepseek_v32.py
@@ -0,0 +1,30 @@
+import tilelang.testing
+import fp8_lighting_indexer
+import sparse_mla_bwd
+import sparse_mla_fwd
+import sparse_mla_fwd_pipelined
+import topk_selector
+
+
+def regression_topk_selector():
+    tilelang.testing.process_func(topk_selector.run_regression_perf)
+
+
+def regression_fp8_lighting_indexer():
+    tilelang.testing.process_func(fp8_lighting_indexer.run_regression_perf, S=512, SKV=1024, H=32, HKV=1, D=64, kv_stride=1)
+
+
+def regression_sparse_mla_fwd():
+    tilelang.testing.process_func(sparse_mla_fwd.run_regression_perf, S=256, SKV=1024, H=64, HKV=1, DQK=576, DV=512, topk=256)
+
+
+def regression_sparse_mla_fwd_pipelined():
+    tilelang.testing.process_func(sparse_mla_fwd_pipelined.run_regression_perf, S=256, SKV=512, H=64, HKV=1, DQK=576, DV=512, topk=256)
+
+
+def regression_sparse_mla_bwd():
+    tilelang.testing.process_func(sparse_mla_bwd.run_regression_perf, S=256, SKV=512, H=64, HKV=1, DQKV=576, DV=512, topk=256)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/deepseek_v32/sparse_mla_bwd.py b/examples/deepseek_v32/sparse_mla_bwd.py
index e7f9c6093..527de22b3 100644
--- a/examples/deepseek_v32/sparse_mla_bwd.py
+++ b/examples/deepseek_v32/sparse_mla_bwd.py
@@ -13,18 +13,18 @@ def preprocess(
     D,
     block_ND=32,
     num_stages=5,
-    dtype="bfloat16",
-    accum_dtype="float",
+    dtype=T.bfloat16,
+    accum_dtype=T.float32,
 ):
-    assert dtype == "bfloat16"
-    assert accum_dtype == "float"
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
     shape = [B, S, H, D]
 
     @T.prim_func
     def preprocess_kernel(
-            O: T.Tensor(shape, dtype),
-            dO: T.Tensor(shape, dtype),
-            Delta: T.Tensor([B, S, H], accum_dtype),
+        O: T.Tensor(shape, dtype),
+        dO: T.Tensor(shape, dtype),
+        Delta: T.Tensor([B, S, H], accum_dtype),
     ):
         with T.Kernel(H, T.ceildiv(S, block_ND), B) as (bx, by, bz):
             o = T.alloc_fragment([block_ND, block_ND], accum_dtype)
@@ -33,16 +33,12 @@ def preprocess_kernel(
             acc = T.alloc_fragment([block_ND, block_ND], accum_dtype)
             T.clear(acc)
             for k in T.Pipelined(T.ceildiv(D, block_ND), num_stages=num_stages):
-                T.copy(
-                    O[bz, by * block_ND:(by + 1) * block_ND, bx, k * block_ND:(k + 1) * block_ND],
-                    o)
-                T.copy(
-                    dO[bz, by * block_ND:(by + 1) * block_ND, bx, k * block_ND:(k + 1) * block_ND],
-                    do)
+                T.copy(O[bz, by * block_ND : (by + 1) * block_ND, bx, k * block_ND : (k + 1) * block_ND], o)
+                T.copy(dO[bz, by * block_ND : (by + 1) * block_ND, bx, k * block_ND : (k + 1) * block_ND], do)
                 for i, j in T.Parallel(block_ND, block_ND):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, by * block_ND:(by + 1) * block_ND, bx])
+            T.copy(delta, Delta[bz, by * block_ND : (by + 1) * block_ND, bx])
 
     return preprocess_kernel
 
@@ -56,22 +52,22 @@ def postprocess(
     kv_group=1,
     block_N=64,
     threads=128,
-    dtype="bfloat16",
-    accum_dtype="float",
+    dtype=T.bfloat16,
+    accum_dtype=T.float32,
 ):
-    assert dtype == "bfloat16"
-    assert accum_dtype == "float"
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
     dkv_shape = [B, S_kv, kv_group, D + D_tail]
 
     @T.prim_func
     def postprocess_kernel(
-            dKV: T.Tensor(dkv_shape, accum_dtype),
-            dKV_out: T.Tensor(dkv_shape, dtype),
+        dKV: T.Tensor(dkv_shape, accum_dtype),
+        dKV_out: T.Tensor(dkv_shape, dtype),
     ):
         with T.Kernel(T.ceildiv(S_kv, block_N), kv_group, B, threads=threads) as (bx, by, bz):
             T.copy(
-                dKV[bz, bx * block_N:(bx + 1) * block_N, by, :],
-                dKV_out[bz, bx * block_N:(bx + 1) * block_N, by, :],
+                dKV[bz, bx * block_N : (bx + 1) * block_N, by, :],
+                dKV_out[bz, bx * block_N : (bx + 1) * block_N, by, :],
             )
 
     return postprocess_kernel
@@ -82,7 +78,9 @@ def postprocess_kernel(
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
+        tilelang.PassConfigKey.TL_ENABLE_AGGRESSIVE_SHARED_MEMORY_MERGE: True,
+    },
+)
 def bwd(
     B,
     S,
@@ -97,18 +95,18 @@ def bwd(
     block_size=32,
     num_stages=0,
     threads=256,
-    indices_dtype="int32",
-    dtype="bfloat16",
-    accum_dtype="float",
+    indices_dtype=T.int32,
+    dtype=T.bfloat16,
+    accum_dtype=T.float32,
 ):
-    assert is_causal == True, 'non-casual is not supported now'
-    assert topk % block_size == 0, 'otherwise will load some index=0 thus causing wrong kv to be loaded'
-    assert dtype == "bfloat16"
-    assert accum_dtype == "float"
-    assert indices_dtype == "int32"
+    assert is_causal == True, "non-casual is not supported now"
+    assert topk % block_size == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
+    assert indices_dtype == T.int32
 
     if sm_scale is None:
-        sm_scale = (D + D_tail)**(-0.5)
+        sm_scale = (D + D_tail) ** (-0.5)
     sm_scale_mul_reciprocal_log2 = sm_scale * 1.44269504  # log2(e)
 
     H_kv = H // kv_group
@@ -118,12 +116,15 @@ def bwd(
     indices_shape = [B, S, kv_group, topk]
     delta_shape = [B, S, H]
     lse_shape = [B, S, H]
-    assert indices_dtype == "int32"
-    assert dtype == "bfloat16"
-    assert accum_dtype == "float"
+    assert indices_dtype == T.int32
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
 
     H = H_kv
     padded_H = max(tilelang.math.next_power_of_2(H_kv), 16)
+    block_H = min(64, padded_H)
+    assert padded_H % block_H == 0
+    NH = padded_H // block_H
     BS = block_size
     NS = tilelang.cdiv(topk, block_size)
 
@@ -131,122 +132,85 @@ def bwd(
 
     @T.prim_func
     def sparse_mla_bwd_kernel(
-            Q: T.Tensor(q_shape, dtype),
-            KV: T.Tensor(k_shape, dtype),
-            dO: T.Tensor(o_shape, dtype),
-            Indices: T.Tensor(indices_shape, indices_dtype),
-            Lse: T.Tensor(lse_shape, accum_dtype),
-            Delta: T.Tensor(delta_shape, accum_dtype),
-            dQ: T.Tensor(q_shape, dtype),
-            dKV: T.Tensor(k_shape, accum_dtype),
+        Q: T.Tensor(q_shape, dtype),
+        KV: T.Tensor(k_shape, dtype),
+        dO: T.Tensor(o_shape, dtype),
+        Indices: T.Tensor(indices_shape, indices_dtype),
+        Lse: T.Tensor(lse_shape, accum_dtype),
+        Delta: T.Tensor(delta_shape, accum_dtype),
+        dQ: T.Tensor(q_shape, dtype),
+        dKV: T.Tensor(k_shape, accum_dtype),
     ):
-        with T.Kernel(S, B, kv_group, threads=threads) as (s_i, by, bz):
-            Q_shared = T.alloc_shared([padded_H, D], dtype)
-            Q_tail_shared = T.alloc_shared([padded_H, D_tail], dtype)
+        with T.Kernel(S, B, kv_group * NH, threads=threads) as (s_i, by, bz):
+            Q_shared = T.alloc_shared([block_H, D], dtype)
+            Q_tail_shared = T.alloc_shared([block_H, D_tail], dtype)
             KV_shared = T.alloc_shared([BS, D], dtype)
             KV_tail_shared = T.alloc_shared([BS, D_tail], dtype)
-            dO_shared = T.alloc_shared([padded_H, D], dtype)
+            dO_shared = T.alloc_shared([block_H, D], dtype)
             mask = T.alloc_fragment([BS], "bool")
 
-            P_shared_cast = T.alloc_shared([padded_H, BS], dtype)
-            dP_shared_cast = T.alloc_shared([padded_H, BS], dtype)
-            dQ_shared = T.alloc_shared([padded_H, D], dtype)
-            dQ_tail_shared = T.alloc_shared([padded_H, D_tail], dtype)
+            P_shared_cast = T.alloc_shared([block_H, BS], dtype)
+            dP_shared_cast = T.alloc_shared([block_H, BS], dtype)
+            dQ_shared = T.alloc_shared([block_H, D], dtype)
+            dQ_tail_shared = T.alloc_shared([block_H, D_tail], dtype)
 
-            acc_p = T.alloc_fragment([padded_H, BS], accum_dtype)
-            acc_dp = T.alloc_fragment([padded_H, BS], accum_dtype)
-            acc_dq = T.alloc_fragment([padded_H, D], accum_dtype)
-            acc_dq_tail = T.alloc_fragment([padded_H, D_tail], accum_dtype)
+            acc_p = T.alloc_fragment([block_H, BS], accum_dtype)
+            acc_dp = T.alloc_fragment([block_H, BS], accum_dtype)
+            acc_dq = T.alloc_fragment([block_H, D], accum_dtype)
+            acc_dq_tail = T.alloc_fragment([block_H, D_tail], accum_dtype)
             acc_dkv = T.alloc_fragment([BS, D], accum_dtype)
             acc_dkv_tail = T.alloc_fragment([BS, D_tail], accum_dtype)
-            acc_dkv_shared = T.view(KV_shared, shape=[BS // split_store, D], dtype=accum_dtype)
-            acc_dkv_tail_shared = T.view(
-                KV_tail_shared, shape=[BS // split_store, D_tail], dtype=accum_dtype)
+            acc_dkv_shared = T.alloc_shared([BS // split_store, D], accum_dtype)
+            acc_dkv_tail_shared = T.alloc_shared([BS // split_store, D_tail], accum_dtype)
 
             max_kv_i = s_i
 
-            T.copy(Q[by, s_i, bz * padded_H:(bz + 1) * padded_H, :D], Q_shared)
-            T.copy(Q[by, s_i, bz * padded_H:(bz + 1) * padded_H, D:], Q_tail_shared)
-            T.copy(dO[by, s_i, bz * padded_H:(bz + 1) * padded_H, :D], dO_shared)
+            T.copy(Q[by, s_i, bz * block_H : (bz + 1) * block_H, :D], Q_shared)
+            T.copy(Q[by, s_i, bz * block_H : (bz + 1) * block_H, D:], Q_tail_shared)
+            T.copy(dO[by, s_i, bz * block_H : (bz + 1) * block_H, :D], dO_shared)
 
             T.clear(acc_dq)
             T.clear(acc_dq_tail)
 
-            T.annotate_layout({
-                dQ_shared: tilelang.layout.make_swizzled_layout(dQ_shared),
-                dQ_tail_shared: tilelang.layout.make_swizzled_layout(dQ_tail_shared),
-            })
-
             # Process each block of indices
             for i_i in T.Pipelined(NS, num_stages=num_stages):
                 # Check which indices are valid
                 for bi_i in T.Parallel(BS):
-                    mask[bi_i] = Indices[by, s_i, bz, i_i * BS + bi_i] <= max_kv_i
+                    mask[bi_i] = Indices[by, s_i, bz // NH, i_i * BS + bi_i] <= max_kv_i
 
                 # Compute attention scores
-                for h_i, bi_i in T.Parallel(padded_H, BS):
+                for h_i, bi_i in T.Parallel(block_H, BS):
                     acc_p[h_i, bi_i] = T.if_then_else(mask[bi_i], 0, -T.infinity(acc_p.dtype))
 
                 # Load KV, V for this block of indices
                 for bi_i, d_i in T.Parallel(BS, D):
-                    KV_shared[bi_i, d_i] = KV[by, Indices[by, s_i, bz, i_i * BS + bi_i], bz, d_i]
+                    KV_shared[bi_i, d_i] = KV[by, Indices[by, s_i, bz // NH, i_i * BS + bi_i], bz // NH, d_i]
 
-                T.gemm(
-                    Q_shared, KV_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(Q_shared, KV_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
 
                 for bi_i, d_i in T.Parallel(BS, D_tail):
-                    KV_tail_shared[bi_i, d_i] = KV[by, Indices[by, s_i, bz, i_i * BS + bi_i], bz,
-                                                   D + d_i]
-                T.gemm(
-                    Q_tail_shared,
-                    KV_tail_shared,
-                    acc_p,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol)
-
-                for h_i, bi_i in T.Parallel(padded_H, BS):
-                    acc_p[h_i, bi_i] = T.exp2(acc_p[h_i, bi_i] * sm_scale_mul_reciprocal_log2 -
-                                              Lse[by, s_i, bz * padded_H + h_i])
+                    KV_tail_shared[bi_i, d_i] = KV[by, Indices[by, s_i, bz // NH, i_i * BS + bi_i], bz // NH, D + d_i]
+                T.gemm(Q_tail_shared, KV_tail_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+
+                for h_i, bi_i in T.Parallel(block_H, BS):
+                    acc_p[h_i, bi_i] = T.exp2(acc_p[h_i, bi_i] * sm_scale_mul_reciprocal_log2 - Lse[by, s_i, bz * block_H + h_i])
 
                 T.copy(acc_p, P_shared_cast)
 
-                T.gemm(
-                    dO_shared,
-                    KV_shared,
-                    acc_dp,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullCol,
-                    clear_accum=True)
+                T.gemm(dO_shared, KV_shared, acc_dp, transpose_B=True, policy=T.GemmWarpPolicy.FullCol, clear_accum=True)
 
-                for h_i, bi_i in T.Parallel(padded_H, BS):
-                    acc_dp[h_i, bi_i] = acc_p[h_i, bi_i] * (
-                        acc_dp[h_i, bi_i] - Delta[by, s_i, bz * padded_H + h_i]) * sm_scale
+                for h_i, bi_i in T.Parallel(block_H, BS):
+                    acc_dp[h_i, bi_i] = acc_p[h_i, bi_i] * (acc_dp[h_i, bi_i] - Delta[by, s_i, bz * block_H + h_i]) * sm_scale
 
                 T.copy(acc_dp, dP_shared_cast)
                 T.gemm(dP_shared_cast, KV_shared, acc_dq, policy=T.GemmWarpPolicy.FullCol)
                 T.gemm(dP_shared_cast, KV_tail_shared, acc_dq_tail, policy=T.GemmWarpPolicy.FullCol)
 
-                T.gemm(
-                    dP_shared_cast,
-                    Q_shared,
-                    acc_dkv,
-                    transpose_A=True,
-                    policy=T.GemmWarpPolicy.FullCol,
-                    clear_accum=True)
-                T.gemm(
-                    P_shared_cast,
-                    dO_shared,
-                    acc_dkv,
-                    transpose_A=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(dP_shared_cast, Q_shared, acc_dkv, transpose_A=True, policy=T.GemmWarpPolicy.FullCol, clear_accum=True)
+                T.gemm(P_shared_cast, dO_shared, acc_dkv, transpose_A=True, policy=T.GemmWarpPolicy.FullCol)
 
                 T.clear(acc_dkv_tail)
-                T.gemm(
-                    dP_shared_cast,
-                    Q_tail_shared,
-                    acc_dkv_tail,
-                    transpose_A=True,
-                    policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(dP_shared_cast, Q_tail_shared, acc_dkv_tail, transpose_A=True, policy=T.GemmWarpPolicy.FullCol)
 
                 for s in range(split_store):
                     for bi_i, d_i in T.Parallel(BS, D):
@@ -255,41 +219,32 @@ def sparse_mla_bwd_kernel(
 
                     for bi_i, d_i in T.Parallel(BS, D_tail):
                         if bi_i < BS // split_store:
-                            acc_dkv_tail_shared[bi_i,
-                                                d_i] = acc_dkv_tail[bi_i + s * (BS // split_store),
-                                                                    d_i]
+                            acc_dkv_tail_shared[bi_i, d_i] = acc_dkv_tail[bi_i + s * (BS // split_store), d_i]
 
                     for bi_i, d_i in T.Parallel(BS // split_store, D // 4):
                         T.atomic_addx4(
-                            dKV[by, Indices[by, s_i, bz, i_i * BS + bi_i + s * (BS // split_store)],
-                                bz, d_i * 4], acc_dkv_shared[bi_i, d_i * 4])
+                            dKV[by, Indices[by, s_i, bz // NH, i_i * BS + bi_i + s * (BS // split_store)], bz // NH, d_i * 4],
+                            acc_dkv_shared[bi_i, d_i * 4],
+                        )
 
                     # Atomically update dKV, dKV_tail tensors
                     for bi_i, d_i in T.Parallel(BS // split_store, D_tail // 4):
                         T.atomic_addx4(
-                            dKV[by, Indices[by, s_i, bz, i_i * BS + bi_i + s * (BS // split_store)],
-                                bz, D + d_i * 4], acc_dkv_tail_shared[bi_i, d_i * 4])
+                            dKV[by, Indices[by, s_i, bz // NH, i_i * BS + bi_i + s * (BS // split_store)], bz // NH, D + d_i * 4],
+                            acc_dkv_tail_shared[bi_i, d_i * 4],
+                        )
 
             # Store the accumulated dQ
             T.copy(acc_dq, dQ_shared)
             T.copy(acc_dq_tail, dQ_tail_shared)
 
-            T.copy(dQ_shared, dQ[by, s_i, bz * padded_H:(bz + 1) * padded_H, :D])
-            T.copy(dQ_tail_shared, dQ[by, s_i, bz * padded_H:(bz + 1) * padded_H, D:])
+            T.copy(dQ_shared, dQ[by, s_i, bz * block_H : (bz + 1) * block_H, :D])
+            T.copy(dQ_tail_shared, dQ[by, s_i, bz * block_H : (bz + 1) * block_H, D:])
 
     return sparse_mla_bwd_kernel
 
 
-def sparse_mla_bwd(q,
-                   kv,
-                   o,
-                   do,
-                   indices,
-                   lse,
-                   sm_scale=None,
-                   is_casual=True,
-                   return_kernel=False,
-                   delta=None):
+def sparse_mla_bwd(q, kv, o, do, indices, lse, sm_scale=None, is_casual=True, return_kernel=False, delta=None):
     assert q.is_contiguous()
     assert kv.is_contiguous()
     assert indices.is_contiguous()
@@ -322,6 +277,7 @@ def sparse_mla_bwd(q,
 
 def ref_sparse_mla_bwd_interface(q, kv, o, do, indices, lse, sm_scale=None, is_casual=True):
     from sparse_mla_fwd import ref_sparse_mla_fwd_interface
+
     q = q.detach().clone()
     kv = kv.detach().clone()
     q.requires_grad = True
@@ -331,30 +287,22 @@ def ref_sparse_mla_bwd_interface(q, kv, o, do, indices, lse, sm_scale=None, is_c
     return q.grad, kv.grad
 
 
-def test_sparse_mla_bwd(B=1,
-                        S=4096,
-                        SKV=8192,
-                        H=64,
-                        HKV=1,
-                        DQKV=576,
-                        DV=512,
-                        topk=2048,
-                        dtype=torch.bfloat16,
-                        check_correctness=True):
+def test_sparse_mla_bwd(B=1, S=4096, SKV=8192, H=64, HKV=1, DQKV=576, DV=512, topk=2048, dtype=torch.bfloat16, check_correctness=True):
     # Prepare data
-    q = torch.randn((B, S, H, DQKV), dtype=dtype, device='cuda').requires_grad_(True)
-    kv = torch.randn((B, SKV, HKV, DQKV), dtype=dtype, device='cuda').requires_grad_(True)
-    do = torch.randn((B, S, H, DV), dtype=dtype, device='cuda')
+    q = torch.randn((B, S, H, DQKV), dtype=dtype, device="cuda").requires_grad_(True)
+    kv = torch.randn((B, SKV, HKV, DQKV), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((B, S, H, DV), dtype=dtype, device="cuda")
 
-    indices = torch.full((B, S, HKV, topk), SKV, dtype=torch.int32, device='cuda')
+    indices = torch.full((B, S, HKV, topk), SKV, dtype=torch.int32, device="cuda")
     for b in range(B):
         for t in range(S):
             for h in range(HKV):
                 i_i = torch.randperm(max(1, t))[:topk]
-                indices[b, t, h, :len(i_i)] = i_i
+                indices[b, t, h, : len(i_i)] = i_i
 
     # Forward
     from sparse_mla_fwd import sparse_mla_fwd_interface
+
     tl_out, tl_lse = sparse_mla_fwd_interface(q, kv, indices)
 
     tl_dq, tl_dkv = sparse_mla_bwd(q, kv, tl_out, do, indices, tl_lse)
@@ -365,13 +313,15 @@ def test_sparse_mla_bwd(B=1,
         assert_tensors_similar(tl_dkv, ref_dkv, eps=1e-4, name="dkv")
         print("assert_tensors_similar passed")
 
-    per_token_flop = 2 * sum([
-        H * DV * topk,
-        H * DQKV * topk,
-        H * DQKV * topk,
-        H * DQKV * topk,
-        H * DV * topk,
-    ])
+    per_token_flop = 2 * sum(
+        [
+            H * DV * topk,
+            H * DQKV * topk,
+            H * DQKV * topk,
+            H * DQKV * topk,
+            H * DV * topk,
+        ]
+    )
     from tilelang.profiler import do_bench
 
     def fn():
@@ -379,20 +329,44 @@ def fn():
 
     ms = do_bench(fn, rep=100, warmup=250)
     print(f"Average time: {ms:.3f} ms")
-    print(f'bwd io bandwidth = ',
-          (B * S * max(DQKV * 2, DQKV + DV) * topk * 2) / (ms * 1e-3) / 1e12)
-    print(f'bwd tflops = ', per_token_flop * S / (ms * 1e-3) / 1e12)
+    print(f"bwd io bandwidth = ", (B * S * max(DQKV * 2, DQKV + DV) * topk * 2) / (ms * 1e-3) / 1e12)
+    print(f"bwd tflops = ", per_token_flop * S / (ms * 1e-3) / 1e12)
+
+
+def run_regression_perf(B=1, S=4096, SKV=8192, H=64, HKV=1, DQKV=576, DV=512, topk=2048, dtype=torch.bfloat16):
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    q = torch.randn((B, S, H, DQKV), dtype=dtype, device="cuda").requires_grad_(True)
+    kv = torch.randn((B, SKV, HKV, DQKV), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((B, S, H, DV), dtype=dtype, device="cuda")
+
+    indices = torch.full((B, S, HKV, topk), SKV, dtype=torch.int32, device="cuda")
+    for b in range(B):
+        for t in range(S):
+            for h in range(HKV):
+                i_i = torch.randperm(max(1, t))[:topk]
+                indices[b, t, h, : len(i_i)] = i_i
+
+    from sparse_mla_fwd import sparse_mla_fwd_interface
+
+    tl_out, tl_lse = sparse_mla_fwd_interface(q, kv, indices)
+    B, S, H, dim_plus_tail_dim = q.shape
+    _, S_kv, kv_group, _ = kv.shape
+    D = 512
+    D_tail = dim_plus_tail_dim - D
+    topk = indices.shape[-1]
+    preprocess_kernel = preprocess(B, S, H, D)
+    bwd_kernel = bwd(B, S, S_kv, H, D, D_tail, topk, kv_group, None, True)
+    delta = preprocess_kernel(tl_out, do)
+    dkv = torch.zeros_like(kv, dtype=torch.float32)
+
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        return bwd_kernel(q, kv, do, indices, tl_lse, delta, dkv)
+
+    return do_bench(run_kernel_only, backend="cupti")
 
 
 if __name__ == "__main__":
-    test_sparse_mla_bwd(
-        B=1,
-        S=4096,
-        SKV=8192,
-        H=64,
-        HKV=1,
-        DQKV=576,
-        DV=512,
-        topk=2048,
-        dtype=torch.bfloat16,
-        check_correctness=True)
+    test_sparse_mla_bwd(B=1, S=4096, SKV=8192, H=64, HKV=1, DQKV=576, DV=512, topk=2048, dtype=torch.bfloat16, check_correctness=True)
diff --git a/examples/deepseek_v32/sparse_mla_fwd.py b/examples/deepseek_v32/sparse_mla_fwd.py
index a39c72c40..2c8bf7fc7 100644
--- a/examples/deepseek_v32/sparse_mla_fwd.py
+++ b/examples/deepseek_v32/sparse_mla_fwd.py
@@ -25,15 +25,12 @@ def sparse_mla_fwd(
     num_stages=2,
     threads=256,
 ):
-    assert dim == tilelang.math.next_power_of_2(
-        dim), f"haven't check padding correctness yet, dim={dim}"
-    assert tail_dim == tilelang.math.next_power_of_2(
-        tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
+    assert dim == tilelang.math.next_power_of_2(dim), f"haven't check padding correctness yet, dim={dim}"
+    assert tail_dim == tilelang.math.next_power_of_2(tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
     assert is_causal == True, "non-casual is not supported"
-    assert (topk %
-            block_I == 0), "otherwise will load some index=0 thus causing wrong kv to be loaded"
+    assert topk % block_I == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
     if sm_scale is None:
-        sm_scale = (1.0 / (dim + tail_dim))**0.5 * 1.44269504  # log2(e)
+        sm_scale = (1.0 / (dim + tail_dim)) ** 0.5 * 1.44269504  # log2(e)
     else:
         sm_scale = sm_scale * 1.44269504  # log2(e)
 
@@ -47,17 +44,17 @@ def sparse_mla_fwd(
     o_shape = [batch, seq_len, heads, dim]
     indices_shape = [batch, seq_len, kv_group, topk]
     lse_shape = [batch, seq_len, heads]
-    indices_dtype = "int32"
-    dtype = "bfloat16"
-    accum_dtype = "float"
+    indices_dtype = T.int32
+    dtype = T.bfloat16
+    accum_dtype = T.float32
 
     G = kv_group
     H = head_kv
     padded_H = max(tilelang.math.next_power_of_2(head_kv), 16)
     if padded_H != H:
-        assert (
-            kv_group == 1
-        ), "here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)"
+        assert kv_group == 1, (
+            "here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)"
+        )
     BI = block_I
     NI = tilelang.cdiv(topk, block_I)
     D = dim
@@ -73,18 +70,17 @@ def sparse_mla_fwd(
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            KV: T.Tensor(kv_shape, dtype),  # type: ignore
-            Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
-            Output: T.Tensor(o_shape, dtype),  # type: ignore
-            Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        KV: T.Tensor(kv_shape, dtype),  # type: ignore
+        Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
+        Output: T.Tensor(o_shape, dtype),  # type: ignore
+        Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
     ):
-        with T.Kernel(
-                seq_len * REPLICATE_H, batch, kv_group, threads=threads) as (
-                    bx,
-                    by,
-                    bz,
-                ):
+        with T.Kernel(seq_len * REPLICATE_H, batch, kv_group, threads=threads) as (
+            bx,
+            by,
+            bz,
+        ):
             Q_shared = T.alloc_shared([H_per_block, D], dtype)
             Q_tail_shared = T.alloc_shared([H_per_block, D_tail], dtype)
             KV_shared = T.alloc_shared([BI, D], dtype)
@@ -118,16 +114,13 @@ def main(
             T.copy(Q[b_i, s_i, H0:H1, D:], Q_tail_shared)
 
             for i_i in T.Pipelined(NI, num_stages=num_stages):
-
                 for bi_i in T.Parallel(BI):
                     mask[bi_i] = Indices[b_i, s_i, g_i, i_i * BI + bi_i] <= max_kv_i
 
                 for bi_i, d_i in T.Parallel(BI, D):
-                    KV_shared[bi_i, d_i] = KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i,
-                                              d_i]
+                    KV_shared[bi_i, d_i] = KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i, d_i]
                 for bi_i, d_i in T.Parallel(BI, D_tail):
-                    K_tail_shared[bi_i, d_i] = KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i,
-                                                  D + d_i]
+                    K_tail_shared[bi_i, d_i] = KV[b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i, D + d_i]
 
                 for h_i, bi_i in T.Parallel(H_per_block, BI):
                     acc_s[h_i, bi_i] = T.if_then_else(mask[bi_i], 0, -T.infinity(acc_s.dtype))
@@ -147,6 +140,8 @@ def main(
                 )
                 T.copy(m_i, m_i_prev)
                 T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                for h_i in T.Parallel(H_per_block):
+                    m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                 for h_i in T.Parallel(H_per_block):
                     alpha[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                 for h_i, bi_i in T.Parallel(H_per_block, BI):
@@ -174,15 +169,7 @@ def main(
     return main
 
 
-def sparse_mla_fwd_interface(q,
-                             kv,
-                             indices,
-                             sm_scale=None,
-                             return_p_sum: bool = False,
-                             d_v=512,
-                             block_I=64,
-                             num_stages=2,
-                             threads=256):
+def sparse_mla_fwd_interface(q, kv, indices, sm_scale=None, return_p_sum: bool = False, d_v=512, block_I=64, num_stages=2, threads=256):
     is_casual = True
     assert return_p_sum == False, "This kernel file is for fwd only"
     assert q.is_contiguous() and kv.is_contiguous() and indices.is_contiguous()
@@ -199,16 +186,8 @@ def sparse_mla_fwd_interface(q,
     assert indices.shape == (batch, seq_len, kv_group, topk)
 
     kernel = sparse_mla_fwd(
-        heads,
-        dim,
-        tail_dim,
-        topk,
-        kv_group,
-        sm_scale,
-        is_casual,
-        block_I=block_I,
-        num_stages=num_stages,
-        threads=threads)
+        heads, dim, tail_dim, topk, kv_group, sm_scale, is_casual, block_I=block_I, num_stages=num_stages, threads=threads
+    )
     out, lse = kernel(q, kv, indices)
     return out, lse
 
@@ -228,14 +207,14 @@ def ref_sparse_mla_fwd_interface(q, kv, indices, sm_scale=None, is_casual=True):
     b, _, _, dim_v = v.shape
     g_index = g
     h_index = h // g
-    compressed_casual_mask = torch.arange(
-        0, sq, dtype=torch.int32, device="cuda").view(-1, 1) >= torch.arange(
-            1 - 1, sk * 1, 1, dtype=torch.int32, device="cuda").view(1, -1)
+    compressed_casual_mask = torch.arange(0, sq, dtype=torch.int32, device="cuda").view(-1, 1) >= torch.arange(
+        1 - 1, sk * 1, 1, dtype=torch.int32, device="cuda"
+    ).view(1, -1)
 
     mask = q.new_zeros(b, g_index, sq, sk + 1, dtype=torch.bool).scatter(3, indices.long(), 1)
     mask = mask[..., :-1]
     mask = mask & compressed_casual_mask.view(1, 1, sq, sk)
-    mask[:, :, :1 - 1, 0] = True
+    mask[:, :, : 1 - 1, 0] = True
     mask = mask.view(b, g_index, 1, sq, sk)
 
     q = q.view(b, sq, g, -1, dim_q)
@@ -250,19 +229,21 @@ def ref_sparse_mla_fwd_interface(q, kv, indices, sm_scale=None, is_casual=True):
     return o.to(torch.bfloat16)
 
 
-def test_sparse_mla_fwd(B=1,
-                        S=4096,
-                        SKV=8192,
-                        H=128,
-                        HKV=1,
-                        DQK=576,
-                        DV=512,
-                        topk=2048,
-                        dtype=torch.bfloat16,
-                        check_correctness=True,
-                        block_I=64,
-                        num_stages=2,
-                        threads=256):
+def test_sparse_mla_fwd(
+    B=1,
+    S=4096,
+    SKV=8192,
+    H=128,
+    HKV=1,
+    DQK=576,
+    DV=512,
+    topk=2048,
+    dtype=torch.bfloat16,
+    check_correctness=True,
+    block_I=64,
+    num_stages=2,
+    threads=256,
+):
     torch.random.manual_seed(0)
     q = torch.randn((B, S, H, DQK), dtype=dtype, device="cuda").requires_grad_(True)
     kv = torch.randn((B, SKV, HKV, DQK), dtype=dtype, device="cuda").requires_grad_(True)
@@ -272,10 +253,9 @@ def test_sparse_mla_fwd(B=1,
         for t in range(S):
             for h in range(HKV):
                 i_i = torch.randperm(max(1, t))[:topk]
-                indices[b, t, h, :len(i_i)] = i_i
+                indices[b, t, h, : len(i_i)] = i_i
 
-    tl_out, tl_lse = sparse_mla_fwd_interface(
-        q, kv, indices, block_I=block_I, num_stages=num_stages, threads=threads)
+    tl_out, tl_lse = sparse_mla_fwd_interface(q, kv, indices, block_I=block_I, num_stages=num_stages, threads=threads)
 
     if check_correctness:
         # otherwise may cause out of memory
@@ -284,8 +264,7 @@ def test_sparse_mla_fwd(B=1,
         print("assert_tensors_similar passed")
 
     def fn():
-        return sparse_mla_fwd_interface(
-            q, kv, indices, block_I=block_I, num_stages=num_stages, threads=threads)
+        return sparse_mla_fwd_interface(q, kv, indices, block_I=block_I, num_stages=num_stages, threads=threads)
 
     from tilelang.profiler import do_bench
 
@@ -299,6 +278,36 @@ def fn():
     print("fwd tflops = ", (B * S * (DQK + DV) * topk * 2 * H) / (ms * 1e-3) / 1e12)
 
 
+def run_regression_perf(
+    B=1, S=4096, SKV=8192, H=128, HKV=1, DQK=576, DV=512, topk=2048, dtype=torch.bfloat16, block_I=64, num_stages=2, threads=256
+):
+    torch.random.manual_seed(0)
+    q = torch.randn((B, S, H, DQK), dtype=dtype, device="cuda").requires_grad_(True)
+    kv = torch.randn((B, SKV, HKV, DQK), dtype=dtype, device="cuda").requires_grad_(True)
+
+    indices = torch.full((B, S, HKV, topk), SKV, dtype=torch.int32, device="cuda")
+    for b in range(B):
+        for t in range(S):
+            for h in range(HKV):
+                i_i = torch.randperm(max(1, t))[:topk]
+                indices[b, t, h, : len(i_i)] = i_i
+
+    is_casual = True
+    _, _, heads, dim_plus_tail_dim = q.shape
+    _, _, kv_group, _ = kv.shape
+    dim = 512
+    tail_dim = dim_plus_tail_dim - dim
+    _, _, _, topk = indices.shape
+    kernel = sparse_mla_fwd(heads, dim, tail_dim, topk, kv_group, None, is_casual, block_I=block_I, num_stages=num_stages, threads=threads)
+
+    def run_kernel_only():
+        kernel(q, kv, indices)
+
+    from tilelang.profiler import do_bench
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     test_sparse_mla_fwd(
         B=1,
@@ -313,4 +322,5 @@ def fn():
         check_correctness=True,
         block_I=64,
         num_stages=2,
-        threads=256)
+        threads=256,
+    )
diff --git a/examples/deepseek_v32/sparse_mla_fwd_pipelined.py b/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
index 96dda7df5..7e664d11b 100644
--- a/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
+++ b/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
@@ -9,10 +9,16 @@
 @tilelang.jit(
     out_idx=[-2, -1],
     compile_flags=[
-        "-O3", "-Wno-deprecated-declarations", "-U__CUDA_NO_HALF_OPERATORS__",
-        "-U__CUDA_NO_HALF_CONVERSIONS__", "-U__CUDA_NO_HALF2_OPERATORS__",
-        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__", "--expt-relaxed-constexpr", "--expt-extended-lambda",
-        "--ptxas-options=-v,--register-usage-level=10", "-DNDEBUG"
+        "-O3",
+        "-Wno-deprecated-declarations",
+        "-U__CUDA_NO_HALF_OPERATORS__",
+        "-U__CUDA_NO_HALF_CONVERSIONS__",
+        "-U__CUDA_NO_HALF2_OPERATORS__",
+        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+        "--expt-relaxed-constexpr",
+        "--expt-extended-lambda",
+        "--ptxas-options=-v,--register-usage-level=10",
+        "-DNDEBUG",
     ],
 )
 def sparse_mla_fwd(
@@ -32,14 +38,12 @@ def sparse_mla_fwd(
     num_stages=0,
     threads=384,
 ):
-    assert dim == tilelang.math.next_power_of_2(
-        dim), f"haven't check padding correctness yet, dim={dim}"
-    assert tail_dim == tilelang.math.next_power_of_2(
-        tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
-    assert is_causal == True, 'non-casual is not supported'
-    assert topk % block_I == 0, 'otherwise will load some index=0 thus causing wrong kv to be loaded'
+    assert dim == tilelang.math.next_power_of_2(dim), f"haven't check padding correctness yet, dim={dim}"
+    assert tail_dim == tilelang.math.next_power_of_2(tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
+    assert is_causal == True, "non-casual is not supported"
+    assert topk % block_I == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
     if sm_scale is None:
-        sm_scale = (1.0 / (dim + tail_dim))**0.5 * 1.44269504  # log2(e)
+        sm_scale = (1.0 / (dim + tail_dim)) ** 0.5 * 1.44269504  # log2(e)
     else:
         sm_scale = sm_scale * 1.44269504  # log2(e)
 
@@ -49,23 +53,25 @@ def sparse_mla_fwd(
     o_shape = [batch, seq_len, heads, dim]
     indices_shape = [batch, seq_len, kv_group, topk]
     lse_shape = [batch, seq_len, heads]
-    indices_dtype = "int32"
-    dtype = "bfloat16"
-    accum_dtype = "float"
+    indices_dtype = T.int32
+    dtype = T.bfloat16
+    accum_dtype = T.float32
 
     G = kv_group
     H = head_kv
     padded_H = max(tilelang.math.next_power_of_2(head_kv), 16)
     if padded_H != H:
-        assert kv_group == 1, 'here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)'
+        assert kv_group == 1, (
+            "here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)"
+        )
     BI = block_I
     NI = tilelang.cdiv(topk, block_I)
-    assert NI % 2 == 0, 'NI should be a multiple of 2'
+    assert NI % 2 == 0, "NI should be a multiple of 2"
     D = dim
     D_tail = tail_dim
     KV_stride = kv_stride
     if head_kv > 64:
-        assert head_kv % 64 == 0, 'head_kv should be a multiple of 64'
+        assert head_kv % 64 == 0, "head_kv should be a multiple of 64"
         REPLICATE_H = head_kv // 64
     else:
         REPLICATE_H = 1
@@ -74,18 +80,14 @@ def sparse_mla_fwd(
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            KV: T.Tensor(kv_shape, dtype),  # type: ignore
-            Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
-            q_start_index_s: T.Tensor(1, indices_dtype),
-            Output: T.Tensor(o_shape, dtype),  # type: ignore
-            Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        KV: T.Tensor(kv_shape, dtype),  # type: ignore
+        Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
+        q_start_index_s: T.Tensor(1, indices_dtype),
+        Output: T.Tensor(o_shape, dtype),  # type: ignore
+        Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
     ):
-        with T.Kernel(
-            (seq_len - kv_stride + 1 if CP0 else seq_len) * REPLICATE_H,
-                batch,
-                kv_group,
-                threads=threads) as (bx, by, bz):
+        with T.Kernel((seq_len - kv_stride + 1 if CP0 else seq_len) * REPLICATE_H, batch, kv_group, threads=threads) as (bx, by, bz):
             Q_shared_l = T.alloc_shared([H_per_block, D // 2], dtype)
             Q_shared_r = T.alloc_shared([H_per_block, D // 2], dtype)
             Q_tail_shared = T.alloc_shared([H_per_block, D_tail], dtype)
@@ -110,7 +112,7 @@ def main(
             alpha_local = T.alloc_fragment([H_per_block], accum_dtype)
             m_i = T.alloc_fragment([H_per_block], accum_dtype)
             m_i_prev = T.alloc_fragment([H_per_block], accum_dtype)
-            indices_local = T.alloc_local([1], indices_dtype)
+            indices_local = T.alloc_var(indices_dtype)
 
             # TODO: Multi buffer
             bar_q = T.alloc_barrier(arrive_count=384)
@@ -122,8 +124,7 @@ def main(
             bar_sScale_and_sS_free = T.alloc_barrier(arrive_count=256)
 
             b_i, g_i = by, bz
-            s_i = (bx + (KV_stride - 1 if CP0 else 0)) if REPLICATE_H == 1 else (
-                bx // REPLICATE_H + (KV_stride - 1 if CP0 else 0))
+            s_i = (bx + (KV_stride - 1 if CP0 else 0)) if REPLICATE_H == 1 else (bx // REPLICATE_H + (KV_stride - 1 if CP0 else 0))
             q_i = q_start_index_s[0] + s_i
             max_kv_i = (q_i + 1 - KV_stride) // KV_stride
 
@@ -132,26 +133,24 @@ def main(
 
             tx = T.get_thread_binding()
 
-            T.copy(Q[b_i, s_i, H0:H1, 0:D // 2], Q_shared_l)
-            T.copy(Q[b_i, s_i, H0:H1, D // 2:D], Q_shared_r)
+            T.copy(Q[b_i, s_i, H0:H1, 0 : D // 2], Q_shared_l)
+            T.copy(Q[b_i, s_i, H0:H1, D // 2 : D], Q_shared_r)
             T.copy(Q[b_i, s_i, H0:H1, D:], Q_tail_shared)
             T.barrier_arrive(bar_q)
 
             if tx < 128:
                 T.set_max_nreg(240, 1)
                 T.fill(sumexp, 0)
-                T.fill(m_i, -2**30)  # avoid -inf - inf to cause nan
+                T.fill(m_i, -(2**30))  # avoid -inf - inf to cause nan
                 T.fill(acc_o_l, 0)
                 T.barrier_wait(bar_q, 0)
 
                 for i_i in T.serial(T.ceildiv(NI, 2)):
-
                     # Buffer 0
                     T.barrier_wait(bar_k_0_ready[0], (i_i & 1))
 
                     for h_i, bi_i in T.Parallel(H_per_block, BI):
-                        acc_s[h_i, bi_i] = T.if_then_else(is_kv_valid[bi_i], 0,
-                                                          -T.infinity(acc_s.dtype))
+                        acc_s[h_i, bi_i] = T.if_then_else(is_kv_valid[bi_i], 0, -T.infinity(acc_s.dtype))
                     T.gemm(Q_shared_l, KV_shared_0_l, acc_s, transpose_B=True, wg_wait=-1)
                     T.gemm(Q_shared_r, KV_shared_0_r, acc_s, transpose_B=True, wg_wait=-1)
                     T.gemm(Q_tail_shared, K_tail_shared_0, acc_s, transpose_B=True, wg_wait=-1)
@@ -164,6 +163,8 @@ def main(
 
                     T.copy(m_i, m_i_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(H_per_block):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(H_per_block):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(H_per_block, BI):
@@ -185,8 +186,7 @@ def main(
                     T.barrier_wait(bar_k_1_ready[0], (i_i & 1))
 
                     for h_i, bi_i in T.Parallel(H_per_block, BI):
-                        acc_s[h_i, bi_i] = T.if_then_else(is_kv_valid[bi_i], 0,
-                                                          -T.infinity(acc_s.dtype))
+                        acc_s[h_i, bi_i] = T.if_then_else(is_kv_valid[bi_i], 0, -T.infinity(acc_s.dtype))
                     T.gemm(Q_shared_l, KV_shared_1_l, acc_s, transpose_B=True, wg_wait=-1)
                     T.gemm(Q_shared_r, KV_shared_1_r, acc_s, transpose_B=True, wg_wait=-1)
                     T.gemm(Q_tail_shared, K_tail_shared_1, acc_s, transpose_B=True, wg_wait=-1)
@@ -198,6 +198,8 @@ def main(
 
                     T.copy(m_i, m_i_prev)
                     T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(H_per_block):
+                        m_i[h_i] = T.max(m_i[h_i], m_i_prev[h_i])
                     for h_i in T.Parallel(H_per_block):
                         alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
                     for h_i, bi_i in T.Parallel(H_per_block, BI):
@@ -223,7 +225,7 @@ def main(
                 for h_i in T.Parallel(H_per_block):
                     sumexp[h_i] = T.log2(sumexp[h_i]) + m_i[h_i] * sm_scale
                 T.copy(acc_o_l, O_shared_l)
-                T.copy(O_shared_l, Output[b_i, s_i, H0:H1, 0:D // 2])
+                T.copy(O_shared_l, Output[b_i, s_i, H0:H1, 0 : D // 2])
 
             elif tx >= 128 and tx < 256:
                 T.set_max_nreg(168, 1)
@@ -253,7 +255,7 @@ def main(
                     acc_o_r[h_i, d_i] /= sum_exp_shared[h_i]
 
                 T.copy(acc_o_r, O_shared_r)
-                T.copy(O_shared_r, Output[b_i, s_i, H0:H1, D // 2:D])
+                T.copy(O_shared_r, Output[b_i, s_i, H0:H1, D // 2 : D])
             elif tx >= 256:
                 # producer
                 T.set_max_nreg(80, 0)
@@ -261,70 +263,58 @@ def main(
                     # Buffer 0
                     T.barrier_wait(bar_k_0_free[0], ((i_i & 1) ^ 1))
                     for r in T.serial(4):
-                        indices_local[0] = Indices[b_i, s_i, g_i,
-                                                   (i_i * 2) * BI + r * 16 + (tx - 256) // 8]
-                        is_kv_valid[r * 16 + (tx - 256) // 8] = indices_local[0] <= max_kv_i
+                        indices_local = Indices[b_i, s_i, g_i, (i_i * 2) * BI + r * 16 + (tx - 256) // 8]
+                        is_kv_valid[r * 16 + (tx - 256) // 8] = indices_local <= max_kv_i
                         if is_kv_valid[r * 16 + (tx - 256) // 8]:
                             with T.attr("default", "async_scope", 1):
                                 for u in T.serial(4):
                                     for v in T.vectorized(8):
-                                        KV_shared_0_l[r * 16 + (tx - 256) // 8,
-                                                      64 * u + (tx - 256) % 8 * 8 +
-                                                      v] = KV[b_i, indices_local[0], g_i,
-                                                              64 * u + (tx - 256) % 8 * 8 + v]
-                                        KV_shared_0_r[r * 16 + (tx - 256) // 8,
-                                                      64 * u + (tx - 256) % 8 * 8 +
-                                                      v] = KV[b_i, indices_local[0], g_i, D // 2 +
-                                                              64 * u + (tx - 256) % 8 * 8 + v]
+                                        KV_shared_0_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                            b_i, indices_local, g_i, 64 * u + (tx - 256) % 8 * 8 + v
+                                        ]
+                                        KV_shared_0_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                            b_i, indices_local, g_i, D // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                        ]
                             with T.attr("default", "async_scope", 1):
                                 for v in T.vectorized(8):
-                                    K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 +
-                                                    v] = KV[b_i, indices_local[0], g_i,
-                                                            D + (tx - 256) % 8 * 8 + v]
+                                    K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = KV[
+                                        b_i, indices_local, g_i, D + (tx - 256) % 8 * 8 + v
+                                    ]
                     T.cp_async_barrier_noinc(bar_k_0_ready[0])
 
                     # Buffer 1
                     T.barrier_wait(bar_k_1_free[0], ((i_i & 1) ^ 1))
                     for r in T.serial(4):
-                        indices_local[0] = Indices[b_i, s_i, g_i,
-                                                   (i_i * 2 + 1) * BI + r * 16 + (tx - 256) // 8]
-                        is_kv_valid[r * 16 + (tx - 256) // 8] = indices_local[0] <= max_kv_i
+                        indices_local = Indices[b_i, s_i, g_i, (i_i * 2 + 1) * BI + r * 16 + (tx - 256) // 8]
+                        is_kv_valid[r * 16 + (tx - 256) // 8] = indices_local <= max_kv_i
                         if is_kv_valid[r * 16 + (tx - 256) // 8]:
                             with T.attr("default", "async_scope", 1):
                                 for u in T.serial(4):
                                     for v in T.vectorized(8):
-                                        KV_shared_1_l[r * 16 + (tx - 256) // 8,
-                                                      64 * u + (tx - 256) % 8 * 8 +
-                                                      v] = KV[b_i, indices_local[0], g_i,
-                                                              64 * u + (tx - 256) % 8 * 8 + v]
-                                        KV_shared_1_r[r * 16 + (tx - 256) // 8,
-                                                      64 * u + (tx - 256) % 8 * 8 +
-                                                      v] = KV[b_i, indices_local[0], g_i, D // 2 +
-                                                              64 * u + (tx - 256) % 8 * 8 + v]
+                                        KV_shared_1_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                            b_i, indices_local, g_i, 64 * u + (tx - 256) % 8 * 8 + v
+                                        ]
+                                        KV_shared_1_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
+                                            b_i, indices_local, g_i, D // 2 + 64 * u + (tx - 256) % 8 * 8 + v
+                                        ]
                             with T.attr("default", "async_scope", 1):
                                 for v in T.vectorized(8):
-                                    K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 +
-                                                    v] = KV[b_i, indices_local[0], g_i,
-                                                            D + (tx - 256) % 8 * 8 + v]
+                                    K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = KV[
+                                        b_i, indices_local, g_i, D + (tx - 256) % 8 * 8 + v
+                                    ]
                     T.cp_async_barrier_noinc(bar_k_1_ready[0])
 
     return main
 
 
-def sparse_mla_fwd_interface(q,
-                             kv,
-                             indices,
-                             q_start_index_s,
-                             kv_stride,
-                             sm_scale=None,
-                             is_casual=True,
-                             return_kernel=False,
-                             print_kernel=False):
+def sparse_mla_fwd_interface(
+    q, kv, indices, q_start_index_s, kv_stride, sm_scale=None, is_casual=True, return_kernel=False, print_kernel=False
+):
     assert q.is_contiguous() and kv.is_contiguous() and indices.is_contiguous()
     batch, seq_len, heads, dim_plus_tail_dim = q.shape
     _, seq_len_kv, kv_group, _ = kv.shape
 
-    assert dim_plus_tail_dim == 576, 'you should assign dim otherwise'
+    assert dim_plus_tail_dim == 576, "you should assign dim otherwise"
     dim = 512
 
     assert kv.shape[-1] == dim_plus_tail_dim
@@ -334,29 +324,23 @@ def sparse_mla_fwd_interface(q,
     assert indices.shape == (batch, seq_len, kv_group, topk)
 
     if q_start_index_s != 0:
-        assert q_start_index_s > kv_stride, "If it is because each cp has too short length, you should fix the logic involving CP0 (cp_rank == 0), to make sure q with pos < KV_Stride - 1 is masked (or you may just ignore how this is handled if nan in these q's Out would not effect others, which is reported to be likely to happen by wangding)"
+        assert q_start_index_s > kv_stride, (
+            "If it is because each cp has too short length, you should fix the logic involving CP0 (cp_rank == 0), to make sure q with pos < KV_Stride - 1 is masked (or you may just ignore how this is handled if nan in these q's Out would not effect others, which is reported to be likely to happen by wangding)"
+        )
     CP0 = q_start_index_s == 0
 
-    kernel = sparse_mla_fwd(batch, seq_len, seq_len_kv, heads, dim, tail_dim, topk, kv_stride,
-                            kv_group, sm_scale, is_casual, CP0)
+    kernel = sparse_mla_fwd(batch, seq_len, seq_len_kv, heads, dim, tail_dim, topk, kv_stride, kv_group, sm_scale, is_casual, CP0)
     if print_kernel:
         print(kernel.get_kernel_source())
-    out, lse = kernel(q, kv, indices,
-                      torch.tensor([q_start_index_s], dtype=torch.int32, device="cuda"))
+    out, lse = kernel(q, kv, indices, torch.tensor([q_start_index_s], dtype=torch.int32, device="cuda"))
     if return_kernel:
         return kernel
     if q_start_index_s == 0 and kv_stride > 1:
-        out[:, :kv_stride - 1, :, :] = 0
+        out[:, : kv_stride - 1, :, :] = 0
     return out, lse
 
 
-def ref_sparse_mla_fwd_interface(q,
-                                 kv,
-                                 indices,
-                                 q_start_index_s,
-                                 kv_stride=4,
-                                 sm_scale=None,
-                                 is_casual=True):
+def ref_sparse_mla_fwd_interface(q, kv, indices, q_start_index_s, kv_stride=4, sm_scale=None, is_casual=True):
     q = q.float()
     kv = kv.float()
     indices = indices.transpose(1, 2)
@@ -365,7 +349,7 @@ def ref_sparse_mla_fwd_interface(q,
     if q_start_index_s is None:
         q_start_index_s = sk * kv_stride - sq
 
-    assert kv.shape[-1] == 576, 'you should assign dim otherwise'
+    assert kv.shape[-1] == 576, "you should assign dim otherwise"
     dim = 512
     k = kv
     v = kv[..., :dim]
@@ -374,15 +358,14 @@ def ref_sparse_mla_fwd_interface(q,
     num_kv_per_index = 1
     g_index = g
     h_index = h // g
-    compressed_casual_mask = torch.arange(
-        q_start_index_s, sq + q_start_index_s, dtype=torch.int32,
-        device="cuda").view(-1, 1) >= torch.arange(
-            kv_stride - 1, sk * kv_stride, kv_stride, dtype=torch.int32, device="cuda").view(1, -1)
+    compressed_casual_mask = torch.arange(q_start_index_s, sq + q_start_index_s, dtype=torch.int32, device="cuda").view(
+        -1, 1
+    ) >= torch.arange(kv_stride - 1, sk * kv_stride, kv_stride, dtype=torch.int32, device="cuda").view(1, -1)
 
     mask = q.new_zeros(b, g_index, sq, sk + 1, dtype=torch.bool).scatter(3, indices.long(), 1)
     mask = mask[..., :-1]
     mask = mask & compressed_casual_mask.view(1, 1, sq, sk)
-    mask[:, :, :kv_stride - 1, 0] = True
+    mask[:, :, : kv_stride - 1, 0] = True
     mask = mask.view(b, g_index, 1, sq, sk)
 
     q = q.view(b, sq, g, -1, dim_q)
@@ -397,41 +380,32 @@ def ref_sparse_mla_fwd_interface(q,
     return o.to(torch.bfloat16)
 
 
-def test_sparse_mla_fwd_pipelined(B=1,
-                                  S=4096,
-                                  SKV=8192,
-                                  H=128,
-                                  HKV=1,
-                                  DQK=576,
-                                  DV=512,
-                                  topk=2048,
-                                  dtype=torch.bfloat16,
-                                  q_start_s_index=1024,
-                                  check_correctness=True):
+def test_sparse_mla_fwd_pipelined(
+    B=1, S=4096, SKV=8192, H=128, HKV=1, DQK=576, DV=512, topk=2048, dtype=torch.bfloat16, q_start_s_index=1024, check_correctness=True
+):
     KV_stride = 1
 
     torch.random.manual_seed(0)
-    q = torch.randn((B, S, H, DQK), dtype=dtype, device='cuda').requires_grad_(True) / 10
-    kv = torch.randn((B, SKV, HKV, DQK), dtype=dtype, device='cuda').requires_grad_(True) / 10
+    q = torch.randn((B, S, H, DQK), dtype=dtype, device="cuda").requires_grad_(True) / 10
+    kv = torch.randn((B, SKV, HKV, DQK), dtype=dtype, device="cuda").requires_grad_(True) / 10
     q_start_s_index_t = torch.tensor([q_start_s_index], dtype=torch.int32, device="cuda")
 
     q.clamp_(-10, 10)
     kv.clamp_(-10, 10)
 
-    indices = torch.full((B, S, HKV, topk), SKV, dtype=torch.int32, device='cuda')
+    indices = torch.full((B, S, HKV, topk), SKV, dtype=torch.int32, device="cuda")
     for b in range(B):
         for t in range(S):
             for h in range(HKV):
                 i_i = torch.randperm(min(max(1, ((t + q_start_s_index) // KV_stride)), SKV))[:topk]
-                indices[b, t, h, :len(i_i)] = i_i
+                indices[b, t, h, : len(i_i)] = i_i
 
-    kernel = sparse_mla_fwd_interface(
-        q, kv, indices, q_start_s_index, KV_stride, return_kernel=True, print_kernel=True)
+    kernel = sparse_mla_fwd_interface(q, kv, indices, q_start_s_index, KV_stride, return_kernel=True, print_kernel=True)
 
     def fn():
         out, lse = kernel(q, kv, indices, q_start_s_index_t)
         if q_start_s_index == 0 and KV_stride > 1:
-            out[:, :KV_stride - 1, :, :] = 0
+            out[:, : KV_stride - 1, :, :] = 0
         return out, lse
 
     tl_out, tl_lse = fn()
@@ -442,14 +416,46 @@ def fn():
     torch.testing.assert_close(tl_out, ref_out, rtol=1e-3, atol=1e-3)
 
     from tilelang.profiler import do_bench
+
     ms = do_bench(
         fn,
         rep=10,
         warmup=10,
     )
     print(f"Average time: {ms:.3f} ms")
-    print(f'fwd io bandwidth = ', (B * S * DQK * topk * 2) / (ms * 1e-3) / 1e12)
-    print(f'fwd tflops = ', (B * S * (DQK + DV) * topk * 2 * H) / (ms * 1e-3) / 1e12)
+    print(f"fwd io bandwidth = ", (B * S * DQK * topk * 2) / (ms * 1e-3) / 1e12)
+    print(f"fwd tflops = ", (B * S * (DQK + DV) * topk * 2 * H) / (ms * 1e-3) / 1e12)
+
+
+def run_regression_perf(B=1, S=4096, SKV=8192, H=128, HKV=1, DQK=576, DV=512, topk=2048, dtype=torch.bfloat16, q_start_s_index=1024):
+    KV_stride = 1
+
+    torch.random.manual_seed(0)
+    q = torch.randn((B, S, H, DQK), dtype=dtype, device="cuda").requires_grad_(True) / 10
+    kv = torch.randn((B, SKV, HKV, DQK), dtype=dtype, device="cuda").requires_grad_(True) / 10
+    q.clamp_(-10, 10)
+    kv.clamp_(-10, 10)
+
+    indices = torch.full((B, S, HKV, topk), SKV, dtype=torch.int32, device="cuda")
+    for b in range(B):
+        for t in range(S):
+            for h in range(HKV):
+                i_i = torch.randperm(min(max(1, ((t + q_start_s_index) // KV_stride)), SKV))[:topk]
+                indices[b, t, h, : len(i_i)] = i_i
+
+    batch, seq_len, heads, dim_plus_tail_dim = q.shape
+    _, seq_len_kv, kv_group, _ = kv.shape
+    dim = 512
+    tail_dim = dim_plus_tail_dim - dim
+    CP0 = q_start_s_index == 0
+    kernel = sparse_mla_fwd(batch, seq_len, seq_len_kv, heads, dim, tail_dim, topk, KV_stride, kv_group, None, True, CP0)
+
+    def run_kernel_only():
+        kernel(q, kv, indices, torch.tensor([q_start_s_index], dtype=torch.int32, device="cuda"))
+
+    from tilelang.profiler import do_bench
+
+    return do_bench(run_kernel_only, backend="cupti")
 
 
 if __name__ == "__main__":
@@ -460,5 +466,4 @@ def fn():
         B, S, SKV, H, HKV, DQK, DV, topk, dtype = 1, 1024, 8192, 128, 1, 576, 512, 2048, torch.bfloat16
     else:
         B, S, SKV, H, HKV, DQK, DV, topk, dtype = 1, 4096, 8192, 128, 1, 576, 512, 2048, torch.bfloat16
-    test_sparse_mla_fwd_pipelined(
-        B, S, SKV, H, HKV, DQK, DV, topk, dtype, check_correctness=args.test_correctness)
+    test_sparse_mla_fwd_pipelined(B, S, SKV, H, HKV, DQK, DV, topk, dtype, check_correctness=args.test_correctness)
diff --git a/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py b/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py
index 971a3206c..983798f9f 100644
--- a/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py
+++ b/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py
@@ -1,42 +1,43 @@
 # ruff: noqa
+import tilelang
 import tilelang.testing
 
-from topk_selector import test_topk_selector
-from fp8_lighting_indexer import test_fp8_lighting_indexer
-from sparse_mla_fwd import test_sparse_mla_fwd
-from sparse_mla_fwd_pipelined import test_sparse_mla_fwd_pipelined
-from sparse_mla_bwd import test_sparse_mla_bwd
+import topk_selector
+import fp8_lighting_indexer
+import sparse_mla_fwd
+import sparse_mla_fwd_pipelined
+import sparse_mla_bwd
 
 
 def test_example_topk_selector():
-    test_topk_selector()
+    topk_selector.test_topk_selector()
 
 
 def test_example_fp8_lighting_indexer():
-    test_fp8_lighting_indexer(S=1024, SKV=2048, H=32, HKV=1, D=64, kv_stride=1)
+    fp8_lighting_indexer.test_fp8_lighting_indexer(S=512, SKV=1024, H=32, HKV=1, D=64, kv_stride=1)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_sparse_mla_fwd():
     # small shapes for testing
-    test_sparse_mla_fwd(
-        S=256, SKV=1024, H=64, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
+    sparse_mla_fwd.test_sparse_mla_fwd(S=256, SKV=1024, H=64, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_sparse_mla_fwd_pipelined():
     # small shapes for testing
-    test_sparse_mla_fwd_pipelined(
-        S=256, SKV=1024, H=64, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
+    sparse_mla_fwd_pipelined.test_sparse_mla_fwd_pipelined(S=256, SKV=512, H=64, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_sparse_mla_bwd():
-    test_sparse_mla_bwd(
-        S=256, SKV=1024, H=64, HKV=1, DQKV=576, DV=512, topk=256, check_correctness=False)
+    sparse_mla_bwd.test_sparse_mla_bwd(S=256, SKV=512, H=64, HKV=1, DQKV=576, DV=512, topk=256, check_correctness=False)
+    sparse_mla_bwd.test_sparse_mla_bwd(
+        S=256, SKV=512, H=128, HKV=1, DQKV=576, DV=512, topk=256, check_correctness=False
+    )  # test for large H
 
 
 if __name__ == "__main__":
diff --git a/examples/deepseek_v32/topk_selector.py b/examples/deepseek_v32/topk_selector.py
index 4a4b43277..078eb2686 100644
--- a/examples/deepseek_v32/topk_selector.py
+++ b/examples/deepseek_v32/topk_selector.py
@@ -8,24 +8,24 @@
 
 
 def convert_to_uint16(x):
-    hval = T.Cast("float16", x)
-    bits_uint = T.reinterpret("uint16", hval)
+    hval = T.Cast(T.float16, x)
+    bits_uint = T.reinterpret(T.uint16, hval)
     bits_uint = T.if_then_else(x < 0, ~bits_uint & (0xFFFF), bits_uint | (0x8000))
     return bits_uint >> 8
 
 
 def convert_to_uint32(x):
-    bits_uint = T.reinterpret("uint32", x)
+    bits_uint = T.reinterpret(T.uint32, x)
     bits_uint = T.if_then_else(
         x < 0,
-        ~bits_uint & T.Cast("uint32", (0xFFFFFFFF)),
-        bits_uint | T.Cast("uint32", (0x80000000)),
+        ~bits_uint & T.Cast(T.uint32, (0xFFFFFFFF)),
+        bits_uint | T.Cast(T.uint32, (0x80000000)),
     )
     return bits_uint
 
 
 @tilelang.jit(pass_configs=pass_configs)
-def tl_topk_impl(topk, in_dtype="float32", out_dtype="int32"):
+def tl_topk_impl(topk, in_dtype=T.float32, out_dtype=T.int32):
     batch = T.dynamic("batch")
     seq_len = T.dynamic("seq_len")
     RADIX = 1 << 8
@@ -42,20 +42,20 @@ def tl_topk_kernel(
         with T.Kernel(batch, threads=BLOCK_SIZE) as (bx):
             tx = T.get_thread_binding()
 
-            s_threshold_bin_id = T.alloc_shared([1], "int32")
-            s_histogram = T.alloc_shared([RADIX + 1], "int32")
-            s_num_input = T.alloc_shared([2], "int32")
-            s_input_idx = T.alloc_shared([2, SMEM_INPUT_SIZE], "int32")
-
-            l_threshold_bin_id = T.alloc_var("int32")
-            l_new_topk = T.alloc_var("int32")
-            l_num_input = T.alloc_var("int32")
-            l_bin_id32 = T.alloc_var("int32")
-            l_val = T.alloc_var("int32")
-            l_start_pos = T.alloc_var("int32")
-            l_start_idx = T.alloc_var("int32")
-            l_end_idx = T.alloc_var("int32")
-            l_out_pos = T.alloc_var("int32")
+            s_threshold_bin_id = T.alloc_shared([1], T.int32)
+            s_histogram = T.alloc_shared([RADIX + 1], T.int32)
+            s_num_input = T.alloc_shared([2], T.int32)
+            s_input_idx = T.alloc_shared([2, SMEM_INPUT_SIZE], T.int32)
+
+            l_threshold_bin_id = T.alloc_var(T.int32)
+            l_new_topk = T.alloc_var(T.int32)
+            l_num_input = T.alloc_var(T.int32)
+            l_bin_id32 = T.alloc_var(T.int32)
+            l_val = T.alloc_var(T.int32)
+            l_start_pos = T.alloc_var(T.int32)
+            l_start_idx = T.alloc_var(T.int32)
+            l_end_idx = T.alloc_var(T.int32)
+            l_out_pos = T.alloc_var(T.int32)
 
             l_new_topk = topk
             l_start_idx = starts[bx]
@@ -99,7 +99,7 @@ def tl_topk_kernel(
                 input_idx = s * BLOCK_SIZE + tx
                 if input_idx < l_end_idx and input_idx >= l_start_idx and input_idx < seq_len:
                     bin_id = convert_to_uint16(input[bx, input_idx])
-                    l_bin_id32 = T.Cast("int32", bin_id)
+                    l_bin_id32 = T.Cast(T.int32, bin_id)
                     if l_bin_id32 > l_threshold_bin_id:
                         # need a pos = T.atomic_add(s_histogram[bin_id32+1], 1)
                         pos = T.atomic_add(s_histogram[l_bin_id32 + 1], 1, return_prev=True)
@@ -127,9 +127,9 @@ def tl_topk_kernel(
                 l_num_input = s_num_input[r_idx]
                 for s in T.serial(T.ceildiv(l_num_input, BLOCK_SIZE)):
                     if s * BLOCK_SIZE + tx < l_num_input:
-                        l_bin_id32 = T.Cast("int32", ((
-                            convert_to_uint32(input[bx, s_input_idx[r_idx, s * BLOCK_SIZE + tx]]) >>
-                            (24 - round * 8)) & 0xFF))
+                        l_bin_id32 = T.Cast(
+                            T.int32, ((convert_to_uint32(input[bx, s_input_idx[r_idx, s * BLOCK_SIZE + tx]]) >> (24 - round * 8)) & 0xFF)
+                        )
                         T.atomic_add(s_histogram[l_bin_id32], 1)
                 T.sync_threads()
                 # cumsum
@@ -156,23 +156,20 @@ def tl_topk_kernel(
                 for s in T.serial(T.ceildiv(l_num_input, BLOCK_SIZE)):
                     T.sync_threads()
                     if s * BLOCK_SIZE + tx < l_num_input:
-                        l_bin_id32 = T.Cast("int32", ((
-                            convert_to_uint32(input[bx, s_input_idx[r_idx, s * BLOCK_SIZE + tx]]) >>
-                            (24 - round * 8)) & 0xFF))
+                        l_bin_id32 = T.Cast(
+                            T.int32, ((convert_to_uint32(input[bx, s_input_idx[r_idx, s * BLOCK_SIZE + tx]]) >> (24 - round * 8)) & 0xFF)
+                        )
                         if l_bin_id32 > l_threshold_bin_id:
-                            pos = T.atomic_add(
-                                s_histogram[l_bin_id32 + 1], 1, return_prev=True) + l_start_pos
+                            pos = T.atomic_add(s_histogram[l_bin_id32 + 1], 1, return_prev=True) + l_start_pos
                             index[bx, pos] = s_input_idx[r_idx, s * BLOCK_SIZE + tx]
                         elif l_bin_id32 == l_threshold_bin_id and l_new_topk > 0:
                             if round == 3:
-                                l_out_pos = T.atomic_add(
-                                    s_histogram[l_bin_id32 + 1], 1, return_prev=True) + l_start_pos
+                                l_out_pos = T.atomic_add(s_histogram[l_bin_id32 + 1], 1, return_prev=True) + l_start_pos
                                 if l_out_pos < topk:
                                     index[bx, l_out_pos] = s_input_idx[r_idx, s * BLOCK_SIZE + tx]
                             else:
                                 pos = T.atomic_add(s_num_input[r_idx ^ 1], 1, return_prev=True)
-                                s_input_idx[r_idx ^ 1, pos] = s_input_idx[r_idx,
-                                                                          s * BLOCK_SIZE + tx]
+                                s_input_idx[r_idx ^ 1, pos] = s_input_idx[r_idx, s * BLOCK_SIZE + tx]
 
     return tl_topk_kernel
 
@@ -186,7 +183,6 @@ def tl_topk(input, starts, ends, topk):
 
 
 def test_topk_selector(batch=64, seq_len=32 * 1024, topk=2048):
-
     batch = 64
     seq_len = 32 * 1024
     topk = 2048
@@ -212,8 +208,7 @@ def test_topk_selector(batch=64, seq_len=32 * 1024, topk=2048):
         set_ref = set(ref_np)
         set_trt = set(trt_np)
         intersection = set_ref & set_trt
-        print("selected/all:", len(intersection), "/", len(set_ref), "=",
-              len(intersection) / len(set_ref))
+        print("selected/all:", len(intersection), "/", len(set_ref), "=", len(intersection) / len(set_ref))
 
     # Performance test with CUDA events
 
@@ -245,5 +240,35 @@ def test_topk_selector(batch=64, seq_len=32 * 1024, topk=2048):
     print(f"Average torch.topk time: {elapsed_time_ms / n_iters:.3f} ms")
 
 
+def run_regression_perf(batch=64, seq_len=32 * 1024, topk=2048):
+    batch = 64
+    seq_len = 32 * 1024
+    topk = 2048
+    torch.manual_seed(1)
+    input = torch.randn(batch, seq_len, dtype=torch.float32).cuda()
+    starts = torch.zeros(batch, dtype=torch.int32).cuda()
+    ends = torch.ones(batch, dtype=torch.int32).cuda() * seq_len
+
+    indexes = tl_topk(input, starts, ends, topk)
+
+    indexes_ref = torch.topk(input, topk, dim=-1)[1]
+
+    for i in range(batch):
+        ref_np = indexes_ref[i].cpu().to(torch.int32).numpy()
+        trt_np = indexes[i].cpu().to(torch.int32).numpy()
+
+        set_ref = set(ref_np)
+        set_trt = set(trt_np)
+        intersection = set_ref & set_trt
+        print("selected/all:", len(intersection), "/", len(set_ref), "=", len(intersection) / len(set_ref))
+
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        tl_topk(input, starts, ends, topk)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     test_topk_selector()
diff --git a/examples/deepseek_v32/utils.py b/examples/deepseek_v32/utils.py
index 2ea34b14a..d7252e171 100644
--- a/examples/deepseek_v32/utils.py
+++ b/examples/deepseek_v32/utils.py
@@ -23,8 +23,7 @@ def _is_equal(a, b):
     if isinstance(a, torch.Tensor):
         return a is b
     # Whitelist of types that are safe to compare by value for caching.
-    if isinstance(a, (int, float, str, bool, type(None))) and isinstance(
-            b, (int, float, str, bool, type(None))):
+    if isinstance(a, (int, float, str, bool, type(None))) and isinstance(b, (int, float, str, bool, type(None))):
         return a == b
     # For other types, we cannot guarantee a cheap and safe comparison, so we fail the cache check.
     return False
@@ -58,9 +57,11 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
             if len(args) == len(last_args) and len(kwargs) == len(last_kwargs):
                 # For Tensors, check for object identity. For other types, check for equality.
                 # Python caches small integers, so `is` works for them but not for large integers like 4096.
-                if all(_is_equal(a, b) for a, b in zip(args, last_args)) and \
-                   set(kwargs.keys()) == set(last_kwargs.keys()) and \
-                   all(_is_equal(v, last_kwargs[k]) for k, v in kwargs.items()):
+                if (
+                    all(_is_equal(a, b) for a, b in zip(args, last_args))
+                    and set(kwargs.keys()) == set(last_kwargs.keys())
+                    and all(_is_equal(v, last_kwargs[k]) for k, v in kwargs.items())
+                ):
                     return last_result
 
         result = fn(*args, **kwargs)
@@ -79,73 +80,68 @@ def cal_seq_idx_from_cu_seqlens(cu_seqlens: torch.LongTensor, seq_len: int):
 
 
 @tensor_cache
-def cal_seq_idx_for_q(cu_seqlens_qs: torch.LongTensor, cu_seqlens_qe: torch.LongTensor,
-                      seq_len: int) -> torch.IntTensor:
-    seq_idx_for_q = torch.full((seq_len,),
-                               len(cu_seqlens_qs),
-                               dtype=torch.int32,
-                               device=cu_seqlens_qs.device)
+def cal_seq_idx_for_q(cu_seqlens_qs: torch.LongTensor, cu_seqlens_qe: torch.LongTensor, seq_len: int) -> torch.IntTensor:
+    seq_idx_for_q = torch.full((seq_len,), len(cu_seqlens_qs), dtype=torch.int32, device=cu_seqlens_qs.device)
     for i in range(len(cu_seqlens_qs)):
-        seq_idx_for_q[cu_seqlens_qs[i]:cu_seqlens_qe[i]] = i
+        seq_idx_for_q[cu_seqlens_qs[i] : cu_seqlens_qe[i]] = i
     return seq_idx_for_q
 
 
 @tensor_cache
-def cal_cu_seqlen_ks_for_q(cu_seqlens_qs: torch.LongTensor, cu_seqlens_qe: torch.LongTensor,
-                           cu_seqlens_ks: torch.LongTensor, seq_len: int) -> torch.IntTensor:
+def cal_cu_seqlen_ks_for_q(
+    cu_seqlens_qs: torch.LongTensor, cu_seqlens_qe: torch.LongTensor, cu_seqlens_ks: torch.LongTensor, seq_len: int
+) -> torch.IntTensor:
     cu_seqlen_ks_for_each_q = torch.gather(
-        input=torch.cat([
-            cu_seqlens_ks,
-            torch.full((1,),
-                       torch.iinfo(torch.int32).max,
-                       dtype=torch.int32,
-                       device=cu_seqlens_qs.device)
-        ]),
+        input=torch.cat([cu_seqlens_ks, torch.full((1,), torch.iinfo(torch.int32).max, dtype=torch.int32, device=cu_seqlens_qs.device)]),
         dim=0,
-        index=cal_seq_idx_for_q(
-            cu_seqlens_qs=cu_seqlens_qs, cu_seqlens_qe=cu_seqlens_qe, seq_len=seq_len).long())
+        index=cal_seq_idx_for_q(cu_seqlens_qs=cu_seqlens_qs, cu_seqlens_qe=cu_seqlens_qe, seq_len=seq_len).long(),
+    )
     return cu_seqlen_ks_for_each_q.int()
 
 
 @tensor_cache
-def cal_cu_seqlen_ke_for_q(cu_seqlens_qs: torch.LongTensor, cu_seqlens_qe: torch.LongTensor,
-                           cu_seqlens_ks: torch.LongTensor, cu_seqlens_ke: torch.LongTensor,
-                           q_start_idxs: torch.LongTensor, seq_len: int,
-                           kv_stride: int) -> torch.IntTensor:
+def cal_cu_seqlen_ke_for_q(
+    cu_seqlens_qs: torch.LongTensor,
+    cu_seqlens_qe: torch.LongTensor,
+    cu_seqlens_ks: torch.LongTensor,
+    cu_seqlens_ke: torch.LongTensor,
+    q_start_idxs: torch.LongTensor,
+    seq_len: int,
+    kv_stride: int,
+) -> torch.IntTensor:
     cu_seqlen_ke_for_each_q = torch.gather(
-        input=torch.cat(
-            [cu_seqlens_ke,
-             torch.zeros(1, dtype=torch.int32, device=cu_seqlens_qs.device)]),
+        input=torch.cat([cu_seqlens_ke, torch.zeros(1, dtype=torch.int32, device=cu_seqlens_qs.device)]),
         dim=0,
-        index=cal_seq_idx_for_q(
-            cu_seqlens_qs=cu_seqlens_qs, cu_seqlens_qe=cu_seqlens_qe, seq_len=seq_len).long())
-    casual_cu_seqlen_ke_for_each_q = torch.zeros((seq_len,),
-                                                 dtype=torch.int32,
-                                                 device=cu_seqlens_qs.device)
+        index=cal_seq_idx_for_q(cu_seqlens_qs=cu_seqlens_qs, cu_seqlens_qe=cu_seqlens_qe, seq_len=seq_len).long(),
+    )
+    casual_cu_seqlen_ke_for_each_q = torch.zeros((seq_len,), dtype=torch.int32, device=cu_seqlens_qs.device)
     for i in range(len(cu_seqlens_qs)):
-        casual_cu_seqlen_ke_for_each_q[cu_seqlens_qs[i]:cu_seqlens_qe[i]] = (torch.arange(
-            q_start_idxs[i],
-            q_start_idxs[i] + cu_seqlens_qe[i] - cu_seqlens_qs[i],
-            dtype=torch.int32,
-            device=cu_seqlens_qs.device) + 1) // kv_stride + cu_seqlens_ks[i]
+        casual_cu_seqlen_ke_for_each_q[cu_seqlens_qs[i] : cu_seqlens_qe[i]] = (
+            torch.arange(
+                q_start_idxs[i], q_start_idxs[i] + cu_seqlens_qe[i] - cu_seqlens_qs[i], dtype=torch.int32, device=cu_seqlens_qs.device
+            )
+            + 1
+        ) // kv_stride + cu_seqlens_ks[i]
     cu_seqlen_ke_for_each_q = torch.minimum(casual_cu_seqlen_ke_for_each_q, cu_seqlen_ke_for_each_q)
     return cu_seqlen_ke_for_each_q.int()
 
 
 @tensor_cache
-def cal_ks_ke_from_cu_seqlen_qk(cu_seqlens_q: torch.LongTensor,
-                                cu_seqlens_k: torch.LongTensor = None,
-                                offs_q: torch.LongTensor = None,
-                                *,
-                                seq_len: int,
-                                kv_stride: int = 1,
-                                cp_rank: int = 0,
-                                cp_size: int = 1,
-                                balanced_cp=False):
-    '''
+def cal_ks_ke_from_cu_seqlen_qk(
+    cu_seqlens_q: torch.LongTensor,
+    cu_seqlens_k: torch.LongTensor = None,
+    offs_q: torch.LongTensor = None,
+    *,
+    seq_len: int,
+    kv_stride: int = 1,
+    cp_rank: int = 0,
+    cp_size: int = 1,
+    balanced_cp=False,
+):
+    """
     seq_len: seq len per cp rank
     balanced cp slice assignment: 0 1 2 3 3 2 1 0
-    '''
+    """
     n_seq = len(cu_seqlens_q) - 1
     assert n_seq > 0
     assert cu_seqlens_q.shape == (n_seq + 1,)
@@ -170,10 +166,12 @@ def cal_ks_ke_from_cu_seqlen_qk(cu_seqlens_q: torch.LongTensor,
 
         def f(x: torch.Tensor):
             chunks = x.chunk(cp_size * 2)
-            return torch.cat([
-                chunks[cp_rank],
-                chunks[cp_size - cp_rank - 1],
-            ])
+            return torch.cat(
+                [
+                    chunks[cp_rank],
+                    chunks[cp_size - cp_rank - 1],
+                ]
+            )
 
         ks = f(ks)
         ke = f(ke)
@@ -189,8 +187,7 @@ def ceil_to_ue8m0(x: torch.Tensor):
     return torch.pow(2.0, torch.ceil(torch.log2(x.abs())))
 
 
-def per_custom_dims_cast_to_fp8(x: torch.Tensor, dims: Tuple[int],
-                                use_ue8m0: bool) -> Tuple[torch.Tensor, torch.Tensor]:
+def per_custom_dims_cast_to_fp8(x: torch.Tensor, dims: Tuple[int], use_ue8m0: bool) -> Tuple[torch.Tensor, torch.Tensor]:
     excluded_dims = tuple([i for i in range(x.dim()) if i not in set(dims)])
     x_amax = x.abs().float().amax(dim=excluded_dims, keepdim=True).clamp(1e-4)
     sf = x_amax / 448.0
@@ -239,14 +236,18 @@ def generate_random_cu_seqlens(per_cp_seqlen, cp_size=4, cp_rank=3, kv_stride=1,
         total_seqlen - (cp_rank + 1) * per_chunk_seqlen,
         total_seqlen - cp_rank * per_chunk_seqlen,
     )
-    ks = torch.cat([
-        cu_seqlens_ks_for_each_q[slice_short],
-        cu_seqlens_ks_for_each_q[slice_long],
-    ])
-    ke = torch.cat([
-        cu_seqlens_ke_for_each_q[slice_short],
-        cu_seqlens_ke_for_each_q[slice_long],
-    ])
+    ks = torch.cat(
+        [
+            cu_seqlens_ks_for_each_q[slice_short],
+            cu_seqlens_ks_for_each_q[slice_long],
+        ]
+    )
+    ke = torch.cat(
+        [
+            cu_seqlens_ke_for_each_q[slice_short],
+            cu_seqlens_ke_for_each_q[slice_long],
+        ]
+    )
     assert len(ks) == len(ke) == per_cp_seqlen
     return ks, ke
 
@@ -302,11 +303,9 @@ def assert_tensors_similar(x, y, eps=1e-8, name="tensor", raise_assert=True):
         raise_assert: Whether to raise assertion error on failure
     """
     sim = calculate_tensor_similarity(x, y, name)
-    diff = 1. - sim
+    diff = 1.0 - sim
     if not (0 <= diff <= eps):
-        print(
-            f"\033[31mERROR: {name} similarity check failed, diff={diff:.2e} (threshold={eps:.2e})\033[0m"
-        )
+        print(f"\033[31mERROR: {name} similarity check failed, diff={diff:.2e} (threshold={eps:.2e})\033[0m")
         if raise_assert:
             assert False  # noqa: B011
 
@@ -316,11 +315,8 @@ def assert_tensors_similar(x, y, eps=1e-8, name="tensor", raise_assert=True):
     cu_seqlens = torch.randint(128, 4096, (1000,), dtype=torch.int32, device="cuda")
     last_idx = torch.where(cu_seqlens.cumsum(dim=0) >= seq_len)[0][0]
     cu_seqlens_cumsum = cu_seqlens[:last_idx].cumsum(dim=0)
-    cu_seqlens_qs = torch.cat(
-        [torch.zeros(1, dtype=torch.int32, device=cu_seqlens.device), cu_seqlens_cumsum])
-    cu_seqlens_qe = torch.cat(
-        [cu_seqlens_cumsum,
-         torch.ones(1, dtype=torch.int32, device=cu_seqlens.device) * seq_len])
+    cu_seqlens_qs = torch.cat([torch.zeros(1, dtype=torch.int32, device=cu_seqlens.device), cu_seqlens_cumsum])
+    cu_seqlens_qe = torch.cat([cu_seqlens_cumsum, torch.ones(1, dtype=torch.int32, device=cu_seqlens.device) * seq_len])
 
     from tilelang.profiler import do_bench
 
diff --git a/examples/dequantize_gemm/README.md b/examples/dequantize_gemm/README.md
index 0c6116775..25ef617a2 100644
--- a/examples/dequantize_gemm/README.md
+++ b/examples/dequantize_gemm/README.md
@@ -19,7 +19,7 @@ def dequant_matmul(
 
         T.clear(Ct_local)
         for k in T.Pipelined(
-            T.ceildiv(K, block_K), 
+            T.ceildiv(K, block_K),
             num_stages=num_stages
         ):
             T.copy(A[by * block_M, k * block_K], A_shared)
diff --git a/examples/dequantize_gemm/dequantize_utils.py b/examples/dequantize_gemm/dequantize_utils.py
index b14c0aee6..90a6265ff 100644
--- a/examples/dequantize_gemm/dequantize_utils.py
+++ b/examples/dequantize_gemm/dequantize_utils.py
@@ -39,12 +39,10 @@ def torch_convert_bit_twiddling(tensor):
     res0 = val_concat_expanded & mask
     res1 = (val_concat_expanded << 3) & mask
     res2 = (val_concat_expanded << 6) & mask
-    res3 = ((val_concat_expanded << 1) & mask1) | ((val_concat_expanded >> 3) & mask2) | (
-        (val_concat_expanded >> 7) & mask3)
+    res3 = ((val_concat_expanded << 1) & mask1) | ((val_concat_expanded >> 3) & mask2) | ((val_concat_expanded >> 7) & mask3)
 
     # Select the correct result based on position
-    bf16 = torch.where(pos == 0, res0, torch.where(pos == 1, res1,
-                                                   torch.where(pos == 2, res2, res3)))
+    bf16 = torch.where(pos == 0, res0, torch.where(pos == 1, res1, torch.where(pos == 2, res2, res3)))
 
     # Convert to uint16 for .view(torch.bfloat16)
     bf16_uint16 = (bf16 & 0xFFFF).to(torch.uint16)
@@ -110,7 +108,7 @@ def print_bit(name, val):
         val (torch.Tensor): A scalar PyTorch tensor (numeric) whose 32-bit binary representation will be shown.
     """
     val_cpu = val.cpu().item()
-    binary_repr = f'{val_cpu:032b}'
+    binary_repr = f"{val_cpu:032b}"
     print(name, binary_repr)
 
 
@@ -122,7 +120,7 @@ def calc_sim(x, y, name="tensor"):
     x, y = x.data.double(), y.data.double()
     denominator = (x * x + y * y).sum()
     if denominator == 0:
-        print_red_warning(f'{name} all zero')
+        print_red_warning(f"{name} all zero")
         return 1
     sim = 2 * (x * y).sum() / denominator
     return sim
@@ -132,21 +130,19 @@ def assert_similar(x, y, eps=1e-8, name="tensor", data="", raise_assert=True):
     x_mask = torch.isfinite(x)
     y_mask = torch.isfinite(y)
     if not torch.all(x_mask == y_mask):
-        print_red_warning(f'{name} Error: isfinite mask mismatch')
+        print_red_warning(f"{name} Error: isfinite mask mismatch")
         if raise_assert:
             raise AssertionError
-    if not torch.isclose(
-            x.masked_fill(x_mask, 0), y.masked_fill(y_mask, 0), rtol=0, atol=0,
-            equal_nan=True).all():
-        print_red_warning(f'{name} Error: nonfinite value mismatch')
+    if not torch.isclose(x.masked_fill(x_mask, 0), y.masked_fill(y_mask, 0), rtol=0, atol=0, equal_nan=True).all():
+        print_red_warning(f"{name} Error: nonfinite value mismatch")
         if raise_assert:
             raise AssertionError
     x = x.masked_fill(~x_mask, 0)
     y = y.masked_fill(~y_mask, 0)
     sim = calc_sim(x, y, name)
-    diff = (1. - sim).item()
-    print(f'{diff=}')
+    diff = (1.0 - sim).item()
+    print(f"{diff=}")
     if not (0 <= diff <= eps):
-        print_red_warning(f'{name} Error: {diff=}')
+        print_red_warning(f"{name} Error: {diff=}")
         if raise_assert:
             raise AssertionError
diff --git a/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py b/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py
index e30845b8d..36b32c0a8 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py
@@ -24,6 +24,7 @@ def get_configs():
         the parameter name to its chosen value.
     """
     import itertools
+
     iter_params = dict(
         block_M=[64, 128, 256],
         block_N=[64, 128, 256],
@@ -32,65 +33,64 @@ def get_configs():
         threads=[128, 256, 512],
         split=[1, 2],
     )
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
 
 
-@tilelang.autotune(configs=get_configs(),)
+@tilelang.autotune(
+    configs=get_configs(),
+)
 @tilelang.jit(
     out_idx=[-1],
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
-    },
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
 )
-def matmul(M,
-           N,
-           K,
-           in_dtype,
-           out_dtype,
-           accum_dtype,
-           source_format='uint',
-           num_bits=4,
-           fast_dequant=True,
-           block_M=256,
-           block_N=128,
-           block_K=128,
-           num_stages=2,
-           threads=256,
-           split=1):
+def matmul(
+    M,
+    N,
+    K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    source_format=T.uint32,
+    num_bits=4,
+    fast_dequant=True,
+    block_M=256,
+    block_N=128,
+    block_K=128,
+    num_stages=2,
+    threads=256,
+    split=1,
+):
+    """
+    Builds a parameterized TileLang/TIR matrix-multiplication kernel that dequantizes 4-bit FP inputs to BF16 on-the-fly and computes C = A @ B^T.
+
+    This function returns a tiled, autotunable prim_func implementing a block-wise GEMM with shared-memory buffering and a pipelined K-loop. The kernel accepts:
+    - A: dense input of shape (M, K) with dtype `in_dtype`.
+    - B: packed quantized input of shape (N, QK) where QK = K / (8 / num_bits) stored as `uint8`.
+    - C: output of shape (M, N) with dtype `out_dtype`.
+
+    The generated kernel supports two dequantization paths:
+    - fast_dequant (fast_dequant=True): calls an external mxfp dequantization intrinsic (twiddling-based) loaded from a C source returned by get_mxfp_intrin_group.
+    - simple dequant (fast_dequant=False): performs a pure-TIR FP4 -> BF16 conversion per element.
+
+    Important behavior and requirements:
+    - num_bits (default 4) is the bit-width of the quantized elements; storage_dtype is uint8 and num_elems_per_byte = 8 // num_bits.
+    - QK = K // num_elems_per_byte and Block_QK = block_K // num_elems_per_byte determine B and shared-buffer shapes.
+    - Asserts that K % (block_K * split) == 0; K must be divisible by block_K * split for the tiling to be valid.
+    - When fast_dequant is True, a valid mxfp intrinsic group (C source and function name) must be available via tilelang.quantize.get_mxfp_intrin_group.
+    - The kernel launches a 2D grid over ceildiv(N, block_N) and ceildiv(M, block_M) and uses `threads` threads per block with `num_stages` pipeline stages.
+
+    Parameters that alter kernel layout/behavior (brief):
+    - block_M, block_N, block_K: tile sizes for M, N, and K dimensions.
+    - num_stages: number of software pipeline stages for the K-loop.
+    - threads: number of threads used per kernel block.
+    - split: extra K-splitting factor; K must be divisible by block_K * split.
+    - source_format, num_bits: describe the quantized data layout passed to the mxfp intrinsics.
+
+    Returns:
+        A TileLang/TIR prim_func (the compiled `main`) implementing the described dequantize-then-GEMM kernel.
     """
-           Builds a parameterized TileLang/TIR matrix-multiplication kernel that dequantizes 4-bit FP inputs to BF16 on-the-fly and computes C = A @ B^T.
-
-           This function returns a tiled, autotunable prim_func implementing a block-wise GEMM with shared-memory buffering and a pipelined K-loop. The kernel accepts:
-           - A: dense input of shape (M, K) with dtype `in_dtype`.
-           - B: packed quantized input of shape (N, QK) where QK = K / (8 / num_bits) stored as `uint8`.
-           - C: output of shape (M, N) with dtype `out_dtype`.
-
-           The generated kernel supports two dequantization paths:
-           - fast_dequant (fast_dequant=True): calls an external mxfp dequantization intrinsic (twiddling-based) loaded from a C source returned by get_mxfp_intrin_group.
-           - simple dequant (fast_dequant=False): performs a pure-TIR FP4 -> BF16 conversion per element.
-
-           Important behavior and requirements:
-           - num_bits (default 4) is the bit-width of the quantized elements; storage_dtype is uint8 and num_elems_per_byte = 8 // num_bits.
-           - QK = K // num_elems_per_byte and Block_QK = block_K // num_elems_per_byte determine B and shared-buffer shapes.
-           - Asserts that K % (block_K * split) == 0; K must be divisible by block_K * split for the tiling to be valid.
-           - When fast_dequant is True, a valid mxfp intrinsic group (C source and function name) must be available via tilelang.quantize.get_mxfp_intrin_group.
-           - The kernel launches a 2D grid over ceildiv(N, block_N) and ceildiv(M, block_M) and uses `threads` threads per block with `num_stages` pipeline stages.
-
-           Parameters that alter kernel layout/behavior (brief):
-           - block_M, block_N, block_K: tile sizes for M, N, and K dimensions.
-           - num_stages: number of software pipeline stages for the K-loop.
-           - threads: number of threads used per kernel block.
-           - split: extra K-splitting factor; K must be divisible by block_K * split.
-           - source_format, num_bits: describe the quantized data layout passed to the mxfp intrinsics.
-
-           Returns:
-               A TileLang/TIR prim_func (the compiled `main`) implementing the described dequantize-then-GEMM kernel.
-           """
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "uint8"
+    storage_dtype = T.uint8
 
     QK = K // num_elems_per_byte
     Block_QK = block_K // num_elems_per_byte
@@ -121,7 +121,7 @@ def matmul(M,
     assert func_name is not None, "mxfp_intrin_info is not found"
     import_source = import_source
 
-    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype="bfloat16"):
+    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype=T.bfloat16):
         """
         Create a TileLang macro that performs fast, twiddling-based dequantization from packed FP4 to BF16 using an external runtime plugin.
 
@@ -131,13 +131,13 @@ def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype="bfloat16"):
         - Writes the dequantized BF16 values back to a shared dequantized buffer for use by the kernel.
 
         Notes and preconditions:
-        - Asserts that `in_dtype == "fp4"` and `out_dtype == "bfloat16"`.
+        - Asserts that `in_dtype == "fp4"` and `out_dtype == T.bfloat16`.
         - The generated macro depends on several surrounding-scope symbols (e.g., `import_source`, `func_name`, `block_K`, `Block_QK`, `threads`, `num_elems_per_byte`, `storage_dtype`, and `out_dtype`) and expects them to be defined consistently in the enclosing kernel.
         - The macro is optimized for block-wise, per-thread transactions sized to the target storage width (uses a MAX_TRANSACTION_SIZE_BITS constant) and uses local/register buffers sized accordingly.
         - The macro uses `T.import_source` to bring the external plugin into the module and `T.call_extern` to perform the high-throughput dequantization; callers must ensure the external function matches the expected calling convention and memory layout.
         """
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
         # Some variables for dequantization in each thread
         MAX_TRANSACTION_SIZE_BITS = 128
@@ -189,12 +189,11 @@ def fast_dequant_bf16_fp4_twiddling(B_shared, B_dequantize_shared):
                 # Finally, store the dequantized data to shared memory.
                 for v in T.vectorized(0, local_size):
                     index = i * threads * local_size + tx * local_size + v
-                    B_dequantize_shared[index // block_K,
-                                        index % block_K] = B_dequantize_local_thread[v]
+                    B_dequantize_shared[index // block_K, index % block_K] = B_dequantize_local_thread[v]
 
         return fast_dequant_bf16_fp4_twiddling
 
-    def get_simple_dequant_func(in_dtype="fp4", out_dtype="bfloat16"):
+    def get_simple_dequant_func(in_dtype="fp4", out_dtype=T.bfloat16):
         """
         Create a simple TIR dequantization macro that converts packed 4-bit FP (FP4) stored in uint8 into bfloat16.
 
@@ -205,7 +204,7 @@ def get_simple_dequant_func(in_dtype="fp4", out_dtype="bfloat16"):
         - Writes the dequantized bfloat16 block into B_dequantize_shared.
 
         Constraints:
-        - Supports only in_dtype="fp4" and out_dtype="bfloat16".
+        - Supports only in_dtype="fp4" and out_dtype=T.bfloat16.
         - The helper assumes nbit == 4 and produces bfloat16 values.
         - The macro uses a fixed test-scale of 0 (no per-element scaling) as written.
 
@@ -213,49 +212,49 @@ def get_simple_dequant_func(in_dtype="fp4", out_dtype="bfloat16"):
             A TIR macro function performing the described in-place block dequantization from packed uint8 FP4 to bfloat16.
         """
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
-        def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr,
-                                  scale: tir.PrimExpr, dtype: str):
+        def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale: tir.PrimExpr, dtype: str):
             """
-                Convert a 4-bit FP4 value packed in a uint8 byte into a bfloat16 value.
-
-                This helper extracts the 4-bit field located at the bit position `pos` within the
-                byte `val`, interprets it as an FP4 (sign, exponent, mantissa) value, applies an
-                exponent `scale` offset to align it with bfloat16 exponent bias, clamps the
-                resulting exponent to 8 bits, and returns the assembled bfloat16 bit pattern.
-
-                Parameters:
-                    nbit (int): Number of bits in the packed element; must be 4.
-                    val (tir.PrimExpr): A uint8 value containing packed FP4 elements.
-                    pos (tir.PrimExpr): Index (0-based) of which FP4 nibble inside `val` to extract.
-                    scale (tir.PrimExpr): Exponent offset applied when converting FP4 exponent to bfloat16.
-                    dtype (str): Target dtype string; must be "bfloat16".
-
-                Returns:
-                    tir.PrimExpr: A bfloat16-typed PrimExpr containing the converted value.
-
-                Notes:
-                    - The function asserts `nbit == 4`, `dtype == "bfloat16"`, and that `val.dtype` is "uint8".
-                    - The conversion uses a fixed mapping from FP4 exponent/mantissa layout into bfloat16
-                    bit fields and clamps the computed exponent to fit into 8 bits.
+            Convert a 4-bit FP4 value packed in a uint8 byte into a bfloat16 value.
+
+            This helper extracts the 4-bit field located at the bit position `pos` within the
+            byte `val`, interprets it as an FP4 (sign, exponent, mantissa) value, applies an
+            exponent `scale` offset to align it with bfloat16 exponent bias, clamps the
+            resulting exponent to 8 bits, and returns the assembled bfloat16 bit pattern.
+
+            Parameters:
+                nbit (int): Number of bits in the packed element; must be 4.
+                val (tir.PrimExpr): A uint8 value containing packed FP4 elements.
+                pos (tir.PrimExpr): Index (0-based) of which FP4 nibble inside `val` to extract.
+                scale (tir.PrimExpr): Exponent offset applied when converting FP4 exponent to bfloat16.
+                dtype (str): Target dtype string; must be T.bfloat16.
+
+            Returns:
+                tir.PrimExpr: A bfloat16-typed PrimExpr containing the converted value.
+
+            Notes:
+                - The function asserts `nbit == 4`, `dtype == T.bfloat16`, and that `val.dtype` is T.uint8.
+                - The conversion uses a fixed mapping from FP4 exponent/mantissa layout into bfloat16
+                bit fields and clamps the computed exponent to fit into 8 bits.
             """
             assert nbit == 4
-            assert dtype == "bfloat16"
-            assert val.dtype == "uint8"
-            mask = tir.const((1 << nbit) - 1, "uint16")
-            f4 = (val >> (pos.astype("uint16") * tir.const(nbit, "uint16"))) & mask
-            s = f4 >> tir.const(3, "uint16")
-            e_f4 = (f4 & tir.const(6, "uint16")) >> tir.const(1, "uint16")
+            assert dtype == T.bfloat16
+            assert val.dtype == T.uint8
+            mask = tir.const((1 << nbit) - 1, T.uint16)
+            f4 = (val >> (pos.astype(T.uint16) * tir.const(nbit, T.uint16))) & mask
+            s = f4 >> tir.const(3, T.uint16)
+            e_f4 = (f4 & tir.const(6, T.uint16)) >> tir.const(1, T.uint16)
             # Exponential bias between f4 and bf16 is 2^(8-1) - 2^(2-1) = 126
-            e_bf16 = e_f4 + tir.const(126, "uint16")
+            e_bf16 = e_f4 + tir.const(126, T.uint16)
             # Scale is the exponential part, within the representation of uint8
             # To handle the overflow, we use the max function to limit the exponential part to 8 bits
-            e_bf16 = T.min(e_bf16 + scale, tir.const((1 << 8) - 1, "uint16"))
-            m_f4 = f4 & tir.const(1, "uint16")
+            e_bf16 = T.min(e_bf16 + scale, tir.const((1 << 8) - 1, T.uint16))
+            m_f4 = f4 & tir.const(1, T.uint16)
             val_bf16 = tir.reinterpret(
-                "bfloat16", ((((s << tir.const(8, "uint16")) | e_bf16) << tir.const(7, "uint16"))
-                             | (m_f4 << tir.const(6, "uint16"))).astype("uint16"))
+                T.bfloat16,
+                ((((s << tir.const(8, T.uint16)) | e_bf16) << tir.const(7, T.uint16)) | (m_f4 << tir.const(6, T.uint16))).astype(T.uint16),
+            )
             return val_bf16
 
         @T.macro
@@ -292,32 +291,32 @@ def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared):
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, storage_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         """
-            Kernel entry for the tiled, pipelined matmul used by the generated prim_func.
-
-            This function implements a block-wise GEMM over a 2D grid (grid dims: ceildiv(N, block_N) x ceildiv(M, block_M)) with a thread block of `threads`. For each output block it:
-            - Allocates shared buffers for A, the packed/quantized B, and a dequantized B tile.
-            - Allocates a fragment accumulator (C_local) and a shared output tile (C_shared) with a swizzled layout.
-            - Pipelines over K in chunks of `block_K` for `num_stages` stages:
-              - Loads A and packed B tiles into shared memory.
-              - Dequantizes B into B_dequantize_shared using either the fast (twiddling/external) or the simple (pure-TIR) dequantization routine.
-              - Performs a GEMM accumulating into C_local with B transposed.
-            - Stores the accumulated block from C_local back to the global output C via C_shared.
-
-            Parameters:
-            - A: input tile of shape (M, K) with dtype `in_dtype`.
-            - B: packed/quantized input of shape (N, QK) with storage dtype `storage_dtype` (quantized FP4 packing).
-            - C: output tensor of shape (M, N) with dtype `out_dtype`.
-
-            Side effects:
-            - Writes the computed output block into the global tensor `C`.
-            - Uses and updates shared memory buffers and per-thread accumulators.
-
-            No value is returned.
+        Kernel entry for the tiled, pipelined matmul used by the generated prim_func.
+
+        This function implements a block-wise GEMM over a 2D grid (grid dims: ceildiv(N, block_N) x ceildiv(M, block_M)) with a thread block of `threads`. For each output block it:
+        - Allocates shared buffers for A, the packed/quantized B, and a dequantized B tile.
+        - Allocates a fragment accumulator (C_local) and a shared output tile (C_shared) with a swizzled layout.
+        - Pipelines over K in chunks of `block_K` for `num_stages` stages:
+          - Loads A and packed B tiles into shared memory.
+          - Dequantizes B into B_dequantize_shared using either the fast (twiddling/external) or the simple (pure-TIR) dequantization routine.
+          - Performs a GEMM accumulating into C_local with B transposed.
+        - Stores the accumulated block from C_local back to the global output C via C_shared.
+
+        Parameters:
+        - A: input tile of shape (M, K) with dtype `in_dtype`.
+        - B: packed/quantized input of shape (N, QK) with storage dtype `storage_dtype` (quantized FP4 packing).
+        - C: output tensor of shape (M, N) with dtype `out_dtype`.
+
+        Side effects:
+        - Writes the computed output block into the global tensor `C`.
+        - Uses and updates shared memory buffers and per-thread accumulators.
+
+        No value is returned.
         """
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -327,10 +326,6 @@ def main(
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             C_shared = T.alloc_shared((block_M, block_N), out_dtype)
 
-            T.annotate_layout({
-                C_shared: tilelang.layout.make_swizzled_layout(C_shared),
-            })
-
             T.clear(C_local)
             for k in T.Pipelined(K // block_K, num_stages=num_stages):
                 T.copy(A[by * block_M, k * block_K], A_shared)
@@ -344,7 +339,7 @@ def main(
                 T.gemm(A_shared, B_dequantize_shared, C_local, transpose_B=True)
 
             T.copy(C_local, C_shared)
-            T.copy(C_shared, C[by * block_M:(by + 1) * block_M, bx * block_N:(bx + 1) * block_N])
+            T.copy(C_shared, C[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N])
 
     return main
 
@@ -363,7 +358,7 @@ def ref_program_twiddling(A, qB):
     Returns:
         torch.Tensor: Result matrix C with shape (M, N) in bfloat16.
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert_bit_twiddling(qB)
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
@@ -383,7 +378,7 @@ def ref_program_simple(A, qB):
     Returns:
         torch.Tensor: Resulting matrix C in bfloat16 with shape (M, N).
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert(qB)
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
@@ -409,16 +404,15 @@ def main(m=256, n=256, k=256, fast_dequant=True, tune=False):
     """
     total_flops = 2 * m * n * k
     if tune:
-        kernel = matmul(
-            m, n, k, "bfloat16", "bfloat16", "float32", num_bits=4, fast_dequant=fast_dequant)
+        kernel = matmul(m, n, k, T.bfloat16, T.bfloat16, T.float32, num_bits=4, fast_dequant=fast_dequant)
     else:
         kernel = matmul(
             m,
             n,
             k,
-            "bfloat16",
-            "bfloat16",
-            "float32",
+            T.bfloat16,
+            T.bfloat16,
+            T.float32,
             num_bits=4,
             fast_dequant=fast_dequant,
             block_M=256,
@@ -426,7 +420,8 @@ def main(m=256, n=256, k=256, fast_dequant=True, tune=False):
             block_K=128,
             num_stages=2,
             threads=256,
-            split=1)
+            split=1,
+        )
     profiler = kernel.get_profiler(tilelang.TensorSupplyType.Auto)
     if fast_dequant:
         profiler.assert_allclose(ref_program_twiddling, rtol=0.01, atol=0.01)
@@ -437,6 +432,27 @@ def main(m=256, n=256, k=256, fast_dequant=True, tune=False):
     print("Tile-lang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
+def run_regression_perf(m=4096, n=4096, k=4096, fast_dequant=True):
+    kernel = matmul(
+        m,
+        n,
+        k,
+        "bfloat16",
+        "bfloat16",
+        "float32",
+        num_bits=4,
+        fast_dequant=fast_dequant,
+        block_M=256,
+        block_N=128,
+        block_K=128,
+        num_stages=2,
+        threads=256,
+        split=1,
+    )
+    profiler = kernel.get_profiler(tilelang.TensorSupplyType.Auto)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     main(256, 256, 256, True)
     main(256, 256, 256, False)
diff --git a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py
index ac1417aeb..cc37c8bc4 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py
@@ -7,45 +7,45 @@
 from dequantize_utils import torch_convert_bit_twiddling, torch_convert
 
 
-def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale: tir.PrimExpr,
-                          dtype: str):
+def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale: tir.PrimExpr, dtype: str):
     """
-        Convert a 4-bit field packed in a uint8 into a bfloat16 value, applying an exponent scale.
+    Convert a 4-bit field packed in a uint8 into a bfloat16 value, applying an exponent scale.
 
-        This helper extracts a 4-bit nibble from `val` at byte-nibble position `pos`, interprets its
-        bits as a sign/exponent/mantissa in the 4-bit custom FP4 layout, adjusts the exponent by
-        `scale` (clamped to an 8-bit range), and assembles the corresponding bfloat16 representation.
+    This helper extracts a 4-bit nibble from `val` at byte-nibble position `pos`, interprets its
+    bits as a sign/exponent/mantissa in the 4-bit custom FP4 layout, adjusts the exponent by
+    `scale` (clamped to an 8-bit range), and assembles the corresponding bfloat16 representation.
 
-        Parameters:
-            nbit (int): Number of bits in the packed field (must be 4).
-            val (tir.PrimExpr): Packed input value of dtype `uint8` containing one or more 4-bit fields.
-            pos (tir.PrimExpr): Index of the nibble within `val` (used to shift/extract the 4-bit field).
-            scale (tir.PrimExpr): Per-element exponent adjustment added to the extracted exponent (uint-like).
-            dtype (str): Destination dtype string (must be "bfloat16").
+    Parameters:
+        nbit (int): Number of bits in the packed field (must be 4).
+        val (tir.PrimExpr): Packed input value of dtype `uint8` containing one or more 4-bit fields.
+        pos (tir.PrimExpr): Index of the nibble within `val` (used to shift/extract the 4-bit field).
+        scale (tir.PrimExpr): Per-element exponent adjustment added to the extracted exponent (uint-like).
+        dtype (str): Destination dtype string (must be T.bfloat16).
 
-        Returns:
-            tir.PrimExpr: The resulting value reinterpreted as `bfloat16`.
+    Returns:
+        tir.PrimExpr: The resulting value reinterpreted as `bfloat16`.
 
-        Notes:
-        - Preconditions are enforced via assertions: nbit == 4, dtype == "bfloat16", and val.dtype == "uint8".
-        - The function clamps the adjusted exponent to the 8-bit range before assembling the bfloat16 bit pattern.
-        """
+    Notes:
+    - Preconditions are enforced via assertions: nbit == 4, dtype == T.bfloat16, and val.dtype == T.uint8.
+    - The function clamps the adjusted exponent to the 8-bit range before assembling the bfloat16 bit pattern.
+    """
     assert nbit == 4
-    assert dtype == "bfloat16"
-    assert val.dtype == "uint8"
-    mask = tir.const((1 << nbit) - 1, "uint16")
-    f4 = (val >> (pos.astype("uint16") * tir.const(nbit, "uint16"))) & mask
-    s = f4 >> tir.const(3, "uint16")
-    e_f4 = (f4 & tir.const(6, "uint16")) >> tir.const(1, "uint16")
+    assert dtype == T.bfloat16
+    assert val.dtype == T.uint8
+    mask = tir.const((1 << nbit) - 1, T.uint16)
+    f4 = (val >> (pos.astype(T.uint16) * tir.const(nbit, T.uint16))) & mask
+    s = f4 >> tir.const(3, T.uint16)
+    e_f4 = (f4 & tir.const(6, T.uint16)) >> tir.const(1, T.uint16)
     # Exponential bias between f4 and bf16 is 2^(8-1) - 2^(2-1) = 126
-    e_bf16 = e_f4 + tir.const(126, "uint16")
+    e_bf16 = e_f4 + tir.const(126, T.uint16)
     # Scale is the exponential part, within the representation of uint8
     # To handle the overflow, we may use the min function to limit the exponential part to 8 bits
     # e_bf16 = T.min(e_bf16 + scale, tir.const((1 << 8) - 1, "uint16"))
-    m_f4 = f4 & tir.const(1, "uint16")
-    val_bf16 = tir.reinterpret("bfloat16",
-                               ((((s << tir.const(8, "uint16")) | e_bf16) << tir.const(7, "uint16"))
-                                | (m_f4 << tir.const(6, "uint16"))).astype("uint16"))
+    m_f4 = f4 & tir.const(1, T.uint16)
+    val_bf16 = tir.reinterpret(
+        T.bfloat16,
+        ((((s << tir.const(8, T.uint16)) | e_bf16) << tir.const(7, T.uint16)) | (m_f4 << tir.const(6, T.uint16))).astype(T.uint16),
+    )
     return val_bf16
 
 
@@ -65,6 +65,7 @@ def get_configs():
         List[dict]: A list of configuration dictionaries covering all combinations.
     """
     import itertools
+
     iter_params = dict(
         block_M=[64, 128, 256],
         block_N=[64, 128, 256],
@@ -73,70 +74,74 @@ def get_configs():
         threads=[128, 256, 512],
         split=[1, 2],
     )
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
-
-
-@tilelang.autotune(configs=get_configs(),)
-@tilelang.jit(out_idx=[-1],)
-def matmul(M,
-           N,
-           K,
-           in_dtype,
-           out_dtype,
-           accum_dtype,
-           source_format='uint',
-           num_bits=4,
-           scale_size=32,
-           fast_dequant=True,
-           with_bias=False,
-           block_M=256,
-           block_N=128,
-           block_K=128,
-           num_stages=2,
-           threads=256,
-           split=1):
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
+
+
+@tilelang.autotune(
+    configs=get_configs(),
+)
+@tilelang.jit(
+    out_idx=[-1],
+)
+def matmul(
+    M,
+    N,
+    K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    source_format=T.uint32,
+    num_bits=4,
+    scale_size=32,
+    fast_dequant=True,
+    with_bias=False,
+    block_M=256,
+    block_N=128,
+    block_K=128,
+    num_stages=2,
+    threads=256,
+    split=1,
+):
     """
-        Construct and return a tiled matrix-multiply TIR kernel that multiplies A (shape MxK) by a quantized B (shape Nx(QK)) and writes an MxN output in out_dtype.
-
-        The generated kernel accepts:
-        - A: dense matrix with element type `in_dtype`.
-        - B: packed quantized matrix stored as uint8 with `num_bits` bits per element (QK = K / (8/num_bits)).
-        - Scale: per-block scale/exponent information used to dequantize B.
-        The kernel dequantizes B to a working floating format (out_dtype/accum_dtype) using one of two paths:
-        - fast_dequant (True): uses an external, hardware/implementation-specific intrinsic group (twiddling) for batch dequantization.
-        - fast_dequant (False): uses a simple elementwise dequantization helper.
-
-        Parameters:
-        M, N, K (int): matrix dimensions (A is MxK, result is MxN). K must be divisible by (block_K * split).
-        in_dtype (str): element type of A (e.g., "fp4" in this file).
-        out_dtype (str): output tensor element type (e.g., "bfloat16").
-        accum_dtype (str): accumulation type used for the inner GEMM.
-        source_format (str, optional): format string passed to intrinsic selector (default "uint").
-        num_bits (int, optional): number of bits per quantized element in B (default 4).
-        scale_size (int, optional): number of elements grouped per scale entry (default 32).
-        fast_dequant (bool, optional): choose the fast intrinsic dequantization path when available (default True).
-        block_M, block_N, block_K (int, optional): tile sizes for M, N, and K dimensions (defaults 256, 128, 128).
-        num_stages (int, optional): pipelining stages for K loop (default 2).
-        threads (int, optional): threads per block used by the kernel (default 256).
-        split (int, optional): split factor along K used by the scheduler (default 1).
-        with_bias (bool, optional): whether to add Bias to the output (default False).
-
-        Returns:
-        A T.prim_func implementing the tiled, pipelined GEMM that:
-        - loads tiled blocks of A and packed B to shared memory,
-        - dequantizes B via the chosen path into a shared dequantized tile,
-        - performs a tiled GEMM accumulating into local fragments,
-        - writes the final MxN block to the global output tensor.
+    Construct and return a tiled matrix-multiply TIR kernel that multiplies A (shape MxK) by a quantized B (shape Nx(QK)) and writes an MxN output in out_dtype.
 
-        Notes:
-        - The function queries an intrinsic group to obtain a fast dequantization implementation when fast_dequant is enabled; that intrinsic must supply a valid C source and function name.
-        - The kernel layout uses swizzled shared-memory layouts for A, B, and the shared C tile.
-        - An assertion enforces that K % (block_K * split) == 0.
+    The generated kernel accepts:
+    - A: dense matrix with element type `in_dtype`.
+    - B: packed quantized matrix stored as uint8 with `num_bits` bits per element (QK = K / (8/num_bits)).
+    - Scale: per-block scale/exponent information used to dequantize B.
+    The kernel dequantizes B to a working floating format (out_dtype/accum_dtype) using one of two paths:
+    - fast_dequant (True): uses an external, hardware/implementation-specific intrinsic group (twiddling) for batch dequantization.
+    - fast_dequant (False): uses a simple elementwise dequantization helper.
+
+    Parameters:
+    M, N, K (int): matrix dimensions (A is MxK, result is MxN). K must be divisible by (block_K * split).
+    in_dtype (str): element type of A (e.g., "fp4" in this file).
+    out_dtype (str): output tensor element type (e.g., T.bfloat16).
+    accum_dtype (str): accumulation type used for the inner GEMM.
+    source_format (str, optional): format string passed to intrinsic selector (default "uint").
+    num_bits (int, optional): number of bits per quantized element in B (default 4).
+    scale_size (int, optional): number of elements grouped per scale entry (default 32).
+    fast_dequant (bool, optional): choose the fast intrinsic dequantization path when available (default True).
+    block_M, block_N, block_K (int, optional): tile sizes for M, N, and K dimensions (defaults 256, 128, 128).
+    num_stages (int, optional): pipelining stages for K loop (default 2).
+    threads (int, optional): threads per block used by the kernel (default 256).
+    split (int, optional): split factor along K used by the scheduler (default 1).
+    with_bias (bool, optional): whether to add Bias to the output (default False).
+
+    Returns:
+    A T.prim_func implementing the tiled, pipelined GEMM that:
+    - loads tiled blocks of A and packed B to shared memory,
+    - dequantizes B via the chosen path into a shared dequantized tile,
+    - performs a tiled GEMM accumulating into local fragments,
+    - writes the final MxN block to the global output tensor.
+
+    Notes:
+    - The function queries an intrinsic group to obtain a fast dequantization implementation when fast_dequant is enabled; that intrinsic must supply a valid C source and function name.
+    - The kernel layout uses swizzled shared-memory layouts for A, B, and the shared C tile.
+    - An assertion enforces that K % (block_K * split) == 0.
     """
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "uint8"
+    storage_dtype = T.uint8
     QK = K // num_elems_per_byte
     Block_QK = block_K // num_elems_per_byte
     A_shape = (M, K)
@@ -150,6 +155,7 @@ def matmul(M,
     assert K % (block_K * split) == 0
 
     from tilelang.quantize import get_mxfp_intrin_group
+
     # fast_dequant_bf16_fp4_twiddling
     mxfp_intrin_info = get_mxfp_intrin_group(
         out_dtype=in_dtype,
@@ -164,7 +170,7 @@ def matmul(M,
     assert func_name is not None, "mxfp_intrin_info is not found"
     import_source = import_source
 
-    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype="bfloat16"):
+    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype=T.bfloat16):
         """
         Return a TileLang macro that performs fast dequantization of twiddled FP4-packed data into BF16.
 
@@ -175,12 +181,12 @@ def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype="bfloat16"):
         - Writes the scaled BF16 results into B_dequantize_shared.
 
         Notes:
-        - This factory only supports in_dtype="fp4" and out_dtype="bfloat16".
+        - This factory only supports in_dtype="fp4" and out_dtype=T.bfloat16.
         - The macro depends on several names from the enclosing scope (e.g., import_source, func_name, DataType, num_elems_per_byte, storage_dtype, block_N, block_K, threads, scale_size); those must be defined and consistent with the kernel that will use the macro.
         - The macro issues a T.import_source and T.call_extern to invoke the external intrinsic; ensure the external implementation matching `func_name` is available at compilation/runtime.
         """
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
         # Some variables for dequantization in each thread
         MAX_TRANSACTION_SIZE_BITS = 128
@@ -252,24 +258,23 @@ def fast_dequant_bf16_fp4_twiddling(B_shared, B_dequantize_shared, Scale, k):
 
                 for v in T.vectorized(0, local_size):
                     index = i * threads * local_size + tx * local_size + v
-                    B_dequantize_shared[index // block_K,
-                                        index % block_K] = B_dequantize_local_thread[v]
+                    B_dequantize_shared[index // block_K, index % block_K] = B_dequantize_local_thread[v]
 
         return fast_dequant_bf16_fp4_twiddling
 
-    def get_simple_dequant_func(in_dtype="fp4", out_dtype="bfloat16"):
+    def get_simple_dequant_func(in_dtype="fp4", out_dtype=T.bfloat16):
         """
         Create a simple (scalar) dequantization macro that converts 4-bit packed inputs to bfloat16.
 
         Returns a T.macro that, given shared-storage buffers B_shared, B_dequantize_shared, a Scale tensor, and block index k, unpacks 4-bit values from B_shared, converts each nibble to a bfloat16 value using _tir_u8_to_f4_to_bf16, applies the per-element exponential Scale, and writes the dequantized BF16 block into B_dequantize_shared.
 
         Notes:
-        - Only supports in_dtype="fp4" and out_dtype="bfloat16".
+        - Only supports in_dtype="fp4" and out_dtype=T.bfloat16.
         - The macro expects B_shared and B_dequantize_shared to have the shapes established in the enclosing scope (B_shared_shape, B_dequantize_shared_shape) and performs block-local copying into allocated fragments before elementwise conversion.
         - Scale holds the exponent-like scaling values indexed per output element as used by the conversion helper.
         """
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
         @T.macro
         def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared, Scale, k):
@@ -301,33 +306,32 @@ def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared, Scale, k):
                     B_local[i, j // num_elems_per_byte],
                     j % num_elems_per_byte,
                     Scale[
-                        bx * block_N + i, k * block_K // scale_size + j //
-                        scale_size],  # Scale is the exponential part, within the representation of uint8
+                        bx * block_N + i, k * block_K // scale_size + j // scale_size
+                    ],  # Scale is the exponential part, within the representation of uint8
                     dtype=out_dtype,
-                ) * T.shift_left(
-                    1, (Scale[bx * block_N + i, k * block_K // scale_size + j // scale_size]))
+                ) * T.shift_left(1, (Scale[bx * block_N + i, k * block_K // scale_size + j // scale_size]))
             T.copy(B_dequantize_local, B_dequantize_shared)
 
         return simple_dequant_bf16_fp4
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, storage_dtype),
-            Scale: T.Tensor(Scale_shape, storage_dtype),
-            Bias: T.Tensor(Bias_shape, out_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        Scale: T.Tensor(Scale_shape, storage_dtype),
+        Bias: T.Tensor(Bias_shape, out_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         """
-            Tiled, pipelined kernel entry that multiplies A with a quantized B (with per-block Scale) producing C.
+        Tiled, pipelined kernel entry that multiplies A with a quantized B (with per-block Scale) producing C.
 
-            This prim-level kernel implements a blocked, multi-threaded matmul: it loads tiles of A and the packed/quantized B into shared memory, dequantizes B (either via the fast intrinsic twiddling path or the simple per-element path), performs a block GEMM (with B transposed), and writes the accumulated block results into the output tensor C. The kernel allocates shared buffers for A, B, and the dequantized B, and a local fragment for accumulation; it runs over K in pipelined stages and expects the provided shapes and dtypes to match the tiling parameters used to build the function.
+        This prim-level kernel implements a blocked, multi-threaded matmul: it loads tiles of A and the packed/quantized B into shared memory, dequantizes B (either via the fast intrinsic twiddling path or the simple per-element path), performs a block GEMM (with B transposed), and writes the accumulated block results into the output tensor C. The kernel allocates shared buffers for A, B, and the dequantized B, and a local fragment for accumulation; it runs over K in pipelined stages and expects the provided shapes and dtypes to match the tiling parameters used to build the function.
 
-            Parameters are self-descriptive in the signature; notable behaviors:
-            - B is stored in a compact uint8-packed layout (num_bits per element) and is dequantized using Scale before GEMM.
-            - The selected dequantization path is controlled by the outer-scope flag `fast_dequant`.
-            - The GEMM uses transpose_B=True (i.e., multiplies A · B^T after dequantization).
-            - The function writes results in-place into C.
+        Parameters are self-descriptive in the signature; notable behaviors:
+        - B is stored in a compact uint8-packed layout (num_bits per element) and is dequantized using Scale before GEMM.
+        - The selected dequantization path is controlled by the outer-scope flag `fast_dequant`.
+        - The GEMM uses transpose_B=True (i.e., multiplies A · B^T after dequantization).
+        - The function writes results in-place into C.
         """
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -337,23 +341,24 @@ def main(
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             C_shared = T.alloc_shared((block_M, block_N), out_dtype)
 
-            T.annotate_layout({
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                C_shared: tilelang.layout.make_swizzled_layout(C_shared),
-            })
+            T.annotate_layout(
+                {
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                }
+            )
 
             if with_bias:
-                T.annotate_layout({
-                    Bias_shared: tilelang.layout.make_swizzled_layout(Bias_shared),
-                })
+                T.annotate_layout(
+                    {
+                        Bias_shared: tilelang.layout.make_swizzled_layout(Bias_shared),
+                    }
+                )
 
             if threads == 512:
                 T.disable_warp_group_reg_alloc()
 
             if with_bias:
-                T.copy(Bias[by * block_M:(by + 1) * block_M, bx * block_N:(bx + 1) * block_N],
-                       Bias_shared)
+                T.copy(Bias[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N], Bias_shared)
                 T.copy(Bias_shared, C_local)
             else:
                 T.clear(C_local)
@@ -368,7 +373,7 @@ def main(
                 T.gemm(A_shared, B_dequantize_shared, C_local, transpose_B=True)
 
             T.copy(C_local, C_shared)
-            T.copy(C_shared, C[by * block_M:(by + 1) * block_M, bx * block_N:(bx + 1) * block_N])
+            T.copy(C_shared, C[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N])
 
     return main
 
@@ -387,9 +392,9 @@ def ref_program_twiddling(A, qB, Scale, Bias=None):
     Returns:
         torch.Tensor: Resulting matrix C with shape (M, N) in bfloat16.
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert_bit_twiddling(qB)
-    B *= 2**(Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
+    B *= 2 ** (Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -410,9 +415,9 @@ def ref_program_twiddling_with_bias(A, qB, Scale, Bias):
     Returns:
         torch.Tensor: Resulting matrix C with shape (M, N) in bfloat16.
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert_bit_twiddling(qB)
-    B *= 2**(Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
+    B *= 2 ** (Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float)) + Bias
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -434,9 +439,9 @@ def ref_program_simple(A, qB, Scale, Bias=None):
 
     No in-place modification is performed on inputs (a local floating copy of B is scaled).
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert(qB)
-    B *= 2**(Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
+    B *= 2 ** (Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -462,9 +467,9 @@ def ref_program_simple_with_bias(A, qB, Scale, Bias):
 
     No in-place modification is performed on inputs (a local floating copy of B is scaled).
     """
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     B = torch_convert(qB)
-    B *= 2**(Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
+    B *= 2 ** (Scale[:, (torch.arange(B.shape[1], device=B.device) // 32)])
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float)) + Bias
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -491,24 +496,16 @@ def main(m=256, n=256, k=256, scale_size=32, fast_dequant=True, with_bias=False,
 
     if tune:
         kernel = matmul(
-            m,
-            n,
-            k,
-            "bfloat16",
-            "bfloat16",
-            "float32",
-            num_bits=4,
-            scale_size=scale_size,
-            fast_dequant=fast_dequant,
-            with_bias=with_bias)
+            m, n, k, T.bfloat16, T.bfloat16, T.float32, num_bits=4, scale_size=scale_size, fast_dequant=fast_dequant, with_bias=with_bias
+        )
     else:
         kernel = matmul(
             m,
             n,
             k,
-            "bfloat16",
-            "bfloat16",
-            "float32",
+            T.bfloat16,
+            T.bfloat16,
+            T.float32,
             num_bits=4,
             scale_size=scale_size,
             block_M=256,
@@ -518,7 +515,8 @@ def main(m=256, n=256, k=256, scale_size=32, fast_dequant=True, with_bias=False,
             threads=256,
             split=1,
             fast_dequant=fast_dequant,
-            with_bias=with_bias)
+            with_bias=with_bias,
+        )
 
     profiler = kernel.get_profiler(tilelang.TensorSupplyType.Auto)
 
@@ -538,6 +536,29 @@ def main(m=256, n=256, k=256, scale_size=32, fast_dequant=True, with_bias=False,
     print("Tile-lang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
+def run_regression_perf(m=4096, n=4096, k=4096, scale_size=32, fast_dequant=True, with_bias=False):
+    kernel = matmul(
+        m,
+        n,
+        k,
+        "bfloat16",
+        "bfloat16",
+        "float32",
+        num_bits=4,
+        scale_size=scale_size,
+        block_M=256,
+        block_N=128,
+        block_K=128,
+        num_stages=2,
+        threads=256,
+        split=1,
+        fast_dequant=fast_dequant,
+        with_bias=with_bias,
+    )
+    profiler = kernel.get_profiler(tilelang.TensorSupplyType.Auto)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     M, N, K = 256, 256, 256
     scale_size = 32
diff --git a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper_tma.py b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper_tma.py
index 7dad79597..12395df0a 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper_tma.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper_tma.py
@@ -7,29 +7,28 @@
 from dequantize_utils import torch_convert_bit_twiddling, torch_convert
 
 
-def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale: tir.PrimExpr,
-                          dtype: str):
+def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale: tir.PrimExpr, dtype: str):
     """
-        Convert a 4-bit field packed in a uint8 into a bfloat16 value, applying an exponent scale.
+    Convert a 4-bit field packed in a uint8 into a bfloat16 value, applying an exponent scale.
 
-        This helper extracts a 4-bit nibble from `val` at byte-nibble position `pos`, interprets its
-        bits as a sign/exponent/mantissa in the 4-bit custom FP4 layout, adjusts the exponent by
-        `scale` (clamped to an 8-bit range), and assembles the corresponding bfloat16 representation.
+    This helper extracts a 4-bit nibble from `val` at byte-nibble position `pos`, interprets its
+    bits as a sign/exponent/mantissa in the 4-bit custom FP4 layout, adjusts the exponent by
+    `scale` (clamped to an 8-bit range), and assembles the corresponding bfloat16 representation.
 
-        Parameters:
-            nbit (int): Number of bits in the packed field (must be 4).
-            val (tir.PrimExpr): Packed input value of dtype `uint8` containing one or more 4-bit fields.
-            pos (tir.PrimExpr): Index of the nibble within `val` (used to shift/extract the 4-bit field).
-            scale (tir.PrimExpr): Per-element exponent adjustment added to the extracted exponent (uint-like).
-            dtype (str): Destination dtype string (must be "bfloat16").
+    Parameters:
+        nbit (int): Number of bits in the packed field (must be 4).
+        val (tir.PrimExpr): Packed input value of dtype `uint8` containing one or more 4-bit fields.
+        pos (tir.PrimExpr): Index of the nibble within `val` (used to shift/extract the 4-bit field).
+        scale (tir.PrimExpr): Per-element exponent adjustment added to the extracted exponent (uint-like).
+        dtype (str): Destination dtype string (must be "bfloat16").
 
-        Returns:
-            tir.PrimExpr: The resulting value reinterpreted as `bfloat16`.
+    Returns:
+        tir.PrimExpr: The resulting value reinterpreted as `bfloat16`.
 
-        Notes:
-        - Preconditions are enforced via assertions: nbit == 4, dtype == "bfloat16", and val.dtype == "uint8".
-        - The function clamps the adjusted exponent to the 8-bit range before assembling the bfloat16 bit pattern.
-        """
+    Notes:
+    - Preconditions are enforced via assertions: nbit == 4, dtype == "bfloat16", and val.dtype == "uint8".
+    - The function clamps the adjusted exponent to the 8-bit range before assembling the bfloat16 bit pattern.
+    """
     assert nbit == 4
     assert dtype == "bfloat16"
     assert val.dtype == "uint8"
@@ -43,9 +42,10 @@ def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale
     # To handle the overflow, we may use the min function to limit the exponential part to 8 bits
     # e_bf16 = T.min(e_bf16 + scale, tir.const((1 << 8) - 1, "uint16"))
     m_f4 = f4 & tir.const(1, "uint16")
-    val_bf16 = tir.reinterpret("bfloat16",
-                               ((((s << tir.const(8, "uint16")) | e_bf16) << tir.const(7, "uint16"))
-                                | (m_f4 << tir.const(6, "uint16"))).astype("uint16"))
+    val_bf16 = tir.reinterpret(
+        "bfloat16",
+        ((((s << tir.const(8, "uint16")) | e_bf16) << tir.const(7, "uint16")) | (m_f4 << tir.const(6, "uint16"))).astype("uint16"),
+    )
     return val_bf16
 
 
@@ -65,6 +65,7 @@ def get_configs():
         List[dict]: A list of configuration dictionaries covering all combinations.
     """
     import itertools
+
     iter_params = dict(
         block_M=[64, 128, 256],
         block_N=[64, 128, 256],
@@ -73,67 +74,71 @@ def get_configs():
         threads=[128, 256, 512],
         split=[1, 2],
     )
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
-
-
-@tilelang.autotune(configs=get_configs(),)
-@tilelang.jit(out_idx=[-1],)
-def matmul(M,
-           N,
-           K,
-           in_dtype,
-           out_dtype,
-           accum_dtype,
-           source_format='uint',
-           num_bits=4,
-           scale_size=32,
-           fast_dequant=True,
-           with_bias=False,
-           block_M=256,
-           block_N=128,
-           block_K=128,
-           num_stages=2,
-           threads=256,
-           split=1):
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
+
+
+@tilelang.autotune(
+    configs=get_configs(),
+)
+@tilelang.jit(
+    out_idx=[-1],
+)
+def matmul(
+    M,
+    N,
+    K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    source_format="uint",
+    num_bits=4,
+    scale_size=32,
+    fast_dequant=True,
+    with_bias=False,
+    block_M=256,
+    block_N=128,
+    block_K=128,
+    num_stages=2,
+    threads=256,
+    split=1,
+):
     """
-        Construct and return a tiled matrix-multiply TIR kernel that multiplies A (shape MxK) by a quantized B (shape Nx(QK)) and writes an MxN output in out_dtype.
-
-        The generated kernel accepts:
-        - A: dense matrix with element type `in_dtype`.
-        - B: packed quantized matrix stored as uint8 with `num_bits` bits per element (QK = K / (8/num_bits)).
-        - Scale: per-block scale/exponent information used to dequantize B.
-        The kernel dequantizes B to a working floating format (out_dtype/accum_dtype) using one of two paths:
-        - fast_dequant (True): uses an external, hardware/implementation-specific intrinsic group (twiddling) for batch dequantization.
-        - fast_dequant (False): uses a simple elementwise dequantization helper.
-
-        Parameters:
-        M, N, K (int): matrix dimensions (A is MxK, result is MxN). K must be divisible by (block_K * split).
-        in_dtype (str): element type of A (e.g., "fp4" in this file).
-        out_dtype (str): output tensor element type (e.g., "bfloat16").
-        accum_dtype (str): accumulation type used for the inner GEMM.
-        source_format (str, optional): format string passed to intrinsic selector (default "uint").
-        num_bits (int, optional): number of bits per quantized element in B (default 4).
-        scale_size (int, optional): number of elements grouped per scale entry (default 32).
-        fast_dequant (bool, optional): choose the fast intrinsic dequantization path when available (default True).
-        block_M, block_N, block_K (int, optional): tile sizes for M, N, and K dimensions (defaults 256, 128, 128).
-        num_stages (int, optional): pipelining stages for K loop (default 2).
-        threads (int, optional): threads per block used by the kernel (default 256).
-        split (int, optional): split factor along K used by the scheduler (default 1).
-        with_bias (bool, optional): whether to add Bias to the output (default False).
-
-        Returns:
-        A T.prim_func implementing the tiled, pipelined GEMM that:
-        - loads tiled blocks of A and packed B to shared memory,
-        - dequantizes B via the chosen path into a shared dequantized tile,
-        - performs a tiled GEMM accumulating into local fragments,
-        - writes the final MxN block to the global output tensor.
+    Construct and return a tiled matrix-multiply TIR kernel that multiplies A (shape MxK) by a quantized B (shape Nx(QK)) and writes an MxN output in out_dtype.
 
-        Notes:
-        - The function queries an intrinsic group to obtain a fast dequantization implementation when fast_dequant is enabled; that intrinsic must supply a valid C source and function name.
-        - The kernel layout uses swizzled shared-memory layouts for A, B, and the shared C tile.
-        - An assertion enforces that K % (block_K * split) == 0.
+    The generated kernel accepts:
+    - A: dense matrix with element type `in_dtype`.
+    - B: packed quantized matrix stored as uint8 with `num_bits` bits per element (QK = K / (8/num_bits)).
+    - Scale: per-block scale/exponent information used to dequantize B.
+    The kernel dequantizes B to a working floating format (out_dtype/accum_dtype) using one of two paths:
+    - fast_dequant (True): uses an external, hardware/implementation-specific intrinsic group (twiddling) for batch dequantization.
+    - fast_dequant (False): uses a simple elementwise dequantization helper.
+
+    Parameters:
+    M, N, K (int): matrix dimensions (A is MxK, result is MxN). K must be divisible by (block_K * split).
+    in_dtype (str): element type of A (e.g., "fp4" in this file).
+    out_dtype (str): output tensor element type (e.g., "bfloat16").
+    accum_dtype (str): accumulation type used for the inner GEMM.
+    source_format (str, optional): format string passed to intrinsic selector (default "uint").
+    num_bits (int, optional): number of bits per quantized element in B (default 4).
+    scale_size (int, optional): number of elements grouped per scale entry (default 32).
+    fast_dequant (bool, optional): choose the fast intrinsic dequantization path when available (default True).
+    block_M, block_N, block_K (int, optional): tile sizes for M, N, and K dimensions (defaults 256, 128, 128).
+    num_stages (int, optional): pipelining stages for K loop (default 2).
+    threads (int, optional): threads per block used by the kernel (default 256).
+    split (int, optional): split factor along K used by the scheduler (default 1).
+    with_bias (bool, optional): whether to add Bias to the output (default False).
+
+    Returns:
+    A T.prim_func implementing the tiled, pipelined GEMM that:
+    - loads tiled blocks of A and packed B to shared memory,
+    - dequantizes B via the chosen path into a shared dequantized tile,
+    - performs a tiled GEMM accumulating into local fragments,
+    - writes the final MxN block to the global output tensor.
+
+    Notes:
+    - The function queries an intrinsic group to obtain a fast dequantization implementation when fast_dequant is enabled; that intrinsic must supply a valid C source and function name.
+    - The kernel layout uses swizzled shared-memory layouts for A, B, and the shared C tile.
+    - An assertion enforces that K % (block_K * split) == 0.
     """
     num_elems_per_byte = 8 // num_bits
     storage_dtype = "uint8"
@@ -150,6 +155,7 @@ def matmul(M,
     assert K % (block_K * split) == 0
 
     from tilelang.quantize import get_mxfp_intrin_group
+
     # fast_dequant_bf16_fp4_twiddling
     mxfp_intrin_info = get_mxfp_intrin_group(
         out_dtype=in_dtype,
@@ -252,8 +258,7 @@ def fast_dequant_bf16_fp4_twiddling(B_shared, B_dequantize_shared, Scale_shared,
 
                 for v in T.vectorized(0, local_size):
                     index = i * threads * local_size + tx * local_size + v
-                    B_dequantize_shared[index // block_K,
-                                        index % block_K] = B_dequantize_local_thread[v]
+                    B_dequantize_shared[index // block_K, index % block_K] = B_dequantize_local_thread[v]
 
         return fast_dequant_bf16_fp4_twiddling
 
@@ -301,8 +306,8 @@ def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared, Scale_shared, k):
                     B_local[i, j // num_elems_per_byte],
                     j % num_elems_per_byte,
                     Scale_shared[
-                        i, k * block_K // scale_size + j //
-                        scale_size],  # Scale is the exponential part, within the representation of uint8
+                        i, k * block_K // scale_size + j // scale_size
+                    ],  # Scale is the exponential part, within the representation of uint8
                     dtype=out_dtype,
                 ) * T.shift_left(1, (Scale_shared[i, k * block_K // scale_size + j // scale_size]))
             T.copy(B_dequantize_local, B_dequantize_shared)
@@ -311,22 +316,22 @@ def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared, Scale_shared, k):
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, storage_dtype),
-            Scale: T.Tensor(Scale_shape, storage_dtype),
-            Bias: T.Tensor(Bias_shape, out_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        Scale: T.Tensor(Scale_shape, storage_dtype),
+        Bias: T.Tensor(Bias_shape, out_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         """
-            Tiled, pipelined kernel entry that multiplies A with a quantized B (with per-block Scale) producing C.
+        Tiled, pipelined kernel entry that multiplies A with a quantized B (with per-block Scale) producing C.
 
-            This prim-level kernel implements a blocked, multi-threaded matmul: it loads tiles of A and the packed/quantized B into shared memory, dequantizes B (either via the fast intrinsic twiddling path or the simple per-element path), performs a block GEMM (with B transposed), and writes the accumulated block results into the output tensor C. The kernel allocates shared buffers for A, B, and the dequantized B, and a local fragment for accumulation; it runs over K in pipelined stages and expects the provided shapes and dtypes to match the tiling parameters used to build the function.
+        This prim-level kernel implements a blocked, multi-threaded matmul: it loads tiles of A and the packed/quantized B into shared memory, dequantizes B (either via the fast intrinsic twiddling path or the simple per-element path), performs a block GEMM (with B transposed), and writes the accumulated block results into the output tensor C. The kernel allocates shared buffers for A, B, and the dequantized B, and a local fragment for accumulation; it runs over K in pipelined stages and expects the provided shapes and dtypes to match the tiling parameters used to build the function.
 
-            Parameters are self-descriptive in the signature; notable behaviors:
-            - B is stored in a compact uint8-packed layout (num_bits per element) and is dequantized using Scale before GEMM.
-            - The selected dequantization path is controlled by the outer-scope flag `fast_dequant`.
-            - The GEMM uses transpose_B=True (i.e., multiplies A · B^T after dequantization).
-            - The function writes results in-place into C.
+        Parameters are self-descriptive in the signature; notable behaviors:
+        - B is stored in a compact uint8-packed layout (num_bits per element) and is dequantized using Scale before GEMM.
+        - The selected dequantization path is controlled by the outer-scope flag `fast_dequant`.
+        - The GEMM uses transpose_B=True (i.e., multiplies A · B^T after dequantization).
+        - The function writes results in-place into C.
         """
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -339,16 +344,20 @@ def main(
             # May use much more shared memory than necessary
             Scale_shared = T.alloc_shared((block_N, K // scale_size), storage_dtype)
 
-            T.annotate_layout({
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                C_shared: tilelang.layout.make_swizzled_layout(C_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                    C_shared: tilelang.layout.make_swizzled_layout(C_shared),
+                }
+            )
 
             if with_bias:
-                T.annotate_layout({
-                    Bias_shared: tilelang.layout.make_swizzled_layout(Bias_shared),
-                })
+                T.annotate_layout(
+                    {
+                        Bias_shared: tilelang.layout.make_swizzled_layout(Bias_shared),
+                    }
+                )
 
             if threads == 512:
                 T.disable_warp_group_reg_alloc()
@@ -357,26 +366,24 @@ def main(
                 # T.copy(Bias[by * block_M:(by + 1) * block_M, bx * block_N:(bx + 1) * block_N],
                 #        Bias_shared)
                 # T.copy(Bias_shared, C_local)
-                T.copy(Bias[by * block_M:(by + 1) * block_M, bx * block_N:(bx + 1) * block_N],
-                       C_local)
+                T.copy(Bias[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N], C_local)
             else:
                 T.clear(C_local)
 
             # Use 1D TMA to load Scale
-            T.copy(Scale[bx * block_N:(bx + 1) * block_N, :], Scale_shared)
+            T.copy(Scale[bx * block_N : (bx + 1) * block_N, :], Scale_shared)
 
             for k in T.Pipelined(K // block_K, num_stages=num_stages):
                 T.copy(A[by * block_M, k * block_K], A_shared)
                 T.copy(B[bx * block_N, k * block_K // num_elems_per_byte], B_shared)
                 if fast_dequant:
-                    get_fast_dequant_twiddling_func()(B_shared, B_dequantize_shared, Scale_shared,
-                                                      k)
+                    get_fast_dequant_twiddling_func()(B_shared, B_dequantize_shared, Scale_shared, k)
                 else:
                     get_simple_dequant_func()(B_shared, B_dequantize_shared, Scale_shared, k)
                 T.gemm(A_shared, B_dequantize_shared, C_local, transpose_B=True)
 
             T.copy(C_local, C_shared)
-            T.copy(C_shared, C[by * block_M:(by + 1) * block_M, bx * block_N:(bx + 1) * block_N])
+            T.copy(C_shared, C[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N])
 
     return main
 
@@ -399,7 +406,7 @@ def ref_program_twiddling(A, qB, Scale, Bias=None):
     B = torch_convert_bit_twiddling(qB)
     for i in range(B.shape[0]):
         for j in range(B.shape[1]):
-            B[i][j] = B[i][j] * (2**(Scale[i][j // 32]))
+            B[i][j] = B[i][j] * (2 ** (Scale[i][j // 32]))
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -424,7 +431,7 @@ def ref_program_twiddling_with_bias(A, qB, Scale, Bias):
     B = torch_convert_bit_twiddling(qB)
     for i in range(B.shape[0]):
         for j in range(B.shape[1]):
-            B[i][j] = B[i][j] * (2**(Scale[i][j // 32]))
+            B[i][j] = B[i][j] * (2 ** (Scale[i][j // 32]))
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float)) + Bias
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -450,7 +457,7 @@ def ref_program_simple(A, qB, Scale, Bias=None):
     B = torch_convert(qB)
     for i in range(B.shape[0]):
         for j in range(B.shape[1]):
-            B[i][j] = B[i][j] * (2**(Scale[i][j // 32]))
+            B[i][j] = B[i][j] * (2 ** (Scale[i][j // 32]))
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -480,7 +487,7 @@ def ref_program_simple_with_bias(A, qB, Scale, Bias):
     B = torch_convert(qB)
     for i in range(B.shape[0]):
         for j in range(B.shape[1]):
-            B[i][j] = B[i][j] * (2**(Scale[i][j // 32]))
+            B[i][j] = B[i][j] * (2 ** (Scale[i][j // 32]))
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float)) + Bias
     C = C.to(torch.__getattribute__(dtypeC))
     return C
@@ -507,16 +514,8 @@ def main(m=256, n=256, k=256, scale_size=32, fast_dequant=True, with_bias=False,
 
     if tune:
         kernel = matmul(
-            m,
-            n,
-            k,
-            "bfloat16",
-            "bfloat16",
-            "float32",
-            num_bits=4,
-            scale_size=scale_size,
-            fast_dequant=fast_dequant,
-            with_bias=with_bias)
+            m, n, k, "bfloat16", "bfloat16", "float32", num_bits=4, scale_size=scale_size, fast_dequant=fast_dequant, with_bias=with_bias
+        )
     else:
         kernel = matmul(
             m,
@@ -534,7 +533,8 @@ def main(m=256, n=256, k=256, scale_size=32, fast_dequant=True, with_bias=False,
             threads=256,
             split=1,
             fast_dequant=fast_dequant,
-            with_bias=with_bias)
+            with_bias=with_bias,
+        )
 
     profiler = kernel.get_profiler(tilelang.TensorSupplyType.Auto)
 
diff --git a/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py b/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py
index 727d6d3b6..37826874b 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py
@@ -24,8 +24,9 @@ def matmul(
     num_bits=4,
 ):
     from tilelang.quantize import _tir_packed_to_unsigned_convert
+
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "int8"
+    storage_dtype = T.int8
     storage_nbit = int("".join(c for c in storage_dtype if c.isdigit()))
     storage_type = str("".join(c for c in storage_dtype if not c.isdigit()))
     A_shape = (M, K)
@@ -39,9 +40,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, storage_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -58,21 +59,19 @@ def main(
                 T.copy(A[by * block_M, k * block_K], A_shared)
                 T.copy(B[bx * block_N, k * block_K // num_elems_per_byte], B_shared)
 
-                for i in T.serial(block_N * block_K // num_elems_per_byte //
-                                  (threads * local_size_compressed)):
+                for i in T.serial(block_N * block_K // num_elems_per_byte // (threads * local_size_compressed)):
                     for v in T.vectorized(0, local_size_compressed):
                         index = i * threads * local_size_compressed + tx * local_size_compressed + v
                         vi = index // (block_K // num_elems_per_byte)
                         vj = index % (block_K // num_elems_per_byte)
                         B_local[v] = B_shared[vi, vj]
                     for v in T.serial(0, local_size):
-                        B_dequantize_local[v] = _tir_packed_to_unsigned_convert(
-                            storage_type, storage_nbit)(
-                                num_bits,
-                                B_local[v // num_elems_per_byte],
-                                v % num_elems_per_byte,
-                                dtype=in_dtype,
-                            )
+                        B_dequantize_local[v] = _tir_packed_to_unsigned_convert(storage_type, storage_nbit)(
+                            num_bits,
+                            B_local[v // num_elems_per_byte],
+                            v % num_elems_per_byte,
+                            dtype=in_dtype,
+                        )
                     for v in T.vectorized(0, local_size):
                         index = i * threads * local_size + tx * local_size + v
                         vi = index // block_K
@@ -121,9 +120,7 @@ def run_gemm(
     def ref_program(A, qB):
         import torch
 
-        B = (
-            torch.zeros(qB.shape[0], qB.shape[1] * 8 // 4,
-                        dtype=torch.half).to(torch.half).to(A.device))
+        B = torch.zeros(qB.shape[0], qB.shape[1] * 8 // 4, dtype=torch.half).to(torch.half).to(A.device)
         for i in range(B.shape[0]):
             for j in range(B.shape[1]):
                 B[i][j] = ((qB[i][j // 2] >> (4 * (j % 2))) & 0xF).to(torch.half)
@@ -146,25 +143,27 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
 ):
     from tilelang.intrinsics.mma_layout import make_mma_swizzle_layout as make_swizzle_layout
     from tilelang.intrinsics.mma_macro_generator import (
-        TensorCoreIntrinEmitterWithLadderTransform,)
+        TensorCoreIntrinEmitterWithLadderTransform,
+    )
 
     from bitblas.gpu.intrin.lop3 import decode_i4_to_f16
+
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
     num_bits = 4
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "int8"
+    storage_dtype = T.int8
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if out_dtype == "int32":
+    if out_dtype == T.int32:
         micro_size_k = 32
 
     # This is a debug config
@@ -183,7 +182,7 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
 
     block_M = block_row_warps * warp_row_tiles
     block_N = block_col_warps * warp_col_tiles
-    block_K = 32 if in_dtype == "float16" else 64
+    block_K = 32 if in_dtype == T.float16 else 64
     chunk = block_K // reduce_k
 
     is_smooth_a = False
@@ -192,8 +191,7 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
     pad_factor = 8
 
     A_shape = (M, K)
-    B_shape = (N // micro_size_y, K // micro_size_k, micro_size_y,
-               micro_size_k // num_elems_per_byte)
+    B_shape = (N // micro_size_y, K // micro_size_k, micro_size_y, micro_size_k // num_elems_per_byte)
     A_shared_shape = (block_M, (block_K + pad_factor) if apply_pad_a else block_K)
     B_shared_shape = (
         block_N // micro_size_y,
@@ -228,7 +226,8 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
         chunk=chunk,
         reduce_k=reduce_k,
         transform_kind_b=transform_b,
-        num_elems_per_byte=num_elems_per_byte)
+        num_elems_per_byte=num_elems_per_byte,
+    )
 
     vec_load_qb = 16
     if block_N * (block_K // reduce_k) // num_elems_per_byte // threads < vec_load_qb:
@@ -236,14 +235,11 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, storage_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads,
-                prelude=decode_i4_to_f16) as (bx, by):
-
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads, prelude=decode_i4_to_f16) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, storage_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -255,40 +251,36 @@ def main(
             thread_binding = T.get_thread_binding(0)
             rk = T.get_thread_binding(1)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                }
+            )
 
             T.use_swizzle(panel_size=10)
 
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, (block_K // reduce_k)):
                     vk = rk * (block_K // reduce_k) + k
                     A_shared[i, vk] = A[by * block_M + i, ko * block_K + vk]
 
                 # TODO(lei): Layout Inference Pass is not efficient to handle the four dims int8 load
-                for i in T.serial(block_N * (block_K // reduce_k) // num_elems_per_byte //
-                                  (threads * vec_load_qb)):
+                for i in T.serial(block_N * (block_K // reduce_k) // num_elems_per_byte // (threads * vec_load_qb)):
                     for v in T.vectorized(0, vec_load_qb):
                         t = thread_binding
                         idx = i * threads * vec_load_qb * reduce_k + rk * threads * vec_load_qb + t * vec_load_qb + v
                         vkk = idx % (micro_size_k // num_elems_per_byte)
                         vjj = (idx // (micro_size_k // num_elems_per_byte)) % micro_size_y
-                        vk = (idx // (micro_size_k // num_elems_per_byte) // micro_size_y) % (
-                            block_K // micro_size_k)
-                        vj = (idx // (micro_size_k // num_elems_per_byte) // micro_size_y //
-                              (block_K // micro_size_k)) % (
-                                  block_N // micro_size_y)
-                        B_shared[vj, vk, vjj,
-                                 vkk] = B[bx * (block_N // micro_size_y) + vj,
-                                          ko * (block_K // micro_size_k) + vk, vjj, vkk]
+                        vk = (idx // (micro_size_k // num_elems_per_byte) // micro_size_y) % (block_K // micro_size_k)
+                        vj = (idx // (micro_size_k // num_elems_per_byte) // micro_size_y // (block_K // micro_size_k)) % (
+                            block_N // micro_size_y
+                        )
+                        B_shared[vj, vk, vjj, vkk] = B[bx * (block_N // micro_size_y) + vj, ko * (block_K // micro_size_k) + vk, vjj, vkk]
 
                 for ki in T.serial(0, (block_K // (micro_size_k * reduce_k))):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -307,9 +299,13 @@ def main(
 
                     for j in T.serial(warp_cols):
                         local_size_b = mma_emitter.local_size_b
-                        T.call_extern('handle', 'decode_i4u_to_f16',
-                                      T.address_of(B_local[j * local_size_b // num_elems_per_byte]),
-                                      T.address_of(B_dequantize_local[j * local_size_b]), 8)
+                        T.call_extern(
+                            "handle",
+                            "decode_i4u_to_f16",
+                            T.address_of(B_local[j * local_size_b // num_elems_per_byte]),
+                            T.address_of(B_dequantize_local[j * local_size_b]),
+                            8,
+                        )
 
                     mma_emitter.mma(A_local, B_dequantize_local, C_local)
 
@@ -328,7 +324,8 @@ def main(
                             reduced_accum_res[0],
                             rk,
                             dtype="handle",
-                        ))
+                        )
+                    )
                     if rk == 0:
                         C_local[n] = reduced_accum_res[0]
 
@@ -340,9 +337,9 @@ def main(
 
             for i, j in T.Parallel(block_M, (block_N // reduce_k)):
                 vj = rk * (block_N // reduce_k) + j
-                C[by * block_M + i,
-                  bx * block_N + vj] = C_shared[i // micro_size_x, vj // micro_size_y,
-                                                i % micro_size_x, vj % micro_size_y]
+                C[by * block_M + i, bx * block_N + vj] = C_shared[
+                    i // micro_size_x, vj // micro_size_y, i % micro_size_x, vj % micro_size_y
+                ]
 
     return main
 
@@ -357,8 +354,8 @@ def assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correct
     transform_b,
 ):
     import bitblas
-    matmul = tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
-        M, N, K, in_dtype, out_dtype, accum_dtype, transform_b)
+
+    matmul = tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(M, N, K, in_dtype, out_dtype, accum_dtype, transform_b)
 
     kernel = tilelang.compile(matmul, out_idx=[2])
     src_code = kernel.get_kernel_source()
@@ -368,11 +365,10 @@ def assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correct
     assert src_code is not None
     num_bits = 4
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "int8"
+    storage_dtype = T.int8
 
     A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
-    qB = torch.randint(
-        0, 127, (N, K // num_elems_per_byte), device="cuda", dtype=getattr(torch, storage_dtype))
+    qB = torch.randint(0, 127, (N, K // num_elems_per_byte), device="cuda", dtype=getattr(torch, storage_dtype))
     C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, accum_dtype))
 
     ladder_permutate_config = bitblas.ops.LadderPermutateConfig(
@@ -407,9 +403,7 @@ def assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correct
     # Ensure that the latency is not None
     assert latency is not None
 
-    B = (
-        torch.zeros(qB.shape[0], qB.shape[1] * 8 // 4,
-                    dtype=torch.half).to(torch.half).to(A.device))
+    B = torch.zeros(qB.shape[0], qB.shape[1] * 8 // 4, dtype=torch.half).to(torch.half).to(A.device)
     for i in range(B.shape[0]):
         for j in range(B.shape[1]):
             B[i][j] = ((qB[i][j // 2] >> (4 * (j % 2))) & 0xF).to(torch.half)
@@ -423,14 +417,13 @@ def assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correct
 
 @tilelang.testing.requires_package("bitblas")
 def test_run_dequantize_gemm():
-    run_gemm(256, 256, 256, "float16", "float16", "float16", 128, 128, 32, num_threads=128)
-    run_gemm(256, 256, 256, "int8", "int32", "int32", 128, 128, 32, num_threads=128)
+    run_gemm(256, 256, 256, T.float16, T.float16, T.float16, 128, 128, 32, num_threads=128)
+    run_gemm(256, 256, 256, T.int8, T.int32, T.int32, 128, 128, 32, num_threads=128)
 
 
 @tilelang.testing.requires_package("bitblas")
 def test_assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4():
-    assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correctness(
-        256, 1024, 512, "float16", "float16", "float16", 3)
+    assert_tl_matmul_with_ladder_weight_only_transform_block_reduce_int4_correctness(256, 1024, 512, T.float16, T.float16, T.float16, 3)
 
 
 def main():
diff --git a/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py b/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py
index c5588d516..2bdcbb068 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py
@@ -9,30 +9,29 @@
 
 def _tir_u8_to_f4_to_f16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
     assert nbit == 4
-    assert dtype == "float16"
-    assert val.dtype == "uint8"
+    assert dtype == T.float16
+    assert val.dtype == T.uint8
     # e_f4 == 0 -> e_f16 = 0
     # e_f4 != 0 -> e_f16 = e_f4 + ExponentialBias(f16, f4) = e_f4 + (2^4 - 2^1) = e_f4 + 14
     # s1e2m1
-    mask = tir.const((1 << nbit) - 1, "uint16")
-    f4 = (val >> (pos.astype("uint16") * tir.const(nbit, "uint16"))) & mask
-    s = f4 >> tir.const(3, "uint16")
-    e_f4 = (f4 & tir.const(6, "uint16")) >> tir.const(1, "uint16")
-    e_f16 = e_f4 + tir.const(14, "uint16")
-    m_f4 = f4 & tir.const(1, "uint16")
+    mask = tir.const((1 << nbit) - 1, T.uint16)
+    f4 = (val >> (pos.astype(T.uint16) * tir.const(nbit, T.uint16))) & mask
+    s = f4 >> tir.const(3, T.uint16)
+    e_f4 = (f4 & tir.const(6, T.uint16)) >> tir.const(1, T.uint16)
+    e_f16 = e_f4 + tir.const(14, T.uint16)
+    m_f4 = f4 & tir.const(1, T.uint16)
     m_f16 = m_f4
-    val_f16 = tir.reinterpret("float16",
-                              ((e_f16 | (s << tir.const(5, "uint16"))) << tir.const(10, "uint16")
-                               | m_f16 << tir.const(9, "uint16")).astype("uint16"))
-    # return tir.Select(e_f4 == tir.const(0, "uint32"), tir.const(0, "float16"), val_f16)
+    val_f16 = tir.reinterpret(
+        T.float16, ((e_f16 | (s << tir.const(5, T.uint16))) << tir.const(10, T.uint16) | m_f16 << tir.const(9, T.uint16)).astype(T.uint16)
+    )
+    # return tir.Select(e_f4 == tir.const(0, "uint32"), tir.const(0, T.float16), val_f16)
     return val_f16
 
 
 def torch_convert(tensor):
-
     def print_bit(name, val):
         val_cpu = val.cpu().item()
-        binary_repr = f'{val_cpu:032b}'
+        binary_repr = f"{val_cpu:032b}"
         print(name, binary_repr)
 
     def _convert(val, pos):
@@ -61,15 +60,15 @@ def _convert(val, pos):
 @tilelang.jit(out_idx=[1])
 def test_convert(N, K, block_N, block_K, in_dtype, num_bits=4, threads=128):
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "uint8"
+    storage_dtype = T.uint8
     B_shape = (N, K // num_elems_per_byte)
     B_shared_shape = (block_N, block_K // num_elems_per_byte)
     B_dequantize_shared_shape = (block_N, block_K)
 
     @T.prim_func
     def main(
-            B: T.Tensor(B_shape, storage_dtype),
-            C: T.Tensor((N, K), in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        C: T.Tensor((N, K), in_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
             B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
@@ -99,7 +98,7 @@ def test_fp4_fp16_convert_close():
         K,
         block_N,
         block_K,
-        "float16",
+        T.float16,
     )
 
     B = torch.randint(0, 16, (N, K // 2), dtype=torch.uint8, device="cuda").to(torch.uint8)
@@ -118,23 +117,15 @@ def get_configs():
     splits = [1]
     _configs = list(itertools.product(block_M, block_N, block_K, num_stages, threads, splits))
 
-    configs = [{
-        'block_M': c[0],
-        'block_N': c[1],
-        'block_K': c[2],
-        'num_stages': c[3],
-        'threads': c[4],
-        'split': c[5]
-    } for c in _configs]
+    configs = [{"block_M": c[0], "block_N": c[1], "block_K": c[2], "num_stages": c[3], "threads": c[4], "split": c[5]} for c in _configs]
     return configs
 
 
 def matmul(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
-
     @tilelang.jit(out_idx=[2])
     def kernel_func(block_M, block_N, block_K, num_stages, threads, split=1):
         num_elems_per_byte = 8 // num_bits
-        storage_dtype = "uint8"
+        storage_dtype = T.uint8
         A_shape = (M, K)
         B_shape = (N, K // num_elems_per_byte)
         A_shared_shape = (block_M, block_K)
@@ -145,29 +136,24 @@ def kernel_func(block_M, block_N, block_K, num_stages, threads, split=1):
 
         @T.prim_func
         def main_split(
-                A: T.Tensor(A_shape, in_dtype),
-                B: T.Tensor(B_shape, storage_dtype),
-                Ct: T.Tensor((N, M), out_dtype),
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, storage_dtype),
+            Ct: T.Tensor((N, M), out_dtype),
         ):
-            SplitC = T.alloc_buffer([
-                split, (N + block_N - 1) // block_N * block_N,
-                (M + block_M - 1) // block_M * block_M
-            ], out_dtype)
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), split,
-                    threads=threads) as (bx, by, bz):
+            SplitC = T.alloc_buffer([split, (N + block_N - 1) // block_N * block_N, (M + block_M - 1) // block_M * block_M], out_dtype)
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), split, threads=threads) as (bx, by, bz):
                 A_shared = T.alloc_shared(A_shared_shape, in_dtype)
                 B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
                 B_local = T.alloc_fragment(B_shared_shape, storage_dtype)
                 B_dequantize_local = T.alloc_fragment(B_dequantize_shared_shape, in_dtype)
                 B_dequantize_prev_local = T.alloc_fragment(B_dequantize_shared_shape, in_dtype)
                 Ct_local = T.alloc_fragment((block_N, block_M), accum_dtype)
-                Ct_shared = T.alloc_shared((block_N, block_M), out_dtype)
 
-                T.annotate_layout({
-                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                    Ct_shared: tilelang.layout.make_swizzled_layout(Ct_shared),
-                })
+                T.annotate_layout(
+                    {
+                        B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                    }
+                )
 
                 T.clear(Ct_local)
                 for k in T.Pipelined(K // (block_K * split), num_stages=num_stages):
@@ -183,8 +169,7 @@ def main_split(
                         )
                     T.copy(B_dequantize_local, B_dequantize_prev_local)
                     T.gemm(B_dequantize_prev_local, A_shared, Ct_local, transpose_B=True)
-                T.copy(Ct_local, SplitC[bz, bx * block_N:(bx + 1) * block_N,
-                                        by * block_M:(by + 1) * block_M])
+                T.copy(Ct_local, SplitC[bz, bx * block_N : (bx + 1) * block_N, by * block_M : (by + 1) * block_M])
             with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M)) as (bx, by):
                 acc = T.alloc_fragment((block_N, block_M), out_dtype)
                 T.clear(acc)
@@ -195,12 +180,11 @@ def main_split(
 
         @T.prim_func
         def main(
-                A: T.Tensor(A_shape, in_dtype),
-                B: T.Tensor(B_shape, storage_dtype),
-                Ct: T.Tensor((N, M), out_dtype),
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, storage_dtype),
+            Ct: T.Tensor((N, M), out_dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
                 A_shared = T.alloc_shared(A_shared_shape, in_dtype)
                 B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
                 B_local = T.alloc_fragment(B_shared_shape, storage_dtype)
@@ -209,10 +193,11 @@ def main(
                 Ct_local = T.alloc_fragment((block_N, block_M), accum_dtype)
                 Ct_shared = T.alloc_shared((block_N, block_M), out_dtype)
 
-                T.annotate_layout({
-                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                    Ct_shared: tilelang.layout.make_swizzled_layout(Ct_shared),
-                })
+                T.annotate_layout(
+                    {
+                        B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                    }
+                )
 
                 T.clear(Ct_local)
                 for k in T.Pipelined(K // block_K, num_stages=num_stages):
@@ -229,8 +214,7 @@ def main(
                     T.copy(B_dequantize_local, B_dequantize_prev_local)
                     T.gemm(B_dequantize_prev_local, A_shared, Ct_local, transpose_B=True)
                 T.copy(Ct_local, Ct_shared)
-                T.copy(Ct_shared, Ct[bx * block_N:(bx + 1) * block_N,
-                                     by * block_M:(by + 1) * block_M])
+                T.copy(Ct_shared, Ct[bx * block_N : (bx + 1) * block_N, by * block_M : (by + 1) * block_M])
 
         if split == 1:
             return main
@@ -241,12 +225,7 @@ def main(
 
         @autotune(configs=get_configs(), warmup=10, rep=10)
         @tilelang.jit(out_idx=[2])
-        def kernel(block_M=None,
-                   block_N=None,
-                   block_K=None,
-                   num_stages=None,
-                   threads=None,
-                   split=None):
+        def kernel(block_M=None, block_N=None, block_K=None, num_stages=None, threads=None, split=None):
             return kernel_func(block_M, block_N, block_K, num_stages, threads, split).prim_func
 
         return kernel()
@@ -259,7 +238,7 @@ def kernel(block_M, block_N, block_K, num_stages, threads, split=1):
 
 
 def ref_program(A, qB):
-    dtypeC = "float16"
+    dtypeC = T.float16
     B = torch_convert(qB)
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
@@ -269,10 +248,10 @@ def ref_program(A, qB):
 def main(m=256, n=256, k=256, tune=False):
     total_flops = 2 * m * n * k
 
-    if (not tune):
-        kernel = matmul(
-            m, n, k, "float16", "float16", "float32", num_bits=4, tune=tune)(
-                block_M=128, block_N=128, block_K=128, num_stages=2, threads=256, split=1)
+    if not tune:
+        kernel = matmul(m, n, k, T.float16, T.float16, T.float32, num_bits=4, tune=tune)(
+            block_M=128, block_N=128, block_K=128, num_stages=2, threads=256, split=1
+        )
         profiler = kernel.get_profiler(tilelang.TensorSupplyType.Integer)
         profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
         print("All checks pass.")
@@ -283,7 +262,7 @@ def main(m=256, n=256, k=256, tune=False):
         print("Tile-lang: {:.2f} ms".format(latency))
         print("Tile-lang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
     else:
-        best_result = matmul(m, n, k, "float16", "float16", "float32", num_bits=4, tune=tune)
+        best_result = matmul(m, n, k, T.float16, T.float16, T.float32, num_bits=4, tune=tune)
         best_latency = best_result.latency
         best_config = best_result.config
         print(f"Best latency: {best_latency}")
@@ -291,12 +270,20 @@ def main(m=256, n=256, k=256, tune=False):
         print(f"Best config: {best_config}")
 
 
+def run_regression_perf(m=4096, n=4096, k=4096):
+    kernel = matmul(m, n, k, "float16", "float16", "float32", num_bits=4, tune=False)(
+        block_M=128, block_N=128, block_K=128, num_stages=2, threads=256, split=1
+    )
+    profiler = kernel.get_profiler(tilelang.TensorSupplyType.Integer)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--m', type=int, default=256, help='M')
-    parser.add_argument('--n', type=int, default=256, help='N')
-    parser.add_argument('--k', type=int, default=256, help='K')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--m", type=int, default=256, help="M")
+    parser.add_argument("--n", type=int, default=256, help="N")
+    parser.add_argument("--k", type=int, default=256, help="K")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
     M, N, K = args.m, args.n, args.k
     main(M, N, K, args.tune)
diff --git a/examples/dequantize_gemm/example_dequant_gemm_w4a8.py b/examples/dequantize_gemm/example_dequant_gemm_w4a8.py
index 52ee8216f..b1f8b1132 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_w4a8.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_w4a8.py
@@ -9,15 +9,15 @@
 
 def _tir_u8_to_i4_to_i8(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
     assert nbit == 4
-    assert dtype == "int8"
-    assert val.dtype == "uint8"
+    assert dtype == T.int8
+    assert val.dtype == T.uint8
 
-    mask = tir.const((1 << nbit) - 1, "uint8")
+    mask = tir.const((1 << nbit) - 1, T.uint8)
 
-    i4 = (val >> (pos.astype("uint8") * tir.const(nbit, "uint8"))) & mask
+    i4 = (val >> (pos.astype(T.uint8) * tir.const(nbit, T.uint8))) & mask
 
-    i8_shifted = tir.reinterpret("int8", i4 << tir.const(4, "uint8"))
-    i8 = i8_shifted >> tir.const(4, "int8")
+    i8_shifted = tir.reinterpret(T.int8, i4 << tir.const(4, T.uint8))
+    i8 = i8_shifted >> tir.const(4, T.int8)
     return i8
 
 
@@ -35,15 +35,15 @@ def get_configs():
 @tilelang.jit(out_idx=[1])
 def _convert_test(N, K, block_N, block_K, in_dtype, num_bits=4, threads=128):
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "uint8"
+    storage_dtype = T.uint8
     B_shape = (N, K // num_elems_per_byte)
     B_shared_shape = (block_N, block_K // num_elems_per_byte)
     B_dequantize_shared_shape = (block_N, block_K)
 
     @T.prim_func
     def main(
-            B: T.Tensor(B_shape, storage_dtype),
-            C: T.Tensor((N, K), in_dtype),
+        B: T.Tensor(B_shape, storage_dtype),
+        C: T.Tensor((N, K), in_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
             B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
@@ -66,13 +66,12 @@ def main(
 
 
 def torch_convert(tensor):
-
     def _convert(val, pos):
         assert val.dtype == torch.uint8
         val = val.view(torch.int8)
         mask = (1 << 4) - 1
-        i4_shifted = ((val >> (pos * 4)) & mask)
-        i4 = ((i4_shifted << 4) >> 4)
+        i4_shifted = (val >> (pos * 4)) & mask
+        i4 = (i4_shifted << 4) >> 4
 
         return i4.view(torch.int8)
 
@@ -86,7 +85,7 @@ def _convert(val, pos):
 
 
 def ref_program(A, qB):
-    dtypeC = "int32"
+    dtypeC = T.int32
     B = torch_convert(qB)
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
@@ -94,11 +93,10 @@ def ref_program(A, qB):
 
 
 def matmul_int8xint4(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
-
     @tilelang.jit(out_idx=[2])
     def kernel_func(block_M, block_N, block_K, num_stages, threads):
         num_elems_per_byte = 8 // num_bits
-        storage_dtype = "uint8"
+        storage_dtype = T.uint8
         A_shape = (M, K)
         B_shape = (N, K // num_elems_per_byte)
         A_shared_shape = (block_M, block_K)
@@ -109,12 +107,11 @@ def kernel_func(block_M, block_N, block_K, num_stages, threads):
 
         @T.prim_func
         def main(
-                A: T.Tensor(A_shape, in_dtype),
-                B: T.Tensor(B_shape, storage_dtype),
-                Ct: T.Tensor((N, M), out_dtype),
+            A: T.Tensor(A_shape, in_dtype),
+            B: T.Tensor(B_shape, storage_dtype),
+            Ct: T.Tensor((N, M), out_dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
                 A_shared = T.alloc_shared(A_shared_shape, in_dtype)
                 B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
                 B_local = T.alloc_fragment(B_shared_shape, storage_dtype)
@@ -123,10 +120,11 @@ def main(
                 Ct_local = T.alloc_fragment((block_N, block_M), accum_dtype)
                 Ct_shared = T.alloc_shared((block_N, block_M), out_dtype)
 
-                T.annotate_layout({
-                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                    Ct_shared: tilelang.layout.make_swizzled_layout(Ct_shared),
-                })
+                T.annotate_layout(
+                    {
+                        B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                    }
+                )
 
                 T.clear(Ct_local)
                 for k in T.Pipelined(K // block_K, num_stages=num_stages):
@@ -143,8 +141,7 @@ def main(
                     T.copy(B_dequantize_local, B_dequantize_prev_local)
                     T.gemm(B_dequantize_prev_local, A_shared, Ct_local, transpose_B=True)
                 T.copy(Ct_local, Ct_shared)
-                T.copy(Ct_shared, Ct[bx * block_N:(bx + 1) * block_N,
-                                     by * block_M:(by + 1) * block_M])
+                T.copy(Ct_shared, Ct[bx * block_N : (bx + 1) * block_N, by * block_M : (by + 1) * block_M])
 
         return main
 
@@ -167,10 +164,10 @@ def kernel(block_M, block_N, block_K, num_stages, threads):
 
 def main(m=128, n=256, k=256, tune=False):
     total_flops = 2 * m * n * k
-    if (not tune):
-        kernel = matmul_int8xint4(
-            m, n, k, "int8", "int32", "int32", num_bits=4, tune=tune)(
-                block_M=32, block_N=32, block_K=128, num_stages=1, threads=128)
+    if not tune:
+        kernel = matmul_int8xint4(m, n, k, T.int8, T.int32, T.int32, num_bits=4, tune=tune)(
+            block_M=32, block_N=32, block_K=128, num_stages=1, threads=128
+        )
         profiler = kernel.get_profiler()
         profiler.assert_allclose(ref_program, rtol=1e-2, atol=1e-2)
         print("All checks pass.")
@@ -179,7 +176,7 @@ def main(m=128, n=256, k=256, tune=False):
         print(f"Tilelang: {latency} ms")
 
     else:
-        best_result = matmul_int8xint4(m, n, k, "int8", "int32", "int32", num_bits=4, tune=tune)
+        best_result = matmul_int8xint4(m, n, k, T.int8, T.int32, T.int32, num_bits=4, tune=tune)
         best_latency = best_result.latency
         best_config = best_result.config
         print(f"Bset latency: {best_latency}")
@@ -187,6 +184,14 @@ def main(m=128, n=256, k=256, tune=False):
         print(f"Best tflops: {total_flops / best_latency * 1e-9}")
 
 
+def run_regression_perf(m=4096, n=4096, k=4096):
+    kernel = matmul_int8xint4(m, n, k, "int8", "int32", "int32", num_bits=4, tune=False)(
+        block_M=32, block_N=32, block_K=128, num_stages=1, threads=128
+    )
+    profiler = kernel.get_profiler()
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--m", type=int, default=512, help="Matrix dimension M")
diff --git a/examples/dequantize_gemm/example_dequant_gemv_fp16xint4.py b/examples/dequantize_gemm/example_dequant_gemv_fp16xint4.py
index d3e90ec93..43e97f930 100644
--- a/examples/dequantize_gemm/example_dequant_gemv_fp16xint4.py
+++ b/examples/dequantize_gemm/example_dequant_gemv_fp16xint4.py
@@ -4,7 +4,8 @@
 import torch
 from tilelang import DataType
 from tilelang.quantize import (
-    _tir_packed_int_to_int_convert,)
+    _tir_packed_int_to_int_convert,
+)
 
 
 @tilelang.jit
@@ -16,7 +17,7 @@ def dequantize_gemv(
     out_dtype: str,
     accum_dtype: str,
     num_bits: int = 4,
-    storage_dtype: str = "int8",
+    storage_dtype: T.dtype = T.int8,
     source_format: str = "uint",
     n_partition: int = 4,
     reduce_thread: int = 32,
@@ -26,11 +27,10 @@ def dequantize_gemv(
     group_size: int = -1,
     with_scaling: bool = False,
 ) -> Callable[..., Any]:
-
     assert n_partition is not None, "n_partition must be provided"
     assert reduce_thread is not None, (
-        "reduce_thread must be provided currently, as related bitblas.gpu.gemv.GEMV"
-        "sch_outer_reduction_with_config is not implemented")
+        "reduce_thread must be provided currently, as related bitblas.gpu.gemv.GEMVsch_outer_reduction_with_config is not implemented"
+    )
 
     assert trans_A is False, "Dequantize only implement for trans_A=False currently"
     assert trans_B is True, "Dequantize only implement for trans_B=TRue currently"
@@ -51,7 +51,7 @@ def dequantize_gemv(
     C_shape = (M, N)
 
     dp4a_size = 4
-    use_dp4a = in_dtype == "int8" and accum_dtype == "int32"
+    use_dp4a = in_dtype == T.int8 and accum_dtype == T.int32
 
     import_source: Optional[str] = None
     func_name: str = ""
@@ -81,12 +81,12 @@ def main(
         C: T.Tensor[C_shape, out_dtype],
     ):
         with T.Kernel(
-                T.ceildiv(N, n_partition),
-                M,
-                threads=(reduce_thread, n_partition),
+            T.ceildiv(N, n_partition),
+            M,
+            threads=(reduce_thread, n_partition),
         ) as (
-                bx,
-                by,
+            bx,
+            by,
         ):
             A_local = T.alloc_local((micro_size_k,), in_dtype)
             B_quant_local = T.alloc_local([micro_size_k_compressed], storage_dtype)
@@ -107,8 +107,7 @@ def main(
                 for v in T.vectorized(micro_size_k_compressed):
                     B_quant_local[v] = B[
                         bx * n_partition + ni,
-                        ko * (reduce_thread * micro_size_k_compressed) +
-                        kr * micro_size_k_compressed + v,
+                        ko * (reduce_thread * micro_size_k_compressed) + kr * micro_size_k_compressed + v,
                     ]
 
                 if fast_decoding:
@@ -120,10 +119,9 @@ def main(
                     )
                 else:
                     for ki in T.serial(micro_size_k):
-                        B_dequantize_local[ki] = _tir_packed_int_to_int_convert(
-                            storage_type,
-                            storage_nbit)(num_bits, B_quant_local[ki // num_elems_per_byte],
-                                          ki % num_elems_per_byte, in_dtype)
+                        B_dequantize_local[ki] = _tir_packed_int_to_int_convert(storage_type, storage_nbit)(
+                            num_bits, B_quant_local[ki // num_elems_per_byte], ki % num_elems_per_byte, in_dtype
+                        )
 
                 if use_dp4a:
                     for ki in T.serial(micro_size_k // dp4a_size):
@@ -137,9 +135,9 @@ def main(
                         accum_res[0] += A_local[ki] * B_dequantize_local[ki]
 
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
-                    "reduce_scope",
-                    T.reinterpret(T.uint64(0), dtype="handle"),
+                T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                "reduce_scope",
+                T.reinterpret(T.uint64(0), dtype="handle"),
             ):
                 T.evaluate(
                     T.tvm_thread_allreduce(
@@ -149,7 +147,8 @@ def main(
                         reduced_accum_res[0],
                         kr,
                         dtype="handle",
-                    ))
+                    )
+                )
             if kr == 0:
                 C[by, bx * n_partition + ni] = reduced_accum_res[0]
 
@@ -160,11 +159,11 @@ def main() -> None:
     M = 1
     N = 1024
     K = 1024
-    in_dtype = "float16"
-    out_dtype = "float16"
-    accum_dtype = "float16"
+    in_dtype = T.float16
+    out_dtype = T.float16
+    accum_dtype = T.float16
     num_bits = 4
-    storage_dtype = "int8"
+    storage_dtype = T.int8
     source_format = "uint"
     n_partition = 4
     reduce_thread = 32
@@ -174,26 +173,39 @@ def main() -> None:
     group_size = -1
     with_scaling = False
 
-    kernel = dequantize_gemv(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits, storage_dtype,
-                             source_format, n_partition, reduce_thread, fast_decoding, trans_A,
-                             trans_B, group_size, with_scaling)
+    kernel = dequantize_gemv(
+        M,
+        N,
+        K,
+        in_dtype,
+        out_dtype,
+        accum_dtype,
+        num_bits,
+        storage_dtype,
+        source_format,
+        n_partition,
+        reduce_thread,
+        fast_decoding,
+        trans_A,
+        trans_B,
+        group_size,
+        with_scaling,
+    )
 
     storage_nbit = int("".join(c for c in storage_dtype if c.isdigit()))
     num_elems_per_byte = storage_nbit // num_bits
     A = torch.rand(M, K, dtype=getattr(torch, in_dtype)).cuda()
-    qB = torch.randint(
-        0, 127, (N, K // num_elems_per_byte), dtype=getattr(torch, storage_dtype)).cuda()
+    qB = torch.randint(0, 127, (N, K // num_elems_per_byte), dtype=getattr(torch, storage_dtype)).cuda()
     C = torch.zeros(M, N, dtype=getattr(torch, accum_dtype)).cuda()
 
     if fast_decoding:
         from tilelang.quantize.utils import interleave_weight
+
         qB = interleave_weight(qB, num_bits, in_dtype)
     kernel(A, qB, C)
 
     # int4 reference
-    B = (
-        torch.zeros(qB.shape[0], qB.shape[1] * 8 // 4,
-                    dtype=torch.half).to(torch.half).to(A.device))
+    B = torch.zeros(qB.shape[0], qB.shape[1] * 8 // 4, dtype=torch.half).to(torch.half).to(A.device)
     for j in range(B.shape[1]):
         B[:, j] = ((qB[:, j // 2] >> (4 * (j % 2))) & 0xF).to(torch.half)
 
@@ -205,5 +217,62 @@ def main() -> None:
     torch.testing.assert_close(C, ref_c, atol=1e3, rtol=1e-1)
 
 
+def run_regression_perf():
+    M = 1
+    N = 8192
+    K = 8192
+    in_dtype = "float16"
+    out_dtype = "float16"
+    accum_dtype = "float16"
+    num_bits = 4
+    storage_dtype = "int8"
+    source_format = "uint"
+    n_partition = 4
+    reduce_thread = 32
+    fast_decoding = True
+    trans_A = False
+    trans_B = True
+    group_size = -1
+    with_scaling = False
+
+    kernel = dequantize_gemv(
+        M,
+        N,
+        K,
+        in_dtype,
+        out_dtype,
+        accum_dtype,
+        num_bits,
+        storage_dtype,
+        source_format,
+        n_partition,
+        reduce_thread,
+        fast_decoding,
+        trans_A,
+        trans_B,
+        group_size,
+        with_scaling,
+    )
+
+    storage_nbit = int("".join(c for c in storage_dtype if c.isdigit()))
+    num_elems_per_byte = storage_nbit // num_bits
+    A = torch.rand(M, K, dtype=getattr(torch, in_dtype)).cuda()
+    qB = torch.randint(0, 127, (N, K // num_elems_per_byte), dtype=getattr(torch, storage_dtype)).cuda()
+    C = torch.zeros(M, N, dtype=getattr(torch, accum_dtype)).cuda()
+
+    if fast_decoding:
+        from tilelang.quantize.utils import interleave_weight
+
+        qB = interleave_weight(qB, num_bits, in_dtype)
+    kernel(A, qB, C)
+
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(A, qB, C)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/dequantize_gemm/example_dequant_groupedgemm_bf16_mxfp4_hopper.py b/examples/dequantize_gemm/example_dequant_groupedgemm_bf16_mxfp4_hopper.py
index c4cf5fb50..6ee595921 100644
--- a/examples/dequantize_gemm/example_dequant_groupedgemm_bf16_mxfp4_hopper.py
+++ b/examples/dequantize_gemm/example_dequant_groupedgemm_bf16_mxfp4_hopper.py
@@ -25,6 +25,7 @@ def get_configs():
         List[dict]: A list of configuration dictionaries covering all combinations.
     """
     import itertools
+
     iter_params = dict(
         block_M=[128],
         block_N=[64, 128, 256],
@@ -33,33 +34,33 @@ def get_configs():
         threads=[128, 256, 512],
         split=[1],
     )
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
 
 
 @tilelang.autotune(configs=get_configs())
 @tilelang.jit(out_idx=[-1])
-def matmul(M,
-           N,
-           K,
-           topk,
-           E,
-           padding_M,
-           in_dtype,
-           out_dtype,
-           accum_dtype,
-           source_format='uint',
-           num_bits=4,
-           scale_size=32,
-           fast_dequant=True,
-           with_bias=False,
-           block_M=128,
-           block_N=256,
-           block_K=128,
-           num_stages=2,
-           threads=256,
-           split=1):
+def matmul(
+    M,
+    N,
+    K,
+    topk,
+    E,
+    padding_M,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    source_format=T.uint32,
+    num_bits=4,
+    scale_size=32,
+    fast_dequant=True,
+    with_bias=False,
+    block_M=128,
+    block_N=256,
+    block_K=128,
+    num_stages=2,
+    threads=256,
+    split=1,
+):
     """
     Construct and return a grouped (Mixture-of-Experts) matrix-multiply TIR kernel that multiplies A (shape MxK) by a quantized, expert-grouped B (shape ExNxQK) and writes an output of shape (M, topk, N) in out_dtype.
 
@@ -82,8 +83,8 @@ def matmul(M,
         topk (int): number of experts selected per token.
         E (int): number of experts.
         padding_M (int): padded number of tokens after grouping and block alignment.
-        in_dtype (str): element type of A (e.g., "bfloat16").
-        out_dtype (str): output tensor element type (e.g., "bfloat16").
+        in_dtype (str): element type of A (e.g., T.bfloat16).
+        out_dtype (str): output tensor element type (e.g., T.bfloat16).
         accum_dtype (str): accumulation type used for the inner GEMM.
         source_format (str, optional): format string passed to intrinsic selector (default "uint").
         num_bits (int, optional): number of bits per quantized element in B (default 4).
@@ -110,16 +111,17 @@ def matmul(M,
     """
 
     num_elems_per_byte = 8 // num_bits
-    storage_dtype = "uint8"
+    storage_dtype = T.uint8
     QK = K // num_elems_per_byte
     Block_QK = block_K // num_elems_per_byte
     A_shared_shape = (block_M, block_K)
     B_shared_shape = (block_N, Block_QK)
-    Bias_shared_shape = (block_N)
+    Bias_shared_shape = block_N
     B_dequantize_shared_shape = (block_N, block_K)
     assert K % (block_K * split) == 0
 
     from tilelang.quantize import get_mxfp_intrin_group
+
     # fast_dequant_bf16_fp4_twiddling
     mxfp_intrin_info = get_mxfp_intrin_group(
         out_dtype=in_dtype,
@@ -135,7 +137,7 @@ def matmul(M,
     import_source = import_source
 
     # the dequant part is the same as in dequant_gemm
-    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype="bfloat16"):
+    def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype=T.bfloat16):
         """
         Return a TileLang macro that performs fast dequantization of twiddled FP4-packed data into BF16.
         The returned macro has signature (B_shared, B_dequantize_shared, Scale, k) and:
@@ -145,12 +147,12 @@ def get_fast_dequant_twiddling_func(in_dtype="fp4", out_dtype="bfloat16"):
         - Writes the scaled BF16 results into B_dequantize_shared.
 
         Notes:
-        - This factory only supports in_dtype="fp4" and out_dtype="bfloat16".
+        - This factory only supports in_dtype="fp4" and out_dtype=T.bfloat16.
         - The macro depends on several names from the enclosing scope (e.g., import_source, func_name, DataType, num_elems_per_byte, storage_dtype, block_N, block_K, threads, scale_size); those must be defined and consistent with the kernel that will use the macro.
         - The macro issues a T.import_source and T.call_extern to invoke the external intrinsic; ensure the external implementation matching `func_name` is available at compilation/runtime.
         """
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
         # Some variables for dequantization in each thread
         MAX_TRANSACTION_SIZE_BITS = 128
@@ -221,19 +223,16 @@ def fast_dequant_bf16_fp4_twiddling(B_shared, B_dequantize_shared, Scale_shared,
 
                 for v in T.vectorized(0, local_size):
                     index = i * threads * local_size + tx * local_size + v
-                    B_dequantize_shared[index // block_K,
-                                        index % block_K] = B_dequantize_local_thread[v]
+                    B_dequantize_shared[index // block_K, index % block_K] = B_dequantize_local_thread[v]
 
         return fast_dequant_bf16_fp4_twiddling
 
-    def get_simple_dequant_func(in_dtype="fp4", out_dtype="bfloat16"):
-
+    def get_simple_dequant_func(in_dtype="fp4", out_dtype=T.bfloat16):
         assert in_dtype in ["fp4"]
-        assert out_dtype in ["bfloat16"]
+        assert out_dtype in [T.bfloat16]
 
         @T.macro
         def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared, Scale_shared, k):
-
             B_local = T.alloc_fragment(B_shared_shape, storage_dtype)
             B_dequantize_local = T.alloc_fragment(B_dequantize_shared_shape, out_dtype)
 
@@ -244,8 +243,8 @@ def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared, Scale_shared, k):
                     B_local[i, j // num_elems_per_byte],
                     j % num_elems_per_byte,
                     Scale_shared[
-                        i, k * block_K // scale_size + j //
-                        scale_size],  # Scale is the exponential part, within the representation of uint8
+                        i, k * block_K // scale_size + j // scale_size
+                    ],  # Scale is the exponential part, within the representation of uint8
                     dtype=out_dtype,
                 ) * T.shift_left(1, (Scale_shared[i, k * block_K // scale_size + j // scale_size]))
             T.copy(B_dequantize_local, B_dequantize_shared)
@@ -254,19 +253,17 @@ def simple_dequant_bf16_fp4(B_shared, B_dequantize_shared, Scale_shared, k):
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), in_dtype),
-            B: T.Tensor((E, N, QK), storage_dtype),
-            Scale: T.Tensor((E, N, K // scale_size), storage_dtype),
-            Bias: T.Tensor((E, N), out_dtype),
-            # Add fusedmoe tensors
-            topk_weights: T.Tensor((M * topk), out_dtype),
-            sorted_token_ids: T.Tensor((padding_M), "int32"),
-            expert_ids: T.Tensor((padding_M // block_M), "int32"),
-            C: T.Tensor((M, topk, N), out_dtype),
+        A: T.Tensor((M, K), in_dtype),
+        B: T.Tensor((E, N, QK), storage_dtype),
+        Scale: T.Tensor((E, N, K // scale_size), storage_dtype),
+        Bias: T.Tensor((E, N), out_dtype),
+        # Add fusedmoe tensors
+        topk_weights: T.Tensor((M * topk), out_dtype),
+        sorted_token_ids: T.Tensor((padding_M), T.int32),
+        expert_ids: T.Tensor((padding_M // block_M), T.int32),
+        C: T.Tensor((M, topk, N), out_dtype),
     ):
-
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(padding_M, block_M), threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(padding_M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
             B_dequantize_shared = T.alloc_shared(B_dequantize_shared_shape, in_dtype)
@@ -274,23 +271,23 @@ def main(
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             C_shared = T.alloc_shared((block_M, block_N), out_dtype)
             topk_weights_shared = T.alloc_shared((block_M), out_dtype)
-            sorted_token_ids_shared = T.alloc_shared((block_M), "int32")
-            expert_id = T.alloc_local((1), "int32")  # the expert id for the current block
+            sorted_token_ids_shared = T.alloc_shared((block_M), T.int32)
+            expert_id = T.alloc_local((1), T.int32)  # the expert id for the current block
             # To use 1D TMA, the last dim of Scale_shared must have stride=1
             # May use much more shared memory than necessary
             Scale_shared = T.alloc_shared((block_N, K // scale_size), storage_dtype)
 
-            T.annotate_layout({
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                C_shared: tilelang.layout.make_swizzled_layout(C_shared),
-            })
+            T.annotate_layout(
+                {
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                }
+            )
             T.use_swizzle(10)
 
             if threads == 512:
                 T.disable_warp_group_reg_alloc()
 
-            T.copy(sorted_token_ids[by * block_M:(by + 1) * block_M], sorted_token_ids_shared)
+            T.copy(sorted_token_ids[by * block_M : (by + 1) * block_M], sorted_token_ids_shared)
             expert_id[0] = expert_ids[by]
 
             # Get the topk weights of each token in the current block
@@ -300,11 +297,11 @@ def main(
 
             # Get bias and scale based on the expert id
             if with_bias:
-                T.copy(Bias[expert_id[0], bx * block_N:(bx + 1) * block_N], Bias_shared)
+                T.copy(Bias[expert_id[0], bx * block_N : (bx + 1) * block_N], Bias_shared)
             else:
                 T.clear(Bias_shared)
 
-            T.copy(Scale[expert_id[0], bx * block_N:(bx + 1) * block_N, :], Scale_shared)
+            T.copy(Scale[expert_id[0], bx * block_N : (bx + 1) * block_N, :], Scale_shared)
 
             for i, j in T.Parallel(block_M, block_N):
                 C_local[i, j] = Bias_shared[j]
@@ -317,14 +314,13 @@ def main(
                     base = copy_i * threads * 16 + tx * 16
                     if sorted_token_ids_shared[base // block_K] != -1:
                         for copy_j in T.vectorized(16):
-                            A_shared[base // block_K, base % block_K +
-                                     copy_j] = A[sorted_token_ids_shared[base // block_K] // topk,
-                                                 k * block_K + base % block_K + copy_j]
+                            A_shared[base // block_K, base % block_K + copy_j] = A[
+                                sorted_token_ids_shared[base // block_K] // topk, k * block_K + base % block_K + copy_j
+                            ]
 
                 T.copy(B[expert_id[0], bx * block_N, k * block_K // num_elems_per_byte], B_shared)
                 if fast_dequant:
-                    get_fast_dequant_twiddling_func()(B_shared, B_dequantize_shared, Scale_shared,
-                                                      k)
+                    get_fast_dequant_twiddling_func()(B_shared, B_dequantize_shared, Scale_shared, k)
                 else:
                     get_simple_dequant_func()(B_shared, B_dequantize_shared, Scale_shared, k)
 
@@ -338,16 +334,17 @@ def main(
                 base = copy_i * threads * 16 + tx * 16
                 if sorted_token_ids_shared[base // block_N] != -1:
                     for copy_j in T.vectorized(16):
-                        C[sorted_token_ids_shared[base // block_N] // topk,
-                          sorted_token_ids_shared[base // block_N] % topk, bx * block_N +
-                          base % block_N + copy_j] = C_shared[base // block_N,
-                                                              base % block_N + copy_j]
+                        C[
+                            sorted_token_ids_shared[base // block_N] // topk,
+                            sorted_token_ids_shared[base // block_N] % topk,
+                            bx * block_N + base % block_N + copy_j,
+                        ] = C_shared[base // block_N, base % block_N + copy_j]
 
     return main
 
 
 def ref_moe(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, block_M=256):
-    dtypeC = "bfloat16"
+    dtypeC = T.bfloat16
     M, K = A.shape
     E, N, QK = qB.shape
     topk = topk_weights.shape[0] // M
@@ -355,7 +352,7 @@ def ref_moe(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, bloc
     assert scale_size == 32  # MXFP4
 
     # Initialize output tensor
-    C = torch.ones((M, topk, N), dtype=getattr(torch, dtypeC), device='cuda')
+    C = torch.ones((M, topk, N), dtype=getattr(torch, dtypeC), device="cuda")
 
     # Iterate over sorted_token_ids
     for idx in range(len(sorted_token_ids)):  # padding_M
@@ -370,14 +367,11 @@ def ref_moe(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, bloc
 
         # Dequantize the expert weights
         B = torch_convert_bit_twiddling(qB[expert_id])  # shape: (N, K)
-        B *= 2**(
-            Scale[expert_id][:, (torch.arange(B.shape[1], device=B.device) // scale_size)].to(
-                torch.bfloat16))
+        B *= 2 ** (Scale[expert_id][:, (torch.arange(B.shape[1], device=B.device) // scale_size)].to(torch.bfloat16))
 
         # Compute the output for this token-expert pair
         # token_embedding @ B.T + bias
-        output = torch.matmul(token_embedding.to(torch.bfloat16), B.T.to(
-            torch.bfloat16)) + Bias[expert_id]
+        output = torch.matmul(token_embedding.to(torch.bfloat16), B.T.to(torch.bfloat16)) + Bias[expert_id]
         output = output.to(torch.__getattribute__(dtypeC))
 
         # Apply the topk weight
@@ -391,14 +385,12 @@ def ref_moe(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, bloc
 
 
 def get_data(m, n, k, qk, scale_size, topk, E, block_M):
-    A = torch.empty(m, k, dtype=torch.bfloat16, device='cuda').uniform_(-1, 1)
-    qB = torch.randint(
-        0, 256, (E, n, qk), dtype=torch.uint8,
-        device='cuda')  #  Quantized weight tensor for E experts.
-    Scale = torch.randint(0, 8, (E, n, k // scale_size), dtype=torch.uint8, device='cuda')
-    Bias = torch.empty(E, n, dtype=torch.bfloat16, device='cuda').uniform_(-1, 1)
-
-    weights = torch.empty(m, E, dtype=torch.bfloat16, device='cuda').uniform_(-1, 1)
+    A = torch.empty(m, k, dtype=torch.bfloat16, device="cuda").uniform_(-1, 1)
+    qB = torch.randint(0, 256, (E, n, qk), dtype=torch.uint8, device="cuda")  #  Quantized weight tensor for E experts.
+    Scale = torch.randint(0, 8, (E, n, k // scale_size), dtype=torch.uint8, device="cuda")
+    Bias = torch.empty(E, n, dtype=torch.bfloat16, device="cuda").uniform_(-1, 1)
+
+    weights = torch.empty(m, E, dtype=torch.bfloat16, device="cuda").uniform_(-1, 1)
     # topk_weights: Router weights for the top-k experts for each token.
     # Shape: (m, topk)
     # tokens_experts: A flattened tensor of expert assignments for each token.
@@ -420,10 +412,7 @@ def get_data(m, n, k, qk, scale_size, topk, E, block_M):
         pad_len = ((cnt + block_M - 1) // block_M) * block_M - cnt
         if pad_len > 0:
             # -1 for padding (`M` instead in vLLM moe_align_block_size())
-            group_token_ids = torch.cat([
-                group_token_ids,
-                torch.full((pad_len,), -1, dtype=group_token_ids.dtype, device='cuda')
-            ])
+            group_token_ids = torch.cat([group_token_ids, torch.full((pad_len,), -1, dtype=group_token_ids.dtype, device="cuda")])
         padded_token_ids.append(group_token_ids)
         expert_ids.extend([eid] * ((cnt + block_M - 1) // block_M))
         start = end
@@ -431,21 +420,13 @@ def get_data(m, n, k, qk, scale_size, topk, E, block_M):
     # sorted_token_ids: The final flattened and padded tensor of token indices.
     sorted_token_ids = torch.cat(padded_token_ids, dim=0).to(torch.int32)  # (padding_M,)
     # expert_ids: The final tensor of expert IDs corresponding to `sorted_token_ids`.
-    expert_ids = torch.tensor(expert_ids, dtype=torch.int32, device='cuda')  # （padding_M,）
+    expert_ids = torch.tensor(expert_ids, dtype=torch.int32, device="cuda")  # （padding_M,）
     padding_M = sorted_token_ids.shape[0]  # padding_M: token number after padding
 
     return A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, padding_M
 
 
-def main(m=256,
-         n=256,
-         k=256,
-         scale_size=32,
-         topk=4,
-         E=32,
-         fast_dequant=True,
-         with_bias=False,
-         tune=False):
+def main(m=256, n=256, k=256, scale_size=32, topk=4, E=32, fast_dequant=True, with_bias=False, tune=False):
     # Tunable parameters
     block_M, block_N, block_K = 128, 256, 128  # noqa: F841
     num_stages = 1  # noqa: F841
@@ -456,8 +437,7 @@ def main(m=256,
     num_bits = 4
     num_elems_per_byte = 8 // num_bits
     qk = k // num_elems_per_byte
-    A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, padding_M = get_data(
-        m, n, k, qk, scale_size, topk, E, block_M)
+    A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, padding_M = get_data(m, n, k, qk, scale_size, topk, E, block_M)
 
     if tune:
         with set_autotune_inputs([A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids]):
@@ -469,9 +449,9 @@ def main(m=256,
                 topk,
                 E,
                 padding_M,
-                "bfloat16",
-                "bfloat16",
-                "float32",
+                T.bfloat16,
+                T.bfloat16,
+                T.float32,
                 num_bits=num_bits,
                 scale_size=scale_size,
                 fast_dequant=fast_dequant,
@@ -485,9 +465,9 @@ def main(m=256,
             topk,
             E,
             padding_M,
-            "bfloat16",
-            "bfloat16",
-            "float32",
+            T.bfloat16,
+            T.bfloat16,
+            T.float32,
             num_bits=num_bits,
             scale_size=scale_size,
             fast_dequant=fast_dequant,
@@ -510,14 +490,11 @@ def main(m=256,
         expert_ids,
     )
 
-    print('Tilelang kernel run finished.')
+    print("Tilelang kernel run finished.")
 
-    ref_output = ref_moe(
-        A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids,
-        block_M=block_M)  # Maybe a little bit slow...
+    ref_output = ref_moe(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, block_M=block_M)  # Maybe a little bit slow...
 
-    latency = tilelang.profiler.do_bench(
-        lambda: kernel(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids), warmup=100)
+    latency = tilelang.profiler.do_bench(lambda: kernel(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids), warmup=100)
     print("Tilelang: {:.2f} ms".format(latency))
     print("Tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
@@ -525,32 +502,72 @@ def main(m=256,
     max_val = diff.max()
     max_idx = diff.argmax()
     print(f"max abs diff: {max_val} at index: {max_idx}")
-    assert_similar(
-        output, ref_output, name="output",
-        eps=2e-5)  # We care about the similarity rather than abs. difference
+    assert_similar(output, ref_output, name="output", eps=2e-5)  # We care about the similarity rather than abs. difference
     print("All checks pass. ✅")
 
 
+def run_regression_perf(m=4096, n=4096, k=4096, scale_size=32, topk=4, E=32, fast_dequant=True, with_bias=False, tune=False):
+    block_M, block_N, block_K = 128, 256, 128
+    num_stages = 1
+    threads = 512
+    split = 1
+    num_bits = 4
+    num_elems_per_byte = 8 // num_bits
+    qk = k // num_elems_per_byte
+    A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids, padding_M = get_data(m, n, k, qk, scale_size, topk, E, block_M)
+
+    if tune:
+        with set_autotune_inputs([A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids]):
+            kernel = matmul(
+                m,
+                n,
+                k,
+                topk,
+                E,
+                padding_M,
+                "bfloat16",
+                "bfloat16",
+                "float32",
+                num_bits=num_bits,
+                scale_size=scale_size,
+                fast_dequant=fast_dequant,
+                with_bias=with_bias,
+            )
+    else:
+        kernel = matmul(
+            m,
+            n,
+            k,
+            topk,
+            E,
+            padding_M,
+            "bfloat16",
+            "bfloat16",
+            "float32",
+            num_bits=num_bits,
+            scale_size=scale_size,
+            fast_dequant=fast_dequant,
+            with_bias=with_bias,
+            block_M=block_M,
+            block_N=block_N,
+            block_K=block_K,
+            num_stages=num_stages,
+            threads=threads,
+            split=split,
+        )
+
+    return tilelang.profiler.do_bench(lambda: kernel(A, qB, Scale, Bias, topk_weights, sorted_token_ids, expert_ids), backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--M", type=int, default=16384, help="M")  # From gpt-oss-20b MoE's first gemm
+    parser.add_argument("--M", type=int, default=16384, help="M")  # From gpt-oss-20b MoE's first gemm
     parser.add_argument("--N", type=int, default=5760, help="N")
     parser.add_argument("--K", type=int, default=2944, help="K")
     parser.add_argument("--scale_size", type=int, default=32, help="scale size")
-    parser.add_argument(
-        "--topk", type=int, default=4, help="topk")  # experts activated for each token
+    parser.add_argument("--topk", type=int, default=4, help="topk")  # experts activated for each token
     parser.add_argument("--E", type=int, default=32, help="E")  # number of experts
     parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
 
-    main(
-        args.M,
-        args.N,
-        args.K,
-        args.scale_size,
-        topk=args.topk,
-        E=args.E,
-        fast_dequant=True,
-        with_bias=True,
-        tune=args.tune)
+    main(args.M, args.N, args.K, args.scale_size, topk=args.topk, E=args.E, fast_dequant=True, with_bias=True, tune=args.tune)
diff --git a/examples/dequantize_gemm/regression_example_dequantize_gemm.py b/examples/dequantize_gemm/regression_example_dequantize_gemm.py
new file mode 100644
index 000000000..4ab03784f
--- /dev/null
+++ b/examples/dequantize_gemm/regression_example_dequantize_gemm.py
@@ -0,0 +1,35 @@
+import tilelang.testing
+import example_dequant_gemm_bf16_fp4_hopper
+import example_dequant_gemm_bf16_mxfp4_hopper
+import example_dequant_gemm_fp4_hopper
+import example_dequant_gemm_w4a8
+import example_dequant_gemv_fp16xint4
+import example_dequant_groupedgemm_bf16_mxfp4_hopper
+
+
+def regression_example_dequant_gemv_fp16xint4():
+    tilelang.testing.process_func(example_dequant_gemv_fp16xint4.run_regression_perf)
+
+
+def regression_example_dequant_gemm_fp4_hopper():
+    tilelang.testing.process_func(example_dequant_gemm_fp4_hopper.run_regression_perf)
+
+
+def regression_example_dequant_gemm_bf16_fp4_hopper():
+    tilelang.testing.process_func(example_dequant_gemm_bf16_fp4_hopper.run_regression_perf)
+
+
+def regression_example_dequant_gemm_bf16_mxfp4_hopper():
+    tilelang.testing.process_func(example_dequant_gemm_bf16_mxfp4_hopper.run_regression_perf)
+
+
+def regression_example_dequant_groupedgemm_bf16_mxfp4_hopper():
+    tilelang.testing.process_func(example_dequant_groupedgemm_bf16_mxfp4_hopper.run_regression_perf)
+
+
+def regression_example_dequant_gemm_w4a8():
+    tilelang.testing.process_func(example_dequant_gemm_w4a8.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/dequantize_gemm/test_example_dequantize_gemm.py b/examples/dequantize_gemm/test_example_dequantize_gemm.py
index 01bc40e6c..a2f777222 100644
--- a/examples/dequantize_gemm/test_example_dequantize_gemm.py
+++ b/examples/dequantize_gemm/test_example_dequantize_gemm.py
@@ -3,7 +3,6 @@
 import example_dequant_gemv_fp16xint4
 import example_dequant_gemm_fp4_hopper
 import example_dequant_gemm_bf16_mxfp4_hopper
-import example_dequant_gemm_bf16_mxfp4_hopper_tma
 import example_dequant_groupedgemm_bf16_mxfp4_hopper
 import example_dequant_gemm_w4a8
 
@@ -25,12 +24,6 @@ def test_example_dequant_gemm_bf16_mxfp4_hopper():
     example_dequant_gemm_bf16_mxfp4_hopper.main()
 
 
-@tilelang.testing.requires_cuda
-@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
-def test_example_dequant_gemm_bf16_mxfp4_hopper_tma():
-    example_dequant_gemm_bf16_mxfp4_hopper_tma.main()
-
-
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_dequant_groupedgemm_bf16_mxfp4_hopper():
diff --git a/examples/dequantize_gemm/utils.py b/examples/dequantize_gemm/utils.py
index 7134ae6aa..da9ddb9f8 100644
--- a/examples/dequantize_gemm/utils.py
+++ b/examples/dequantize_gemm/utils.py
@@ -34,8 +34,7 @@ def _convert(val0, val1, pos) -> torch.bfloat16:
             mask1 = 0b1000000000000000
             mask2 = 0b0000000110000000
             mask3 = 0b0000000001000000
-            bf16 = ((val_concat << 1) & mask1) | ((val_concat >> 3) & mask2) | (
-                (val_concat >> 7) & mask3)
+            bf16 = ((val_concat << 1) & mask1) | ((val_concat >> 3) & mask2) | ((val_concat >> 7) & mask3)
         bf16_new = torch.tensor([bf16], dtype=torch.uint16, device=val0.device).view(torch.bfloat16)
         # Add bias for change from fp4 to bf16
         bf16_new = bf16_new.item() * (2**126)
@@ -104,5 +103,5 @@ def print_bit(name, val):
         val (torch.Tensor): A scalar PyTorch tensor (numeric) whose 32-bit binary representation will be shown.
     """
     val_cpu = val.cpu().item()
-    binary_repr = f'{val_cpu:032b}'
+    binary_repr = f"{val_cpu:032b}"
     print(name, binary_repr)
diff --git a/examples/distributed/README.md b/examples/distributed/README.md
index e73ae0fac..48cf85488 100644
--- a/examples/distributed/README.md
+++ b/examples/distributed/README.md
@@ -2,7 +2,7 @@
 
 This directory contains examples demonstrating distributed computing capabilities using TileLang.
 
-For example, 
+For example,
 ```
 ./tilelang/distributed/launch.sh examples/distributed/example_allgather.py
 ```
@@ -11,7 +11,7 @@ For example,
 
 Before running the examples, you need to build NVSHMEM library for device-side code generation.
 
-```bash 
+```bash
 export NVSHMEM_SRC="your_custom_nvshmem_dir" # default to 3rdparty/nvshmem_src
 cd tilelang/distributed
 source build_nvshmem.sh
diff --git a/examples/distributed/deepseek_deepep/buffer.py b/examples/distributed/deepseek_deepep/buffer.py
index f281f19e3..71f7f3faf 100644
--- a/examples/distributed/deepseek_deepep/buffer.py
+++ b/examples/distributed/deepseek_deepep/buffer.py
@@ -1,4 +1,4 @@
-""" The interface for DeepEP. """
+"""The interface for DeepEP."""
 
 import torch
 import torch.distributed as dist
@@ -27,14 +27,16 @@ class EPBuffer:
     num_sms: int = 20
     symm_heap_size: int = 2**30  # size of the symm heap for allocators
 
-    def __init__(self,
-                 group: dist.ProcessGroup,
-                 num_nvl_bytes: int,
-                 num_topk: int,
-                 num_experts: int,
-                 hidden: int,
-                 dispatch_cfg: Optional[Config] = None,
-                 combine_cfg: Optional[Config] = None):
+    def __init__(
+        self,
+        group: dist.ProcessGroup,
+        num_nvl_bytes: int,
+        num_topk: int,
+        num_experts: int,
+        hidden: int,
+        dispatch_cfg: Optional[Config] = None,
+        combine_cfg: Optional[Config] = None,
+    ):
         """
         Initialize the communication buffer.
 
@@ -70,7 +72,8 @@ def __init__(self,
             is_distributed=True,
             local_rank=self.rank,
             num_local_ranks=self.num_ranks,
-            group=group)
+            group=group,
+        )
 
         self._pre_alloc_symm_buffers()
         self._prepare_counters()
@@ -87,81 +90,70 @@ def _pre_alloc_symm_buffers(self):
 
     def _pre_alloc_symm_buffers_intranode(self):
         # barrier signal is always zeroed after each usage, so we can pre-init here
-        barrier_signal = tilelang.tensor((self.num_ranks),
-                                         dtype=torch.int32,
-                                         device='cuda',
-                                         allocator=self._allocator).zero_()
-
-        per_rank_buffer = tilelang.tensor((self.num_ranks, self.num_ranks),
-                                          dtype=torch.int32,
-                                          device='cuda',
-                                          allocator=self._allocator)
-        per_expert_buffer = tilelang.tensor((self.num_ranks, self.num_local_experts),
-                                            dtype=torch.int32,
-                                            device='cuda',
-                                            allocator=self._allocator)
-
-        channel_start_offset = tilelang.tensor([self.num_channels, self.num_ranks],
-                                               dtype=torch.int32,
-                                               device='cuda',
-                                               allocator=self._allocator)
-        channel_end_offset = tilelang.tensor([self.num_channels, self.num_ranks],
-                                             dtype=torch.int32,
-                                             device='cuda',
-                                             allocator=self._allocator)
-        channel_head_idx = tilelang.tensor([self.num_channels, self.num_ranks],
-                                           dtype=torch.int32,
-                                           device='cuda',
-                                           allocator=self._allocator)
-        channel_tail_idx = tilelang.tensor([self.num_channels, self.num_ranks],
-                                           dtype=torch.int32,
-                                           device='cuda',
-                                           allocator=self._allocator)
+        barrier_signal = tilelang.tensor((self.num_ranks), dtype=torch.int32, device="cuda", allocator=self._allocator).zero_()
+
+        per_rank_buffer = tilelang.tensor((self.num_ranks, self.num_ranks), dtype=torch.int32, device="cuda", allocator=self._allocator)
+        per_expert_buffer = tilelang.tensor(
+            (self.num_ranks, self.num_local_experts), dtype=torch.int32, device="cuda", allocator=self._allocator
+        )
+
+        channel_start_offset = tilelang.tensor(
+            [self.num_channels, self.num_ranks], dtype=torch.int32, device="cuda", allocator=self._allocator
+        )
+        channel_end_offset = tilelang.tensor(
+            [self.num_channels, self.num_ranks], dtype=torch.int32, device="cuda", allocator=self._allocator
+        )
+        channel_head_idx = tilelang.tensor([self.num_channels, self.num_ranks], dtype=torch.int32, device="cuda", allocator=self._allocator)
+        channel_tail_idx = tilelang.tensor([self.num_channels, self.num_ranks], dtype=torch.int32, device="cuda", allocator=self._allocator)
         # NOTE: for each #ranks, dispatch and combine cfg have the same num_max_nvl_chunked_recv_tokens, so we can use the same buffer here
-        channel_x_buffers = tilelang.tensor([
-            self.num_channels, self.num_ranks, self.dispatch_cfg.num_max_nvl_chunked_recv_tokens,
-            self.hidden
-        ],
-                                            dtype=torch.bfloat16,
-                                            device='cuda',
-                                            allocator=self._allocator)
+        channel_x_buffers = tilelang.tensor(
+            [self.num_channels, self.num_ranks, self.dispatch_cfg.num_max_nvl_chunked_recv_tokens, self.hidden],
+            dtype=torch.bfloat16,
+            device="cuda",
+            allocator=self._allocator,
+        )
         channel_src_idx_buffers = tilelang.tensor(
             [self.num_channels, self.num_ranks, self.dispatch_cfg.num_max_nvl_chunked_recv_tokens],
             dtype=torch.int32,
-            device='cuda',
-            allocator=self._allocator)
-        channel_topk_idx_buffers = tilelang.tensor([
-            self.num_channels, self.num_ranks, self.dispatch_cfg.num_max_nvl_chunked_recv_tokens,
-            self.num_topk
-        ],
-                                                   dtype=torch.int64,
-                                                   device='cuda',
-                                                   allocator=self._allocator)
-        channel_topk_weights_buffers = tilelang.tensor([
-            self.num_channels, self.num_ranks, self.dispatch_cfg.num_max_nvl_chunked_recv_tokens,
-            self.num_topk
-        ],
-                                                       dtype=torch.float32,
-                                                       device='cuda',
-                                                       allocator=self._allocator)
-
-        self._symm_buffers = (barrier_signal, per_rank_buffer, per_expert_buffer,
-                              channel_start_offset, channel_end_offset, channel_head_idx,
-                              channel_tail_idx, channel_x_buffers, channel_src_idx_buffers,
-                              channel_topk_idx_buffers, channel_topk_weights_buffers)
+            device="cuda",
+            allocator=self._allocator,
+        )
+        channel_topk_idx_buffers = tilelang.tensor(
+            [self.num_channels, self.num_ranks, self.dispatch_cfg.num_max_nvl_chunked_recv_tokens, self.num_topk],
+            dtype=torch.int64,
+            device="cuda",
+            allocator=self._allocator,
+        )
+        channel_topk_weights_buffers = tilelang.tensor(
+            [self.num_channels, self.num_ranks, self.dispatch_cfg.num_max_nvl_chunked_recv_tokens, self.num_topk],
+            dtype=torch.float32,
+            device="cuda",
+            allocator=self._allocator,
+        )
+
+        self._symm_buffers = (
+            barrier_signal,
+            per_rank_buffer,
+            per_expert_buffer,
+            channel_start_offset,
+            channel_end_offset,
+            channel_head_idx,
+            channel_tail_idx,
+            channel_x_buffers,
+            channel_src_idx_buffers,
+            channel_topk_idx_buffers,
+            channel_topk_weights_buffers,
+        )
 
     def _pre_alloc_symm_buffers_internode(self):
         raise NotImplementedError("internode is not supported yet")
 
     def _prepare_counters(self):
-        self._moe_recv_counter, self._moe_recv_counter_mapped = create_mapped_tensor([1],
-                                                                                     torch.int32)
-        self._moe_recv_expert_counter, self._moe_recv_expert_counter_mapped = create_mapped_tensor(
-            [self.num_local_experts], torch.int32)
+        self._moe_recv_counter, self._moe_recv_counter_mapped = create_mapped_tensor([1], torch.int32)
+        self._moe_recv_expert_counter, self._moe_recv_expert_counter_mapped = create_mapped_tensor([self.num_local_experts], torch.int32)
 
         if self.num_ranks > 8:  # internode
-            self._moe_recv_rdma_counter, self._moe_recv_rdma_counter_mapped = create_mapped_tensor(
-                [1], torch.int32)
+            self._moe_recv_rdma_counter, self._moe_recv_rdma_counter_mapped = create_mapped_tensor([1], torch.int32)
 
     @staticmethod
     def set_num_sms(num_sms: int):
@@ -204,19 +196,20 @@ def get_dispatch_layout(self, topk_idx: torch.Tensor):
             num_tokens_per_expert: `[num_experts]` with `torch.int`, the number of tokens to be sent to each expert.
             is_token_in_rank: `[num_tokens, num_ranks]` with `torch.bool`, whether a token be sent to a rank.
         """
-        num_tokens_per_rank, num_tokens_per_expert, is_token_in_rank = get_dispatch_layout(
-            topk_idx, self.num_experts, self.num_ranks)
+        num_tokens_per_rank, num_tokens_per_expert, is_token_in_rank = get_dispatch_layout(topk_idx, self.num_experts, self.num_ranks)
         return num_tokens_per_rank, num_tokens_per_expert, is_token_in_rank
 
-    def dispatch(self,
-                 x: torch.Tensor,
-                 handle: Optional[Tuple] = None,
-                 num_tokens_per_rank: Optional[torch.Tensor] = None,
-                 is_token_in_rank: Optional[torch.Tensor] = None,
-                 num_tokens_per_expert: Optional[torch.Tensor] = None,
-                 topk_idx: Optional[torch.Tensor] = None,
-                 topk_weights: Optional[torch.Tensor] = None,
-                 expert_alignment: int = 1):
+    def dispatch(
+        self,
+        x: torch.Tensor,
+        handle: Optional[Tuple] = None,
+        num_tokens_per_rank: Optional[torch.Tensor] = None,
+        is_token_in_rank: Optional[torch.Tensor] = None,
+        num_tokens_per_expert: Optional[torch.Tensor] = None,
+        topk_idx: Optional[torch.Tensor] = None,
+        topk_weights: Optional[torch.Tensor] = None,
+        expert_alignment: int = 1,
+    ):
         """
         Dispatch tokens to different ranks, both intranode and internode settings are supported.
         Intranode kernels require all the ranks should be visible via NVLink.
@@ -273,11 +266,24 @@ def dispatch(self,
         else:
             assert num_tokens_per_rank is not None and is_token_in_rank is not None and num_tokens_per_expert is not None
             recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle = intranode_dispatch(
-                self.rank, self._allocator, self._symm_buffers, self._moe_recv_counter,
-                self._moe_recv_expert_counter, self._moe_recv_counter_mapped,
-                self._moe_recv_expert_counter_mapped, x, self.dispatch_cfg, handle,
-                num_tokens_per_rank, is_token_in_rank, num_tokens_per_expert, topk_idx,
-                topk_weights, expert_alignment, self.comm_stream)
+                self.rank,
+                self._allocator,
+                self._symm_buffers,
+                self._moe_recv_counter,
+                self._moe_recv_expert_counter,
+                self._moe_recv_counter_mapped,
+                self._moe_recv_expert_counter_mapped,
+                x,
+                self.dispatch_cfg,
+                handle,
+                num_tokens_per_rank,
+                is_token_in_rank,
+                num_tokens_per_expert,
+                topk_idx,
+                topk_weights,
+                expert_alignment,
+                self.comm_stream,
+            )
             return recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle
 
     def combine(self, x: torch.Tensor, handle: Tuple, topk_weights: torch.Tensor):
@@ -298,7 +304,7 @@ def combine(self, x: torch.Tensor, handle: Tuple, topk_weights: torch.Tensor):
             recv_x: the reduced token from its dispatched ranks.
             recv_topk_weights: the reduced top-k weights from its dispatch ranks.
         """
-        recv_x, recv_topk_weights = intranode_combine(self.rank, self._allocator,
-                                                      self._symm_buffers, x, self.combine_cfg,
-                                                      handle, topk_weights, self.comm_stream)
+        recv_x, recv_topk_weights = intranode_combine(
+            self.rank, self._allocator, self._symm_buffers, x, self.combine_cfg, handle, topk_weights, self.comm_stream
+        )
         return recv_x, recv_topk_weights
diff --git a/examples/distributed/deepseek_deepep/deepep.md b/examples/distributed/deepseek_deepep/deepep.md
index d3cea90dc..620baf428 100644
--- a/examples/distributed/deepseek_deepep/deepep.md
+++ b/examples/distributed/deepseek_deepep/deepep.md
@@ -20,14 +20,12 @@ The table below shows a latency and bandwidth comparison for DeepEP and TileScal
 | DeepEP      | 1.0045             | 328.97                    | 1.1552            | 287.14                   |
 | TileScale   | 1.0720             | 308.25                    | 1.0809            | 306.86                   |
 
-
 # Intra-node Introduction
 
 This example implements DeepEP’s intra‑node (NVLink) dispatch/combine using TileScale kernels.
 z
 The intra‑node path lives under `intranode/` and provides a minimal public API that mirrors DeepEP’s behavior for NVLink‑connected ranks.
 
-
 ## Overview
 
 - Scope: intra‑node (NVLink) only; all ranks must be within one node and NVLink‑visible.
@@ -35,7 +33,6 @@ The intra‑node path lives under `intranode/` and provides a minimal public API
 - Datatypes: inputs are `torch.bfloat16`; routing `topk_idx` is `torch.int64`; `topk_weights` is `torch.float32`.
 - Channels: each channel uses 2 SMs (send/recv). With default `num_sms=20`, there are `num_channels=10`.
 
-
 ## Public API (intranode)
 
 - `intranode.get_dispatch_layout(topk_idx, num_experts, num_ranks)`
@@ -63,7 +60,6 @@ Convenience wrapper used by examples/tests:
   - Exposes the interface for the functions above via methods: `get_dispatch_layout`, `dispatch`, `combine`.
   - Manages TileScale allocator, symmetric buffers, and recommended kernel configs.
 
-
 ## Core Data Structures and Handle
 
 - `rank_prefix_matrix` (num_ranks × num_ranks): cumulative per‑rank token counts; used to compute global offsets for receiver writes.
@@ -82,7 +78,6 @@ Dispatch returns the handle:
 `(rank_prefix_matrix, channel_prefix_matrix, recv_channel_prefix_matrix, recv_src_idx, is_token_in_rank, send_head)`
 which can be reused for cached re‑dispatch and is required by the combine stage.
 
-
 ## Kernel Responsibilities (high level)
 
 - Layout
@@ -97,14 +92,12 @@ which can be reused for cached re‑dispatch and is required by the combine stag
   - `cached_notify_combine_kernel`: recalculates `send_head` expectations and zeros `channel_head_idx`/`channel_tail_idx` for the combine round.
   - `combine_kernel`: senders return expert outputs; receivers reduce by sum per token. `recv_topk_weights` is the sum of returned weights per token. Requires `hidden % 8 == 0` for vectorized access on the receiver side.
 
-
 ## Configuration and Tuning
 
 - `utils.Config` provides recommended values for `num_max_nvl_chunked_send_tokens` and `num_max_nvl_chunked_recv_tokens` per `num_ranks`. These control per‑round trunk sizes and receiver buffer depth per channel.
 - `EPBuffer.num_sms` controls total SMs assigned to high‑throughput kernels. Channels = `num_sms // 2` (one send SM + one recv SM per channel).
 - `expert_alignment` pads per‑local‑expert MoE receive counters up to the specified multiple, which can be used to size per‑expert workspace.
 
-
 ## Execution Flow (non‑cached)
 
 1) Prepare group and buffers
@@ -138,7 +131,6 @@ which can be reused for cached re‑dispatch and is required by the combine stag
 6) Cached re‑dispatch (optional)
 - For repeated communication with the same layout, pass `handle` back into `EPBuffer.dispatch(x, handle, ...)` to skip layout/notify work and return only `recv_x`.
 
-
 ## Usage
 
 Quick start (intra‑node test):
@@ -174,7 +166,6 @@ recv_x, recv_topk_idx, recv_topk_weights, per_expert_counts, handle = buf.dispat
 reduced_x, reduced_weights = buf.combine(expert_out, handle, recv_topk_weights)
 ```
 
-
 ## Notes and Limits
 
 - Intra‑node only: ranks must be NVLink‑visible; current code asserts `num_ranks <= 8` and `num_experts % num_ranks == 0`.
@@ -184,7 +175,6 @@ reduced_x, reduced_weights = buf.combine(expert_out, handle, recv_topk_weights)
 - Ensure `topk_idx` is contiguous, 2D, and `torch.int64`.
 - Set `TILELANG_USE_DISTRIBUTED=1` to enable TileScale’s distributed runtime.
 
-
 ## Files
 
 - `intranode/__init__.py` — re‑exports `get_dispatch_layout`, `intranode_dispatch`, `intranode_combine`.
@@ -194,7 +184,6 @@ reduced_x, reduced_weights = buf.combine(expert_out, handle, recv_topk_weights)
 - `buffer.py` — EPBuffer wrapper: allocator and symmetric buffers, public methods.
 - `utils.py` — recommended configs and MoE counter helpers.
 
-
 ## Implementation Notes
 
 - Negative offset encoding: senders write channel start/end offsets as `-value-1` so that a zero token count is distinguishable from an uninitialized `0`.
diff --git a/examples/distributed/deepseek_deepep/deepep_utils.py b/examples/distributed/deepseek_deepep/deepep_utils.py
index 1294acb31..288640295 100644
--- a/examples/distributed/deepseek_deepep/deepep_utils.py
+++ b/examples/distributed/deepseek_deepep/deepep_utils.py
@@ -30,7 +30,7 @@ def __post_init__(self):
         # 1 sm for send, 1 sm for recv in each channel
 
     @staticmethod
-    def get_dispatch_config(num_ranks: int) -> 'Config':
+    def get_dispatch_config(num_ranks: int) -> "Config":
         """
         Get a recommended dispatch config.
 
@@ -56,11 +56,11 @@ def get_dispatch_config(num_ranks: int) -> 'Config':
             144: Config(num_sms, 32, 720, 12, 128),
             160: Config(num_sms, 28, 720, 12, 128),
         }
-        assert num_ranks in config_map, f'Unsupported number of EP ranks: {num_ranks}'
+        assert num_ranks in config_map, f"Unsupported number of EP ranks: {num_ranks}"
         return config_map[num_ranks]
 
     @staticmethod
-    def get_combine_config(num_ranks: int) -> 'Config':
+    def get_combine_config(num_ranks: int) -> "Config":
         """
         Get a recommended combine config.
 
@@ -86,33 +86,31 @@ def get_combine_config(num_ranks: int) -> 'Config':
             144: Config(num_sms, 2, 720, 8, 128),
             160: Config(num_sms, 2, 720, 8, 128),
         }
-        assert num_ranks in config_map, f'Unsupported number of EP ranks: {num_ranks}'
+        assert num_ranks in config_map, f"Unsupported number of EP ranks: {num_ranks}"
         return config_map[num_ranks]
 
 
 # Only necessary in inter-node cases
-def set_rdma_env_args(num_qps_per_rank: int = 24,
-                      allow_nvlink_for_low_latency_mode: bool = True,
-                      allow_mnnvl: bool = False):
-    os.environ['NVSHMEM_DISABLE_P2P'] = '0' if allow_nvlink_for_low_latency_mode else '1'
-    os.environ['NVSHMEM_IB_ENABLE_IBGDA'] = '1'
-    os.environ['NVSHMEM_IBGDA_NUM_RC_PER_PE'] = f'{num_qps_per_rank}'
+def set_rdma_env_args(num_qps_per_rank: int = 24, allow_nvlink_for_low_latency_mode: bool = True, allow_mnnvl: bool = False):
+    os.environ["NVSHMEM_DISABLE_P2P"] = "0" if allow_nvlink_for_low_latency_mode else "1"
+    os.environ["NVSHMEM_IB_ENABLE_IBGDA"] = "1"
+    os.environ["NVSHMEM_IBGDA_NUM_RC_PER_PE"] = f"{num_qps_per_rank}"
 
     # Make sure QP depth is always larger than the number of on-flight WRs, so that we can skip WQ slot check
-    nvshmem_qp_depth = int(os.environ.get('NVSHMEM_QP_DEPTH', '1024'))
-    os.environ['NVSHMEM_QP_DEPTH'] = str(nvshmem_qp_depth)
+    nvshmem_qp_depth = int(os.environ.get("NVSHMEM_QP_DEPTH", "1024"))
+    os.environ["NVSHMEM_QP_DEPTH"] = str(nvshmem_qp_depth)
 
     # Reduce gpu memory usage
     # 6 default teams + 1 extra team
-    os.environ['NVSHMEM_MAX_TEAMS'] = '7'
+    os.environ["NVSHMEM_MAX_TEAMS"] = "7"
     # Disable NVLink SHArP
-    os.environ['NVSHMEM_DISABLE_NVLS'] = '1'
+    os.environ["NVSHMEM_DISABLE_NVLS"] = "1"
     # NOTES: NVSHMEM initialization requires at least 256 MiB
-    os.environ['NVSHMEM_CUMEM_GRANULARITY'] = f'{2 ** 29}'
+    os.environ["NVSHMEM_CUMEM_GRANULARITY"] = f"{2**29}"
 
     if not allow_mnnvl:
         # Disable multi-node NVLink detection
-        os.environ['NVSHMEM_DISABLE_MNNVL'] = '1'
+        os.environ["NVSHMEM_DISABLE_MNNVL"] = "1"
 
 
 def unpack_bias(bias: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]):
@@ -147,10 +145,10 @@ def gen_inputs(num_tokens: int, hidden: int, num_topk: int, num_experts: int, nu
     assert num_topk <= num_experts, "num_topk must be less than or equal to num_experts"
     assert num_experts % num_ranks == 0, "num_experts must be divisible by num_ranks"
 
-    x = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
-    scores = torch.randn((num_tokens, num_experts), dtype=torch.float32, device='cuda').abs() + 1
+    x = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device="cuda")
+    scores = torch.randn((num_tokens, num_experts), dtype=torch.float32, device="cuda").abs() + 1
     topk_idx = torch.topk(scores, num_topk, dim=-1, largest=True, sorted=False)[1]
-    topk_weights = torch.randn((num_tokens, num_topk), dtype=torch.float32, device='cuda')
+    topk_weights = torch.randn((num_tokens, num_topk), dtype=torch.float32, device="cuda")
     rank_idx = topk_idx // (num_experts // num_ranks)
     rank_idx.masked_fill_(topk_idx == -1, -1)
     inplace_unique(rank_idx, num_ranks)
@@ -192,7 +190,7 @@ def ep_bench(fn, warmup: int = 50, rep: int = 50, post_fn=None):
 
     # Flush L2 cache with 256 MB data
     torch.cuda.synchronize()
-    cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda')
+    cache = torch.empty(int(256e6 // 4), dtype=torch.int, device="cuda")
 
     # Warmup
     for _ in range(warmup):
@@ -248,8 +246,5 @@ def ep_bench(fn, warmup: int = 50, rep: int = 50, post_fn=None):
 """
 
 ep_ext = load_inline(
-    name="ep_ext",
-    cpp_sources=_src,
-    functions=["wait_for_counters_ready"],
-    extra_cflags=["-O3", "-march=native"],
-    verbose=False)
+    name="ep_ext", cpp_sources=_src, functions=["wait_for_counters_ready"], extra_cflags=["-O3", "-march=native"], verbose=False
+)
diff --git a/examples/distributed/deepseek_deepep/intranode/combine.py b/examples/distributed/deepseek_deepep/intranode/combine.py
index 17c5f175c..aa95b9339 100644
--- a/examples/distributed/deepseek_deepep/intranode/combine.py
+++ b/examples/distributed/deepseek_deepep/intranode/combine.py
@@ -11,7 +11,7 @@
 import tilelang.language as T
 
 tilelang.disable_cache()
-os.environ['NCCL_DEBUG'] = 'WARN'  # silence NCCL log
+os.environ["NCCL_DEBUG"] = "WARN"  # silence NCCL log
 
 
 @tilelang.jit(pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
@@ -19,15 +19,15 @@ def cached_notify_combine_kernel(num_ranks, num_sms):
     num_channels = num_sms // 2
     threads = max(128, 32 * num_ranks)
 
-    num_recv_tokens = T.dynamic('num_recv_tokens')
+    num_recv_tokens = T.dynamic("num_recv_tokens")
 
     @T.prim_func
     def cached_notify_combine_main(
-            send_head: T.Tensor([num_recv_tokens, num_ranks], "int32"),
-            ##### symm buffers #####
-            channel_head_idx: T.Tensor([num_channels, num_ranks], "int32"),
-            channel_tail_idx: T.Tensor([num_channels, num_ranks], "int32"),
-            barrier_signal: T.Tensor((num_ranks,), 'int32'),
+        send_head: T.Tensor([num_recv_tokens, num_ranks], "int32"),
+        ##### symm buffers #####
+        channel_head_idx: T.Tensor([num_channels, num_ranks], "int32"),
+        channel_tail_idx: T.Tensor([num_channels, num_ranks], "int32"),
+        barrier_signal: T.Tensor((num_ranks,), "int32"),
     ):
         with T.Kernel(num_channels + 1, threads=threads) as bx:
             tx = T.get_thread_binding()
@@ -48,17 +48,15 @@ def cached_notify_combine_main(
                 token_start_idx = T.min(tokens_per_channel * channel_id, num_recv_tokens)
                 token_end_idx = T.min(token_start_idx + tokens_per_channel, num_recv_tokens)
 
-                last_head = T.alloc_var('int32', init=2**25)  # a heuristic large number
-                # todo: tilelang doesn't support reverse loop, we simulate this
-                for i in T.serial(0, token_end_idx - token_start_idx, 32):
-                    token_idx_tail = token_end_idx - i - 1
+                last_head = T.alloc_var("int32", init=2**25)  # a heuristic large number
+                for token_idx_tail in T.serial(token_end_idx - 1, token_start_idx - 1, -32):
                     token_idx = token_idx_tail - lane_id
-                    current_head = T.alloc_var('int32')
+                    current_head = T.alloc_var("int32")
                     if token_idx >= token_start_idx:
                         T.ld(send_head[token_idx, rank_id], current_head, nc=True)
                     else:
                         current_head = -1
-                    expected_head = T.alloc_var('int32')
+                    expected_head = T.alloc_var("int32")
                     expected_head = 0
                     for j in T.serial(T.min(32, token_idx_tail - token_start_idx + 1)):
                         head = T.tvm_warp_shuffle(-1, current_head, j, 32, 32)
@@ -74,31 +72,27 @@ def cached_notify_combine_main(
 
 
 def cached_notify_combine(
-        num_ranks,
-        num_sms,
-        ##### symm buffers #####
-        send_head: torch.Tensor,
-        channel_head_idx: torch.Tensor,
-        channel_tail_idx: torch.Tensor,
-        barrier_signal: torch.Tensor,
-        allocator,
-        comm_stream=None):
+    num_ranks,
+    num_sms,
+    ##### symm buffers #####
+    send_head: torch.Tensor,
+    channel_head_idx: torch.Tensor,
+    channel_tail_idx: torch.Tensor,
+    barrier_signal: torch.Tensor,
+    allocator,
+):
     kernel = cached_notify_combine_kernel(num_ranks, num_sms)
-    kernel.initialize(allocator=allocator, stream=comm_stream.cuda_stream)
+    kernel.initialize(allocator=allocator)
 
-    kernel(
-        send_head,
-        channel_head_idx,
-        channel_tail_idx,
-        barrier_signal,
-        stream=comm_stream.cuda_stream,
-        skip_tensor_validation=True)  # reduce runtime overhead
+    kernel(send_head, channel_head_idx, channel_tail_idx, barrier_signal)  # reduce runtime overhead
 
 
-@tilelang.jit(pass_configs={
-    "tl.disable_tma_lower": True,  # use TMA later
-    "tl.disable_warp_specialized": True
-})
+@tilelang.jit(
+    pass_configs={
+        "tl.disable_tma_lower": True,  # use TMA later
+        "tl.disable_warp_specialized": True,
+    }
+)
 def combine_kernel(
     num_ranks,
     num_max_send_tokens,  # config.num_max_nvl_chunked_send_tokens
@@ -106,10 +100,10 @@ def combine_kernel(
     hidden,
     num_topk,
     num_sms,
-    dtype: str = 'bfloat16',
+    dtype: str = "bfloat16",
 ):
-    num_tokens = T.dynamic('num_tokens')
-    num_recv_tokens = T.dynamic('num_recv_tokens')
+    num_tokens = T.dynamic("num_tokens")
+    num_recv_tokens = T.dynamic("num_recv_tokens")
 
     num_channels = num_sms // 2
     threads = 768  # 24 warps
@@ -140,12 +134,9 @@ def combine_main(
         # symm buffers
         channel_head_idx: T.Tensor([num_channels, num_ranks], "int32"),  # reuse, already zeroed
         channel_tail_idx: T.Tensor([num_channels, num_ranks], "int32"),  # reuse, already zeroed
-        channel_x_buffers: T.Tensor([num_channels, num_ranks, num_recv_buffer_tokens, hidden],
-                                    dtype),
-        channel_src_idx_buffers: T.Tensor([num_channels, num_ranks, num_recv_buffer_tokens],
-                                          "int32"),
-        channel_topk_weights_buffers: T.Tensor(
-            [num_channels, num_ranks, num_recv_buffer_tokens, num_topk], "float32"),
+        channel_x_buffers: T.Tensor([num_channels, num_ranks, num_recv_buffer_tokens, hidden], dtype),
+        channel_src_idx_buffers: T.Tensor([num_channels, num_ranks, num_recv_buffer_tokens], "int32"),
+        channel_topk_weights_buffers: T.Tensor([num_channels, num_ranks, num_recv_buffer_tokens, num_topk], "float32"),
     ):
         with T.Kernel(num_sms, threads=threads) as bx:
             tx = T.get_thread_binding()
@@ -158,85 +149,85 @@ def combine_main(
                 send_warp_id_in_rank = warp_id // num_ranks
 
                 # get tasks
-                rank_offset = T.if_then_else(send_rank_id > 0, rank_prefix_matrix[send_rank_id - 1,
-                                                                                  rank], 0)
+                rank_offset = T.if_then_else(send_rank_id > 0, rank_prefix_matrix[send_rank_id - 1, rank], 0)
                 num_rank_tokens = rank_prefix_matrix[send_rank_id, rank] - rank_offset
                 channel_offset = channel_prefix_matrix[send_rank_id, responsible_channel]
-                num_channel_tokens = T.if_then_else(
-                    responsible_channel == num_channels - 1, num_rank_tokens,
-                    channel_prefix_matrix[send_rank_id, responsible_channel + 1]) - channel_offset
+                num_channel_tokens = (
+                    T.if_then_else(
+                        responsible_channel == num_channels - 1,
+                        num_rank_tokens,
+                        channel_prefix_matrix[send_rank_id, responsible_channel + 1],
+                    )
+                    - channel_offset
+                )
                 token_start_idx = rank_offset + channel_offset
                 token_end_idx = token_start_idx + num_channel_tokens
 
                 # Iterate over all tokens and send by trunk
-                current_channel_tail_idx = T.alloc_var('int32')
+                current_channel_tail_idx = T.alloc_var("int32")
                 current_channel_tail_idx = 0
-                token_idx = T.alloc_var('int32')
+                token_idx = T.alloc_var("int32")
                 token_idx = token_start_idx
-                with T.While(token_idx < token_end_idx):
+                while token_idx < token_end_idx:
                     # Check destination queue emptiness, or wait a buffer to be released (rare cases)
                     num_round_tokens = T.min(num_max_send_tokens, token_end_idx - token_idx)
-                    if T.elect_one_sync():
+                    if T.shuffle_elect(32):
                         T.wait_ge(
                             channel_head_idx[responsible_channel, rank],
                             current_channel_tail_idx + num_round_tokens - num_recv_buffer_tokens,
-                            peer=send_rank_id)
+                            peer=send_rank_id,
+                        )
                     T.sync_warp()
 
                     # Send by trunk
                     for i in T.serial(send_warp_id_in_rank, num_round_tokens, warps_per_rank):
                         # Get an empty slot
-                        dst_slot_idx = T.alloc_var('int32')
+                        dst_slot_idx = T.alloc_var("int32")
                         dst_slot_idx = (current_channel_tail_idx + i) % num_recv_buffer_tokens
 
                         # 1. copy data
                         T.put_warp(
                             T.address_of(x[token_idx + i, 0]),
-                            T.address_of(channel_x_buffers[responsible_channel, rank, dst_slot_idx,
-                                                           0]),
+                            T.address_of(channel_x_buffers[responsible_channel, rank, dst_slot_idx, 0]),
                             hidden,
                             dst_pe=send_rank_id,
                             unroll_factor=4,
-                            enable_aggressive_vectorize=True)
+                            enable_aggressive_vectorize=True,
+                        )
 
                         # 2. send src idx
-                        idx = T.alloc_var('int32')
-                        if T.elect_one_sync():
+                        idx = T.alloc_var("int32")
+                        if T.shuffle_elect(32):
                             T.ld(src_idx[token_idx + i], idx, nc=True)
-                            T.st(
-                                channel_src_idx_buffers[responsible_channel, rank, dst_slot_idx],
-                                idx,
-                                dst_pe=send_rank_id)
+                            T.st(channel_src_idx_buffers[responsible_channel, rank, dst_slot_idx], idx, dst_pe=send_rank_id)
 
                         # 3. send topk_weights
                         if num_topk > 0 and lane_id < num_topk:
-                            weight = T.alloc_var('float32')
+                            weight = T.alloc_var("float32")
                             T.ld(topk_weights[token_idx + i, lane_id], weight, nc=True)
                             T.st(
-                                channel_topk_weights_buffers[responsible_channel, rank,
-                                                             dst_slot_idx, lane_id],
-                                weight,
-                                dst_pe=send_rank_id)
+                                channel_topk_weights_buffers[responsible_channel, rank, dst_slot_idx, lane_id], weight, dst_pe=send_rank_id
+                            )
 
                     token_idx += num_round_tokens
                     current_channel_tail_idx += num_round_tokens
 
                     # move tail index
                     T.sync_threads(send_rank_id, threads_per_rank)
-                    if send_warp_id_in_rank == 0 and T.elect_one_sync():
+                    if T.shuffle_elect(96):
                         T.st(
                             channel_tail_idx[responsible_channel, rank],
                             current_channel_tail_idx,
-                            scope='sys',
-                            sem='release',
-                            dst_pe=send_rank_id)
+                            scope="sys",
+                            sem="release",
+                            dst_pe=send_rank_id,
+                        )
 
             else:  # receiver
-                #? Why we must need scope='shared', not 'shared.dynamic' here?
-                warp_channel_head_idx = T.alloc_shared([warps, num_ranks], 'int32', scope='shared')
-                shared_channel_tail_idx = T.alloc_shared(
-                    [32], 'int32', scope='shared')  #! workaround for illegal address
-                warp_retired = T.alloc_shared([warps], 'bool', scope='shared')
+                # ? Why we must need scope='shared', not 'shared.dynamic' here?
+                warp_channel_head_idx = T.alloc_shared([warps, num_ranks], "int32", scope="shared")
+                shared_channel_tail_idx = T.alloc_shared([32], "int32", scope="shared")  #! workaround for illegal address
+                warp_retired = T.alloc_shared([warps], "bool", scope="shared")
                 if tx < warps:
                     warp_retired[tx] = False
                 if lane_id < num_ranks:
@@ -246,84 +237,66 @@ def combine_main(
                 T.sync_threads()
 
                 if tx < 32:  # one warp for moving the queue head
-                    last_head = T.alloc_var('int32')
+                    last_head = T.alloc_var("int32")
                     last_head = 0
-                    with T.While(lane_id < num_ranks):
+                    while lane_id < num_ranks:
                         # check retired
-                        retired = T.alloc_var('bool')
+                        retired = T.alloc_var("bool")
                         retired = True
                         for i in T.serial(1, warps):
                             retired = retired and warp_retired[i]
                         if retired:
-                            T.loop_break()
+                            break
 
                         # Update queue tail
-                        new_tail = T.alloc_var('int32')
-                        T.ld(
-                            channel_tail_idx[responsible_channel, lane_id],
-                            new_tail,
-                            sem="acquire",
-                            scope="sys")
+                        new_tail = T.alloc_var("int32")
+                        T.ld(channel_tail_idx[responsible_channel, lane_id], new_tail, sem="acquire", scope="sys")
                         # Use release semantics to ensure receiver warps see the update
-                        T.st(
-                            shared_channel_tail_idx[lane_id], new_tail, sem="release",
-                            scope="cta")  # todo: weaker sem pair
+                        T.st(shared_channel_tail_idx[lane_id], new_tail, sem="release", scope="cta")  # todo: weaker sem pair
 
                         # Update minimum head
-                        min_head = T.alloc_var('int32')
+                        min_head = T.alloc_var("int32")
                         min_head = 2**31 - 1  # int32 max
                         for i in T.serial(1, warps):
                             if not warp_retired[i]:
                                 min_head = T.min(min_head, warp_channel_head_idx[i, lane_id])
                         if min_head != 2**31 - 1 and min_head > last_head:
                             last_head = min_head
-                            T.st(
-                                channel_head_idx[responsible_channel, lane_id],
-                                min_head,
-                                sem="relaxed",
-                                scope="sys")
+                            T.st(channel_head_idx[responsible_channel, lane_id], min_head, sem="relaxed", scope="sys")
                 else:  # other warps for reduction
                     # All lanes will use data buffer, but only rank lane will use `head/tail/src_idx`
 
                     # The same tokens as the dispatch process
-                    num_tokens_per_channel = T.truncdiv(num_recv_tokens + num_channels - 1,
-                                                        num_channels)
+                    num_tokens_per_channel = T.truncdiv(num_recv_tokens + num_channels - 1, num_channels)
                     # todo: this is a workaround, as TVM has a bug when calculating safe ceildiv for tir.Var
-                    token_start_idx = T.min(num_tokens_per_channel * responsible_channel,
-                                            num_recv_tokens)
+                    token_start_idx = T.min(num_tokens_per_channel * responsible_channel, num_recv_tokens)
                     token_end_idx = T.min(token_start_idx + num_tokens_per_channel, num_recv_tokens)
 
                     # Iterate over all tokens and combine
-                    for token_idx in T.serial(token_start_idx + warp_id - 1, token_end_idx,
-                                              warps - 1):
+                    for token_idx in T.serial(token_start_idx + warp_id - 1, token_end_idx, warps - 1):
                         # Read expected head
-                        expected_head = T.alloc_var('int32')
+                        expected_head = T.alloc_var("int32")
                         expected_head = -1
                         if lane_id < num_ranks:
                             T.ld(send_head[token_idx, lane_id], expected_head, nc=True)
 
-                        condvar = T.alloc_var('int32')
+                        condvar = T.alloc_var("int32")
                         T.ld(shared_channel_tail_idx[lane_id], condvar, sem="acquire", scope="cta")
-                        with T.While(T.warp_any(condvar <= expected_head and expected_head >= 0)):
-                            T.ld(
-                                shared_channel_tail_idx[lane_id],
-                                condvar,
-                                sem="acquire",
-                                scope="cta")
-                            T.loop_continue()
+                        while T.warp_any(condvar <= expected_head and expected_head >= 0):
+                            T.ld(shared_channel_tail_idx[lane_id], condvar, sem="acquire", scope="cta")
+                            continue
                         # can we simplify this ?
                         T.sync_warp()
 
                         # Broadcast current heads
-                        num_topk_ranks = T.alloc_var('int32')
+                        num_topk_ranks = T.alloc_var("int32")
                         num_topk_ranks = 0
-                        topk_ranks = T.alloc_local([num_ranks], 'int32')
-                        slot_indices = T.alloc_local([num_ranks], 'int32')
+                        topk_ranks = T.alloc_local([num_ranks], "int32")
+                        slot_indices = T.alloc_local([num_ranks], "int32")
                         for i in T.serial(num_ranks):
                             expected_head_i = T.tvm_warp_shuffle(-1, expected_head, i, 32, 32)
                             if expected_head_i >= 0:
-                                slot_indices[
-                                    num_topk_ranks] = expected_head_i % num_recv_buffer_tokens
+                                slot_indices[num_topk_ranks] = expected_head_i % num_recv_buffer_tokens
                                 topk_ranks[num_topk_ranks] = i
                                 num_topk_ranks += 1
 
@@ -337,10 +310,10 @@ def combine_main(
                             for j in T.serial(num_topk_ranks):
                                 for k in T.vectorized(8):
                                     T.ld(
-                                        channel_x_buffers[responsible_channel, topk_ranks[j],
-                                                          slot_indices[j], i * 8 + k],
+                                        channel_x_buffers[responsible_channel, topk_ranks[j], slot_indices[j], i * 8 + k],
                                         recv_value[j, k],
-                                        nc=True)
+                                        nc=True,
+                                    )
 
                             # todo: support bias
 
@@ -349,47 +322,52 @@ def combine_main(
                                 for k in T.vectorized(8):
                                     values[k] += recv_value[j, k]
                             for j in T.vectorized(8):
-                                recv_x[token_idx,
-                                       i * 8 + j] = values[j]  # todo: further vectorize this
+                                recv_x[token_idx, i * 8 + j] = values[j]  # todo: further vectorize this
 
                         # Reduce topk_weights
                         if lane_id < num_topk:
-                            weight_sum = T.alloc_var('float32')
+                            weight_sum = T.alloc_var("float32")
                             weight_sum = 0
                             for i in T.serial(num_topk_ranks):
-                                weight = T.alloc_var('float32')
+                                weight = T.alloc_var("float32")
                                 T.ld(
-                                    channel_topk_weights_buffers[responsible_channel, topk_ranks[i],
-                                                                 slot_indices[i], lane_id],
+                                    channel_topk_weights_buffers[responsible_channel, topk_ranks[i], slot_indices[i], lane_id],
                                     weight,
-                                    nc=True)
+                                    nc=True,
+                                )
                                 weight_sum += weight
                             recv_topk_weights[token_idx, lane_id] = weight_sum
 
                         # Update head
                         if lane_id < num_ranks:
                             warp_channel_head_idx[warp_id, lane_id] = T.if_then_else(
-                                expected_head < 0, -expected_head - 1, expected_head + 1)
+                                expected_head < 0, -expected_head - 1, expected_head + 1
+                            )
 
                     # Retired
                     T.sync_warp()
-                    if T.elect_one_sync():
+                    if T.shuffle_elect(32):
                         warp_retired[warp_id] = True
 
     return combine_main
 
 
-def intranode_combine(rank: int,
-                      allocator,
-                      symm_buffers,
-                      x,
-                      config,
-                      handle,
-                      topk_weights,
-                      comm_stream=None):
+def intranode_combine(rank: int, allocator, symm_buffers, x, config, handle, topk_weights, comm_stream=None):
     assert handle is not None
     rank_prefix_matrix, channel_prefix_matrix, recv_channel_prefix_matrix, recv_src_idx, _, send_head = handle
-    barrier_signal, _, _, _, _, channel_head_idx, channel_tail_idx, channel_x_buffers, channel_src_idx_buffers, _, channel_topk_weights_buffers = symm_buffers
+    (
+        barrier_signal,
+        _,
+        _,
+        _,
+        _,
+        channel_head_idx,
+        channel_tail_idx,
+        channel_x_buffers,
+        channel_src_idx_buffers,
+        _,
+        channel_topk_weights_buffers,
+    ) = symm_buffers
 
     # acquire_shapes
     _, hidden = x.shape
@@ -398,19 +376,12 @@ def intranode_combine(rank: int,
     num_recv_tokens = send_head.shape[0]
 
     # notify combine
-    cached_notify_combine(
-        num_ranks,
-        config.num_sms,
-        send_head,
-        channel_head_idx,
-        channel_tail_idx,
-        barrier_signal,
-        allocator,
-        comm_stream=comm_stream)
+    with torch.cuda.stream(comm_stream):
+        cached_notify_combine(num_ranks, config.num_sms, send_head, channel_head_idx, channel_tail_idx, barrier_signal, allocator)
 
     # combine
-    recv_x = torch.empty((num_recv_tokens, hidden), dtype=x.dtype, device='cuda')
-    recv_topk_weights = torch.empty((num_recv_tokens, num_topk), dtype=torch.float32, device='cuda')
+    recv_x = torch.empty((num_recv_tokens, hidden), dtype=x.dtype, device="cuda")
+    recv_topk_weights = torch.empty((num_recv_tokens, num_topk), dtype=torch.float32, device="cuda")
 
     kernel = combine_kernel(
         num_ranks,
@@ -419,25 +390,26 @@ def intranode_combine(rank: int,
         hidden,
         num_topk,
         config.num_sms,
-        dtype='bfloat16')
-    kernel.initialize(allocator=allocator, stream=comm_stream.cuda_stream)
-    kernel(
-        rank,
-        x,
-        topk_weights,
-        recv_src_idx,
-        recv_x,
-        recv_topk_weights,
-        rank_prefix_matrix,
-        recv_channel_prefix_matrix,
-        send_head,
-        channel_head_idx,
-        channel_tail_idx,
-        channel_x_buffers,
-        channel_src_idx_buffers,
-        channel_topk_weights_buffers,
-        stream=comm_stream.cuda_stream,
-        skip_tensor_validation=True)  # reduce runtime overhead
+        dtype="bfloat16",
+    )
+    with torch.cuda.stream(comm_stream):
+        kernel.initialize(allocator=allocator)
+        kernel(
+            rank,
+            x,
+            topk_weights,
+            recv_src_idx,
+            recv_x,
+            recv_topk_weights,
+            rank_prefix_matrix,
+            recv_channel_prefix_matrix,
+            send_head,
+            channel_head_idx,
+            channel_tail_idx,
+            channel_x_buffers,
+            channel_src_idx_buffers,
+            channel_topk_weights_buffers,
+        )  # reduce runtime overhead
     compute_stream = torch.cuda.current_stream()
     compute_stream.wait_stream(comm_stream)
     return recv_x, recv_topk_weights
diff --git a/examples/distributed/deepseek_deepep/intranode/dispatch.py b/examples/distributed/deepseek_deepep/intranode/dispatch.py
index 0811a4eb1..83912a089 100644
--- a/examples/distributed/deepseek_deepep/intranode/dispatch.py
+++ b/examples/distributed/deepseek_deepep/intranode/dispatch.py
@@ -11,9 +11,10 @@
 import tilelang.language as T
 from typing import Optional, Tuple
 from deepep_utils import Config, ep_ext  # noqa: F403
+import tvm_ffi
 
 # tilelang.disable_cache()
-os.environ['NCCL_DEBUG'] = 'WARN'  # silence NCCL log
+os.environ["NCCL_DEBUG"] = "WARN"  # silence NCCL log
 
 
 # notify_dispatch is responsible for:
@@ -30,26 +31,26 @@ def notify_dispatch_kernel(
     num_local_experts = num_experts // num_ranks
     num_warps = threads // 32
 
-    num_tokens = T.dynamic('num_tokens')
+    num_tokens = T.dynamic("num_tokens")
 
     @T.prim_func
     def notify_dispatch_main(
-            rank: T.int32,
-            num_tokens_per_rank: T.Tensor((num_ranks,), 'int32'),
-            num_tokens_per_expert: T.Tensor((num_experts,), 'int32'),
-            is_token_in_rank: T.Tensor((num_tokens, num_ranks), 'bool'),
-            moe_recv_counter_mapped: T.Tensor((1,), 'int32'),
-            moe_recv_expert_counter_mapped: T.Tensor((num_local_experts,), 'int32'),
-            per_rank_buffer: T.Tensor((num_ranks, num_ranks), 'int32'),
-            per_expert_buffer: T.Tensor((num_ranks, num_local_experts), 'int32'),
-            barrier_signal: T.Tensor((num_ranks,), 'int32'),
-            rank_prefix_matrix: T.Tensor((num_ranks, num_ranks), 'int32'),
-            channel_prefix_matrix: T.Tensor((num_ranks, num_channels), 'int32'),
-            # 4 symm buffers to be zeroed
-            channel_start_offset: T.Tensor([num_channels, num_ranks], "int32"),
-            channel_end_offset: T.Tensor([num_channels, num_ranks], "int32"),
-            channel_head_idx: T.Tensor([num_channels, num_ranks], "int32"),
-            channel_tail_idx: T.Tensor([num_channels, num_ranks], "int32"),
+        rank: T.int32,
+        num_tokens_per_rank: T.Tensor((num_ranks,), "int32"),
+        num_tokens_per_expert: T.Tensor((num_experts,), "int32"),
+        is_token_in_rank: T.Tensor((num_tokens, num_ranks), "bool"),
+        moe_recv_counter_mapped: T.Tensor((1,), "int32"),
+        moe_recv_expert_counter_mapped: T.Tensor((num_local_experts,), "int32"),
+        per_rank_buffer: T.Tensor((num_ranks, num_ranks), "int32"),
+        per_expert_buffer: T.Tensor((num_ranks, num_local_experts), "int32"),
+        barrier_signal: T.Tensor((num_ranks,), "int32"),
+        rank_prefix_matrix: T.Tensor((num_ranks, num_ranks), "int32"),
+        channel_prefix_matrix: T.Tensor((num_ranks, num_channels), "int32"),
+        # 4 symm buffers to be zeroed
+        channel_start_offset: T.Tensor([num_channels, num_ranks], "int32"),
+        channel_end_offset: T.Tensor([num_channels, num_ranks], "int32"),
+        channel_head_idx: T.Tensor([num_channels, num_ranks], "int32"),
+        channel_tail_idx: T.Tensor([num_channels, num_ranks], "int32"),
     ):
         with T.Kernel(num_ranks + 1, threads=threads) as bx:
             tx = T.get_thread_binding()
@@ -64,10 +65,7 @@ def notify_dispatch_main(
                 if tx < num_ranks:
                     T.st(per_rank_buffer[rank, tx], num_tokens_per_rank[tx], dst_pe=tx)
                     for i in T.serial(num_local_experts):
-                        T.st(
-                            per_expert_buffer[rank, i],
-                            num_tokens_per_expert[tx * num_local_experts + i],
-                            dst_pe=tx)
+                        T.st(per_expert_buffer[rank, i], num_tokens_per_expert[tx * num_local_experts + i], dst_pe=tx)
 
                 T.barrier_blocks(barrier_signal)
 
@@ -80,7 +78,7 @@ def notify_dispatch_main(
 
                 # Sum per-expert cnts
                 if tx < num_local_experts:
-                    sum = T.alloc_local([1], 'int32')
+                    sum = T.alloc_local([1], "int32")
                     sum[0] = 0
                     for i in T.serial(0, num_ranks):
                         sum[0] += per_expert_buffer[i, tx]
@@ -106,12 +104,12 @@ def notify_dispatch_main(
                     # todo: this is a workaround, as TVM has a bug when calculating safe ceildiv for tir.Var
                     token_start_idx = T.min(num_tokens_per_channel * channel_id, num_tokens)
                     token_end_idx = T.min(token_start_idx + num_tokens_per_channel, num_tokens)
-                    cnt = T.alloc_var('int32')
+                    cnt = T.alloc_var("int32")
                     cnt = 0
                     for i in T.serial(token_start_idx + lane_id, token_end_idx, 32):
                         cnt += is_token_in_rank[i, dst_rank]
                     cnt = T.warp_reduce_sum(cnt)
-                    if T.elect_one_sync():
+                    if T.shuffle_elect(32):
                         channel_prefix_matrix[dst_rank, channel_id] = cnt
                 T.sync_threads()
 
@@ -149,7 +147,7 @@ def notify_dispatch(
     channel_tail_idx: torch.Tensor,
     # allocator
     allocator,
-    comm_stream=None,
+    comm_stream: torch.cuda.Stream = None,
 ):
     kernel = notify_dispatch_kernel(
         num_ranks,
@@ -159,8 +157,8 @@ def notify_dispatch(
     )
     kernel.initialize(allocator=allocator, stream=comm_stream.cuda_stream)
 
-    rank_prefix_matrix = torch.empty([num_ranks, num_ranks], dtype=torch.int32, device='cuda')
-    channel_prefix_matrix = torch.empty([num_ranks, num_channels], dtype=torch.int32, device='cuda')
+    rank_prefix_matrix = torch.empty([num_ranks, num_ranks], dtype=torch.int32, device="cuda")
+    channel_prefix_matrix = torch.empty([num_ranks, num_channels], dtype=torch.int32, device="cuda")
 
     # clear buffers and counters
     moe_recv_counter.fill_(-1)
@@ -182,27 +180,22 @@ def notify_dispatch(
         channel_end_offset,
         channel_head_idx,
         channel_tail_idx,
-        stream=comm_stream.cuda_stream,
-        skip_tensor_validation=True  # reduce runtime overhead
     )
-
-    num_recv_tokens, num_recv_tokens_per_expert_list = ep_ext.wait_for_counters_ready(
-        moe_recv_counter, moe_recv_expert_counter)
+    num_recv_tokens, num_recv_tokens_per_expert_list = ep_ext.wait_for_counters_ready(moe_recv_counter, moe_recv_expert_counter)
     return num_recv_tokens, num_recv_tokens_per_expert_list, rank_prefix_matrix, channel_prefix_matrix
 
 
 # cached_notify_dispatch only needs to clear symm buffers
 @tilelang.jit(pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
 def cached_notify_dispatch_kernel(num_ranks: int, num_channels: int):
-
     @T.prim_func
     def cached_notify_dispatch_main(
-            barrier_signal: T.Tensor((num_ranks,), 'int32'),
-            # 4 symm buffers to be zeroed
-            channel_start_offset: T.Tensor([num_channels, num_ranks], "int32"),
-            channel_end_offset: T.Tensor([num_channels, num_ranks], "int32"),
-            channel_head_idx: T.Tensor([num_channels, num_ranks], "int32"),
-            channel_tail_idx: T.Tensor([num_channels, num_ranks], "int32"),
+        barrier_signal: T.Tensor((num_ranks,), "int32"),
+        # 4 symm buffers to be zeroed
+        channel_start_offset: T.Tensor([num_channels, num_ranks], "int32"),
+        channel_end_offset: T.Tensor([num_channels, num_ranks], "int32"),
+        channel_head_idx: T.Tensor([num_channels, num_ranks], "int32"),
+        channel_tail_idx: T.Tensor([num_channels, num_ranks], "int32"),
     ):
         with T.Kernel(1, threads=128):
             T.sync_blocks(barrier_signal)
@@ -232,22 +225,23 @@ def cached_notify_dispatch(
     comm_stream=None,
 ):
     kernel = cached_notify_dispatch_kernel(num_ranks, num_channels)
-    kernel.initialize(
-        allocator=allocator, stream=comm_stream.cuda_stream)  # we still comm on barrier_signal
-    kernel(
-        barrier_signal,
-        channel_start_offset,
-        channel_end_offset,
-        channel_head_idx,
-        channel_tail_idx,
-        stream=comm_stream.cuda_stream,
-        skip_tensor_validation=True)  # reduce runtime overhead
+    kernel.initialize(allocator=allocator, stream=comm_stream.cuda_stream)
+    with torch.cuda.stream(comm_stream):
+        kernel(
+            barrier_signal,
+            channel_start_offset,
+            channel_end_offset,
+            channel_head_idx,
+            channel_tail_idx,
+        )
 
 
-@tilelang.jit(pass_configs={
-    "tl.disable_tma_lower": True,  # enable TMA later
-    "tl.disable_warp_specialized": True
-})
+@tilelang.jit(
+    pass_configs={
+        "tl.disable_tma_lower": True,  # enable TMA later
+        "tl.disable_warp_specialized": True,
+    }
+)
 def dispatch_kernel(
     num_ranks,
     num_max_send_tokens,  # config.num_max_nvl_chunked_send_tokens
@@ -256,7 +250,7 @@ def dispatch_kernel(
     num_topk,
     num_experts,
     num_sms,
-    dtype: str = 'bfloat16',
+    dtype: str = "bfloat16",
 ):
     threads = 768  # 24 warps
     TMABytesPerWarp = 8192
@@ -269,17 +263,17 @@ def dispatch_kernel(
     num_warps = threads // 32  # 24
     num_warps_per_rank = num_warps // num_ranks  # 3
 
-    num_tokens = T.dynamic('num_tokens')
-    num_recv_tokens = T.dynamic('num_recv_tokens')
+    num_tokens = T.dynamic("num_tokens")
+    num_recv_tokens = T.dynamic("num_recv_tokens")
 
     @T.prim_func
     def dispatch_main(
         rank: T.int32,
         # output
         recv_x: T.Tensor((num_recv_tokens, hidden), dtype),
-        recv_src_idx: T.Tensor((num_recv_tokens,), 'int32'),
-        recv_topk_idx: T.Tensor((num_recv_tokens, num_topk), 'int64'),
-        recv_topk_weights: T.Tensor((num_recv_tokens, num_topk), 'float'),
+        recv_src_idx: T.Tensor((num_recv_tokens,), "int32"),
+        recv_topk_idx: T.Tensor((num_recv_tokens, num_topk), "int64"),
+        recv_topk_weights: T.Tensor((num_recv_tokens, num_topk), "float"),
         recv_channel_offset: T.Tensor([num_ranks, num_channels], "int32"),
         send_head: T.Tensor([num_tokens, num_ranks], "int32"),
         # input
@@ -297,14 +291,10 @@ def dispatch_main(
         channel_head_idx: T.Tensor([num_channels, num_ranks], "int32"),
         channel_tail_idx: T.Tensor([num_channels, num_ranks], "int32"),
         # channel data buffers, stored on the receiver side
-        channel_x_buffers: T.Tensor([num_channels, num_ranks, num_recv_buffer_tokens, hidden],
-                                    dtype),
-        channel_src_idx_buffers: T.Tensor([num_channels, num_ranks, num_recv_buffer_tokens],
-                                          "int32"),
-        channel_topk_idx_buffers: T.Tensor(
-            [num_channels, num_ranks, num_recv_buffer_tokens, num_topk], "int64"),
-        channel_topk_weights_buffers: T.Tensor(
-            [num_channels, num_ranks, num_recv_buffer_tokens, num_topk], "float32"),
+        channel_x_buffers: T.Tensor([num_channels, num_ranks, num_recv_buffer_tokens, hidden], dtype),
+        channel_src_idx_buffers: T.Tensor([num_channels, num_ranks, num_recv_buffer_tokens], "int32"),
+        channel_topk_idx_buffers: T.Tensor([num_channels, num_ranks, num_recv_buffer_tokens, num_topk], "int64"),
+        channel_topk_weights_buffers: T.Tensor([num_channels, num_ranks, num_recv_buffer_tokens, num_topk], "float32"),
         # channel_x_scales_buffers: T.Tensor([num_channels, num_ranks, num_recv_buffer_tokens, num_scales], "float32"),
     ):
         with T.Kernel(num_sms, threads=threads) as bx:
@@ -318,65 +308,53 @@ def dispatch_main(
 
                 # send offset by `-value-1` e.g. 0->-1, 1->-2
                 # this is for distinguishing zero tokens
-                if send_warp_id_in_rank == 0 and T.elect_one_sync():
-                    value = T.alloc_var('int32')
-                    value = T.if_then_else(
-                        responsible_channel > 0, channel_prefix_matrix[responsible_rank,
-                                                                       responsible_channel - 1], 0)
-                    T.st(
-                        channel_start_offset[responsible_channel, rank],
-                        -value - 1,
-                        scope='sys',
-                        sem='relaxed',
-                        dst_pe=responsible_rank)
+                if send_warp_id_in_rank == 0 and T.shuffle_elect(32):
+                    value = T.alloc_var("int32")
+                    value = T.if_then_else(responsible_channel > 0, channel_prefix_matrix[responsible_rank, responsible_channel - 1], 0)
+                    T.st(channel_start_offset[responsible_channel, rank], -value - 1, scope="sys", sem="relaxed", dst_pe=responsible_rank)
                     value = channel_prefix_matrix[responsible_rank, responsible_channel]
-                    T.st(
-                        channel_end_offset[responsible_channel, rank],
-                        -value - 1,
-                        scope='sys',
-                        sem='relaxed',
-                        dst_pe=responsible_rank)
+                    T.st(channel_end_offset[responsible_channel, rank], -value - 1, scope="sys", sem="relaxed", dst_pe=responsible_rank)
                 T.sync_warp()
 
                 # get task
                 num_tokens_per_channel = T.truncdiv(num_tokens + num_channels - 1, num_channels)
                 # todo: this is a workaround, as TVM has a bug when calculating safe ceildiv for tir.Var
-                token_start_idx = T.alloc_var('int32')
+                token_start_idx = T.alloc_var("int32")
                 token_start_idx = T.min(num_tokens_per_channel * responsible_channel, num_tokens)
-                token_end_idx = T.alloc_var('int32')
+                token_end_idx = T.alloc_var("int32")
                 token_end_idx = T.min(token_start_idx + num_tokens_per_channel, num_tokens)
 
                 # sender mainloop: iterate over all tokens and send by trunk
-                cached_channel_tail_idx = T.alloc_var('int32')
+                cached_channel_tail_idx = T.alloc_var("int32")
                 cached_channel_tail_idx = 0
-                token_idx = T.alloc_var('int32')
+                token_idx = T.alloc_var("int32")
                 token_idx = token_start_idx
-                with T.While(token_idx < token_end_idx):
-                    if T.elect_one_sync():
+                while token_idx < token_end_idx:
+                    if T.shuffle_elect(32):
                         T.wait_ge(
                             channel_head_idx[responsible_channel, rank],
                             num_max_send_tokens + cached_channel_tail_idx - num_recv_buffer_tokens,
-                            responsible_rank)
+                            responsible_rank,
+                        )
                     T.sync_warp()
 
-                    chunk_token_idx = T.alloc_var('int32')
+                    chunk_token_idx = T.alloc_var("int32")
                     chunk_token_idx = 0
                     while chunk_token_idx < num_max_send_tokens and token_idx < token_end_idx:
                         # for the same token, the warp assigned to save `send_head` may be different from the warp
                         # assigned to send the following data
-                        if token_idx % num_warps_per_rank == send_warp_id_in_rank and T.elect_one_sync(
-                        ):
+                        if token_idx % num_warps_per_rank == send_warp_id_in_rank and T.shuffle_elect(32):
                             send_head[token_idx, responsible_rank] = T.if_then_else(
-                                is_token_in_rank[token_idx, responsible_rank],
-                                cached_channel_tail_idx, -1)
+                                is_token_in_rank[token_idx, responsible_rank], cached_channel_tail_idx, -1
+                            )
 
                         # skip if not selected
                         if not is_token_in_rank[token_idx, responsible_rank]:
                             token_idx += 1
-                            T.loop_continue()
+                            continue
 
                         # selected, get an empty slot
-                        dst_slot_idx = T.alloc_var('int32')
+                        dst_slot_idx = T.alloc_var("int32")
                         dst_slot_idx = cached_channel_tail_idx % num_recv_buffer_tokens
                         cached_channel_tail_idx += 1
                         if cached_channel_tail_idx % num_warps_per_rank == send_warp_id_in_rank:
@@ -384,20 +362,16 @@ def dispatch_main(
                             # 1. copy data
                             T.put_warp(
                                 T.address_of(x[token_idx, 0]),
-                                T.address_of(channel_x_buffers[responsible_channel, rank,
-                                                               dst_slot_idx, 0]),
+                                T.address_of(channel_x_buffers[responsible_channel, rank, dst_slot_idx, 0]),
                                 hidden,
                                 dst_pe=responsible_rank,
                                 unroll_factor=4,
-                                enable_aggressive_vectorize=True)
+                                enable_aggressive_vectorize=True,
+                            )
 
                             # 2. copy src idx
-                            if T.elect_one_sync():
-                                T.st(
-                                    channel_src_idx_buffers[responsible_channel, rank,
-                                                            dst_slot_idx],
-                                    token_idx,
-                                    dst_pe=responsible_rank)
+                            if T.shuffle_elect(32):
+                                T.st(channel_src_idx_buffers[responsible_channel, rank, dst_slot_idx], token_idx, dst_pe=responsible_rank)
 
                             # 3. copy `topk_idx` and `topk_weights` with transformed index
                             if lane_id < num_topk:
@@ -405,26 +379,26 @@ def dispatch_main(
                                 recv_expert_begin = responsible_rank * num_local_experts
                                 recv_expert_end = recv_expert_begin + num_local_experts
 
-                                idx_value = T.alloc_var('int64')
+                                idx_value = T.alloc_var("int64")
                                 T.ld(topk_idx[token_idx, lane_id], idx_value, nc=True)
                                 idx_value = T.if_then_else(
-                                    recv_expert_begin <= T.cast(idx_value, 'int32') <
-                                    recv_expert_end, idx_value - recv_expert_begin, -1)
+                                    recv_expert_begin <= T.cast(idx_value, "int32") < recv_expert_end, idx_value - recv_expert_begin, -1
+                                )
                                 T.st(
-                                    channel_topk_idx_buffers[responsible_channel, rank,
-                                                             dst_slot_idx, lane_id],
+                                    channel_topk_idx_buffers[responsible_channel, rank, dst_slot_idx, lane_id],
                                     idx_value,
-                                    dst_pe=responsible_rank)
+                                    dst_pe=responsible_rank,
+                                )
 
                                 # topk_weights
-                                weight_value = T.alloc_var('float32')
+                                weight_value = T.alloc_var("float32")
                                 T.ld(topk_weights[token_idx, lane_id], weight_value, nc=True)
                                 weight_value = T.if_then_else(idx_value >= 0, weight_value, 0)
                                 T.st(
-                                    channel_topk_weights_buffers[responsible_channel, rank,
-                                                                 dst_slot_idx, lane_id],
+                                    channel_topk_weights_buffers[responsible_channel, rank, dst_slot_idx, lane_id],
                                     weight_value,
-                                    dst_pe=responsible_rank)
+                                    dst_pe=responsible_rank,
+                                )
 
                             # 4. copy scale (support fp8 later)
 
@@ -434,36 +408,30 @@ def dispatch_main(
                     # move tail index
                     # here all warps should share the same new tail
                     T.sync_threads(responsible_rank, num_threads_per_rank)
-                    if send_warp_id_in_rank == 0 and T.elect_one_sync():
+                    if send_warp_id_in_rank == 0 and T.shuffle_elect(32):
                         T.st(
                             channel_tail_idx[responsible_channel, rank],
                             cached_channel_tail_idx,
-                            scope='sys',
-                            sem='release',
-                            dst_pe=responsible_rank)
+                            scope="sys",
+                            sem="release",
+                            dst_pe=responsible_rank,
+                        )
 
             else:  # receiver
                 recv_thread_id_in_rank = tx % num_threads_per_rank
                 recv_warp_id_in_rank = recv_thread_id_in_rank // 32
 
                 # calculate offset first
-                rank_offset = T.if_then_else(responsible_rank > 0,
-                                             rank_prefix_matrix[responsible_rank - 1, rank], 0)
+                rank_offset = T.if_then_else(responsible_rank > 0, rank_prefix_matrix[responsible_rank - 1, rank], 0)
 
                 # receive channel offset
-                total_offset = T.alloc_var('int32')
-                num_tokens_to_recv = T.alloc_var('int32')
-                if T.elect_one_sync():
+                total_offset = T.alloc_var("int32")
+                num_tokens_to_recv = T.alloc_var("int32")
+                if T.shuffle_elect(32):
                     T.wait_ne(channel_start_offset[responsible_channel, responsible_rank], 0)
-                    T.ld(
-                        channel_start_offset[responsible_channel, responsible_rank],
-                        total_offset,
-                        sem='volatile')
+                    T.ld(channel_start_offset[responsible_channel, responsible_rank], total_offset, sem="volatile")
                     T.wait_ne(channel_end_offset[responsible_channel, responsible_rank], 0)
-                    T.ld(
-                        channel_end_offset[responsible_channel, responsible_rank],
-                        num_tokens_to_recv,
-                        sem='volatile')
+                    T.ld(channel_end_offset[responsible_channel, responsible_rank], num_tokens_to_recv, sem="volatile")
                     total_offset = -total_offset - 1
                     num_tokens_to_recv = -num_tokens_to_recv - 1
                     if recv_warp_id_in_rank == 0:
@@ -474,24 +442,20 @@ def dispatch_main(
                 num_tokens_to_recv = T.tvm_warp_shuffle(-1, num_tokens_to_recv, 0, 32, 32)
 
                 # Shared tail indices for different warps
-                shared_channel_tail_idx = T.alloc_shared([num_ranks], 'int32')
+                shared_channel_tail_idx = T.alloc_shared([num_ranks], "int32")
 
-                cached_channel_head_idx = T.alloc_var('int32')
+                cached_channel_head_idx = T.alloc_var("int32")
                 cached_channel_head_idx = 0
-                cached_channel_tail_idx = T.alloc_var('int32')
+                cached_channel_tail_idx = T.alloc_var("int32")
                 cached_channel_tail_idx = 0
-                with T.While(num_tokens_to_recv > 0):
-                    with T.While(recv_thread_id_in_rank == 0):
-                        T.ld(
-                            channel_tail_idx[responsible_channel, responsible_rank],
-                            cached_channel_tail_idx,
-                            sem='acquire',
-                            scope='sys')
+                while num_tokens_to_recv > 0:
+                    while recv_thread_id_in_rank == 0:
+                        T.ld(channel_tail_idx[responsible_channel, responsible_rank], cached_channel_tail_idx, sem="acquire", scope="sys")
 
                         # read to copy
                         if cached_channel_head_idx != cached_channel_tail_idx:
                             shared_channel_tail_idx[responsible_rank] = cached_channel_tail_idx
-                            T.loop_break()
+                            break
 
                     # sync queue tail
                     T.sync_threads(responsible_rank, num_threads_per_rank)
@@ -500,48 +464,42 @@ def dispatch_main(
                     # copy data
                     # 1. recv x
                     num_cur_recv_tokens = cached_channel_tail_idx - cached_channel_head_idx
-                    for chunk_idx in T.serial(recv_warp_id_in_rank, num_cur_recv_tokens,
-                                              num_warps_per_rank):
-                        token_idx_in_buffer = (cached_channel_head_idx +
-                                               chunk_idx) % num_recv_buffer_tokens
+                    for chunk_idx in T.serial(recv_warp_id_in_rank, num_cur_recv_tokens, num_warps_per_rank):
+                        token_idx_in_buffer = (cached_channel_head_idx + chunk_idx) % num_recv_buffer_tokens
                         # T.copy(channel_x_buffers[responsible_channel, responsible_rank, token_idx_in_buffer, :], recv_x[total_offset+chunk_idx, :])  # todo: add ld_nc and st_na
                         #! T.copy will cause layout inference error
                         T.put_warp(
-                            T.address_of(channel_x_buffers[responsible_channel, responsible_rank,
-                                                           token_idx_in_buffer, 0]),
+                            T.address_of(channel_x_buffers[responsible_channel, responsible_rank, token_idx_in_buffer, 0]),
                             T.address_of(recv_x[total_offset + chunk_idx, 0]),
                             hidden,
                             -1,
                             5,
-                            enable_aggressive_vectorize=True)
+                            enable_aggressive_vectorize=True,
+                        )
 
                     # 2. recv src_idx
-                    for chunk_idx in T.serial(cached_channel_head_idx + recv_thread_id_in_rank,
-                                              cached_channel_tail_idx, num_threads_per_rank):
-                        local_src_idx = T.alloc_var('int32')
+                    for chunk_idx in T.serial(
+                        cached_channel_head_idx + recv_thread_id_in_rank, cached_channel_tail_idx, num_threads_per_rank
+                    ):
+                        local_src_idx = T.alloc_var("int32")
                         T.ld(
-                            channel_src_idx_buffers[responsible_channel, responsible_rank,
-                                                    chunk_idx % num_recv_buffer_tokens],
+                            channel_src_idx_buffers[responsible_channel, responsible_rank, chunk_idx % num_recv_buffer_tokens],
                             local_src_idx,
-                            nc=True)
-                        recv_src_idx[total_offset + chunk_idx -
-                                     cached_channel_head_idx] = local_src_idx
+                            nc=True,
+                        )
+                        recv_src_idx[total_offset + chunk_idx - cached_channel_head_idx] = local_src_idx
 
                     # 3. recv topk_idx and topk_weights
-                    for idx in T.serial(recv_thread_id_in_rank, num_cur_recv_tokens * num_topk,
-                                        num_threads_per_rank):
+                    for idx in T.serial(recv_thread_id_in_rank, num_cur_recv_tokens * num_topk, num_threads_per_rank):
                         chunk_idx = idx // num_topk
                         token_topk_idx = idx % num_topk
-                        token_idx_in_buffer = (cached_channel_head_idx +
-                                               chunk_idx) % num_recv_buffer_tokens
-                        recv_topk_idx[total_offset + chunk_idx,
-                                      token_topk_idx] = channel_topk_idx_buffers[
-                                          responsible_channel, responsible_rank,
-                                          token_idx_in_buffer, token_topk_idx]
-                        recv_topk_weights[total_offset + chunk_idx,
-                                          token_topk_idx] = channel_topk_weights_buffers[
-                                              responsible_channel, responsible_rank,
-                                              token_idx_in_buffer, token_topk_idx]
+                        token_idx_in_buffer = (cached_channel_head_idx + chunk_idx) % num_recv_buffer_tokens
+                        recv_topk_idx[total_offset + chunk_idx, token_topk_idx] = channel_topk_idx_buffers[
+                            responsible_channel, responsible_rank, token_idx_in_buffer, token_topk_idx
+                        ]
+                        recv_topk_weights[total_offset + chunk_idx, token_topk_idx] = channel_topk_weights_buffers[
+                            responsible_channel, responsible_rank, token_idx_in_buffer, token_topk_idx
+                        ]
 
                     # 4. recv scale (support fp8 later)
 
@@ -549,12 +507,8 @@ def dispatch_main(
                     cached_channel_head_idx += num_cur_recv_tokens
                     total_offset += num_cur_recv_tokens
                     T.sync_threads(responsible_rank, num_threads_per_rank)
-                    if recv_warp_id_in_rank == num_warps_per_rank - 1 and T.elect_one_sync():
-                        T.st(
-                            channel_head_idx[responsible_channel, responsible_rank],
-                            cached_channel_head_idx,
-                            scope='sys',
-                            sem='relaxed')
+                    if recv_warp_id_in_rank == num_warps_per_rank - 1 and T.shuffle_elect(32):
+                        T.st(channel_head_idx[responsible_channel, responsible_rank], cached_channel_head_idx, scope="sys", sem="relaxed")
 
                     # Exit
                     num_tokens_to_recv -= num_cur_recv_tokens
@@ -562,10 +516,12 @@ def dispatch_main(
     return dispatch_main
 
 
-@tilelang.jit(pass_configs={
-    "tl.disable_tma_lower": True,  # enable TMA later
-    "tl.disable_warp_specialized": True
-})
+@tilelang.jit(
+    pass_configs={
+        "tl.disable_tma_lower": True,  # enable TMA later
+        "tl.disable_warp_specialized": True,
+    }
+)
 def cached_dispatch_kernel(
     num_ranks,
     num_tokens,
@@ -573,7 +529,7 @@ def cached_dispatch_kernel(
     num_recv_buffer_tokens,  # config.num_max_nvl_chunked_recv_tokens
     hidden,
     num_sms,
-    dtype: str = 'bfloat16',
+    dtype: str = "bfloat16",
 ):
     threads = 768  # 24 warps
     TMABytesPerWarp = 8192
@@ -585,14 +541,14 @@ def cached_dispatch_kernel(
     num_warps = threads // 32  # 24
     num_warps_per_rank = num_warps // num_ranks  # 3
 
-    num_recv_tokens = T.dynamic('num_recv_tokens')
+    num_recv_tokens = T.dynamic("num_recv_tokens")
 
     @T.prim_func
     def cached_dispatch_main(
         rank: T.int32,
         # output
         recv_x: T.Tensor((num_recv_tokens, hidden), dtype),
-        recv_src_idx: T.Tensor((num_recv_tokens,), 'int32'),
+        recv_src_idx: T.Tensor((num_recv_tokens,), "int32"),
         recv_channel_offset: T.Tensor([num_ranks, num_channels], "int32"),
         send_head: T.Tensor([num_tokens, num_ranks], "int32"),
         # input
@@ -608,10 +564,8 @@ def cached_dispatch_main(
         channel_head_idx: T.Tensor([num_channels, num_ranks], "int32"),
         channel_tail_idx: T.Tensor([num_channels, num_ranks], "int32"),
         # channel data buffers, stored on the receiver side
-        channel_x_buffers: T.Tensor([num_channels, num_ranks, num_recv_buffer_tokens, hidden],
-                                    dtype),
-        channel_src_idx_buffers: T.Tensor([num_channels, num_ranks, num_recv_buffer_tokens],
-                                          "int32"),
+        channel_x_buffers: T.Tensor([num_channels, num_ranks, num_recv_buffer_tokens, hidden], dtype),
+        channel_src_idx_buffers: T.Tensor([num_channels, num_ranks, num_recv_buffer_tokens], "int32"),
         # channel_x_scales_buffers: T.Tensor([num_channels, num_ranks, num_recv_buffer_tokens, num_scales], "float32"),
     ):
         with T.Kernel(num_sms, threads=threads) as bx:
@@ -624,65 +578,52 @@ def cached_dispatch_main(
 
                 # send offset by `-value-1` e.g. 0->-1, 1->-2
                 # this is for distinguishing zero tokens
-                if send_warp_id_in_rank == 0 and T.elect_one_sync():
-                    value = T.alloc_var('int32')
-                    value = T.if_then_else(
-                        responsible_channel > 0, channel_prefix_matrix[responsible_rank,
-                                                                       responsible_channel - 1], 0)
-                    T.st(
-                        channel_start_offset[responsible_channel, rank],
-                        -value - 1,
-                        scope='sys',
-                        sem='relaxed',
-                        dst_pe=responsible_rank)
+                if send_warp_id_in_rank == 0 and T.shuffle_elect(32):
+                    value = T.alloc_var("int32")
+                    value = T.if_then_else(responsible_channel > 0, channel_prefix_matrix[responsible_rank, responsible_channel - 1], 0)
+                    T.st(channel_start_offset[responsible_channel, rank], -value - 1, scope="sys", sem="relaxed", dst_pe=responsible_rank)
                     value = channel_prefix_matrix[responsible_rank, responsible_channel]
-                    T.st(
-                        channel_end_offset[responsible_channel, rank],
-                        -value - 1,
-                        scope='sys',
-                        sem='relaxed',
-                        dst_pe=responsible_rank)
+                    T.st(channel_end_offset[responsible_channel, rank], -value - 1, scope="sys", sem="relaxed", dst_pe=responsible_rank)
                 T.sync_warp()
 
                 # get task
-                num_tokens_per_channel = T.alloc_var(
-                    'int32', init=T.ceildiv(num_tokens, num_channels))
-                token_start_idx = T.alloc_var('int32')
+                num_tokens_per_channel = T.alloc_var("int32", init=T.ceildiv(num_tokens, num_channels))
+                token_start_idx = T.alloc_var("int32")
                 token_start_idx = T.min(num_tokens_per_channel * responsible_channel, num_tokens)
-                token_end_idx = T.alloc_var('int32')
+                token_end_idx = T.alloc_var("int32")
                 token_end_idx = T.min(token_start_idx + num_tokens_per_channel, num_tokens)
 
                 # sender mainloop: iterate over all tokens and send by trunk
-                cached_channel_tail_idx = T.alloc_var('int32')
+                cached_channel_tail_idx = T.alloc_var("int32")
                 cached_channel_tail_idx = 0
-                token_idx = T.alloc_var('int32')
+                token_idx = T.alloc_var("int32")
                 token_idx = token_start_idx
-                with T.While(token_idx < token_end_idx):
-                    if T.elect_one_sync():
+                while token_idx < token_end_idx:
+                    if T.shuffle_elect(32):
                         T.wait_ge(
                             channel_head_idx[responsible_channel, rank],
                             num_max_send_tokens + cached_channel_tail_idx - num_recv_buffer_tokens,
-                            responsible_rank)
+                            responsible_rank,
+                        )
                     T.sync_warp()
 
-                    chunk_token_idx = T.alloc_var('int32')
+                    chunk_token_idx = T.alloc_var("int32")
                     chunk_token_idx = 0
                     while chunk_token_idx < num_max_send_tokens and token_idx < token_end_idx:
                         # for the same token, the warp assigned to save `send_head` may be different from the warp
                         # assigned to send the following data
-                        if token_idx % num_warps_per_rank == send_warp_id_in_rank and T.elect_one_sync(
-                        ):
+                        if token_idx % num_warps_per_rank == send_warp_id_in_rank and T.shuffle_elect(32):
                             send_head[token_idx, responsible_rank] = T.if_then_else(
-                                is_token_in_rank[token_idx, responsible_rank],
-                                cached_channel_tail_idx, -1)
+                                is_token_in_rank[token_idx, responsible_rank], cached_channel_tail_idx, -1
+                            )
 
                         # skip if not selected
                         if not is_token_in_rank[token_idx, responsible_rank]:
                             token_idx += 1
-                            T.loop_continue()
+                            continue
 
                         # selected, get an empty slot
-                        dst_slot_idx = T.alloc_var('int32')
+                        dst_slot_idx = T.alloc_var("int32")
                         dst_slot_idx = cached_channel_tail_idx % num_recv_buffer_tokens
                         cached_channel_tail_idx += 1
                         if cached_channel_tail_idx % num_warps_per_rank == send_warp_id_in_rank:
@@ -690,20 +631,16 @@ def cached_dispatch_main(
                             # 1. copy data
                             T.put_warp(
                                 T.address_of(x[token_idx, 0]),
-                                T.address_of(channel_x_buffers[responsible_channel, rank,
-                                                               dst_slot_idx, 0]),
+                                T.address_of(channel_x_buffers[responsible_channel, rank, dst_slot_idx, 0]),
                                 hidden,
                                 dst_pe=responsible_rank,
                                 unroll_factor=4,
-                                enable_aggressive_vectorize=True)
+                                enable_aggressive_vectorize=True,
+                            )
 
                             # 2. copy src idx
-                            if T.elect_one_sync():
-                                T.st(
-                                    channel_src_idx_buffers[responsible_channel, rank,
-                                                            dst_slot_idx],
-                                    token_idx,
-                                    dst_pe=responsible_rank)
+                            if T.shuffle_elect(32):
+                                T.st(channel_src_idx_buffers[responsible_channel, rank, dst_slot_idx], token_idx, dst_pe=responsible_rank)
 
                             # 4. copy scale (support fp8 later)
 
@@ -713,36 +650,30 @@ def cached_dispatch_main(
                     # move tail index
                     # here all warps should share the same new tail
                     T.sync_threads(responsible_rank, num_threads_per_rank)
-                    if send_warp_id_in_rank == 0 and T.elect_one_sync():
+                    if T.shuffle_elect(96):
                         T.st(
                             channel_tail_idx[responsible_channel, rank],
                             cached_channel_tail_idx,
-                            scope='sys',
-                            sem='release',
-                            dst_pe=responsible_rank)
+                            scope="sys",
+                            sem="release",
+                            dst_pe=responsible_rank,
+                        )
 
             else:  # receiver
                 recv_thread_id_in_rank = tx % num_threads_per_rank
                 recv_warp_id_in_rank = recv_thread_id_in_rank // 32
 
                 # calculate offset first
-                rank_offset = T.if_then_else(responsible_rank > 0,
-                                             rank_prefix_matrix[responsible_rank - 1, rank], 0)
+                rank_offset = T.if_then_else(responsible_rank > 0, rank_prefix_matrix[responsible_rank - 1, rank], 0)
 
                 # receive channel offset
-                total_offset = T.alloc_var('int32')
-                num_tokens_to_recv = T.alloc_var('int32')
-                if T.elect_one_sync():
+                total_offset = T.alloc_var("int32")
+                num_tokens_to_recv = T.alloc_var("int32")
+                if T.shuffle_elect(32):
                     T.wait_ne(channel_start_offset[responsible_channel, responsible_rank], 0)
-                    T.ld(
-                        channel_start_offset[responsible_channel, responsible_rank],
-                        total_offset,
-                        sem='volatile')
+                    T.ld(channel_start_offset[responsible_channel, responsible_rank], total_offset, sem="volatile")
                     T.wait_ne(channel_end_offset[responsible_channel, responsible_rank], 0)
-                    T.ld(
-                        channel_end_offset[responsible_channel, responsible_rank],
-                        num_tokens_to_recv,
-                        sem='volatile')
+                    T.ld(channel_end_offset[responsible_channel, responsible_rank], num_tokens_to_recv, sem="volatile")
                     total_offset = -total_offset - 1
                     num_tokens_to_recv = -num_tokens_to_recv - 1
                     if recv_warp_id_in_rank == 0:
@@ -753,24 +684,20 @@ def cached_dispatch_main(
                 num_tokens_to_recv = T.tvm_warp_shuffle(-1, num_tokens_to_recv, 0, 32, 32)
 
                 # Shared tail indices for different warps
-                shared_channel_tail_idx = T.alloc_shared([num_ranks], 'int32')
+                shared_channel_tail_idx = T.alloc_shared([num_ranks], "int32")
 
-                cached_channel_head_idx = T.alloc_var('int32')
+                cached_channel_head_idx = T.alloc_var("int32")
                 cached_channel_head_idx = 0
-                cached_channel_tail_idx = T.alloc_var('int32')
+                cached_channel_tail_idx = T.alloc_var("int32")
                 cached_channel_tail_idx = 0
-                with T.While(num_tokens_to_recv > 0):
-                    with T.While(recv_thread_id_in_rank == 0):
-                        T.ld(
-                            channel_tail_idx[responsible_channel, responsible_rank],
-                            cached_channel_tail_idx,
-                            sem='acquire',
-                            scope='sys')
+                while num_tokens_to_recv > 0:
+                    while recv_thread_id_in_rank == 0:
+                        T.ld(channel_tail_idx[responsible_channel, responsible_rank], cached_channel_tail_idx, sem="acquire", scope="sys")
 
                         # read to copy
                         if cached_channel_head_idx != cached_channel_tail_idx:
                             shared_channel_tail_idx[responsible_rank] = cached_channel_tail_idx
-                            T.loop_break()
+                            break
 
                     # sync queue tail
                     T.sync_threads(responsible_rank, num_threads_per_rank)
@@ -779,31 +706,29 @@ def cached_dispatch_main(
                     # copy data
                     # 1. recv x
                     num_cur_recv_tokens = cached_channel_tail_idx - cached_channel_head_idx
-                    for chunk_idx in T.serial(recv_warp_id_in_rank, num_cur_recv_tokens,
-                                              num_warps_per_rank):
-                        token_idx_in_buffer = (cached_channel_head_idx +
-                                               chunk_idx) % num_recv_buffer_tokens
+                    for chunk_idx in T.serial(recv_warp_id_in_rank, num_cur_recv_tokens, num_warps_per_rank):
+                        token_idx_in_buffer = (cached_channel_head_idx + chunk_idx) % num_recv_buffer_tokens
                         #! T.copy will cause layout inference error
                         T.put_warp(
-                            T.address_of(channel_x_buffers[responsible_channel, responsible_rank,
-                                                           token_idx_in_buffer, 0]),
+                            T.address_of(channel_x_buffers[responsible_channel, responsible_rank, token_idx_in_buffer, 0]),
                             T.address_of(recv_x[total_offset + chunk_idx, 0]),
                             hidden,
                             -1,
                             5,
-                            enable_aggressive_vectorize=True)
+                            enable_aggressive_vectorize=True,
+                        )
 
                     # 2. recv src_idx
-                    for chunk_idx in T.serial(cached_channel_head_idx + recv_thread_id_in_rank,
-                                              cached_channel_tail_idx, num_threads_per_rank):
-                        local_src_idx = T.alloc_var('int32')
+                    for chunk_idx in T.serial(
+                        cached_channel_head_idx + recv_thread_id_in_rank, cached_channel_tail_idx, num_threads_per_rank
+                    ):
+                        local_src_idx = T.alloc_var("int32")
                         T.ld(
-                            channel_src_idx_buffers[responsible_channel, responsible_rank,
-                                                    chunk_idx % num_recv_buffer_tokens],
+                            channel_src_idx_buffers[responsible_channel, responsible_rank, chunk_idx % num_recv_buffer_tokens],
                             local_src_idx,
-                            nc=True)
-                        recv_src_idx[total_offset + chunk_idx -
-                                     cached_channel_head_idx] = local_src_idx
+                            nc=True,
+                        )
+                        recv_src_idx[total_offset + chunk_idx - cached_channel_head_idx] = local_src_idx
 
                     # 4. recv scale (support fp8 later)
 
@@ -811,12 +736,8 @@ def cached_dispatch_main(
                     cached_channel_head_idx += num_cur_recv_tokens
                     total_offset += num_cur_recv_tokens
                     T.sync_threads(responsible_rank, num_threads_per_rank)
-                    if recv_warp_id_in_rank == num_warps_per_rank - 1 and T.elect_one_sync():
-                        T.st(
-                            channel_head_idx[responsible_channel, responsible_rank],
-                            cached_channel_head_idx,
-                            scope='sys',
-                            sem='relaxed')
+                    if T.shuffle_elect(96):
+                        T.st(channel_head_idx[responsible_channel, responsible_rank], cached_channel_head_idx, scope="sys", sem="relaxed")
 
                     # Exit
                     num_tokens_to_recv -= num_cur_recv_tokens
@@ -848,8 +769,9 @@ def intranode_dispatch(
     # todo: support async functionality
 ):
     if handle is None:
-        assert num_tokens_per_rank is not None and is_token_in_rank is not None and num_tokens_per_expert is not None, \
-        "num_tokens_per_rank, is_token_in_rank, and num_tokens_per_expert must be provided in non-cached mode"
+        assert num_tokens_per_rank is not None and is_token_in_rank is not None and num_tokens_per_expert is not None, (
+            "num_tokens_per_rank, is_token_in_rank, and num_tokens_per_expert must be provided in non-cached mode"
+        )
     else:
         rank_prefix_matrix, channel_prefix_matrix, recv_channel_prefix_matrix, recv_src_idx, is_token_in_rank, send_head = handle
 
@@ -858,8 +780,19 @@ def intranode_dispatch(
     num_ranks = num_tokens_per_rank.shape[0]
     num_topk = topk_idx.shape[1] if handle is None else 0
 
-    barrier_signal, per_rank_buffer, per_expert_buffer, channel_start_offset, channel_end_offset, channel_head_idx, channel_tail_idx, \
-        channel_x_buffers, channel_src_idx_buffers, channel_topk_idx_buffers, channel_topk_weights_buffers = symm_buffers
+    (
+        barrier_signal,
+        per_rank_buffer,
+        per_expert_buffer,
+        channel_start_offset,
+        channel_end_offset,
+        channel_head_idx,
+        channel_tail_idx,
+        channel_x_buffers,
+        channel_src_idx_buffers,
+        channel_topk_idx_buffers,
+        channel_topk_weights_buffers,
+    ) = symm_buffers
 
     if handle is None:
         num_recv_tokens, num_recv_tokens_per_expert_list, rank_prefix_matrix, channel_prefix_matrix = notify_dispatch(
@@ -895,76 +828,84 @@ def intranode_dispatch(
             channel_tail_idx,
             barrier_signal,
             allocator,
-            comm_stream=comm_stream)
+            comm_stream=comm_stream,
+        )
         num_recv_tokens = recv_src_idx.size(0)
 
-    recv_x = torch.empty((num_recv_tokens, hidden), dtype=x.dtype, device='cuda')
-    recv_src_idx = torch.empty((num_recv_tokens,), dtype=torch.int32, device='cuda')
+    recv_x = torch.empty((num_recv_tokens, hidden), dtype=x.dtype, device="cuda")
+    recv_src_idx = torch.empty((num_recv_tokens,), dtype=torch.int32, device="cuda")
     if handle is None:
-        recv_topk_idx = torch.empty((num_recv_tokens, num_topk), dtype=torch.int64, device='cuda')
-        recv_topk_weights = torch.empty((num_recv_tokens, num_topk),
-                                        dtype=torch.float32,
-                                        device='cuda')
-    recv_channel_prefix_matrix = torch.empty((num_ranks, config.num_channels),
-                                             dtype=torch.int32,
-                                             device='cuda')
-    send_head = torch.empty((num_tokens, num_ranks), dtype=torch.int32, device='cuda')
+        recv_topk_idx = torch.empty((num_recv_tokens, num_topk), dtype=torch.int64, device="cuda")
+        recv_topk_weights = torch.empty((num_recv_tokens, num_topk), dtype=torch.float32, device="cuda")
+    recv_channel_prefix_matrix = torch.empty((num_ranks, config.num_channels), dtype=torch.int32, device="cuda")
+    send_head = torch.empty((num_tokens, num_ranks), dtype=torch.int32, device="cuda")
 
     # run dispatch
     if handle is None:
-        kernel = dispatch_kernel(num_ranks, config.num_max_nvl_chunked_send_tokens,
-                                 config.num_max_nvl_chunked_recv_tokens, hidden, num_topk,
-                                 num_experts, config.num_sms, 'bfloat16')
-        kernel.initialize(allocator=allocator)
-        kernel(
-            rank,
-            recv_x,
-            recv_src_idx,
-            recv_topk_idx,
-            recv_topk_weights,
-            recv_channel_prefix_matrix,
-            send_head,
-            x,
-            topk_idx,
-            topk_weights,
-            is_token_in_rank,
-            rank_prefix_matrix,
-            channel_prefix_matrix,
-            channel_start_offset,
-            channel_end_offset,
-            channel_head_idx,
-            channel_tail_idx,
-            channel_x_buffers,
-            channel_src_idx_buffers,
-            channel_topk_idx_buffers,
-            channel_topk_weights_buffers,
-            stream=comm_stream.cuda_stream,
-            skip_tensor_validation=True)  # reduce runtime overhead
-        handle = (rank_prefix_matrix, channel_prefix_matrix, recv_channel_prefix_matrix,
-                  recv_src_idx, is_token_in_rank, send_head)
+        kernel = dispatch_kernel(
+            num_ranks,
+            config.num_max_nvl_chunked_send_tokens,
+            config.num_max_nvl_chunked_recv_tokens,
+            hidden,
+            num_topk,
+            num_experts,
+            config.num_sms,
+            "bfloat16",
+        )
+        kernel.initialize(allocator=allocator, stream=comm_stream.cuda_stream)
+        with tvm_ffi.use_torch_stream(torch.cuda.stream(comm_stream)):
+            kernel(
+                rank,
+                recv_x,
+                recv_src_idx,
+                recv_topk_idx,
+                recv_topk_weights,
+                recv_channel_prefix_matrix,
+                send_head,
+                x,
+                topk_idx,
+                topk_weights,
+                is_token_in_rank,
+                rank_prefix_matrix,
+                channel_prefix_matrix,
+                channel_start_offset,
+                channel_end_offset,
+                channel_head_idx,
+                channel_tail_idx,
+                channel_x_buffers,
+                channel_src_idx_buffers,
+                channel_topk_idx_buffers,
+                channel_topk_weights_buffers,
+            )
+        handle = (rank_prefix_matrix, channel_prefix_matrix, recv_channel_prefix_matrix, recv_src_idx, is_token_in_rank, send_head)
         return recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle
     else:
-        kernel = cached_dispatch_kernel(num_ranks, num_tokens,
-                                        config.num_max_nvl_chunked_send_tokens,
-                                        config.num_max_nvl_chunked_recv_tokens, hidden,
-                                        config.num_sms, 'bfloat16')
+        kernel = cached_dispatch_kernel(
+            num_ranks,
+            num_tokens,
+            config.num_max_nvl_chunked_send_tokens,
+            config.num_max_nvl_chunked_recv_tokens,
+            hidden,
+            config.num_sms,
+            "bfloat16",
+        )
         kernel.initialize(allocator=allocator, stream=comm_stream.cuda_stream)
-        kernel(
-            rank,
-            recv_x,
-            recv_src_idx,
-            recv_channel_prefix_matrix,
-            send_head,
-            x,
-            is_token_in_rank,
-            rank_prefix_matrix,
-            channel_prefix_matrix,
-            channel_start_offset,
-            channel_end_offset,
-            channel_head_idx,
-            channel_tail_idx,
-            channel_x_buffers,
-            channel_src_idx_buffers,
-            stream=comm_stream.cuda_stream,
-            skip_tensor_validation=True)  # reduce runtime overhead
+        with torch.cuda.stream(comm_stream):
+            kernel(
+                rank,
+                recv_x,
+                recv_src_idx,
+                recv_channel_prefix_matrix,
+                send_head,
+                x,
+                is_token_in_rank,
+                rank_prefix_matrix,
+                channel_prefix_matrix,
+                channel_start_offset,
+                channel_end_offset,
+                channel_head_idx,
+                channel_tail_idx,
+                channel_x_buffers,
+                channel_src_idx_buffers,
+            )
         return recv_x
diff --git a/examples/distributed/deepseek_deepep/intranode/example_intranode.py b/examples/distributed/deepseek_deepep/intranode/example_intranode.py
index 8f555dfee..41ea25834 100644
--- a/examples/distributed/deepseek_deepep/intranode/example_intranode.py
+++ b/examples/distributed/deepseek_deepep/intranode/example_intranode.py
@@ -13,7 +13,7 @@
 from deepep_utils import gen_inputs, ep_bench
 
 # tilelang.disable_cache()
-os.environ['NCCL_DEBUG'] = 'WARN'  # silence NCCL log
+os.environ["NCCL_DEBUG"] = "WARN"  # silence NCCL log
 
 
 def test_intranode(
@@ -37,170 +37,187 @@ def test_intranode(
     deepep_buffer = deep_ep.Buffer(group, num_nvl_bytes=2**30)
 
     # Generate inputs for testing
-    x, topk_idx, topk_weights, rank_idx = gen_inputs(num_tokens, hidden, num_topk, num_experts,
-                                                     num_ranks)
+    x, topk_idx, topk_weights, rank_idx = gen_inputs(num_tokens, hidden, num_topk, num_experts, num_ranks)
 
     # 1. test get_dispatch_layout
     ref_num_tokens_per_rank, _, ref_num_tokens_per_expert, ref_is_token_in_rank, _ = deepep_buffer.get_dispatch_layout(
-        topk_idx, num_experts)
-    num_tokens_per_rank, num_tokens_per_expert, is_token_in_rank = ts_buffer.get_dispatch_layout(
-        topk_idx)
+        topk_idx, num_experts
+    )
+    num_tokens_per_rank, num_tokens_per_expert, is_token_in_rank = ts_buffer.get_dispatch_layout(topk_idx)
 
-    assert torch.equal(num_tokens_per_expert, ref_num_tokens_per_expert), \
+    assert torch.equal(num_tokens_per_expert, ref_num_tokens_per_expert), (
         f"[rank {rank}] num_tokens_per_expert mismatch, max err: {(num_tokens_per_expert - ref_num_tokens_per_expert).abs().max()}"
-    assert torch.equal(is_token_in_rank, ref_is_token_in_rank), \
-        f"[rank {rank}] is_token_in_rank mismatch"
-    assert torch.equal(num_tokens_per_rank, ref_num_tokens_per_rank), \
+    )
+    assert torch.equal(is_token_in_rank, ref_is_token_in_rank), f"[rank {rank}] is_token_in_rank mismatch"
+    assert torch.equal(num_tokens_per_rank, ref_num_tokens_per_rank), (
         f"[rank {rank}] num_tokens_per_rank mismatch, max err: {(num_tokens_per_rank - ref_num_tokens_per_rank).abs().max()}"
+    )
 
     group.barrier()
     if rank == 0:
-        print('Check passed for get_dispatch_layout. ✅')
+        print("Check passed for get_dispatch_layout. ✅")
 
     # 2. test dispatch
     # ref
-    ref_recv_x, ref_recv_topk_idx, ref_recv_topk_weights, ref_num_recv_tokens_per_expert_list, ref_handle, event = \
-        deepep_buffer.dispatch(x, None, ref_num_tokens_per_rank, None, ref_is_token_in_rank, ref_num_tokens_per_expert, topk_idx, topk_weights, expert_alignment)
+    ref_recv_x, ref_recv_topk_idx, ref_recv_topk_weights, ref_num_recv_tokens_per_expert_list, ref_handle, event = deepep_buffer.dispatch(
+        x, None, ref_num_tokens_per_rank, None, ref_is_token_in_rank, ref_num_tokens_per_expert, topk_idx, topk_weights, expert_alignment
+    )
     # ours
     if cached_dispatch:
-        recv_x = ts_buffer.dispatch(x, ref_handle, num_tokens_per_rank, is_token_in_rank,
-                                    num_tokens_per_expert, None, None, expert_alignment)
+        recv_x = ts_buffer.dispatch(
+            x, ref_handle, num_tokens_per_rank, is_token_in_rank, num_tokens_per_expert, None, None, expert_alignment
+        )
     else:
         recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle = ts_buffer.dispatch(
-            x, None, num_tokens_per_rank, is_token_in_rank, num_tokens_per_expert, topk_idx,
-            topk_weights, expert_alignment)
+            x, None, num_tokens_per_rank, is_token_in_rank, num_tokens_per_expert, topk_idx, topk_weights, expert_alignment
+        )
 
     # check dispatch output
-    assert torch.equal(
-        recv_x,
-        ref_recv_x), f'[rank {rank}] recv_x mismatch, max err: {(recv_x - ref_recv_x).abs().max()}'
+    assert torch.equal(recv_x, ref_recv_x), f"[rank {rank}] recv_x mismatch, max err: {(recv_x - ref_recv_x).abs().max()}"
     if not cached_dispatch:
-        assert torch.equal(
-            recv_topk_idx, ref_recv_topk_idx
-        ), f'[rank {rank}] recv_topk_idx mismatch, max err: {(recv_topk_idx - ref_recv_topk_idx).abs().max()}'
-        assert torch.equal(
-            recv_topk_weights, ref_recv_topk_weights
-        ), f'[rank {rank}] recv_topk_weights mismatch, max err: {(recv_topk_weights - ref_recv_topk_weights).abs().max()}'
-        assert num_recv_tokens_per_expert_list == ref_num_recv_tokens_per_expert_list, f'[rank {rank}] num_recv_tokens_per_expert_list mismatch'
+        assert torch.equal(recv_topk_idx, ref_recv_topk_idx), (
+            f"[rank {rank}] recv_topk_idx mismatch, max err: {(recv_topk_idx - ref_recv_topk_idx).abs().max()}"
+        )
+        assert torch.equal(recv_topk_weights, ref_recv_topk_weights), (
+            f"[rank {rank}] recv_topk_weights mismatch, max err: {(recv_topk_weights - ref_recv_topk_weights).abs().max()}"
+        )
+        assert num_recv_tokens_per_expert_list == ref_num_recv_tokens_per_expert_list, (
+            f"[rank {rank}] num_recv_tokens_per_expert_list mismatch"
+        )
 
         # check handle
         rank_prefix_matrix, channel_prefix_matrix, recv_channel_prefix_matrix, recv_src_idx, is_token_in_rank, send_head = handle
-        ref_rank_prefix_matrix, ref_channel_prefix_matrix, ref_recv_channel_prefix_matrix, ref_recv_src_idx, ref_is_token_in_rank, ref_send_head = ref_handle
-        assert torch.equal(
-            rank_prefix_matrix, ref_rank_prefix_matrix
-        ), f'[rank {rank}] rank_prefix_matrix mismatch, max err: {(rank_prefix_matrix - ref_rank_prefix_matrix).abs().max()}'
-        assert torch.equal(
-            channel_prefix_matrix, ref_channel_prefix_matrix
-        ), f'[rank {rank}] channel_prefix_matrix mismatch, max err: {(channel_prefix_matrix - ref_channel_prefix_matrix).abs().max()}'
-        assert torch.equal(
-            recv_channel_prefix_matrix, ref_recv_channel_prefix_matrix
-        ), f'[rank {rank}] recv_channel_prefix_matrix mismatch, max err: {(recv_channel_prefix_matrix - ref_recv_channel_prefix_matrix).abs().max()}'
-        assert torch.equal(
-            recv_src_idx, ref_recv_src_idx
-        ), f'[rank {rank}] recv_src_idx mismatch, max err: {(recv_src_idx - ref_recv_src_idx).abs().max()}'
-        assert torch.equal(
-            is_token_in_rank, ref_is_token_in_rank
-        ), f'[rank {rank}] is_token_in_rank mismatch, max err: {(is_token_in_rank - ref_is_token_in_rank).abs().max()}'
-        assert torch.equal(
-            send_head, ref_send_head
-        ), f'[rank {rank}] send_head mismatch, max err: {(send_head - ref_send_head).abs().max()}'
+        (
+            ref_rank_prefix_matrix,
+            ref_channel_prefix_matrix,
+            ref_recv_channel_prefix_matrix,
+            ref_recv_src_idx,
+            ref_is_token_in_rank,
+            ref_send_head,
+        ) = ref_handle
+        assert torch.equal(rank_prefix_matrix, ref_rank_prefix_matrix), (
+            f"[rank {rank}] rank_prefix_matrix mismatch, max err: {(rank_prefix_matrix - ref_rank_prefix_matrix).abs().max()}"
+        )
+        assert torch.equal(channel_prefix_matrix, ref_channel_prefix_matrix), (
+            f"[rank {rank}] channel_prefix_matrix mismatch, max err: {(channel_prefix_matrix - ref_channel_prefix_matrix).abs().max()}"
+        )
+        assert torch.equal(recv_channel_prefix_matrix, ref_recv_channel_prefix_matrix), (
+            f"[rank {rank}] recv_channel_prefix_matrix mismatch, max err: {(recv_channel_prefix_matrix - ref_recv_channel_prefix_matrix).abs().max()}"
+        )
+        assert torch.equal(recv_src_idx, ref_recv_src_idx), (
+            f"[rank {rank}] recv_src_idx mismatch, max err: {(recv_src_idx - ref_recv_src_idx).abs().max()}"
+        )
+        assert torch.equal(is_token_in_rank, ref_is_token_in_rank), (
+            f"[rank {rank}] is_token_in_rank mismatch, max err: {(is_token_in_rank - ref_is_token_in_rank).abs().max()}"
+        )
+        assert torch.equal(send_head, ref_send_head), (
+            f"[rank {rank}] send_head mismatch, max err: {(send_head - ref_send_head).abs().max()}"
+        )
 
     group.barrier()
     if rank == 0:
-        print(f'Check passed for {"cached" if cached_dispatch else "non-cached"} dispatch. ✅')
+        print(f"Check passed for {'cached' if cached_dispatch else 'non-cached'} dispatch. ✅")
 
     # 3. test combine
-    ref_combined_x, ref_combined_topk_weights, _ = deepep_buffer.combine(
-        recv_x, ref_handle, ref_recv_topk_weights)
+    ref_combined_x, ref_combined_topk_weights, _ = deepep_buffer.combine(recv_x, ref_handle, ref_recv_topk_weights)
     if cached_dispatch:  # acquire handle first
         recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle = ts_buffer.dispatch(
-            x, None, num_tokens_per_rank, is_token_in_rank, num_tokens_per_expert, topk_idx,
-            topk_weights, expert_alignment)
+            x, None, num_tokens_per_rank, is_token_in_rank, num_tokens_per_expert, topk_idx, topk_weights, expert_alignment
+        )
     combined_x, combined_topk_weights = ts_buffer.combine(recv_x, handle, recv_topk_weights)
-    assert torch.equal(
-        combined_x, ref_combined_x
-    ), f'[rank {rank}] combined_x mismatch, max err: {(combined_x - ref_combined_x).abs().max()}'
-    assert torch.equal(
-        combined_topk_weights, ref_combined_topk_weights
-    ), f'[rank {rank}] combined_topk_weights mismatch, max err: {(combined_topk_weights - ref_combined_topk_weights).abs().max()}'
+    assert torch.equal(combined_x, ref_combined_x), (
+        f"[rank {rank}] combined_x mismatch, max err: {(combined_x - ref_combined_x).abs().max()}"
+    )
+    assert torch.equal(combined_topk_weights, ref_combined_topk_weights), (
+        f"[rank {rank}] combined_topk_weights mismatch, max err: {(combined_topk_weights - ref_combined_topk_weights).abs().max()}"
+    )
 
     group.barrier()
     if rank == 0:
-        print('Check passed for combine. ✅')
+        print("Check passed for combine. ✅")
 
     if rank == 0:
-        print('All checks passed for TileScale intranode DeepEP. ✅')
+        print("All checks passed for TileScale intranode DeepEP. ✅")
 
     # benchmark
     if rank == 0:
-        print(
-            f'========== Benchmarking {"cached" if cached_dispatch else "non-cached"} dispatch =========='
-        )
+        print(f"========== Benchmarking {'cached' if cached_dispatch else 'non-cached'} dispatch ==========")
     if not cached_dispatch:
         group.barrier()
         deepep_dispatch_time = ep_bench(
-            lambda: deepep_buffer.
-            dispatch(x, None, ref_num_tokens_per_rank, None, ref_is_token_in_rank,
-                     ref_num_tokens_per_expert, topk_idx, topk_weights, expert_alignment),
+            lambda: deepep_buffer.dispatch(
+                x,
+                None,
+                ref_num_tokens_per_rank,
+                None,
+                ref_is_token_in_rank,
+                ref_num_tokens_per_expert,
+                topk_idx,
+                topk_weights,
+                expert_alignment,
+            ),
             warmup=50,
-            rep=50)
-        print(f'[rank {rank}] DeepEP dispatch time: {deepep_dispatch_time:.4f}ms')
+            rep=50,
+        )
+        print(f"[rank {rank}] DeepEP dispatch time: {deepep_dispatch_time:.4f}ms")
         group.barrier()
         ts_dispatch_time = ep_bench(
-            lambda: ts_buffer.
-            dispatch(x, None, num_tokens_per_rank, is_token_in_rank, num_tokens_per_expert,
-                     topk_idx, topk_weights, expert_alignment),
+            lambda: ts_buffer.dispatch(
+                x, None, num_tokens_per_rank, is_token_in_rank, num_tokens_per_expert, topk_idx, topk_weights, expert_alignment
+            ),
             warmup=50,
-            rep=50)
-        print(f'[rank {rank}] TileScale dispatch time: {ts_dispatch_time:.4f}ms')
+            rep=50,
+        )
+        print(f"[rank {rank}] TileScale dispatch time: {ts_dispatch_time:.4f}ms")
         group.barrier()
     else:
         group.barrier()
         deepep_dispatch_time = ep_bench(
-            lambda: deepep_buffer.
-            dispatch(x, ref_handle, ref_num_tokens_per_rank, None, ref_is_token_in_rank,
-                     ref_num_tokens_per_expert, None, None, expert_alignment),
+            lambda: deepep_buffer.dispatch(
+                x, ref_handle, ref_num_tokens_per_rank, None, ref_is_token_in_rank, ref_num_tokens_per_expert, None, None, expert_alignment
+            ),
             warmup=50,
-            rep=50)
-        print(f'[rank {rank}] DeepEP dispatch time: {deepep_dispatch_time:.4f}ms')
+            rep=50,
+        )
+        print(f"[rank {rank}] DeepEP dispatch time: {deepep_dispatch_time:.4f}ms")
         group.barrier()
         ts_dispatch_time = ep_bench(
-            lambda: ts_buffer.dispatch(x, ref_handle, num_tokens_per_rank, is_token_in_rank,
-                                       num_tokens_per_expert, None, None, expert_alignment),
+            lambda: ts_buffer.dispatch(
+                x, ref_handle, num_tokens_per_rank, is_token_in_rank, num_tokens_per_expert, None, None, expert_alignment
+            ),
             warmup=50,
-            rep=50)
-        print(f'[rank {rank}] TileScale dispatch time: {ts_dispatch_time:.4f}ms')
+            rep=50,
+        )
+        print(f"[rank {rank}] TileScale dispatch time: {ts_dispatch_time:.4f}ms")
         group.barrier()
 
     if rank == 0:
-        print('========== Benchmarking combine ==========')
+        print("========== Benchmarking combine ==========")
     group.barrier()
-    deepep_combine_time = ep_bench(
-        lambda: deepep_buffer.combine(recv_x, ref_handle, ref_recv_topk_weights), warmup=50, rep=50)
-    print(f'[rank {rank}] DeepEP combine time: {deepep_combine_time:.4f}ms')
+    deepep_combine_time = ep_bench(lambda: deepep_buffer.combine(recv_x, ref_handle, ref_recv_topk_weights), warmup=50, rep=50)
+    print(f"[rank {rank}] DeepEP combine time: {deepep_combine_time:.4f}ms")
 
     group.barrier()
-    ts_combine_time = ep_bench(
-        lambda: ts_buffer.combine(recv_x, handle, recv_topk_weights), warmup=50, rep=50)
-    print(f'[rank {rank}] TileScale combine time: {ts_combine_time:.4f}ms')
+    ts_combine_time = ep_bench(lambda: ts_buffer.combine(recv_x, handle, recv_topk_weights), warmup=50, rep=50)
+    print(f"[rank {rank}] TileScale combine time: {ts_combine_time:.4f}ms")
     group.barrier()
 
     if rank == 0:
-        print('========== Benchmarking report ==========')
+        print("========== Benchmarking report ==========")
     dispatch_bf16_nvl_recv_bytes = recv_x.numel() * 2
     combine_bf16_nvl_send_bytes = dispatch_bf16_nvl_recv_bytes
     if rank == 0:
         print(
-            f'DeepEP dispatch time: {deepep_dispatch_time:.4f}ms, bandwidth: {dispatch_bf16_nvl_recv_bytes  / deepep_dispatch_time / 1e6:.2f} GB/s (NVL)'
+            f"DeepEP dispatch time: {deepep_dispatch_time:.4f}ms, bandwidth: {dispatch_bf16_nvl_recv_bytes / deepep_dispatch_time / 1e6:.2f} GB/s (NVL)"
         )
         print(
-            f'TileScale dispatch time: {ts_dispatch_time:.4f}ms, bandwidth: {dispatch_bf16_nvl_recv_bytes / ts_dispatch_time / 1e6:.2f} GB/s (NVL)'
+            f"TileScale dispatch time: {ts_dispatch_time:.4f}ms, bandwidth: {dispatch_bf16_nvl_recv_bytes / ts_dispatch_time / 1e6:.2f} GB/s (NVL)"
         )
         print(
-            f'DeepEP combine time: {deepep_combine_time:.4f}ms, bandwidth: {combine_bf16_nvl_send_bytes / deepep_combine_time / 1e6:.2f} GB/s (NVL)'
+            f"DeepEP combine time: {deepep_combine_time:.4f}ms, bandwidth: {combine_bf16_nvl_send_bytes / deepep_combine_time / 1e6:.2f} GB/s (NVL)"
         )
         print(
-            f'TileScale combine time: {ts_combine_time:.4f}ms, bandwidth: {combine_bf16_nvl_send_bytes / ts_combine_time / 1e6:.2f} GB/s (NVL)'
+            f"TileScale combine time: {ts_combine_time:.4f}ms, bandwidth: {combine_bf16_nvl_send_bytes / ts_combine_time / 1e6:.2f} GB/s (NVL)"
         )
 
 
@@ -227,12 +244,10 @@ def parse_args():
     parser.add_argument("--num_ranks", type=int, default=8, help="Number of ranks")
     parser.add_argument("--num_tokens", type=int, default=4096, help="Number of tokens")
     parser.add_argument("--hidden", type=int, default=7168, help="Hidden size")
-    parser.add_argument(
-        "--num_topk", type=int, default=8, help="Number of top-k experts to select for each token")
+    parser.add_argument("--num_topk", type=int, default=8, help="Number of top-k experts to select for each token")
     parser.add_argument("--num_experts", type=int, default=32, help="Number of experts")
     parser.add_argument("--expert_alignment", type=int, default=1, help="Expert alignment")
-    parser.add_argument(
-        "--cached", action="store_true", default=False, help="Whether to use cached dispatch")
+    parser.add_argument("--cached", action="store_true", default=False, help="Whether to use cached dispatch")
     return parser.parse_args()
 
 
diff --git a/examples/distributed/deepseek_deepep/intranode/get_dispatch_layout.py b/examples/distributed/deepseek_deepep/intranode/get_dispatch_layout.py
index 97b67d1a4..c696297e1 100644
--- a/examples/distributed/deepseek_deepep/intranode/get_dispatch_layout.py
+++ b/examples/distributed/deepseek_deepep/intranode/get_dispatch_layout.py
@@ -15,8 +15,8 @@
 
 # TODO(wt): Add async functionality
 def get_dispatch_layout(
-        topk_idx: torch.Tensor, num_experts: int,
-        num_ranks: int) -> Tuple[torch.Tensor, torch.Tensor | None, torch.Tensor, torch.Tensor]:
+    topk_idx: torch.Tensor, num_experts: int, num_ranks: int
+) -> Tuple[torch.Tensor, torch.Tensor | None, torch.Tensor, torch.Tensor]:
     """Calculate the layout required for later communication.
 
     Arguments:
@@ -42,9 +42,9 @@ def get_dispatch_layout(
     # Allocate tensors
     # TODO(wt): Wait on previous events and allocate on comm stream when adding async functionality
     num_tokens, num_topk = topk_idx.shape
-    num_tokens_per_rank = torch.empty(num_ranks, dtype=torch.int32, device='cuda')
-    num_tokens_per_expert = torch.empty(num_experts, dtype=torch.int32, device='cuda')
-    is_token_in_rank = torch.empty((num_tokens, num_ranks), dtype=torch.bool, device='cuda')
+    num_tokens_per_rank = torch.empty(num_ranks, dtype=torch.int32, device="cuda")
+    num_tokens_per_expert = torch.empty(num_experts, dtype=torch.int32, device="cuda")
+    is_token_in_rank = torch.empty((num_tokens, num_ranks), dtype=torch.bool, device="cuda")
 
     # Launch the kernel
     kernel = get_dispatch_layout_kernel(num_topk, num_experts, num_ranks)
@@ -72,14 +72,14 @@ def get_dispatch_layout_kernel(
     num_sms = T.ceildiv(num_experts, experts_per_sm) + T.ceildiv(num_ranks, ranks_per_sm)
     experts_per_rank = num_experts // num_ranks
 
-    num_tokens = T.dynamic('num_tokens')
+    num_tokens = T.dynamic("num_tokens")
 
     @T.prim_func
     def get_dispatch_layout_main(
-            topk_idx: T.Tensor([num_tokens, num_topk], "int64"),  # type: ignore
-            num_tokens_per_rank: T.Tensor([num_ranks], "int32"),  # type: ignore
-            num_tokens_per_expert: T.Tensor([num_experts], "int32"),  # type: ignore
-            is_token_in_rank: T.Tensor([num_tokens, num_ranks], "bool"),  # type: ignore
+        topk_idx: T.Tensor([num_tokens, num_topk], "int64"),  # type: ignore
+        num_tokens_per_rank: T.Tensor([num_ranks], "int32"),  # type: ignore
+        num_tokens_per_expert: T.Tensor([num_experts], "int32"),  # type: ignore
+        is_token_in_rank: T.Tensor([num_tokens, num_ranks], "bool"),  # type: ignore
     ):
         with T.Kernel(num_sms, threads=threads) as bx:
             tx = T.get_thread_binding()
diff --git a/examples/distributed/deepseek_deepep/intranode/test_intranode.py b/examples/distributed/deepseek_deepep/intranode/test_intranode.py
index 317721996..c6f8a55c6 100644
--- a/examples/distributed/deepseek_deepep/intranode/test_intranode.py
+++ b/examples/distributed/deepseek_deepep/intranode/test_intranode.py
@@ -3,6 +3,7 @@
 import example_intranode
 
 
+@tilelang.testing.requires_distributed
 @tilelang.testing.requires_cuda
 def test_intranode(monkeypatch):
     monkeypatch.setattr("sys.argv", ["example_intranode.py"])  # optionally add testing params here
diff --git a/examples/distributed/example_all_to_all.py b/examples/distributed/example_all_to_all.py
index 328ebc86b..dd0157c89 100644
--- a/examples/distributed/example_all_to_all.py
+++ b/examples/distributed/example_all_to_all.py
@@ -11,7 +11,6 @@
 
 
 def all_to_all(PE_num, TOKEN_NUM, TOPK, HIDDEN, EXPERT_NUM, dtype="float16"):
-
     EXPERTS_PER_RANK = EXPERT_NUM // PE_num
 
     @T.prim_func
@@ -37,8 +36,8 @@ def main(
             m_end[0] = splits_cumsum[(peer + 1) * EXPERTS_PER_RANK]
 
             T.putmem_nbi_block(
-                T.address_of(data_dst[0, 0]), T.address_of(data_src[m_start[0], 0]),
-                (m_end[0] - m_start[0]) * HIDDEN * 2, peer)
+                T.address_of(data_dst[0, 0]), T.address_of(data_src[m_start[0], 0]), (m_end[0] - m_start[0]) * HIDDEN * 2, peer
+            )
 
             T.fence()
 
@@ -119,7 +118,7 @@ def splits_to_cumsum(splits: torch.Tensor):
 # print("split_cumsum:", split_cumsum)
 
 data_src = pynvshmem.nvshmem_create_tensor([args.M * args.topk, args.N], torch.float16)
-data_src[:].copy_(ref_tensor[args.M * args.topk * RANK:args.M * args.topk * (RANK + 1), :])
+data_src[:].copy_(ref_tensor[args.M * args.topk * RANK : args.M * args.topk * (RANK + 1), :])
 
 splits_cumsum = pynvshmem.nvshmem_create_tensor([args.G + 1], torch.int32)
 splits_cumsum[:].copy_(split_cumsum)
diff --git a/examples/distributed/example_allgather.py b/examples/distributed/example_allgather.py
index bc9cb3e1b..56e865391 100644
--- a/examples/distributed/example_allgather.py
+++ b/examples/distributed/example_allgather.py
@@ -13,8 +13,8 @@ def allgather(PE_num, M, N, dtype="float16", threads=128):
 
     @T.prim_func
     def a2a_split(
-            A: T.Tensor((M_per_rank, N), dtype),  # type: ignore
-            B: T.Tensor((M, N), dtype),  # type: ignore
+        A: T.Tensor((M_per_rank, N), dtype),  # type: ignore
+        B: T.Tensor((M, N), dtype),  # type: ignore
     ):
         # Each block is responsible for sending (block_M, N) to exact one rank.
         with T.Kernel(M_per_rank // block_M, PE_num - 1, threads=threads) as (bx, by):
@@ -24,11 +24,9 @@ def a2a_split(
             A_shared = T.alloc_shared((block_M, N), dtype)
             local_base = bx * block_M
             global_base = M_per_rank * mype + local_base
-            T.copy(A[local_base:local_base + block_M, :], A_shared)
+            T.copy(A[local_base : local_base + block_M, :], A_shared)
             peer = (mype + by + 1) % npes
-            T.putmem_nbi_block(
-                T.address_of(B[global_base, 0]), T.address_of(A_shared[0, 0]),
-                block_M * N * dtype_map[dtype].itemsize, peer)
+            T.putmem_nbi_block(T.address_of(B[global_base, 0]), T.address_of(A_shared[0, 0]), block_M * N * dtype_map[dtype].itemsize, peer)
 
     return a2a_split
 
@@ -37,8 +35,7 @@ def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--M", type=int, default=8192)
     parser.add_argument("--N", type=int, default=12288)
-    parser.add_argument(
-        "--dtype", type=str, default="float16", choices=["float16", "float32", "bfloat16"])
+    parser.add_argument("--dtype", type=str, default="float16", choices=["float16", "float32", "bfloat16"])
     parser.add_argument("--threads", type=int, default=128, help="number of threads in a block")
     parser.add_argument("--print_source", action="store_true", help="print kernel source code")
     parser.add_argument("--warmup", type=int, default=1, help="number of warmup iterations")
@@ -46,7 +43,7 @@ def parse_args():
     return parser.parse_args()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     WORLD_SIZE, RANK, LOCAL_RANK, TP_GROUP = init_distributed(return_tp_group=True)
     assert WORLD_SIZE <= 8, "This benchmark is designed for intra-node communication"
 
@@ -82,7 +79,7 @@ def tilelang_ag():
         ag_buffer = pynvshmem.nvshmem_create_tensor([M_per_rank, N], torch_dtype)
         ag_buffer.copy_(local_data)
         out = pynvshmem.nvshmem_create_tensor([M, N], torch_dtype)
-        out[RANK * M_per_rank:(RANK + 1) * M_per_rank, :].copy_(local_data)
+        out[RANK * M_per_rank : (RANK + 1) * M_per_rank, :].copy_(local_data)
         kernel(ag_buffer, out)
         pynvshmem.nvshmem_barrier_all()  # Ensure all ranks have completed
         return out
diff --git a/examples/distributed/example_allgather_gemm.py b/examples/distributed/example_allgather_gemm.py
index 96f95a797..702f1264a 100644
--- a/examples/distributed/example_allgather_gemm.py
+++ b/examples/distributed/example_allgather_gemm.py
@@ -8,16 +8,15 @@
 
 
 def allgather_gemm(PE_num, M, N, K, block_M, block_N, block_K, dtype="float16"):
-
     accum_dtype = "float"
 
     @T.prim_func
     def main(
-            A: T.Buffer((M, K), dtype),
-            A_ag: T.Buffer((M * PE_num, K), dtype),
-            B: T.Buffer((K, N), dtype),
-            signal: T.Buffer((PE_num,), "uint64"),
-            C: T.Buffer((M * PE_num, N), dtype),
+        A: T.Buffer((M, K), dtype),
+        A_ag: T.Buffer((M * PE_num, K), dtype),
+        B: T.Buffer((K, N), dtype),
+        signal: T.Buffer((PE_num,), "uint64"),
+        C: T.Buffer((M * PE_num, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -36,8 +35,14 @@ def main(
             for k in T.serial(PE_num - 1):
                 peer[0] = (mype[0] + 1 + k) % npes[0]
                 T.putmem_signal_nbi_block(
-                    T.address_of(A_ag[mype[0] * M, 0]), T.address_of(A[0, 0]),
-                    block_M * block_K * 2, T.address_of(signal[k]), k + 1, 9, peer[0])
+                    T.address_of(A_ag[mype[0] * M, 0]),
+                    T.address_of(A[0, 0]),
+                    block_M * block_K * 2,
+                    T.address_of(signal[k]),
+                    k + 1,
+                    9,
+                    peer[0],
+                )
             for k in T.serial(PE_num - 1):
                 T.signal_wait_until(T.address_of(signal[k]), 0, k + 1)
 
@@ -60,13 +65,7 @@ def main(
 WORLD_SIZE, RANK, LOCAL_RANK, TP_GROUP = init_distributed(return_tp_group=True)
 PE_num = WORLD_SIZE
 func = allgather_gemm(PE_num, M, N, K, block_M, block_N, block_K)
-kernel = tilelang.compile(
-    func,
-    out_idx=-1,
-    pass_configs={
-        "tl.disable_tma_lower": True,
-        "tl.disable_warp_specialized": True
-    })
+kernel = tilelang.compile(func, out_idx=-1, pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
 
 # Get CUDA Source
 if RANK == 0:
@@ -90,9 +89,9 @@ def ref_program(A, B):
 C_ref = ref_program(A_tensor, B_tensor)
 print("C_ref:", C_ref)
 
-#profiler.init_distributed()
+# profiler.init_distributed()
 A_local = pynvshmem.nvshmem_create_tensor([M, K], dtype)
-A_local[:].copy_(A_tensor[M * RANK:M * (RANK + 1), :])
+A_local[:].copy_(A_tensor[M * RANK : M * (RANK + 1), :])
 
 A_ag_local = pynvshmem.nvshmem_create_tensor([M * PE_num, K], dtype)
 A_ag_local.fill_(0)
diff --git a/examples/distributed/example_allgather_gemm_overlapped.py b/examples/distributed/example_allgather_gemm_overlapped.py
index cebf58ed1..309481967 100644
--- a/examples/distributed/example_allgather_gemm_overlapped.py
+++ b/examples/distributed/example_allgather_gemm_overlapped.py
@@ -12,6 +12,7 @@
 
 cuda_python_version = importlib.metadata.version("cuda-python")
 from packaging import version
+
 if version.parse(cuda_python_version) >= version.parse("12.8.0"):
     from cuda.bindings import driver as cuda
 else:
@@ -19,14 +20,15 @@
 from tilelang.distributed import perf_fn
 
 tilelang.disable_cache()
-os.environ['NCCL_DEBUG'] = 'WARN'  # silence NCCL log
+os.environ["NCCL_DEBUG"] = "WARN"  # silence NCCL log
 
 
 @tilelang.jit(pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True})
 def set_signal_kernel(local_rank, num_local_ranks, threads):
-
     @T.prim_func
-    def _set_signal_kernel(signal_buffer: T.Tensor((num_local_ranks), "uint32"),):
+    def _set_signal_kernel(
+        signal_buffer: T.Tensor((num_local_ranks), "uint32"),
+    ):
         with T.Kernel(1, threads=threads):
             tx = T.get_thread_binding(0)
             if tx < num_local_ranks:
@@ -39,19 +41,9 @@ def _set_signal_kernel(signal_buffer: T.Tensor((num_local_ranks), "uint32"),):
 
 
 @tilelang.jit
-def gemm_kernel(M,
-                N,
-                K,
-                local_rank,
-                num_local_rank,
-                block_M,
-                block_N,
-                block_K,
-                threads,
-                persistent=False,
-                dtype="float16",
-                accum_dtype="float"):
-
+def gemm_kernel(
+    M, N, K, local_rank, num_local_rank, block_M, block_N, block_K, threads, persistent=False, dtype="float16", accum_dtype="float"
+):
     sm_num = driver.get_num_sms()
     m_blocks = T.ceildiv(M, block_M)
     n_blocks = T.ceildiv(N // num_local_rank, block_N)
@@ -61,14 +53,12 @@ def gemm_kernel(M,
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N // num_local_rank), dtype),
-            signal_buffer: T.Tensor((num_local_rank), "uint32"),
-            C: T.Tensor((M, N // num_local_rank), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N // num_local_rank), dtype),
+        signal_buffer: T.Tensor((num_local_rank), "uint32"),
+        C: T.Tensor((M, N // num_local_rank), dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(M, block_M) * T.ceildiv(N // num_local_rank, block_N),
-                threads=threads) as (bid):
+        with T.Kernel(T.ceildiv(M, block_M) * T.ceildiv(N // num_local_rank, block_N), threads=threads) as (bid):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             B_shared = T.alloc_shared((block_K, block_N), dtype)
             C_shared = T.alloc_shared((block_M, block_N), dtype)
@@ -103,10 +93,10 @@ def main(
 
     @T.prim_func
     def main_persistent(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N // num_local_rank), dtype),
-            signal_buffer: T.Tensor((num_local_rank), "uint32"),
-            C: T.Tensor((M, N // num_local_rank), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N // num_local_rank), dtype),
+        signal_buffer: T.Tensor((num_local_rank), "uint32"),
+        C: T.Tensor((M, N // num_local_rank), dtype),
     ):
         with T.Kernel(sm_num, threads=threads) as (bid):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -162,8 +152,8 @@ def cp_engine_producer_all_gather_full_mesh_pull(
         for src_rank in rank_orders:
             if src_rank == local_rank:
                 continue
-            dst = ag_buffer[local_rank][src_rank * M_per_rank:(src_rank + 1) * M_per_rank, :]
-            src = ag_buffer[src_rank][src_rank * M_per_rank:(src_rank + 1) * M_per_rank, :]
+            dst = ag_buffer[local_rank][src_rank * M_per_rank : (src_rank + 1) * M_per_rank, :]
+            src = ag_buffer[src_rank][src_rank * M_per_rank : (src_rank + 1) * M_per_rank, :]
             dst.copy_(src)
 
             (err,) = cuda.cuStreamWriteValue32(
@@ -175,21 +165,33 @@ def cp_engine_producer_all_gather_full_mesh_pull(
             CUDA_CHECK(err)
 
 
-def ag_gemm_op(A, B, C, ag_buffer, signal_buffer, M_per_rank, N, signal_target, local_rank,
-               local_world_size, set_signal_kernel, gemm_kernel, gemm_stream, ag_stream):
-
+def ag_gemm_op(
+    A,
+    B,
+    C,
+    ag_buffer,
+    signal_buffer,
+    M_per_rank,
+    N,
+    signal_target,
+    local_rank,
+    local_world_size,
+    set_signal_kernel,
+    gemm_kernel,
+    gemm_stream,
+    ag_stream,
+):
     with torch.cuda.stream(gemm_stream):
-        set_signal_kernel(signal_buffer[local_rank], stream=gemm_stream.cuda_stream)
+        set_signal_kernel(signal_buffer[local_rank])
 
     ag_stream.wait_stream(gemm_stream)
 
-    cp_engine_producer_all_gather_full_mesh_pull(ag_buffer, signal_buffer, M_per_rank,
-                                                 signal_target, local_rank, local_world_size,
-                                                 ag_stream)
+    cp_engine_producer_all_gather_full_mesh_pull(
+        ag_buffer, signal_buffer, M_per_rank, signal_target, local_rank, local_world_size, ag_stream
+    )
 
     with torch.cuda.stream(gemm_stream):
-        gemm_kernel(
-            ag_buffer[local_rank], B, signal_buffer[local_rank], C, stream=gemm_stream.cuda_stream)
+        gemm_kernel(ag_buffer[local_rank], B, signal_buffer[local_rank], C)
 
     gemm_stream.wait_stream(ag_stream)
     current_stream = torch.cuda.current_stream()
@@ -225,14 +227,9 @@ def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
     rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
     assert rank == local_rank and num_ranks == num_local_ranks, "only support single node for now"
     allocator = tilelang.get_allocator(
-        size=2**30,
-        device="cuda",
-        is_distributed=True,
-        local_rank=local_rank,
-        num_local_ranks=num_local_ranks,
-        group=group)
-    gemm_func = gemm_kernel(M, N, K, local_rank, num_local_ranks, BLOCK_M, BLOCK_N, BLOCK_K,
-                            threads, persistent)
+        size=2**30, device="cuda", is_distributed=True, local_rank=local_rank, num_local_ranks=num_local_ranks, group=group
+    )
+    gemm_func = gemm_kernel(M, N, K, local_rank, num_local_ranks, BLOCK_M, BLOCK_N, BLOCK_K, threads, persistent)
     set_signal_func = set_signal_kernel(
         local_rank=local_rank,
         num_local_ranks=num_local_ranks,
@@ -247,11 +244,8 @@ def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
     B = tilelang.tensor((K, N_per_rank), dtype, allocator=allocator).normal_()
     C = tilelang.tensor((M, N_per_rank), dtype, allocator=allocator)
     ag_buffer = tilelang.tensor((M, K), dtype, allocator=allocator, return_peers=True)
-    A = ag_buffer[local_rank][M_per_rank * local_rank:M_per_rank * (local_rank + 1), :].normal_()
-    signal_buffer = tilelang.tensor((num_local_ranks,),
-                                    torch.uint32,
-                                    allocator=allocator,
-                                    return_peers=True)
+    A = ag_buffer[local_rank][M_per_rank * local_rank : M_per_rank * (local_rank + 1), :].normal_()
+    signal_buffer = tilelang.tensor((num_local_ranks,), torch.uint32, allocator=allocator, return_peers=True)
 
     gemm_stream = torch.cuda.Stream()
     ag_stream = torch.cuda.Stream(priority=-1)
@@ -259,9 +253,22 @@ def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
 
     dist.barrier()
 
-    tilelang_C = ag_gemm_op(A, B, C, ag_buffer, signal_buffer, M_per_rank, K, signal_target,
-                            local_rank, num_local_ranks, set_signal_func, gemm_func, gemm_stream,
-                            ag_stream)
+    tilelang_C = ag_gemm_op(
+        A,
+        B,
+        C,
+        ag_buffer,
+        signal_buffer,
+        M_per_rank,
+        K,
+        signal_target,
+        local_rank,
+        num_local_ranks,
+        set_signal_func,
+        gemm_func,
+        gemm_stream,
+        ag_stream,
+    )
 
     torch_ag_buffer = torch.empty([M, K], dtype=dtype, device="cuda")
     torch_C = torch_ag_gemm(group, A, B, torch_ag_buffer)
@@ -273,27 +280,38 @@ def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
         print(f"torch_C: {torch_C}, tilelang_C: {tilelang_C}")
 
     _, tl_t = perf_fn(
-        lambda:
-        ag_gemm_op(A, B, C, ag_buffer, signal_buffer, M_per_rank, K, signal_target, local_rank,
-                   num_local_ranks, set_signal_func, gemm_func, gemm_stream, ag_stream),
+        lambda: ag_gemm_op(
+            A,
+            B,
+            C,
+            ag_buffer,
+            signal_buffer,
+            M_per_rank,
+            K,
+            signal_target,
+            local_rank,
+            num_local_ranks,
+            set_signal_func,
+            gemm_func,
+            gemm_stream,
+            ag_stream,
+        ),
         warmup=5,
-        rep=10)
-
-    print(
-        f"rank {local_rank} tilelang ag_gemm time: {tl_t:.2f} ms, TFLOPS: {2*M*N*K/1e9/(tl_t)/num_local_ranks:.2f}"
+        rep=10,
     )
 
+    print(f"rank {local_rank} tilelang ag_gemm time: {tl_t:.2f} ms, TFLOPS: {2 * M * N * K / 1e9 / (tl_t) / num_local_ranks:.2f}")
+
     dist.destroy_process_group()
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--num-processes', type=int, default=2, help='Number of processes to spawn (default: 2)')
-    parser.add_argument('--M', type=int, default=8192, help='M dimension')
-    parser.add_argument('--N', type=int, default=28672, help='N dimension')
-    parser.add_argument('--K', type=int, default=8192, help='K dimension')
-    parser.add_argument('--persistent', action='store_true', help='Use persistent kernel')
+    parser.add_argument("--num-processes", type=int, default=2, help="Number of processes to spawn (default: 2)")
+    parser.add_argument("--M", type=int, default=8192, help="M dimension")
+    parser.add_argument("--N", type=int, default=28672, help="N dimension")
+    parser.add_argument("--K", type=int, default=8192, help="K dimension")
+    parser.add_argument("--persistent", action="store_true", help="Use persistent kernel")
     args = parser.parse_args()
     num_processes = args.num_processes
 
diff --git a/examples/distributed/example_cannon.py b/examples/distributed/example_cannon.py
index 649be6c4b..ad25a41e7 100644
--- a/examples/distributed/example_cannon.py
+++ b/examples/distributed/example_cannon.py
@@ -11,7 +11,6 @@
 
 
 def cannon(MESH, M, N, K, block_M, block_N, block_K, dtype="float16", specialize=False):
-
     M_local = T.ceildiv(M, MESH)
     N_local = T.ceildiv(N, MESH)
     K_local = T.ceildiv(K, MESH)
@@ -22,13 +21,13 @@ def cannon(MESH, M, N, K, block_M, block_N, block_K, dtype="float16", specialize
 
     @T.prim_func
     def main(
-            A: T.Tensor((2, M_local, K_local), dtype),
-            B: T.Tensor((2, N_local, K_local), dtype),
-            A_signal_to: T.Tensor((T.ceildiv(M, block_M),), "uint64"),
-            A_signal_from: T.Tensor((T.ceildiv(M, block_M),), "uint64"),
-            B_signal_to: T.Tensor((T.ceildiv(N, block_N),), "uint64"),
-            B_signal_from: T.Tensor((T.ceildiv(N, block_N),), "uint64"),
-            C: T.Tensor((M_local, N_local), dtype),
+        A: T.Tensor((2, M_local, K_local), dtype),
+        B: T.Tensor((2, N_local, K_local), dtype),
+        A_signal_to: T.Tensor((T.ceildiv(M, block_M),), "uint64"),
+        A_signal_from: T.Tensor((T.ceildiv(M, block_M),), "uint64"),
+        B_signal_to: T.Tensor((T.ceildiv(N, block_N),), "uint64"),
+        B_signal_from: T.Tensor((T.ceildiv(N, block_N),), "uint64"),
+        C: T.Tensor((M_local, N_local), dtype),
     ):
         grid_size = T.min(sm_num, total_tiles)
         A_rows_per_block = T.ceildiv(M_local, grid_size)
@@ -72,16 +71,23 @@ def main(
                         T.address_of(A[(ko + 1) % 2, A_rows_per_block * block_id, 0]),
                         T.address_of(A[ko % 2, A_rows_per_block * block_id, 0]),
                         A_rows_per_block * K_local * dtype_map[dtype].itemsize,
-                        T.address_of(A_signal_to[0]), 1, T.Amo.SIGNAL_ADD, a_peer_to[0])
+                        T.address_of(A_signal_to[0]),
+                        1,
+                        T.Amo.SIGNAL_ADD,
+                        a_peer_to[0],
+                    )
                 if block_id < T.ceildiv(N_local, B_cols_per_block):
                     T.putmem_signal_nbi_block(
                         T.address_of(B[(ko + 1) % 2, B_cols_per_block * block_id, 0]),
                         T.address_of(B[ko % 2, B_cols_per_block * block_id, 0]),
                         B_cols_per_block * K_local * dtype_map[dtype].itemsize,
-                        T.address_of(B_signal_to[0]), 1, T.Amo.SIGNAL_ADD, b_peer_to[0])
+                        T.address_of(B_signal_to[0]),
+                        1,
+                        T.Amo.SIGNAL_ADD,
+                        b_peer_to[0],
+                    )
 
                 for w in T.serial(waves):
-
                     bx = (grid_size * w + block_id) // T.ceildiv(N_local, block_N)
                     by = (grid_size * w + block_id) % T.ceildiv(N_local, block_N)
 
@@ -122,13 +128,13 @@ def main(
     # TODO: fix correctness
     @T.prim_func
     def main_specialize(
-            A: T.Tensor((2, M_local, K_local), dtype),
-            B: T.Tensor((2, N_local, K_local), dtype),
-            A_signal_to: T.Tensor((T.ceildiv(M, block_M),), "uint64"),
-            A_signal_from: T.Tensor((T.ceildiv(M, block_M),), "uint64"),
-            B_signal_to: T.Tensor((T.ceildiv(N, block_N),), "uint64"),
-            B_signal_from: T.Tensor((T.ceildiv(N, block_N),), "uint64"),
-            C: T.Tensor((M_local, N_local), dtype),
+        A: T.Tensor((2, M_local, K_local), dtype),
+        B: T.Tensor((2, N_local, K_local), dtype),
+        A_signal_to: T.Tensor((T.ceildiv(M, block_M),), "uint64"),
+        A_signal_from: T.Tensor((T.ceildiv(M, block_M),), "uint64"),
+        B_signal_to: T.Tensor((T.ceildiv(N, block_N),), "uint64"),
+        B_signal_from: T.Tensor((T.ceildiv(N, block_N),), "uint64"),
+        C: T.Tensor((M_local, N_local), dtype),
     ):
         # 0-compute blocks: compute
         # compute_blocks-grid_size: copy
@@ -172,21 +178,26 @@ def main_specialize(
                             total_tiles * ko,
                         )
                     T.putmem_signal_nbi_block(
-                        T.address_of(A[(ko + 1) % 2, A_rows_per_block * (block_id - compute_blocks),
-                                       0]),
+                        T.address_of(A[(ko + 1) % 2, A_rows_per_block * (block_id - compute_blocks), 0]),
                         T.address_of(A[ko % 2, A_rows_per_block * (block_id - compute_blocks), 0]),
                         A_rows_per_block * K_local * dtype_map[dtype].itemsize,
-                        T.address_of(A_signal_to[0]), 1, T.Amo.SIGNAL_ADD, a_peer_to[0])
+                        T.address_of(A_signal_to[0]),
+                        1,
+                        T.Amo.SIGNAL_ADD,
+                        a_peer_to[0],
+                    )
                     T.putmem_signal_nbi_block(
-                        T.address_of(B[(ko + 1) % 2, B_cols_per_block * (block_id - compute_blocks),
-                                       0]),
+                        T.address_of(B[(ko + 1) % 2, B_cols_per_block * (block_id - compute_blocks), 0]),
                         T.address_of(B[ko % 2, B_cols_per_block * (block_id - compute_blocks), 0]),
                         B_cols_per_block * K_local * dtype_map[dtype].itemsize,
-                        T.address_of(B_signal_to[0]), 1, T.Amo.SIGNAL_ADD, b_peer_to[0])
+                        T.address_of(B_signal_to[0]),
+                        1,
+                        T.Amo.SIGNAL_ADD,
+                        b_peer_to[0],
+                    )
 
                 if block_id < compute_blocks:
                     for w in T.serial(waves):
-
                         bx = (compute_blocks * w + block_id) // T.ceildiv(N_local, block_N)
                         by = (compute_blocks * w + block_id) % T.ceildiv(N_local, block_N)
 
@@ -256,11 +267,7 @@ def parse_args():
     K_local = math.ceil(K / MESH)
 
     func = cannon(MESH, M, N, K, block_M, block_N, block_K, args.dtype, specialize)
-    kernel = tilelang.compile(
-        func, pass_configs={
-            "tl.disable_tma_lower": True,
-            "tl.disable_warp_specialized": True
-        })
+    kernel = tilelang.compile(func, pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
 
     # Get CUDA Source
     if RANK == 0:
@@ -281,11 +288,9 @@ def parse_args():
         b_scatter_list = []
         for r in range(WORLD_SIZE):
             rr, cc = divmod(r, MESH)
-            c_tile = C[M_local * rr:M_local * (rr + 1), N_local * cc:N_local * (cc + 1)]
-            a_tile = A[M_local * rr:M_local * (rr + 1),
-                       K_local * ((cc + rr) % MESH):K_local * ((cc + rr) % MESH + 1)]
-            b_tile = B[N_local * cc:N_local * (cc + 1),
-                       K_local * ((cc + rr) % MESH):K_local * ((cc + rr) % MESH + 1)]
+            c_tile = C[M_local * rr : M_local * (rr + 1), N_local * cc : N_local * (cc + 1)]
+            a_tile = A[M_local * rr : M_local * (rr + 1), K_local * ((cc + rr) % MESH) : K_local * ((cc + rr) % MESH + 1)]
+            b_tile = B[N_local * cc : N_local * (cc + 1), K_local * ((cc + rr) % MESH) : K_local * ((cc + rr) % MESH + 1)]
 
             c_scatter_list.append(c_tile.contiguous())
             a_scatter_list.append(a_tile.contiguous())
@@ -320,7 +325,7 @@ def parse_args():
         dist.barrier()
         if r == RANK:
             if torch.allclose(C_tilelang, ref, rtol=1e-2, atol=1e-2):
-                print('-' * 100)
+                print("-" * 100)
                 print(f"[Rank {RANK}] ✅ Tilelang and Torch match")
             else:
                 abs_error = torch.abs(C_tilelang - ref)
@@ -330,7 +335,7 @@ def parse_args():
                 max_rel_error = rel_error.max().item()
                 mismatch_ratio = (abs_error > (1e-2 + 1e-2 * torch.abs(ref))).float().mean().item()
 
-                print('-' * 100)
+                print("-" * 100)
                 print(f"[Rank {RANK}] ❌ Tilelang and Torch mismatch")
                 print(f"[Rank {RANK}] ref:\n{ref}")
                 print(f"[Rank {RANK}] tilelang:\n{C_tilelang}")
@@ -381,8 +386,7 @@ def reduce_local_time(local_time):
 
 
 total_flops = 2 * M * N * K
-avg_time = reduce_local_time(
-    bench(kernel, A, B, A_signal_to, A_signal_from, B_signal_to, B_signal_from, C_tilelang))
+avg_time = reduce_local_time(bench(kernel, A, B, A_signal_to, A_signal_from, B_signal_to, B_signal_from, C_tilelang))
 
 if RANK == 0:
     print(f"avg time of RANK {RANK}: {avg_time} ms")
diff --git a/examples/distributed/example_gemm_rs_overlapped.py b/examples/distributed/example_gemm_rs_overlapped.py
index 4fb1c6d43..27c2278bd 100644
--- a/examples/distributed/example_gemm_rs_overlapped.py
+++ b/examples/distributed/example_gemm_rs_overlapped.py
@@ -14,19 +14,9 @@
 
 
 @tilelang.jit
-def gemm_kernel(M,
-                N,
-                K,
-                local_rank,
-                num_local_rank,
-                block_M,
-                block_N,
-                block_K,
-                threads,
-                persistent=False,
-                dtype="float16",
-                accum_dtype="float"):
-
+def gemm_kernel(
+    M, N, K, local_rank, num_local_rank, block_M, block_N, block_K, threads, persistent=False, dtype="float16", accum_dtype="float"
+):
     M_per_rank = T.ceildiv(M, num_local_rank)
     GROUP_SIZE_M = 8
 
@@ -41,11 +31,11 @@ def swizzle_2d(tile_id, num_pid_m, num_pid_n):
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K // num_local_rank), dtype),
-            B: T.Tensor((K // num_local_rank, N), dtype),
-            scatter_signal_buf: T.Tensor((num_local_rank), "uint32"),
-            counter_signal_buf: T.Tensor((num_local_rank), "uint32"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K // num_local_rank), dtype),
+        B: T.Tensor((K // num_local_rank, N), dtype),
+        scatter_signal_buf: T.Tensor((num_local_rank), "uint32"),
+        counter_signal_buf: T.Tensor((num_local_rank), "uint32"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(M, block_M) * T.ceildiv(N, block_N), threads=threads) as (bid):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -87,27 +77,12 @@ def main(
     return main
 
 
-def gemm_rs_op(A,
-               B,
-               C,
-               output,
-               ctx,
-               gemm_kernel,
-               gemm_stream,
-               rs_stream,
-               local_rank,
-               print_source=False):
-
+def gemm_rs_op(A, B, C, output, ctx, gemm_kernel, gemm_stream, rs_stream, local_rank, print_source=False):
     current_stream = torch.cuda.current_stream()
     rs_stream.wait_stream(gemm_stream)
 
-    gemm_kernel(
-        A,
-        B,
-        ctx.scatter_signal_bufs[local_rank],
-        ctx.counter_bufs[local_rank],
-        C,
-        stream=gemm_stream.cuda_stream)
+    with torch.cuda.stream(gemm_stream):
+        gemm_kernel(A, B, ctx.scatter_signal_bufs[local_rank], ctx.counter_bufs[local_rank], C)
 
     if print_source and local_rank == 1:
         print(gemm_kernel.get_kernel_source())
@@ -155,14 +130,9 @@ def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
     rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
     assert rank == local_rank and num_ranks == num_local_ranks, "only support single node for now"
     allocator = tilelang.get_allocator(
-        size=2**30,
-        device="cuda",
-        is_distributed=True,
-        local_rank=local_rank,
-        num_local_ranks=num_local_ranks,
-        group=group)
-    gemm_func = gemm_kernel(M, N, K, local_rank, num_local_ranks, BLOCK_M, BLOCK_N, BLOCK_K,
-                            threads, persistent)
+        size=2**30, device="cuda", is_distributed=True, local_rank=local_rank, num_local_ranks=num_local_ranks, group=group
+    )
+    gemm_func = gemm_kernel(M, N, K, local_rank, num_local_ranks, BLOCK_M, BLOCK_N, BLOCK_K, threads, persistent)
     gemm_func.initialize(allocator=allocator)
 
     A = tilelang.tensor((M, K_per_rank), dtype, allocator=allocator).normal_() / 10
@@ -172,20 +142,12 @@ def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
     gemm_stream = torch.cuda.Stream()
     rs_stream = torch.cuda.Stream(priority=-1)
     ctx = create_reduce_scater_2d_ctx(
-        M,
-        N,
-        local_rank,
-        num_local_ranks,
-        num_local_ranks,
-        dtype,
-        allocator,
-        overlap_with_gemm=True,
-        num_reduction_sms=15)
+        M, N, local_rank, num_local_ranks, num_local_ranks, dtype, allocator, overlap_with_gemm=True, num_reduction_sms=15
+    )
 
     dist.barrier()
 
-    tilelang_out = gemm_rs_op(
-        A, B, C, output, ctx, gemm_func, gemm_stream, rs_stream, local_rank, print_source=True)
+    tilelang_out = gemm_rs_op(A, B, C, output, ctx, gemm_func, gemm_stream, rs_stream, local_rank, print_source=True)
     torch_out = torch_gemm_rs(group, A, B, None, num_local_ranks)
 
     atol = 1e-2
@@ -196,26 +158,20 @@ def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
         print(f"rank {local_rank} check failed.❌")
         print(f"torch_out: {torch_out}, tilelang_out: {tilelang_out}")
 
-    _, tl_t = perf_fn(
-        lambda: gemm_rs_op(A, B, C, output, ctx, gemm_func, gemm_stream, rs_stream, local_rank),
-        warmup=5,
-        rep=5)
+    _, tl_t = perf_fn(lambda: gemm_rs_op(A, B, C, output, ctx, gemm_func, gemm_stream, rs_stream, local_rank), warmup=5, rep=5)
 
-    print(
-        f"rank {local_rank} tilelang gemm_rs time: {tl_t:.2f} ms, TFLOPS: {2*M*N*K/1e9/(tl_t)/num_local_ranks:.2f}"
-    )
+    print(f"rank {local_rank} tilelang gemm_rs time: {tl_t:.2f} ms, TFLOPS: {2 * M * N * K / 1e9 / (tl_t) / num_local_ranks:.2f}")
 
     dist.destroy_process_group()
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--num-processes', type=int, default=2, help='Number of processes to spawn (default: 2)')
-    parser.add_argument('--M', type=int, default=8192, help='M dimension')
-    parser.add_argument('--N', type=int, default=8192, help='N dimension')
-    parser.add_argument('--K', type=int, default=29568, help='K dimension')
-    parser.add_argument('--persistent', action='store_true', help='Use persistent kernel')
+    parser.add_argument("--num-processes", type=int, default=2, help="Number of processes to spawn (default: 2)")
+    parser.add_argument("--M", type=int, default=8192, help="M dimension")
+    parser.add_argument("--N", type=int, default=8192, help="N dimension")
+    parser.add_argument("--K", type=int, default=29568, help="K dimension")
+    parser.add_argument("--persistent", action="store_true", help="Use persistent kernel")
     args = parser.parse_args()
     num_processes = args.num_processes
 
diff --git a/examples/distributed/example_nvshmem.py b/examples/distributed/example_nvshmem.py
index 6499a4648..8f8de69ed 100644
--- a/examples/distributed/example_nvshmem.py
+++ b/examples/distributed/example_nvshmem.py
@@ -29,11 +29,10 @@ def tilelang_callback_cuda_postproc(code, _):
 
 
 def dist_test(M, N, block_M, block_N, dtype="int16"):
-
     @T.prim_func
     def main(
-            A: T.Buffer((M, N), dtype),
-            B: T.Buffer((M, N), dtype),
+        A: T.Buffer((M, N), dtype),
+        B: T.Buffer((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_N), dtype)
diff --git a/examples/distributed/example_overlapping_allgather.py b/examples/distributed/example_overlapping_allgather.py
index 13c3e6dac..281e07dee 100644
--- a/examples/distributed/example_overlapping_allgather.py
+++ b/examples/distributed/example_overlapping_allgather.py
@@ -19,28 +19,24 @@
 
 
 def internode_gather(M, local_world_size, block_M, threads):
-
     @T.prim_func
     def main(
-            dst: T.Tensor((M), "float32"),
-            src: T.Tensor((M), "float32"),
+        dst: T.Tensor((M), "float32"),
+        src: T.Tensor((M), "float32"),
     ):
         with T.Kernel(T.ceildiv(M, block_M), threads=threads) as (bx):
             rank = T.alloc_local([1], "uint64")
             rank[0] = (T.get_pe() + local_world_size) % (2 * local_world_size)  # 2 nodes
-            T.putmem_nbi_block(
-                T.address_of(dst[bx * block_M]), T.address_of(src[bx * block_M]), block_M * 4,
-                rank[0])
+            T.putmem_nbi_block(T.address_of(dst[bx * block_M]), T.address_of(src[bx * block_M]), block_M * 4, rank[0])
 
     return main
 
 
 def intranode_gather(M, world_size, block_M, threads):
-
     @T.prim_func
     def main(
-            dst: T.Tensor((M * world_size), "float32"),
-            src: T.Tensor((M * 2), "float32"),
+        dst: T.Tensor((M * world_size), "float32"),
+        src: T.Tensor((M * 2), "float32"),
     ):
         with T.Kernel(T.ceildiv(M, block_M), threads=threads) as (bx):
             rank = T.alloc_local([1], "uint64")
@@ -68,24 +64,19 @@ def main(
     return main
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     tilelang.disable_cache()
 
     M = 2
     K = 12288
-    #for 2 node(16 GPUs), world_size=16,rank is 0-15,local rank is 0-7
-    WORLD_SIZE, RANK, LOCAL_RANK, TP_GROUP, LC_GROUP = init_distributed(
-        return_tp_group=True, return_lc_group=True)
-    local_world_size = int(os.environ.get('LOCAL_WORLD_SIZE', 1))
+    # for 2 node(16 GPUs), world_size=16,rank is 0-15,local rank is 0-7
+    WORLD_SIZE, RANK, LOCAL_RANK, TP_GROUP, LC_GROUP = init_distributed(return_tp_group=True, return_lc_group=True)
+    local_world_size = int(os.environ.get("LOCAL_WORLD_SIZE", 1))
     LOCAL_RANK = int(os.environ.get("LOCAL_RANK", 0))
 
     allocator = tilelang.get_allocator(
-        size=2**25,
-        device="cuda",
-        is_distributed=True,
-        local_rank=LOCAL_RANK,
-        num_local_ranks=local_world_size,
-        group=LC_GROUP)
+        size=2**25, device="cuda", is_distributed=True, local_rank=LOCAL_RANK, num_local_ranks=local_world_size, group=LC_GROUP
+    )
     print(local_world_size, LOCAL_RANK)
 
     # Each rank sends the local_tensor to ranks of other nodes with the same local_rank
@@ -99,7 +90,7 @@ def main(
         print(interkernel.get_kernel_source())
     src = pynvshmem.nvshmem_create_tensor([M], torch.float32)
     dst = pynvshmem.nvshmem_create_tensor([M], torch.float32)
-    input_data = torch.ones([M], dtype=torch.float32, device='cuda') * RANK
+    input_data = torch.ones([M], dtype=torch.float32, device="cuda") * RANK
     src.copy_(input_data)
 
     pynvshmem.nvshmem_barrier_all()
@@ -119,20 +110,14 @@ def main(
     src_intra = tilelang.tensor((M * 2), torch.float32, allocator=allocator).normal_()
     dst_intra = tilelang.tensor((M * WORLD_SIZE), torch.float32, allocator=allocator)
     if RANK < WORLD_SIZE / 2:
-        cudart.cudaMemcpy(src_intra.data_ptr(), src.data_ptr(), M * 4,
-                          cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice)
-        cudart.cudaMemcpy(src_intra.data_ptr() + M * 4, dst.data_ptr(), M * 4,
-                          cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice)
+        cudart.cudaMemcpy(src_intra.data_ptr(), src.data_ptr(), M * 4, cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice)
+        cudart.cudaMemcpy(src_intra.data_ptr() + M * 4, dst.data_ptr(), M * 4, cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice)
     else:
-        cudart.cudaMemcpy(src_intra.data_ptr(), dst.data_ptr(), M * 4,
-                          cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice)
-        cudart.cudaMemcpy(src_intra.data_ptr() + M * 4, src.data_ptr(), M * 4,
-                          cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice)
+        cudart.cudaMemcpy(src_intra.data_ptr(), dst.data_ptr(), M * 4, cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice)
+        cudart.cudaMemcpy(src_intra.data_ptr() + M * 4, src.data_ptr(), M * 4, cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice)
 
     env.USE_NVSHMEM = False
-    intrakernel = tilelang.compile(
-        intranode_gather(M, WORLD_SIZE, M, 128),
-        pass_configs={tilelang.PassConfigKey.TL_DISABLE_RDC: True})
+    intrakernel = tilelang.compile(intranode_gather(M, WORLD_SIZE, M, 128), pass_configs={tilelang.PassConfigKey.TL_DISABLE_RDC: True})
     intrakernel.initialize(allocator=allocator)
     if LOCAL_RANK == 0:
         print(intrakernel.get_kernel_source())
diff --git a/examples/distributed/example_post_attn_all2all_transpose.py b/examples/distributed/example_post_attn_all2all_transpose.py
index e17c55ad9..de2c43671 100644
--- a/examples/distributed/example_post_attn_all2all_transpose.py
+++ b/examples/distributed/example_post_attn_all2all_transpose.py
@@ -2,6 +2,7 @@
 import torch.distributed as dist
 import pynvshmem
 import tilelang
+import tilelang.testing
 import tilelang.language as T
 from tilelang.distributed import init_distributed, dtype_map
 import argparse
@@ -43,21 +44,14 @@ def torch_reverse_all_to_all_transpose_reference(data_src, group):
     # Step 2: Prepare output list for all_to_all
     output_list = []
     for _ in range(world_size):
-        recv_data = torch.empty(
-            batch_size,
-            heads_per_pe,
-            seq_per_pe,
-            head_dim,
-            dtype=data_src.dtype,
-            device=data_src.device)
+        recv_data = torch.empty(batch_size, heads_per_pe, seq_per_pe, head_dim, dtype=data_src.dtype, device=data_src.device)
         output_list.append(recv_data)
 
     # Step 3: Execute all_to_all
     dist.all_to_all(output_list, input_list, group=group)
 
     # Step 4: Reorganize received data
-    result = torch.empty(
-        batch_size, seq_per_pe, num_heads, head_dim, dtype=data_src.dtype, device=data_src.device)
+    result = torch.empty(batch_size, seq_per_pe, num_heads, head_dim, dtype=data_src.dtype, device=data_src.device)
 
     for pe_idx in range(world_size):
         head_start = pe_idx * heads_per_pe
@@ -69,12 +63,7 @@ def torch_reverse_all_to_all_transpose_reference(data_src, group):
     return result
 
 
-def sequence_parallel_reverse_all_to_all_transpose(PE_num,
-                                                   BATCH_SIZE,
-                                                   NUM_HEADS,
-                                                   SEQ_LEN,
-                                                   HEAD_DIM,
-                                                   dtype="float16"):
+def sequence_parallel_reverse_all_to_all_transpose(PE_num, BATCH_SIZE, NUM_HEADS, SEQ_LEN, HEAD_DIM, dtype="float16"):
     """
     Reverse All-to-All: Convert from head parallel to sequence parallel
     Input:  [BATCH_SIZE, HEADS_PER_PE, SEQ_LEN, HEAD_DIM]
@@ -88,9 +77,9 @@ def sequence_parallel_reverse_all_to_all_transpose(PE_num,
 
     @T.prim_func
     def main(
-            data_src: T.Tensor((BATCH_SIZE, HEADS_PER_PE, SEQ_LEN, HEAD_DIM), dtype),
-            data_dst: T.Tensor((BATCH_SIZE, SEQ_PER_PE, NUM_HEADS, HEAD_DIM), dtype),
-            signal: T.Tensor((PE_num,), "uint64"),
+        data_src: T.Tensor((BATCH_SIZE, HEADS_PER_PE, SEQ_LEN, HEAD_DIM), dtype),
+        data_dst: T.Tensor((BATCH_SIZE, SEQ_PER_PE, NUM_HEADS, HEAD_DIM), dtype),
+        signal: T.Tensor((PE_num,), "uint64"),
     ):
         with T.Kernel(NUM_BLOCKS_X, PE_num, threads=128) as (bx, target_pe):
             tx = T.thread_binding(128, thread="threadIdx.x")
@@ -118,8 +107,10 @@ def main(
 
                     T.putmem_nbi_block(
                         T.address_of(data_dst[batch_idx, seq_idx, dst_head_idx, 0]),
-                        T.address_of(data_src[batch_idx, head_idx, src_seq_idx, 0]), transfer_size,
-                        target_pe)
+                        T.address_of(data_src[batch_idx, head_idx, src_seq_idx, 0]),
+                        transfer_size,
+                        target_pe,
+                    )
 
             T.fence()
 
@@ -129,7 +120,8 @@ def main(
                     T.address_of(signal[mype[0]]),
                     1,  # Signal the number of head chunks processed
                     T.Amo.SIGNAL_ADD,
-                    target_pe)
+                    target_pe,
+                )
                 T.fence()
                 # Wait for all blocks to complete all head transfers
                 T.signal_wait_until(T.address_of(signal[target_pe]), T.CmpType.EQ, NUM_BLOCKS_X)
@@ -177,6 +169,7 @@ def parse_args():
     return parser.parse_args()
 
 
+@tilelang.testing.requires_distributed
 def test_reverse_transpose_all_to_all_with_golden_reference():
     args = parse_args()
 
@@ -203,13 +196,8 @@ def test_reverse_transpose_all_to_all_with_golden_reference():
         print("Converting from HEAD_PARALLEL to SEQUENCE_PARALLEL")
 
     # Compile TileLang kernel
-    func = sequence_parallel_reverse_all_to_all_transpose(PE_num, args.batch_size, args.num_heads,
-                                                          args.seq_len, args.head_dim, args.dtype)
-    kernel = tilelang.compile(
-        func, pass_configs={
-            "tl.disable_tma_lower": True,
-            "tl.disable_warp_specialized": True
-        })
+    func = sequence_parallel_reverse_all_to_all_transpose(PE_num, args.batch_size, args.num_heads, args.seq_len, args.head_dim, args.dtype)
+    kernel = tilelang.compile(func, pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
 
     if RANK == 0:
         print("\nTileLang Kernel Source:")
@@ -219,9 +207,7 @@ def test_reverse_transpose_all_to_all_with_golden_reference():
     dtype_torch = dtype_map[args.dtype]
 
     # Create input data: [BATCH_SIZE, HEADS_PER_PE, SEQ_LEN, HEAD_DIM] - head parallel format
-    input_data = torch.rand([args.batch_size, HEADS_PER_PE, args.seq_len, args.head_dim],
-                            dtype=dtype_torch,
-                            device='cuda')
+    input_data = torch.rand([args.batch_size, HEADS_PER_PE, args.seq_len, args.head_dim], dtype=dtype_torch, device="cuda")
 
     print(f"PE {RANK} Input shape: {input_data.shape}")
     print(f"PE {RANK} Input sample: {input_data[0, 0, 0, :3]}")
@@ -235,10 +221,8 @@ def test_reverse_transpose_all_to_all_with_golden_reference():
     # === Test 2: TileLang NVSHMEM Implementation ===
     def tilelang_reverse_all_to_all():
         # Create NVSHMEM tensors
-        data_src = pynvshmem.nvshmem_create_tensor(
-            [args.batch_size, HEADS_PER_PE, args.seq_len, args.head_dim], dtype_torch)
-        data_dst = pynvshmem.nvshmem_create_tensor(
-            [args.batch_size, SEQ_PER_PE, args.num_heads, args.head_dim], dtype_torch)
+        data_src = pynvshmem.nvshmem_create_tensor([args.batch_size, HEADS_PER_PE, args.seq_len, args.head_dim], dtype_torch)
+        data_dst = pynvshmem.nvshmem_create_tensor([args.batch_size, SEQ_PER_PE, args.num_heads, args.head_dim], dtype_torch)
         signal = pynvshmem.nvshmem_create_tensor([PE_num], torch.uint64)
 
         # Initialize data
@@ -268,6 +252,7 @@ def tilelang_reverse_all_to_all():
     dist.destroy_process_group()
 
 
+@tilelang.testing.requires_distributed
 def test_roundtrip_consistency():
     """Test that forward + reverse all-to-all gives back original data"""
     args = parse_args()
@@ -285,9 +270,7 @@ def test_roundtrip_consistency():
     SEQ_PER_PE = args.seq_len // WORLD_SIZE
 
     # Create original data in sequence parallel format
-    original_data = torch.rand([args.batch_size, SEQ_PER_PE, args.num_heads, args.head_dim],
-                               dtype=dtype_torch,
-                               device='cuda')
+    original_data = torch.rand([args.batch_size, SEQ_PER_PE, args.num_heads, args.head_dim], dtype=dtype_torch, device="cuda")
 
     # Forward: sequence parallel -> head parallel
     head_parallel_data = torch_sequence_all_to_all_transpose_reference(original_data, TP_GROUP)
diff --git a/examples/distributed/example_pre_attn_all2all.py b/examples/distributed/example_pre_attn_all2all.py
index 53884f5b0..cb85a9389 100644
--- a/examples/distributed/example_pre_attn_all2all.py
+++ b/examples/distributed/example_pre_attn_all2all.py
@@ -2,6 +2,7 @@
 import torch.distributed as dist
 import pynvshmem
 import tilelang
+import tilelang.testing
 import tilelang.language as T
 from tilelang.distributed import init_distributed, dtype_map
 import argparse
@@ -44,13 +45,7 @@ def torch_sequence_all_to_all_reference(data_src, group):
     output_list = []
     for _ in range(world_size):
         # Receive [BATCH_SIZE, HEADS_PER_PE, SEQ_PER_PE, HEAD_DIM] from each PE
-        recv_data = torch.empty(
-            batch_size,
-            heads_per_pe,
-            seq_per_pe,
-            head_dim,
-            dtype=data_src.dtype,
-            device=data_src.device)
+        recv_data = torch.empty(batch_size, heads_per_pe, seq_per_pe, head_dim, dtype=data_src.dtype, device=data_src.device)
         output_list.append(recv_data)
 
     # Step 3: Execute all_to_all
@@ -59,8 +54,7 @@ def torch_sequence_all_to_all_reference(data_src, group):
     # Step 4: Reorganize received data
     # output_list[pe_idx] contains data from PE pe_idx
     # Need to arrange by sequence dimension
-    result = torch.empty(
-        batch_size, heads_per_pe, seq_len, head_dim, dtype=data_src.dtype, device=data_src.device)
+    result = torch.empty(batch_size, heads_per_pe, seq_len, head_dim, dtype=data_src.dtype, device=data_src.device)
 
     for pe_idx in range(world_size):
         seq_start = pe_idx * seq_per_pe
@@ -86,12 +80,12 @@ def sequence_parallel_all_to_all(PE_num, BATCH_SIZE, NUM_HEADS, SEQ_LEN, HEAD_DI
 
     @T.prim_func
     def main(
-            # Input: [BATCH_SIZE, NUM_HEADS, SEQ_PER_PE, HEAD_DIM]
-            data_src: T.Tensor((BATCH_SIZE, NUM_HEADS, SEQ_PER_PE, HEAD_DIM), dtype),
-            # Output: [BATCH_SIZE, HEADS_PER_PE, SEQ_LEN, HEAD_DIM]
-            data_dst: T.Tensor((BATCH_SIZE, HEADS_PER_PE, SEQ_LEN, HEAD_DIM), dtype),
-            # Sync signals
-            signal: T.Tensor((PE_num,), "uint64"),
+        # Input: [BATCH_SIZE, NUM_HEADS, SEQ_PER_PE, HEAD_DIM]
+        data_src: T.Tensor((BATCH_SIZE, NUM_HEADS, SEQ_PER_PE, HEAD_DIM), dtype),
+        # Output: [BATCH_SIZE, HEADS_PER_PE, SEQ_LEN, HEAD_DIM]
+        data_dst: T.Tensor((BATCH_SIZE, HEADS_PER_PE, SEQ_LEN, HEAD_DIM), dtype),
+        # Sync signals
+        signal: T.Tensor((PE_num,), "uint64"),
     ):
         # Grid: (batch*head, target_pe)
         with T.Kernel(NUM_BLOCKS_X, PE_num, threads=128) as (bx, target_pe):
@@ -116,7 +110,10 @@ def main(
                 # Single block transfer for entire [SEQ_PER_PE, HEAD_DIM] data
                 T.putmem_nbi_block(
                     T.address_of(data_dst[batch_idx, head_idx, dst_seq_start, 0]),
-                    T.address_of(data_src[batch_idx, src_head_idx, 0, 0]), transfer_size, target_pe)
+                    T.address_of(data_src[batch_idx, src_head_idx, 0, 0]),
+                    transfer_size,
+                    target_pe,
+                )
 
             # Memory fence
             T.fence()
@@ -127,7 +124,8 @@ def main(
                     T.address_of(signal[mype[0]]),
                     1,
                     10,  # NVSHMEM_SIGNAL_ADD
-                    target_pe)
+                    target_pe,
+                )
                 T.fence()
                 for k in T.serial(PE_num):
                     T.signal_wait_until(T.address_of(signal[k]), 0, NUM_BLOCKS_X)
@@ -165,8 +163,7 @@ def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--batch_size", type=int, default=2, help="Batch size")
     parser.add_argument("--seq_len", type=int, default=256, help="Sequence length")
-    parser.add_argument(
-        "--num_heads", type=int, default=16, help="Number of attention heads,combine QKV")
+    parser.add_argument("--num_heads", type=int, default=16, help="Number of attention heads,combine QKV")
     parser.add_argument("--head_dim", type=int, default=64, help="Head dimension")
     parser.add_argument("--dtype", default="float16", help="Data type")
     parser.add_argument("--warmup", type=int, default=3, help="Warmup iterations")
@@ -175,6 +172,7 @@ def parse_args():
     return parser.parse_args()
 
 
+@tilelang.testing.requires_distributed
 def test_all_to_all_with_golden_reference():
     args = parse_args()
 
@@ -200,13 +198,8 @@ def test_all_to_all_with_golden_reference():
         print(f"Heads per PE: {HEADS_PER_PE}")
 
     # Compile TileLang kernel
-    func = sequence_parallel_all_to_all(PE_num, args.batch_size, args.num_heads, args.seq_len,
-                                        args.head_dim, args.dtype)
-    kernel = tilelang.compile(
-        func, pass_configs={
-            "tl.disable_tma_lower": True,
-            "tl.disable_warp_specialized": True
-        })
+    func = sequence_parallel_all_to_all(PE_num, args.batch_size, args.num_heads, args.seq_len, args.head_dim, args.dtype)
+    kernel = tilelang.compile(func, pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
 
     if RANK == 0:
         print("\nTileLang Kernel Source:")
@@ -216,9 +209,7 @@ def test_all_to_all_with_golden_reference():
     dtype_torch = dtype_map[args.dtype]
 
     # Create input data (same for both implementations)
-    input_data = torch.rand([args.batch_size, args.num_heads, SEQ_PER_PE, args.head_dim],
-                            dtype=dtype_torch,
-                            device='cuda')
+    input_data = torch.rand([args.batch_size, args.num_heads, SEQ_PER_PE, args.head_dim], dtype=dtype_torch, device="cuda")
 
     print(f"PE {RANK} Input shape: {input_data.shape}")
     print(f"PE {RANK} Input sample: {input_data[0, 0, 0, :3]}")
@@ -233,10 +224,8 @@ def test_all_to_all_with_golden_reference():
     # === Test 2: TileLang NVSHMEM Implementation ===
     def tilelang_all_to_all():
         # Create NVSHMEM tensors
-        data_src = pynvshmem.nvshmem_create_tensor(
-            [args.batch_size, args.num_heads, SEQ_PER_PE, args.head_dim], dtype_torch)
-        data_dst = pynvshmem.nvshmem_create_tensor(
-            [args.batch_size, HEADS_PER_PE, args.seq_len, args.head_dim], dtype_torch)
+        data_src = pynvshmem.nvshmem_create_tensor([args.batch_size, args.num_heads, SEQ_PER_PE, args.head_dim], dtype_torch)
+        data_dst = pynvshmem.nvshmem_create_tensor([args.batch_size, HEADS_PER_PE, args.seq_len, args.head_dim], dtype_torch)
         signal = pynvshmem.nvshmem_create_tensor([PE_num], torch.uint64)
 
         # Initialize data
@@ -246,7 +235,7 @@ def tilelang_all_to_all():
 
         # Execute kernel
         kernel(data_src, data_dst, signal)
-        #pynvshmem.nvshmem_barrier_all()
+        # pynvshmem.nvshmem_barrier_all()
 
         return data_dst
 
diff --git a/examples/distributed/example_pre_attn_all2all_transpose.py b/examples/distributed/example_pre_attn_all2all_transpose.py
index f5c4b9fc3..80f6ef6b7 100644
--- a/examples/distributed/example_pre_attn_all2all_transpose.py
+++ b/examples/distributed/example_pre_attn_all2all_transpose.py
@@ -2,6 +2,7 @@
 import torch.distributed as dist
 import pynvshmem
 import tilelang
+import tilelang.testing
 import tilelang.language as T
 from tilelang.distributed import init_distributed, dtype_map
 import argparse
@@ -41,21 +42,14 @@ def torch_sequence_all_to_all_transpose_reference(data_src, group):
     # Step 2: Prepare output list for all_to_all
     output_list = []
     for _ in range(world_size):
-        recv_data = torch.empty(
-            batch_size,
-            seq_per_pe,
-            heads_per_pe,
-            head_dim,
-            dtype=data_src.dtype,
-            device=data_src.device)
+        recv_data = torch.empty(batch_size, seq_per_pe, heads_per_pe, head_dim, dtype=data_src.dtype, device=data_src.device)
         output_list.append(recv_data)
 
     # Step 3: Execute all_to_all
     dist.all_to_all(output_list, input_list, group=group)
 
     # Step 4: Reorganize received data with transpose
-    result = torch.empty(
-        batch_size, heads_per_pe, seq_len, head_dim, dtype=data_src.dtype, device=data_src.device)
+    result = torch.empty(batch_size, heads_per_pe, seq_len, head_dim, dtype=data_src.dtype, device=data_src.device)
 
     for pe_idx in range(world_size):
         seq_start = pe_idx * seq_per_pe
@@ -67,12 +61,7 @@ def torch_sequence_all_to_all_transpose_reference(data_src, group):
     return result
 
 
-def sequence_parallel_all_to_all_transpose(PE_num,
-                                           BATCH_SIZE,
-                                           NUM_HEADS,
-                                           SEQ_LEN,
-                                           HEAD_DIM,
-                                           dtype="float16"):
+def sequence_parallel_all_to_all_transpose(PE_num, BATCH_SIZE, NUM_HEADS, SEQ_LEN, HEAD_DIM, dtype="float16"):
     """
     Coarse-grained version with proper transpose handling
     Each block handles one (batch, head) combination and processes all sequence positions
@@ -85,9 +74,9 @@ def sequence_parallel_all_to_all_transpose(PE_num,
 
     @T.prim_func
     def main(
-            data_src: T.Tensor((BATCH_SIZE, SEQ_PER_PE, NUM_HEADS, HEAD_DIM), dtype),
-            data_dst: T.Tensor((BATCH_SIZE, HEADS_PER_PE, SEQ_LEN, HEAD_DIM), dtype),
-            signal: T.Tensor((PE_num,), "uint64"),
+        data_src: T.Tensor((BATCH_SIZE, SEQ_PER_PE, NUM_HEADS, HEAD_DIM), dtype),
+        data_dst: T.Tensor((BATCH_SIZE, HEADS_PER_PE, SEQ_LEN, HEAD_DIM), dtype),
+        signal: T.Tensor((PE_num,), "uint64"),
     ):
         with T.Kernel(NUM_BLOCKS_X, PE_num, threads=128) as (bx, target_pe):
             tx = T.thread_binding(128, thread="threadIdx.x")
@@ -115,8 +104,10 @@ def main(
 
                     T.putmem_nbi_block(
                         T.address_of(data_dst[batch_idx, head_idx, dst_seq_idx, 0]),
-                        T.address_of(data_src[batch_idx, seq_idx, src_head_idx, 0]), transfer_size,
-                        target_pe)
+                        T.address_of(data_src[batch_idx, seq_idx, src_head_idx, 0]),
+                        transfer_size,
+                        target_pe,
+                    )
 
             T.fence()
 
@@ -126,7 +117,8 @@ def main(
                     T.address_of(signal[mype[0]]),
                     1,  # Signal the number of sequence positions processed
                     T.Amo.SIGNAL_ADD,
-                    target_pe)
+                    target_pe,
+                )
                 T.fence()
                 # Wait for all blocks to complete all sequence positions
                 T.signal_wait_until(T.address_of(signal[target_pe]), T.CmpType.EQ, NUM_BLOCKS_X)
@@ -173,6 +165,7 @@ def parse_args():
     return parser.parse_args()
 
 
+@tilelang.testing.requires_distributed
 def test_transpose_all_to_all_with_golden_reference():
     args = parse_args()
 
@@ -198,13 +191,8 @@ def test_transpose_all_to_all_with_golden_reference():
         print(f"Heads per PE: {HEADS_PER_PE}")
 
     # Compile TileLang kernel
-    func = sequence_parallel_all_to_all_transpose(PE_num, args.batch_size, args.num_heads,
-                                                  args.seq_len, args.head_dim, args.dtype)
-    kernel = tilelang.compile(
-        func, pass_configs={
-            "tl.disable_tma_lower": True,
-            "tl.disable_warp_specialized": True
-        })
+    func = sequence_parallel_all_to_all_transpose(PE_num, args.batch_size, args.num_heads, args.seq_len, args.head_dim, args.dtype)
+    kernel = tilelang.compile(func, pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
 
     if RANK == 0:
         print("\nTileLang Kernel Source:")
@@ -214,9 +202,7 @@ def test_transpose_all_to_all_with_golden_reference():
     dtype_torch = dtype_map[args.dtype]
 
     # Create input data: [BATCH_SIZE, SEQ_PER_PE, NUM_HEADS, HEAD_DIM] - random like example
-    input_data = torch.rand([args.batch_size, SEQ_PER_PE, args.num_heads, args.head_dim],
-                            dtype=dtype_torch,
-                            device='cuda')
+    input_data = torch.rand([args.batch_size, SEQ_PER_PE, args.num_heads, args.head_dim], dtype=dtype_torch, device="cuda")
 
     print(f"PE {RANK} Input shape: {input_data.shape}")
     print(f"PE {RANK} Input sample: {input_data[0, 0, 0, :3]}")
@@ -230,10 +216,8 @@ def test_transpose_all_to_all_with_golden_reference():
     # === Test 2: TileLang NVSHMEM Implementation ===
     def tilelang_all_to_all():
         # Create NVSHMEM tensors
-        data_src = pynvshmem.nvshmem_create_tensor(
-            [args.batch_size, SEQ_PER_PE, args.num_heads, args.head_dim], dtype_torch)
-        data_dst = pynvshmem.nvshmem_create_tensor(
-            [args.batch_size, HEADS_PER_PE, args.seq_len, args.head_dim], dtype_torch)
+        data_src = pynvshmem.nvshmem_create_tensor([args.batch_size, SEQ_PER_PE, args.num_heads, args.head_dim], dtype_torch)
+        data_dst = pynvshmem.nvshmem_create_tensor([args.batch_size, HEADS_PER_PE, args.seq_len, args.head_dim], dtype_torch)
         signal = pynvshmem.nvshmem_create_tensor([PE_num], torch.uint64)
 
         # Initialize data
diff --git a/examples/distributed/example_simple_shift.py b/examples/distributed/example_simple_shift.py
index a837c4b8d..b1e69d960 100644
--- a/examples/distributed/example_simple_shift.py
+++ b/examples/distributed/example_simple_shift.py
@@ -5,11 +5,10 @@
 
 
 def simple_shift(M, N, block_M, block_N, dtype="float16"):
-
     @T.prim_func
     def main(
-            A: T.Buffer((M, N), dtype),
-            B: T.Buffer((M, N), dtype),
+        A: T.Buffer((M, N), dtype),
+        B: T.Buffer((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             mype = T.alloc_local([1], "int32")
@@ -19,8 +18,7 @@ def main(
             npes[0] = T.get_pe_num()
             peer[0] = (mype[0] + 1) % npes[0]
 
-            T.putmem_nbi_block(
-                T.address_of(B[0, 0]), T.address_of(A[0, 0]), block_M * block_N * 2, peer[0])
+            T.putmem_nbi_block(T.address_of(B[0, 0]), T.address_of(A[0, 0]), block_M * block_N * 2, peer[0])
 
     return main
 
@@ -28,6 +26,7 @@ def main(
 WORLD_SIZE, RANK, LOCAL_RANK = init_distributed()
 
 func = simple_shift(128, 128, 128, 128)
+# Auto-selects cython backend when TILELANG_USE_DISTRIBUTED=1 is set
 kernel = tilelang.compile(func, out_idx=-1)
 
 # Get CUDA Source
diff --git a/examples/distributed/example_sp_ag_attention_intra_node.py b/examples/distributed/example_sp_ag_attention_intra_node.py
index c4d120fea..5b893e4f2 100644
--- a/examples/distributed/example_sp_ag_attention_intra_node.py
+++ b/examples/distributed/example_sp_ag_attention_intra_node.py
@@ -17,7 +17,6 @@
 
 
 class FusedSequenceParallelAttn(torch.nn.Module):
-
     def __init__(
         self,
         pg: torch.distributed.ProcessGroup,
@@ -47,8 +46,9 @@ def __init__(
         self.max_seqlen_k = max_seqlen_k
         self.head_dim = head_dim
 
-        assert (max_seqlen_q % self.world_size == 0 and max_seqlen_q % self.world_size
-                == 0), f"sequence length should be multiple of world_size({self.world_size})"
+        assert max_seqlen_q % self.world_size == 0 and max_seqlen_q % self.world_size == 0, (
+            f"sequence length should be multiple of world_size({self.world_size})"
+        )
         self.max_q_shard_len = self.max_seqlen_q // self.world_size
 
         self.input_dtype = input_dtype
@@ -101,7 +101,6 @@ def forward(self, q_shard, k_shards, v_shards, cu_seqlens_q, cu_seqlens_k, print
 
 
 class TorchSequenceParallelAttn(torch.nn.Module):
-
     def __init__(
         self,
         pg: torch.distributed.ProcessGroup,
@@ -138,8 +137,9 @@ def __init__(
 
         self.max_q_shard_len = max_seqlen_q // self.world_size
         self.max_kv_shard_ken = max_seqlen_q // self.world_size
-        assert (max_seqlen_q % self.world_size == 0 and max_seqlen_q % self.world_size
-                == 0), f"sequence length should be multiple of world_size({self.world_size})"
+        assert max_seqlen_q % self.world_size == 0 and max_seqlen_q % self.world_size == 0, (
+            f"sequence length should be multiple of world_size({self.world_size})"
+        )
 
         self.ag_k_buffer: torch.Tensor = torch.empty(
             self.batch_size * self.max_seqlen_k,
@@ -161,9 +161,9 @@ def forward(self, q_shard, k_shard, v_shard, cu_seqlens_q, cu_seqlens_k):
         def _gen_mask(offset, q_shard_len, kv_len):
             if self.is_causal:
                 mask = torch.zeros((q_shard_len, kv_len), dtype=torch.bool, device=self.device)
-                mask[:, :offset + q_shard_len] = True
+                mask[:, : offset + q_shard_len] = True
                 if offset < kv_len:
-                    mask[:, offset:offset + q_shard_len].tril_()
+                    mask[:, offset : offset + q_shard_len].tril_()
                 return mask
             return None
 
@@ -186,37 +186,27 @@ def _gen_mask(offset, q_shard_len, kv_len):
                 half_q_shard_len = q_shard_len // 2
                 half_kv_shard_len = kv_shard_len // 2
 
-                q0_shard = q_shard[cu_seqlens_q_start:cu_seqlens_q_start +
-                                   half_q_shard_len, :, :].clone()
-                q1_shard = q_shard[cu_seqlens_q_start +
-                                   half_q_shard_len:cu_seqlens_q_end, :, :].clone()
-
-                q0_shard_permute = torch.permute(
-                    q0_shard.reshape(1, half_q_shard_len, q_head, head_dim),
-                    (0, 2, 1, 3)).contiguous()
-                q1_shard_permute = torch.permute(
-                    q1_shard.reshape(1, half_q_shard_len, q_head, head_dim),
-                    (0, 2, 1, 3)).contiguous()
-
-                k0_shard = k_shard[cu_seqlens_k_start:cu_seqlens_k_start +
-                                   half_kv_shard_len, :, :].clone()
-                k1_shard = k_shard[cu_seqlens_k_start +
-                                   half_kv_shard_len:cu_seqlens_k_end, :, :].clone()
-                v0_shard = v_shard[cu_seqlens_k_start:cu_seqlens_k_start +
-                                   half_kv_shard_len, :, :].clone()
-                v1_shard = v_shard[cu_seqlens_k_start +
-                                   half_kv_shard_len:cu_seqlens_k_end, :, :].clone()
-
-                buffer_size = (half_kv_shard_len * kv_head * head_dim * self.world_size)
-
-                ag_k0 = self.ag_k_buffer.reshape(-1)[:buffer_size].reshape(
-                    half_kv_shard_len * self.world_size, kv_head, head_dim)
-                ag_k1 = self.ag_k_buffer.reshape(-1)[buffer_size:2 * buffer_size].reshape(
-                    half_kv_shard_len * self.world_size, kv_head, head_dim)
-                ag_v0 = self.ag_v_buffer.reshape(-1)[:buffer_size].reshape(
-                    half_kv_shard_len * self.world_size, kv_head, head_dim)
-                ag_v1 = self.ag_v_buffer.reshape(-1)[buffer_size:2 * buffer_size].reshape(
-                    half_kv_shard_len * self.world_size, kv_head, head_dim)
+                q0_shard = q_shard[cu_seqlens_q_start : cu_seqlens_q_start + half_q_shard_len, :, :].clone()
+                q1_shard = q_shard[cu_seqlens_q_start + half_q_shard_len : cu_seqlens_q_end, :, :].clone()
+
+                q0_shard_permute = torch.permute(q0_shard.reshape(1, half_q_shard_len, q_head, head_dim), (0, 2, 1, 3)).contiguous()
+                q1_shard_permute = torch.permute(q1_shard.reshape(1, half_q_shard_len, q_head, head_dim), (0, 2, 1, 3)).contiguous()
+
+                k0_shard = k_shard[cu_seqlens_k_start : cu_seqlens_k_start + half_kv_shard_len, :, :].clone()
+                k1_shard = k_shard[cu_seqlens_k_start + half_kv_shard_len : cu_seqlens_k_end, :, :].clone()
+                v0_shard = v_shard[cu_seqlens_k_start : cu_seqlens_k_start + half_kv_shard_len, :, :].clone()
+                v1_shard = v_shard[cu_seqlens_k_start + half_kv_shard_len : cu_seqlens_k_end, :, :].clone()
+
+                buffer_size = half_kv_shard_len * kv_head * head_dim * self.world_size
+
+                ag_k0 = self.ag_k_buffer.reshape(-1)[:buffer_size].reshape(half_kv_shard_len * self.world_size, kv_head, head_dim)
+                ag_k1 = self.ag_k_buffer.reshape(-1)[buffer_size : 2 * buffer_size].reshape(
+                    half_kv_shard_len * self.world_size, kv_head, head_dim
+                )
+                ag_v0 = self.ag_v_buffer.reshape(-1)[:buffer_size].reshape(half_kv_shard_len * self.world_size, kv_head, head_dim)
+                ag_v1 = self.ag_v_buffer.reshape(-1)[buffer_size : 2 * buffer_size].reshape(
+                    half_kv_shard_len * self.world_size, kv_head, head_dim
+                )
                 torch.distributed.all_gather_into_tensor(
                     ag_k0,
                     k0_shard,
@@ -238,19 +228,15 @@ def _gen_mask(offset, q_shard_len, kv_len):
                     group=self.pg,
                 )
                 ag_k1 = ag_k1.reshape(self.world_size, half_kv_shard_len, kv_head, head_dim)
-                ag_k1 = torch.flip(ag_k1, [0]).reshape(self.world_size * half_kv_shard_len, kv_head,
-                                                       head_dim)
+                ag_k1 = torch.flip(ag_k1, [0]).reshape(self.world_size * half_kv_shard_len, kv_head, head_dim)
                 ag_k = torch.cat((ag_k0, ag_k1), dim=0)
-                ag_k = torch.permute(ag_k.reshape(1, kv_len, kv_head, head_dim),
-                                     (0, 2, 1, 3)).contiguous()
+                ag_k = torch.permute(ag_k.reshape(1, kv_len, kv_head, head_dim), (0, 2, 1, 3)).contiguous()
                 ag_k = ag_k.repeat_interleave(q_head // kv_head, -3)
 
                 ag_v1 = ag_v1.reshape(self.world_size, half_kv_shard_len, kv_head, head_dim)
-                ag_v1 = torch.flip(ag_v1, [0]).reshape(self.world_size * half_kv_shard_len, kv_head,
-                                                       head_dim)
+                ag_v1 = torch.flip(ag_v1, [0]).reshape(self.world_size * half_kv_shard_len, kv_head, head_dim)
                 ag_v = torch.cat((ag_v0, ag_v1), dim=0)
-                ag_v = torch.permute(ag_v.reshape(1, kv_len, kv_head, head_dim),
-                                     (0, 2, 1, 3)).contiguous()
+                ag_v = torch.permute(ag_v.reshape(1, kv_len, kv_head, head_dim), (0, 2, 1, 3)).contiguous()
                 ag_v = ag_v.repeat_interleave(q_head // kv_head, -3)
 
                 offset_q0 = half_q_shard_len * self.rank
@@ -258,16 +244,12 @@ def _gen_mask(offset, q_shard_len, kv_len):
                 prefix = kv_len - q_len
                 mask0 = _gen_mask(prefix + offset_q0, half_q_shard_len, kv_len)
                 mask1 = _gen_mask(prefix + offset_q1, half_q_shard_len, kv_len)
-                out0 = torch.nn.functional.scaled_dot_product_attention(
-                    q0_shard_permute, ag_k, ag_v, attn_mask=mask0)
-                out1 = torch.nn.functional.scaled_dot_product_attention(
-                    q1_shard_permute, ag_k, ag_v, attn_mask=mask1)
+                out0 = torch.nn.functional.scaled_dot_product_attention(q0_shard_permute, ag_k, ag_v, attn_mask=mask0)
+                out1 = torch.nn.functional.scaled_dot_product_attention(q1_shard_permute, ag_k, ag_v, attn_mask=mask1)
                 out = torch.cat((out0, out1), dim=2)  # [1, q_head, q_shard_len, head_dim]
             else:
                 cu_q_shard = q_shard[cu_seqlens_q_start:cu_seqlens_q_end, :, :].clone()
-                cu_q_shard_permute = torch.permute(
-                    cu_q_shard.reshape(1, q_shard_len, q_head, head_dim),
-                    (0, 2, 1, 3)).contiguous()
+                cu_q_shard_permute = torch.permute(cu_q_shard.reshape(1, q_shard_len, q_head, head_dim), (0, 2, 1, 3)).contiguous()
 
                 total_size = kv_len * kv_head * head_dim
                 ag_k = self.ag_k_buffer.reshape(-1)[:total_size].reshape(kv_len, kv_head, head_dim)
@@ -284,19 +266,17 @@ def _gen_mask(offset, q_shard_len, kv_len):
                     cu_v_shard,
                     group=self.pg,
                 )
-                ag_k = torch.permute(ag_k.reshape(1, kv_len, kv_head, head_dim),
-                                     (0, 2, 1, 3)).contiguous()
+                ag_k = torch.permute(ag_k.reshape(1, kv_len, kv_head, head_dim), (0, 2, 1, 3)).contiguous()
                 ag_k = ag_k.repeat_interleave(q_head // kv_head, -3)
-                ag_v = torch.permute(ag_v.reshape(1, kv_len, kv_head, head_dim),
-                                     (0, 2, 1, 3)).contiguous()
+                ag_v = torch.permute(ag_v.reshape(1, kv_len, kv_head, head_dim), (0, 2, 1, 3)).contiguous()
                 ag_v = ag_v.repeat_interleave(q_head // kv_head, -3)
 
                 offset = self.rank * q_shard_len
                 prefix = kv_len - q_len
                 mask = _gen_mask(prefix + offset, q_shard_len, kv_len)
                 out = torch.nn.functional.scaled_dot_product_attention(
-                    cu_q_shard_permute, ag_k, ag_v,
-                    attn_mask=mask)  # [1, q_head, q_shard_len, head_dim]
+                    cu_q_shard_permute, ag_k, ag_v, attn_mask=mask
+                )  # [1, q_head, q_shard_len, head_dim]
 
             out = torch.permute(out.reshape(q_head, q_shard_len, head_dim), (1, 0, 2)).contiguous()
             out_list.append(out)
@@ -327,29 +307,20 @@ def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
     rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
     assert rank == local_rank and num_ranks == num_local_ranks, "only support single node for now"
     allocator = tilelang.get_allocator(
-        size=2**30,
-        device=device,
-        is_distributed=True,
-        local_rank=local_rank,
-        num_local_ranks=num_local_ranks,
-        group=group)
+        size=2**30, device=device, is_distributed=True, local_rank=local_rank, num_local_ranks=num_local_ranks, group=group
+    )
 
     cu_seqlens_q = torch.tensor(cu_seqlens_q_list, dtype=torch.int32, device=device)
     cu_seqlens_q = cu_seqlens_q // num_local_ranks
     cu_seqlens_k = torch.tensor(cu_seqlens_k_list, dtype=torch.int32, device=device)
 
-    q_shard = tilelang.tensor((cu_seqlens_q[-1], q_head, head_dim),
-                              dtype=dtype,
-                              allocator=allocator).normal_(
-                                  mean=0.0, std=0.5)
-    k_shards = tilelang.tensor((cu_seqlens_k[-1] // num_local_ranks, kv_head, head_dim),
-                               dtype=dtype,
-                               allocator=allocator,
-                               return_peers=True)
-    v_shards = tilelang.tensor((cu_seqlens_k[-1] // num_local_ranks, kv_head, head_dim),
-                               dtype=dtype,
-                               allocator=allocator,
-                               return_peers=True)
+    q_shard = tilelang.tensor((cu_seqlens_q[-1], q_head, head_dim), dtype=dtype, allocator=allocator).normal_(mean=0.0, std=0.5)
+    k_shards = tilelang.tensor(
+        (cu_seqlens_k[-1] // num_local_ranks, kv_head, head_dim), dtype=dtype, allocator=allocator, return_peers=True
+    )
+    v_shards = tilelang.tensor(
+        (cu_seqlens_k[-1] // num_local_ranks, kv_head, head_dim), dtype=dtype, allocator=allocator, return_peers=True
+    )
     k_shards[local_rank].normal_(mean=0.0, std=0.5)
     v_shards[local_rank].normal_(mean=0.0, std=0.5)
 
@@ -386,12 +357,10 @@ def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
         enable_zig_zag,
     )
 
-    tilescale_out = tilescale_module(
-        q_shard, k_shards, v_shards, cu_seqlens_q, cu_seqlens_k, print_source=True)
+    tilescale_out = tilescale_module(q_shard, k_shards, v_shards, cu_seqlens_q, cu_seqlens_k, print_source=True)
     print(f"tilescale_out: {tilescale_out.shape}")
 
-    torch_out = torch_module(q_shard, k_shards[local_rank], v_shards[local_rank], cu_seqlens_q,
-                             cu_seqlens_k)
+    torch_out = torch_module(q_shard, k_shards[local_rank], v_shards[local_rank], cu_seqlens_q, cu_seqlens_k)
     print(f"torch_out: {torch_out.shape}")
 
     atol = 1e-2
@@ -402,10 +371,7 @@ def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
         print(f"rank {local_rank} check failed.❌")
         print(f"torch_out: {torch_out}, tilelang_out: {tilescale_out}")
 
-    _, tl_t = perf_fn(
-        lambda: tilescale_module(q_shard, k_shards, v_shards, cu_seqlens_q, cu_seqlens_k),
-        warmup=5,
-        rep=5)
+    _, tl_t = perf_fn(lambda: tilescale_module(q_shard, k_shards, v_shards, cu_seqlens_q, cu_seqlens_k), warmup=5, rep=5)
 
     print(f"rank {local_rank} tilescale time: {tl_t:.2f} ms")
 
@@ -414,20 +380,16 @@ def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--num-processes', type=int, default=1, help='Number of processes to spawn (default: 2)')
+    parser.add_argument("--num-processes", type=int, default=2, help="Number of processes to spawn (default: 2)")
     parser.add_argument("--batch_size", type=int, default=2, help="batch size")
     parser.add_argument("--q_head", type=int, default=32, help="num q heads")
     parser.add_argument("--kv_head", type=int, default=8, help="num kv heads")
     parser.add_argument("--max_seqlen_q", type=int, default=8192, help="max sequence length of q")
-    parser.add_argument(
-        "--max_seqlen_k", type=int, default=12288, help="max sequence length of k/v")
+    parser.add_argument("--max_seqlen_k", type=int, default=12288, help="max sequence length of k/v")
     parser.add_argument("--head_dim", type=int, default=128, help="head dim")
-    parser.add_argument(
-        "--seqlens_q", type=int, nargs='+', default=[4096, 8192], help="sequence lengths of q")
-    parser.add_argument(
-        "--seqlens_k", type=int, nargs='+', default=[6144, 12288], help="sequence lengths of k/v")
-    parser.add_argument('--is_causal', action='store_true', help='causal')
+    parser.add_argument("--seqlens_q", type=int, nargs="+", default=[4096, 8192], help="sequence lengths of q")
+    parser.add_argument("--seqlens_k", type=int, nargs="+", default=[6144, 12288], help="sequence lengths of k/v")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
     parser.add_argument(
         "--zig-zag",
         "--no-zig-zag",
diff --git a/examples/distributed/example_summa.py b/examples/distributed/example_summa.py
index 168517c09..640a31de6 100644
--- a/examples/distributed/example_summa.py
+++ b/examples/distributed/example_summa.py
@@ -11,7 +11,6 @@
 
 
 def summa(MESH, M, N, K, block_M, block_N, block_K, dtype="float16"):
-
     M_local = T.ceildiv(M, MESH)
     N_local = T.ceildiv(N, MESH)
     K_local = T.ceildiv(K, MESH)
@@ -22,13 +21,13 @@ def summa(MESH, M, N, K, block_M, block_N, block_K, dtype="float16"):
 
     @T.prim_func
     def main(
-            A: T.Tensor((2, M_local, K_local), dtype),
-            B: T.Tensor((2, N_local, K_local), dtype),
-            A_signal_to: T.Tensor((T.ceildiv(M, block_M),), "uint64"),
-            A_signal_from: T.Tensor((T.ceildiv(M, block_M),), "uint64"),
-            B_signal_to: T.Tensor((T.ceildiv(N, block_N),), "uint64"),
-            B_signal_from: T.Tensor((T.ceildiv(N, block_N),), "uint64"),
-            C: T.Tensor((M_local, N_local), dtype),
+        A: T.Tensor((2, M_local, K_local), dtype),
+        B: T.Tensor((2, N_local, K_local), dtype),
+        A_signal_to: T.Tensor((T.ceildiv(M, block_M),), "uint64"),
+        A_signal_from: T.Tensor((T.ceildiv(M, block_M),), "uint64"),
+        B_signal_to: T.Tensor((T.ceildiv(N, block_N),), "uint64"),
+        B_signal_from: T.Tensor((T.ceildiv(N, block_N),), "uint64"),
+        C: T.Tensor((M_local, N_local), dtype),
     ):
         grid_size = T.min(sm_num, total_tiles)
         A_rows_per_block = T.ceildiv(M_local, grid_size)
@@ -63,8 +62,11 @@ def main(
                                 T.address_of(A[(ko + 1) % 2, A_rows_per_block * block_id, 0]),
                                 T.address_of(A[ko % 2, A_rows_per_block * block_id, 0]),
                                 A_rows_per_block * K_local * dtype_map[dtype].itemsize,
-                                T.address_of(A_signal_to[0]), 1, T.Amo.SIGNAL_ADD,
-                                pe_mn * MESH + peer_k)
+                                T.address_of(A_signal_to[0]),
+                                1,
+                                T.Amo.SIGNAL_ADD,
+                                pe_mn * MESH + peer_k,
+                            )
 
                 # broadcast B
                 if pe_k == ko:
@@ -80,8 +82,11 @@ def main(
                                 T.address_of(B[(ko + 1) % 2, B_cols_per_block * block_id, 0]),
                                 T.address_of(B[ko % 2, B_cols_per_block * block_id, 0]),
                                 B_cols_per_block * K_local * dtype_map[dtype].itemsize,
-                                T.address_of(B_signal_to[0]), 1, T.Amo.SIGNAL_ADD,
-                                pe_mn * MESH + peer_k)
+                                T.address_of(B_signal_to[0]),
+                                1,
+                                T.Amo.SIGNAL_ADD,
+                                pe_mn * MESH + peer_k,
+                            )
 
                 # TODO: check if __syncthreads() is needed
                 T.signal_wait_until(
@@ -96,7 +101,6 @@ def main(
                 )
 
                 for w in T.serial(waves):
-
                     bx = (grid_size * w + block_id) // T.ceildiv(N_local, block_N)
                     by = (grid_size * w + block_id) % T.ceildiv(N_local, block_N)
 
@@ -158,11 +162,7 @@ def parse_args():
     K_local = math.ceil(K / MESH)
 
     func = summa(MESH, M, N, K, block_M, block_N, block_K, args.dtype)
-    kernel = tilelang.compile(
-        func, pass_configs={
-            "tl.disable_tma_lower": True,
-            "tl.disable_warp_specialized": True
-        })
+    kernel = tilelang.compile(func, pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
 
     # Get CUDA Source
     if RANK == 0:
@@ -183,9 +183,9 @@ def parse_args():
         b_scatter_list = []
         for r in range(WORLD_SIZE):
             rr, cc = divmod(r, MESH)
-            c_tile = C[M_local * rr:M_local * (rr + 1), N_local * cc:N_local * (cc + 1)]
-            a_tile = A[M_local * rr:M_local * (rr + 1), K_local * cc:K_local * (cc + 1)]
-            b_tile = B[N_local * cc:N_local * (cc + 1), K_local * rr:K_local * (rr + 1)]
+            c_tile = C[M_local * rr : M_local * (rr + 1), N_local * cc : N_local * (cc + 1)]
+            a_tile = A[M_local * rr : M_local * (rr + 1), K_local * cc : K_local * (cc + 1)]
+            b_tile = B[N_local * cc : N_local * (cc + 1), K_local * rr : K_local * (rr + 1)]
 
             c_scatter_list.append(c_tile.contiguous())
             a_scatter_list.append(a_tile.contiguous())
@@ -220,7 +220,7 @@ def parse_args():
         dist.barrier()
         if r == RANK:
             if torch.allclose(C_tilelang, ref, rtol=1e-2, atol=1e-2):
-                print('-' * 100)
+                print("-" * 100)
                 print(f"[Rank {RANK}] ✅ Tilelang and Torch match")
             else:
                 abs_error = torch.abs(C_tilelang - ref)
@@ -230,7 +230,7 @@ def parse_args():
                 max_rel_error = rel_error.max().item()
                 mismatch_ratio = (abs_error > (1e-2 + 1e-2 * torch.abs(ref))).float().mean().item()
 
-                print('-' * 100)
+                print("-" * 100)
                 print(f"[Rank {RANK}] ❌ Tilelang and Torch mismatch")
                 print(f"[Rank {RANK}] ref:\n{ref}")
                 print(f"[Rank {RANK}] tilelang:\n{C_tilelang}")
@@ -281,8 +281,7 @@ def reduce_local_time(local_time):
 
 
 total_flops = 2 * M * N * K
-avg_time = reduce_local_time(
-    bench(kernel, A, B, A_signal_to, A_signal_from, B_signal_to, B_signal_from, C_tilelang))
+avg_time = reduce_local_time(bench(kernel, A, B, A_signal_to, A_signal_from, B_signal_to, B_signal_from, C_tilelang))
 
 if RANK == 0:
     print(f"avg time of RANK {RANK}: {avg_time} ms")
diff --git a/examples/distributed/gemm_rs_utils.py b/examples/distributed/gemm_rs_utils.py
index 2d8141467..0a6634c39 100644
--- a/examples/distributed/gemm_rs_utils.py
+++ b/examples/distributed/gemm_rs_utils.py
@@ -79,16 +79,13 @@ def __post_init__(self):
         for buf in self.signal_bufs:
             assert buf.shape[0] >= 2 * self.world_size
 
-        self.scatter_signal_bufs = [buf[:self.world_size] for buf in self.signal_bufs]
-        self.rs_per_node_signal_bufs = [
-            buf[self.world_size:self.world_size * 2] for buf in self.signal_bufs
-        ]
+        self.scatter_signal_bufs = [buf[: self.world_size] for buf in self.signal_bufs]
+        self.rs_per_node_signal_bufs = [buf[self.world_size : self.world_size * 2] for buf in self.signal_bufs]
 
         for node_id in range(self.nnodes):
             self.scatter_signal_buf_list_for_each_node.append(
-                self.scatter_signal_bufs[self.local_rank][node_id *
-                                                          self.local_world_size:(node_id + 1) *
-                                                          self.local_world_size])
+                self.scatter_signal_bufs[self.local_rank][node_id * self.local_world_size : (node_id + 1) * self.local_world_size]
+            )
 
     def reset_barriers(self) -> int:
         # self.scatter_signal_bufs[self.local_rank].fill_(0)
@@ -101,9 +98,7 @@ def get_scatter_bufs_and_signal_for_each_node(self, input, node_id):
         M_per_node = M_per_rank * self.local_world_size
         M_start = node_id * M_per_node
         M_end = M_start + M_per_node
-        scatter_bufs_intra_node = [
-            self.scatter_bufs[i][M_start:M_end] for i in range(self.local_world_size)
-        ]
+        scatter_bufs_intra_node = [self.scatter_bufs[i][M_start:M_end] for i in range(self.local_world_size)]
         return scatter_bufs_intra_node, self.scatter_signal_buf_list_for_each_node[node_id]
 
     @property
@@ -131,36 +126,32 @@ def scatter_signal_buf(self) -> torch.Tensor:
         return self.scatter_signal_bufs[self.local_rank]
 
 
-def create_reduce_scater_2d_ctx(max_M,
-                                N,
-                                rank,
-                                world_size,
-                                local_world_size,
-                                dtype,
-                                overlap_with_gemm=True,
-                                num_reduction_sms=15) -> ReduceScatter2DContext:
+def create_reduce_scater_2d_ctx(
+    max_M, N, rank, world_size, local_world_size, dtype, overlap_with_gemm=True, num_reduction_sms=15
+) -> ReduceScatter2DContext:
     """
-        for num_reduction_sms: tunable param, 16 are enough for H800
-            For H800, we overlap local reduce and inter-node p2p with intra-node scatter.
-            The reduction kernel bandwidth is not a bottleneck if it exceeds 450GB, so only a few SMs are needed.
-            For machines with higher intra_node bandwidth(e.g. H100), we may need to increase the number of SMs or redesign overlapping.
+    for num_reduction_sms: tunable param, 16 are enough for H800
+        For H800, we overlap local reduce and inter-node p2p with intra-node scatter.
+        The reduction kernel bandwidth is not a bottleneck if it exceeds 450GB, so only a few SMs are needed.
+        For machines with higher intra_node bandwidth(e.g. H100), we may need to increase the number of SMs or redesign overlapping.
     """
     assert world_size % local_world_size == 0
     assert max_M % world_size == 0
 
     scatter_bufs = pynvshmem.nvshmem_create_tensor_list_intra_node([max_M, N], dtype)
 
-    rs_per_node_bufs = pynvshmem.nvshmem_create_tensor_list_intra_node(
-        [max_M // local_world_size, N], dtype)
+    rs_per_node_bufs = pynvshmem.nvshmem_create_tensor_list_intra_node([max_M // local_world_size, N], dtype)
 
-    p2p_bufs = pynvshmem.nvshmem_create_tensor_list_intra_node([max_M // local_world_size, N],
-                                                               dtype)
+    p2p_bufs = pynvshmem.nvshmem_create_tensor_list_intra_node([max_M // local_world_size, N], dtype)
 
     # signal_buf: scatter_signal | rs_per_node_signal
     num_signal_bufs = 2
-    signal_bufs = pynvshmem.nvshmem_create_tensor_list_intra_node([
-        world_size * num_signal_bufs,
-    ], SIGNAL_DTYPE)
+    signal_bufs = pynvshmem.nvshmem_create_tensor_list_intra_node(
+        [
+            world_size * num_signal_bufs,
+        ],
+        SIGNAL_DTYPE,
+    )
 
     # TODO: implement barrier_all_on_stream
     # barrier_all_on_stream(None, torch.cuda.current_stream())
@@ -187,7 +178,8 @@ def create_reduce_scater_2d_ctx(max_M,
         p2p_stream=p2p_stream,
         num_sync_sms=num_sync_sms,
         num_p2p_sms=num_p2p_sms,
-        num_reduction_sms=num_reduction_sms)
+        num_reduction_sms=num_reduction_sms,
+    )
     return ctx
 
 
@@ -211,14 +203,7 @@ class GEMMReduceScatterTensorParallelContext:
     GROUP_M: int = 8
     stages: int = 3
 
-    def update(self,
-               rs_stream,
-               output_dtype=None,
-               BLOCK_M=128,
-               BLOCK_N=256,
-               BLOCK_K=64,
-               GROUP_M=8,
-               stages=3):
+    def update(self, rs_stream, output_dtype=None, BLOCK_M=128, BLOCK_N=256, BLOCK_K=64, GROUP_M=8, stages=3):
         self.rs_stream = rs_stream
         self.output_dtype = output_dtype
         self.BLOCK_M = BLOCK_M
@@ -233,20 +218,10 @@ def get_gemm_out_buf(self, input):
         return self.gemm_out_bufs[local_rank][:M]
 
 
-def create_gemm_rs_context(max_M,
-                           N,
-                           rank,
-                           world_size,
-                           local_world_size,
-                           output_dtype,
-                           rs_stream,
-                           BLOCK_M=128,
-                           BLOCK_N=256,
-                           BLOCK_K=64,
-                           GROUP_M=8,
-                           stages=3) -> GEMMReduceScatterTensorParallelContext:
-    rs_ctx = create_reduce_scater_2d_ctx(
-        max_M, N, rank, world_size, local_world_size, output_dtype, overlap_with_gemm=True)
+def create_gemm_rs_context(
+    max_M, N, rank, world_size, local_world_size, output_dtype, rs_stream, BLOCK_M=128, BLOCK_N=256, BLOCK_K=64, GROUP_M=8, stages=3
+) -> GEMMReduceScatterTensorParallelContext:
+    rs_ctx = create_reduce_scater_2d_ctx(max_M, N, rank, world_size, local_world_size, output_dtype, overlap_with_gemm=True)
     NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
     num_gemm_sms = NUM_SMS - rs_ctx.num_rs_sms
     gemm_out_bufs = pynvshmem.nvshmem_create_tensor_list_intra_node([max_M, N], output_dtype)
@@ -260,5 +235,6 @@ def create_gemm_rs_context(max_M,
         BLOCK_N=BLOCK_N,
         BLOCK_K=BLOCK_K,
         GROUP_M=GROUP_M,
-        stages=stages)
+        stages=stages,
+    )
     return ctx
diff --git a/examples/distributed/primitives/example_get_block.py b/examples/distributed/primitives/example_get_block.py
index 9039fbf6c..369e81032 100644
--- a/examples/distributed/primitives/example_get_block.py
+++ b/examples/distributed/primitives/example_get_block.py
@@ -8,15 +8,14 @@
 from tilelang.distributed import init_dist
 
 tilelang.disable_cache()
-os.environ['NCCL_DEBUG'] = 'WARN'  # silence NCCL log
+os.environ["NCCL_DEBUG"] = "WARN"  # silence NCCL log
 
 
 def get_kernel(M, num_rank, block_M, threads):
-
     @T.prim_func
     def main(
-            dst: T.Tensor((M), "float32"),
-            src: T.Tensor((M), "float32"),
+        dst: T.Tensor((M), "float32"),
+        src: T.Tensor((M), "float32"),
     ):
         with T.Kernel(T.ceildiv(M, block_M), threads=threads) as (bx):
             rank = T.alloc_local([1], "uint64")
@@ -42,12 +41,8 @@ def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
 
     rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
     allocator = tilelang.get_allocator(
-        size=2**25,
-        device="cuda",
-        is_distributed=True,
-        local_rank=local_rank,
-        num_local_ranks=num_local_ranks,
-        group=group)
+        size=2**25, device="cuda", is_distributed=True, local_rank=local_rank, num_local_ranks=num_local_ranks, group=group
+    )
     kernel = tilelang.compile(get_kernel(M, num_ranks, BLOCK_M, threads))
     kernel.initialize(allocator=allocator)
     if local_rank == 0:
@@ -78,9 +73,8 @@ def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--num-processes', type=int, default=2, help='Number of processes to spawn (default: 2)')
-    parser.add_argument('--M', type=int, default=65536, help='M dimension')
+    parser.add_argument("--num-processes", type=int, default=2, help="Number of processes to spawn (default: 2)")
+    parser.add_argument("--M", type=int, default=65536, help="M dimension")
     args = parser.parse_args()
     num_processes = args.num_processes
 
diff --git a/examples/distributed/primitives/example_get_warp.py b/examples/distributed/primitives/example_get_warp.py
index 49b1fc02a..80d34d2ce 100644
--- a/examples/distributed/primitives/example_get_warp.py
+++ b/examples/distributed/primitives/example_get_warp.py
@@ -8,15 +8,14 @@
 from tilelang.distributed import init_dist
 
 tilelang.disable_cache()
-os.environ['NCCL_DEBUG'] = 'WARN'  # silence NCCL log
+os.environ["NCCL_DEBUG"] = "WARN"  # silence NCCL log
 
 
 def get_kernel(M, num_rank, block_M, threads):
-
     @T.prim_func
     def main(
-            dst: T.Tensor((M), "float32"),
-            src: T.Tensor((M), "float32"),
+        dst: T.Tensor((M), "float32"),
+        src: T.Tensor((M), "float32"),
     ):
         with T.Kernel(T.ceildiv(M, block_M), threads=threads) as (bx):
             rank = T.alloc_local([1], "uint64")
@@ -31,7 +30,8 @@ def main(
                 dst=T.address_of(dst[warp_start]),
                 size=warp_copy_size,
                 src_pe=rank[0] ^ 1,
-                unroll_factor=4)
+                unroll_factor=4,
+            )
             T.fence_sys()
 
     return main
@@ -45,12 +45,8 @@ def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
 
     rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
     allocator = tilelang.get_allocator(
-        size=2**25,
-        device="cuda",
-        is_distributed=True,
-        local_rank=local_rank,
-        num_local_ranks=num_local_ranks,
-        group=group)
+        size=2**25, device="cuda", is_distributed=True, local_rank=local_rank, num_local_ranks=num_local_ranks, group=group
+    )
     kernel = tilelang.compile(get_kernel(M, num_ranks, BLOCK_M, threads))
     kernel.initialize(allocator=allocator)
     if local_rank == 0:
@@ -81,9 +77,8 @@ def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--num-processes', type=int, default=2, help='Number of processes to spawn (default: 2)')
-    parser.add_argument('--M', type=int, default=65536, help='M dimension')
+    parser.add_argument("--num-processes", type=int, default=2, help="Number of processes to spawn (default: 2)")
+    parser.add_argument("--M", type=int, default=65536, help="M dimension")
     args = parser.parse_args()
     num_processes = args.num_processes
 
diff --git a/examples/distributed/primitives/example_put_block.py b/examples/distributed/primitives/example_put_block.py
index 19e22b1ce..3b59c6c56 100644
--- a/examples/distributed/primitives/example_put_block.py
+++ b/examples/distributed/primitives/example_put_block.py
@@ -8,15 +8,14 @@
 from tilelang.distributed import init_dist
 
 tilelang.disable_cache()
-os.environ['NCCL_DEBUG'] = 'WARN'  # silence NCCL log
+os.environ["NCCL_DEBUG"] = "WARN"  # silence NCCL log
 
 
 def kernel_(M, num_rank, block_M, threads):
-
     @T.prim_func
     def main(
-            dst: T.Tensor((M), "float32"),
-            src: T.Tensor((M), "float32"),
+        dst: T.Tensor((M), "float32"),
+        src: T.Tensor((M), "float32"),
     ):
         with T.Kernel(T.ceildiv(M, block_M), threads=threads) as (bx):
             rank = T.alloc_local([1], "uint64")
@@ -41,12 +40,8 @@ def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
 
     rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
     allocator = tilelang.get_allocator(
-        size=2**25,
-        device="cuda",
-        is_distributed=True,
-        local_rank=local_rank,
-        num_local_ranks=num_local_ranks,
-        group=group)
+        size=2**25, device="cuda", is_distributed=True, local_rank=local_rank, num_local_ranks=num_local_ranks, group=group
+    )
     kernel = tilelang.compile(kernel_(M, num_ranks, BLOCK_M, threads))
     kernel.initialize(allocator=allocator)
     if local_rank == 0:
@@ -77,9 +72,8 @@ def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--num-processes', type=int, default=2, help='Number of processes to spawn (default: 2)')
-    parser.add_argument('--M', type=int, default=65536, help='M dimension')
+    parser.add_argument("--num-processes", type=int, default=2, help="Number of processes to spawn (default: 2)")
+    parser.add_argument("--M", type=int, default=65536, help="M dimension")
     args = parser.parse_args()
     num_processes = args.num_processes
 
diff --git a/examples/distributed/primitives/example_put_warp.py b/examples/distributed/primitives/example_put_warp.py
index a0351f6bf..4d397bc9d 100644
--- a/examples/distributed/primitives/example_put_warp.py
+++ b/examples/distributed/primitives/example_put_warp.py
@@ -8,15 +8,14 @@
 from tilelang.distributed import init_dist
 
 tilelang.disable_cache()
-os.environ['NCCL_DEBUG'] = 'WARN'  # silence NCCL log
+os.environ["NCCL_DEBUG"] = "WARN"  # silence NCCL log
 
 
 def kernel_(M, num_rank, block_M, threads):
-
     @T.prim_func
     def main(
-            dst: T.Tensor((M), "bfloat16"),
-            src: T.Tensor((M), "bfloat16"),
+        dst: T.Tensor((M), "bfloat16"),
+        src: T.Tensor((M), "bfloat16"),
     ):
         with T.Kernel(T.ceildiv(M, block_M), threads=threads) as (bx):
             rank = T.alloc_local([1], "uint64")
@@ -31,7 +30,8 @@ def main(
                 dst=T.address_of(dst[warp_start]),
                 size=warp_copy_size,
                 dst_pe=rank[0] ^ 1,
-                unroll_factor=4)
+                unroll_factor=4,
+            )
 
     return main
 
@@ -44,12 +44,8 @@ def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
 
     rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
     allocator = tilelang.get_allocator(
-        size=2**25,
-        device="cuda",
-        is_distributed=True,
-        local_rank=local_rank,
-        num_local_ranks=num_local_ranks,
-        group=group)
+        size=2**25, device="cuda", is_distributed=True, local_rank=local_rank, num_local_ranks=num_local_ranks, group=group
+    )
     kernel = tilelang.compile(kernel_(M, num_ranks, BLOCK_M, threads))
     kernel.initialize(allocator=allocator)
     if local_rank == 0:
@@ -80,9 +76,8 @@ def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--num-processes', type=int, default=2, help='Number of processes to spawn (default: 2)')
-    parser.add_argument('--M', type=int, default=65536, help='M dimension')
+    parser.add_argument("--num-processes", type=int, default=2, help="Number of processes to spawn (default: 2)")
+    parser.add_argument("--M", type=int, default=65536, help="M dimension")
     args = parser.parse_args()
     num_processes = args.num_processes
 
diff --git a/examples/distributed/primitives/example_remote_st.py b/examples/distributed/primitives/example_remote_st.py
index 251e5e08b..05f95f50d 100644
--- a/examples/distributed/primitives/example_remote_st.py
+++ b/examples/distributed/primitives/example_remote_st.py
@@ -8,15 +8,14 @@
 from tilelang.distributed import init_dist
 
 tilelang.disable_cache()
-os.environ['NCCL_DEBUG'] = 'WARN'  # silence NCCL log
+os.environ["NCCL_DEBUG"] = "WARN"  # silence NCCL log
 
 
 def kernel_(M, num_rank, block_M, threads):
-
     @T.prim_func
     def main(
-            dst: T.Tensor((M), "float32"),
-            src: T.Tensor((M), "float32"),
+        dst: T.Tensor((M), "float32"),
+        src: T.Tensor((M), "float32"),
     ):
         with T.Kernel(T.ceildiv(M, block_M), threads=threads) as (bx):
             rank = T.alloc_local([1], "uint64")
@@ -36,12 +35,8 @@ def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
 
     rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
     allocator = tilelang.get_allocator(
-        size=2**25,
-        device="cuda",
-        is_distributed=True,
-        local_rank=local_rank,
-        num_local_ranks=num_local_ranks,
-        group=group)
+        size=2**25, device="cuda", is_distributed=True, local_rank=local_rank, num_local_ranks=num_local_ranks, group=group
+    )
     kernel = tilelang.compile(kernel_(M, num_ranks, BLOCK_M, threads))
     kernel.initialize(allocator=allocator)
     if local_rank == 0:
@@ -72,9 +67,8 @@ def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--num-processes', type=int, default=2, help='Number of processes to spawn (default: 2)')
-    parser.add_argument('--M', type=int, default=1024, help='M dimension')
+    parser.add_argument("--num-processes", type=int, default=2, help="Number of processes to spawn (default: 2)")
+    parser.add_argument("--M", type=int, default=1024, help="M dimension")
     args = parser.parse_args()
     num_processes = args.num_processes
 
diff --git a/examples/distributed/primitives/example_sync.py b/examples/distributed/primitives/example_sync.py
index fa5949a3f..eba17c442 100644
--- a/examples/distributed/primitives/example_sync.py
+++ b/examples/distributed/primitives/example_sync.py
@@ -7,7 +7,7 @@
 from tilelang.distributed import init_dist
 
 tilelang.disable_cache()
-os.environ['NCCL_DEBUG'] = 'WARN'  # silence NCCL log
+os.environ["NCCL_DEBUG"] = "WARN"  # silence NCCL log
 
 
 def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
@@ -16,12 +16,8 @@ def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
 
     rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
     allocator = tilelang.get_allocator(
-        size=2**25,
-        device="cuda",
-        is_distributed=True,
-        local_rank=local_rank,
-        num_local_ranks=num_local_ranks,
-        group=group)
+        size=2**25, device="cuda", is_distributed=True, local_rank=local_rank, num_local_ranks=num_local_ranks, group=group
+    )
 
     dst = tilelang.tensor((M), torch.float32, allocator=allocator)
     srcs = tilelang.tensor((M), torch.float32, allocator=allocator, return_peers=True)
@@ -39,9 +35,8 @@ def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--num-processes', type=int, default=2, help='Number of processes to spawn (default: 2)')
-    parser.add_argument('--M', type=int, default=65536, help='M dimension')
+    parser.add_argument("--num-processes", type=int, default=2, help="Number of processes to spawn (default: 2)")
+    parser.add_argument("--M", type=int, default=65536, help="M dimension")
     args = parser.parse_args()
     num_processes = args.num_processes
 
diff --git a/examples/distributed/primitives/test_get_block.py b/examples/distributed/primitives/test_get_block.py
index 6675965b0..63c52435a 100644
--- a/examples/distributed/primitives/test_get_block.py
+++ b/examples/distributed/primitives/test_get_block.py
@@ -5,6 +5,7 @@
 import example_get_block
 
 
+@tilelang.testing.requires_distributed
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_get_block():
diff --git a/examples/distributed/primitives/test_get_warp.py b/examples/distributed/primitives/test_get_warp.py
index c482fa394..a542361fa 100644
--- a/examples/distributed/primitives/test_get_warp.py
+++ b/examples/distributed/primitives/test_get_warp.py
@@ -5,6 +5,7 @@
 import example_get_warp
 
 
+@tilelang.testing.requires_distributed
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_get_warp():
diff --git a/examples/distributed/primitives/test_put_block.py b/examples/distributed/primitives/test_put_block.py
index 83ef08fb2..2e31de627 100644
--- a/examples/distributed/primitives/test_put_block.py
+++ b/examples/distributed/primitives/test_put_block.py
@@ -5,6 +5,7 @@
 import example_put_block
 
 
+@tilelang.testing.requires_distributed
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_put_block():
diff --git a/examples/distributed/primitives/test_put_warp.py b/examples/distributed/primitives/test_put_warp.py
index de4cc1476..3b289cd27 100644
--- a/examples/distributed/primitives/test_put_warp.py
+++ b/examples/distributed/primitives/test_put_warp.py
@@ -5,6 +5,7 @@
 import example_put_warp
 
 
+@tilelang.testing.requires_distributed
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_put_warp():
diff --git a/examples/distributed/reduce_scatter.py b/examples/distributed/reduce_scatter.py
index fcb8e997f..6ddc5707e 100644
--- a/examples/distributed/reduce_scatter.py
+++ b/examples/distributed/reduce_scatter.py
@@ -72,16 +72,13 @@ def __post_init__(self):
         for buf in self.signal_bufs:
             assert buf.shape[0] >= 2 * self.world_size
 
-        self.scatter_signal_bufs = [buf[:self.world_size] for buf in self.signal_bufs]
-        self.rs_per_node_signal_bufs = [
-            buf[self.world_size:self.world_size * 2] for buf in self.signal_bufs
-        ]
+        self.scatter_signal_bufs = [buf[: self.world_size] for buf in self.signal_bufs]
+        self.rs_per_node_signal_bufs = [buf[self.world_size : self.world_size * 2] for buf in self.signal_bufs]
 
         for node_id in range(self.nnodes):
             self.scatter_signal_buf_list_for_each_node.append(
-                self.scatter_signal_bufs[self.local_rank][node_id *
-                                                          self.local_world_size:(node_id + 1) *
-                                                          self.local_world_size])
+                self.scatter_signal_bufs[self.local_rank][node_id * self.local_world_size : (node_id + 1) * self.local_world_size]
+            )
 
     def reset_barriers(self):
         self.signal_bufs[self.local_rank].fill_(0)
@@ -93,9 +90,7 @@ def get_scatter_bufs_and_signal_for_each_node(self, input, node_id):
         M_per_node = M_per_rank * self.local_world_size
         M_start = node_id * M_per_node
         M_end = M_start + M_per_node
-        scatter_bufs_intra_node = [
-            self.scatter_bufs[i][M_start:M_end] for i in range(self.local_world_size)
-        ]
+        scatter_bufs_intra_node = [self.scatter_bufs[i][M_start:M_end] for i in range(self.local_world_size)]
         return scatter_bufs_intra_node, self.scatter_signal_buf_list_for_each_node[node_id]
 
     @property
@@ -123,50 +118,29 @@ def scatter_signal_buf(self) -> torch.Tensor:
         return self.scatter_signal_bufs[self.local_rank]
 
 
-def create_reduce_scater_2d_ctx(max_M,
-                                N,
-                                rank,
-                                world_size,
-                                local_world_size,
-                                dtype,
-                                allocator,
-                                overlap_with_gemm=True,
-                                num_reduction_sms=15) -> ReduceScatter2DContext:
+def create_reduce_scater_2d_ctx(
+    max_M, N, rank, world_size, local_world_size, dtype, allocator, overlap_with_gemm=True, num_reduction_sms=15
+) -> ReduceScatter2DContext:
     """
-        for num_reduction_sms: tunable param, 16 are enough for H800
-            For H800, we overlap local reduce and inter-node p2p with intra-node scatter.
-            The reduction kernel bandwidth is not a bottleneck if it exceeds 450GB, so only a few SMs are needed.
-            For machines with higher intra_node bandwidth(e.g. H100), we may need to increase the number of SMs or redesign overlapping.
+    for num_reduction_sms: tunable param, 16 are enough for H800
+        For H800, we overlap local reduce and inter-node p2p with intra-node scatter.
+        The reduction kernel bandwidth is not a bottleneck if it exceeds 450GB, so only a few SMs are needed.
+        For machines with higher intra_node bandwidth(e.g. H100), we may need to increase the number of SMs or redesign overlapping.
     """
     assert world_size % local_world_size == 0
     assert max_M % world_size == 0
 
     scatter_bufs = tilelang.tensor((max_M, N), dtype, allocator=allocator, return_peers=True)
-    rs_per_node_bufs = tilelang.tensor((max_M // local_world_size, N),
-                                       dtype,
-                                       allocator=allocator,
-                                       return_peers=True)
-    p2p_bufs = tilelang.tensor((max_M // local_world_size, N),
-                               dtype,
-                               allocator=allocator,
-                               return_peers=True)
+    rs_per_node_bufs = tilelang.tensor((max_M // local_world_size, N), dtype, allocator=allocator, return_peers=True)
+    p2p_bufs = tilelang.tensor((max_M // local_world_size, N), dtype, allocator=allocator, return_peers=True)
 
     # signal_buf: scatter_signal | rs_per_node_signal
     num_signal_bufs = 2
-    signal_bufs = tilelang.tensor((world_size * num_signal_bufs),
-                                  dtype=torch.uint32,
-                                  allocator=allocator,
-                                  return_peers=True)
-    symm_barriers = tilelang.tensor((local_world_size,),
-                                    torch.int32,
-                                    allocator=allocator,
-                                    return_peers=True)
+    signal_bufs = tilelang.tensor((world_size * num_signal_bufs), dtype=torch.uint32, allocator=allocator, return_peers=True)
+    symm_barriers = tilelang.tensor((local_world_size,), torch.int32, allocator=allocator, return_peers=True)
     symm_barriers[rank] = 0
 
-    counter_signal_buf = tilelang.tensor((local_world_size),
-                                         dtype=torch.uint32,
-                                         allocator=allocator,
-                                         return_peers=True)
+    counter_signal_buf = tilelang.tensor((local_world_size), dtype=torch.uint32, allocator=allocator, return_peers=True)
 
     dist.barrier()
 
@@ -191,29 +165,21 @@ def create_reduce_scater_2d_ctx(max_M,
         reduction_stream=reduction_stream,
         num_sync_sms=num_sync_sms,
         num_p2p_sms=num_p2p_sms,
-        num_reduction_sms=num_reduction_sms)
+        num_reduction_sms=num_reduction_sms,
+    )
     return ctx
 
 
 @tilelang.jit
-def kernel_ring_reduce_tma(M_per_rank,
-                           N,
-                           block_M,
-                           block_N,
-                           begin_idx,
-                           num_splits,
-                           threads,
-                           persistent=False,
-                           dtype="float16",
-                           accum_dtype="float"):
-
+def kernel_ring_reduce_tma(
+    M_per_rank, N, block_M, block_N, begin_idx, num_splits, threads, persistent=False, dtype="float16", accum_dtype="float"
+):
     @T.prim_func
     def _kernel_ring_reduce_tma(
-            C: T.Tensor((M_per_rank * num_splits, N), dtype),
-            output: T.Tensor((M_per_rank, N), dtype),
+        C: T.Tensor((M_per_rank * num_splits, N), dtype),
+        output: T.Tensor((M_per_rank, N), dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(M_per_rank, block_M), T.ceildiv(N, block_N), threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(M_per_rank, block_M), T.ceildiv(N, block_N), threads=threads) as (bx, by):
             data_shared = T.alloc_shared((block_M, block_N), dtype)
             init_shared = T.alloc_shared((block_M, block_N), dtype)
             data_local = T.alloc_fragment((block_M, block_N), dtype)
@@ -233,10 +199,7 @@ def _kernel_ring_reduce_tma(
     return _kernel_ring_reduce_tma
 
 
-def _wait_eq_cuda(signal_tensor: torch.Tensor,
-                  signal: int,
-                  stream: Optional[torch.cuda.Stream] = None,
-                  require_i64=False):
+def _wait_eq_cuda(signal_tensor: torch.Tensor, signal: int, stream: Optional[torch.cuda.Stream] = None, require_i64=False):
     stream = stream or torch.cuda.current_stream()
     if signal_tensor.dtype in (torch.int32, torch.uint32):
         (err,) = cuda.cuStreamWaitValue32(
@@ -258,11 +221,13 @@ def _wait_eq_cuda(signal_tensor: torch.Tensor,
         raise Exception(f"Unsupported signal dtype {signal_tensor.dtype}")
 
 
-def intra_node_scatter(input_intra_node,
-                       scatter_bufs_intra_node: List[torch.Tensor],
-                       scatter_signal_buf_intra_node: torch.Tensor,
-                       local_rank,
-                       overlap_with_gemm=True):
+def intra_node_scatter(
+    input_intra_node,
+    scatter_bufs_intra_node: List[torch.Tensor],
+    scatter_signal_buf_intra_node: torch.Tensor,
+    local_rank,
+    overlap_with_gemm=True,
+):
     M, N = input_intra_node.shape
     local_world_size = len(scatter_bufs_intra_node)
     M_per_rank = M // local_world_size
@@ -275,10 +240,8 @@ def intra_node_scatter(input_intra_node,
         # print(f"scatter_signal_buf_intra_node[remote_local_rank]: {scatter_signal_buf_intra_node[remote_local_rank]}")
         if overlap_with_gemm:
             _wait_eq_cuda(scatter_signal_buf_intra_node[remote_local_rank], 1, stream)
-        src = input_intra_node[remote_local_rank * M_per_rank:(remote_local_rank + 1) *
-                               M_per_rank, :]
-        dst = scatter_bufs_intra_node[remote_local_rank][local_rank * M_per_rank:(local_rank + 1) *
-                                                         M_per_rank, :]
+        src = input_intra_node[remote_local_rank * M_per_rank : (remote_local_rank + 1) * M_per_rank, :]
+        dst = scatter_bufs_intra_node[remote_local_rank][local_rank * M_per_rank : (local_rank + 1) * M_per_rank, :]
         with torch.cuda.stream(stream):
             dst.copy_(src)
 
@@ -292,21 +255,15 @@ def ring_reduce_tma(
 ):
     total_M, N = input.shape
     M_per_split = total_M // num_splits
-    assert output.shape[
-        0] == M_per_split and total_M % num_splits == 0, f"{output.shape}, {total_M}, {num_splits}"
+    assert output.shape[0] == M_per_split and total_M % num_splits == 0, f"{output.shape}, {total_M}, {num_splits}"
 
     def alloc_fn(size, alignment, stream):
         return torch.empty(size, device="cuda", dtype=torch.int8)
 
     if num_sms == -1:
         ring_reduce_tma_func = kernel_ring_reduce_tma(
-            M_per_split,
-            N,
-            block_M=64,
-            block_N=64,
-            begin_idx=begin_idx,
-            num_splits=num_splits,
-            threads=128)
+            M_per_split, N, block_M=64, block_N=64, begin_idx=begin_idx, num_splits=num_splits, threads=128
+        )
         # if begin_idx == 0:
         #     print(ring_reduce_tma_func.get_kernel_source())
         ring_reduce_tma_func(input, output, stream=torch.cuda.current_stream().cuda_stream)
@@ -345,9 +302,7 @@ def ring_reduce(
         raise NotImplementedError("Only Hopper ring reduce is implemented now.")
 
 
-def reduce_scatter_for_each_node(input: torch.Tensor,
-                                 ctx: ReduceScatter2DContext,
-                                 output: Optional[torch.Tensor] = None):
+def reduce_scatter_for_each_node(input: torch.Tensor, ctx: ReduceScatter2DContext, output: Optional[torch.Tensor] = None):
     world_size = ctx.world_size
     local_world_size = ctx.local_world_size
     local_rank = ctx.local_rank
@@ -364,18 +319,14 @@ def reduce_scatter_for_each_node(input: torch.Tensor,
     stream = torch.cuda.current_stream()
     for n in range(0, nnodes):
         cur_node_id = (node_id + n + 1) % nnodes
-        input_intra_node = input[cur_node_id * M_per_node:(cur_node_id + 1) * M_per_node]
-        scatter_bufs_intra_node, scatter_signal_buf_intra_node = ctx.get_scatter_bufs_and_signal_for_each_node(
-            input, cur_node_id)
+        input_intra_node = input[cur_node_id * M_per_node : (cur_node_id + 1) * M_per_node]
+        scatter_bufs_intra_node, scatter_signal_buf_intra_node = ctx.get_scatter_bufs_and_signal_for_each_node(input, cur_node_id)
         intra_node_scatter(
-            input_intra_node,
-            scatter_bufs_intra_node,
-            scatter_signal_buf_intra_node,
-            local_rank,
-            overlap_with_gemm=ctx.overlap_with_gemm)
+            input_intra_node, scatter_bufs_intra_node, scatter_signal_buf_intra_node, local_rank, overlap_with_gemm=ctx.overlap_with_gemm
+        )
 
         # ring reduce intra node
-        rs_buf_cur_node = rs_per_node_buf[M_per_rank * cur_node_id:(cur_node_id + 1) * M_per_rank]
+        rs_buf_cur_node = rs_per_node_buf[M_per_rank * cur_node_id : (cur_node_id + 1) * M_per_rank]
         # nvshmem_barrier_all_on_stream(stream)
         reduction_stream.wait_stream(stream)
         with torch.cuda.stream(reduction_stream):
@@ -385,7 +336,8 @@ def reduce_scatter_for_each_node(input: torch.Tensor,
                 reduce_out_buf,
                 local_rank,
                 local_world_size,
-                num_sms=-1 if n == nnodes - 1 else num_reduction_sms)
+                num_sms=-1 if n == nnodes - 1 else num_reduction_sms,
+            )
 
             # inter node p2p
             if nnodes > 1:
@@ -408,12 +360,10 @@ def reduce_scatter_for_each_node(input: torch.Tensor,
     stream.wait_stream(reduction_stream)
     if nnodes == 1:
         return output
-    return p2p_buf[:M_per_rank * nnodes]
+    return p2p_buf[: M_per_rank * nnodes]
 
 
-def reduce_scatter_multi_node(input: torch.Tensor,
-                              ctx: ReduceScatter2DContext,
-                              output: Optional[torch.Tensor] = None):
+def reduce_scatter_multi_node(input: torch.Tensor, ctx: ReduceScatter2DContext, output: Optional[torch.Tensor] = None):
     """
     A hierarchical reduce-scatter implementation that overlaps the intra-node scatter
     with the local reduce and the inter-node p2p(after reduce). It also provides a rank-wise
@@ -443,9 +393,7 @@ def reduce_scatter_multi_node(input: torch.Tensor,
     return output
 
 
-def reduce_scatter_2d_op(input: torch.Tensor,
-                         ctx: ReduceScatter2DContext,
-                         output: Optional[torch.Tensor] = None):
+def reduce_scatter_2d_op(input: torch.Tensor, ctx: ReduceScatter2DContext, output: Optional[torch.Tensor] = None):
     M, N = input.shape
     assert input.dtype == ctx.dtype
     assert ctx.max_M >= M and ctx.N == N
diff --git a/examples/distributed/sp_ag_attention_intra_node.py b/examples/distributed/sp_ag_attention_intra_node.py
index 421f13393..b66684ae6 100644
--- a/examples/distributed/sp_ag_attention_intra_node.py
+++ b/examples/distributed/sp_ag_attention_intra_node.py
@@ -10,10 +10,13 @@
 
 
 @tilelang.jit
-def barrier_all_blocks_sys_kernel(num_local_rank,):
-
+def barrier_all_blocks_sys_kernel(
+    num_local_rank,
+):
     @T.prim_func
-    def main(barrier: T.Tensor((num_local_rank), "int32"),):
+    def main(
+        barrier: T.Tensor((num_local_rank), "int32"),
+    ):
         with T.Kernel(1, threads=32):
             T.barrier_blocks(barrier)
 
@@ -25,28 +28,36 @@ def main(barrier: T.Tensor((num_local_rank), "int32"),):
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
     },
     compile_flags=[
-        "-O3", "-Wno-deprecated-declarations", "-U__CUDA_NO_HALF_OPERATORS__",
-        "-U__CUDA_NO_HALF_CONVERSIONS__", "-U__CUDA_NO_HALF2_OPERATORS__",
-        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__", "--expt-relaxed-constexpr", "--expt-extended-lambda",
-        "--ptxas-options=-v,--register-usage-level=10", "-DNDEBUG"
+        "-O3",
+        "-Wno-deprecated-declarations",
+        "-U__CUDA_NO_HALF_OPERATORS__",
+        "-U__CUDA_NO_HALF_CONVERSIONS__",
+        "-U__CUDA_NO_HALF2_OPERATORS__",
+        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+        "--expt-relaxed-constexpr",
+        "--expt-extended-lambda",
+        "--ptxas-options=-v,--register-usage-level=10",
+        "-DNDEBUG",
     ],
 )
-def flashattn(batch_size,
-              groups,
-              UQ,
-              UKV,
-              heads,
-              dim,
-              is_causal,
-              enable_zig_zag,
-              enable_specialized,
-              rank,
-              num_ranks,
-              block_M=64,
-              block_N=64,
-              num_stages=1,
-              threads=128):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+def flashattn(
+    batch_size,
+    groups,
+    UQ,
+    UKV,
+    heads,
+    dim,
+    is_causal,
+    enable_zig_zag,
+    enable_specialized,
+    rank,
+    num_ranks,
+    block_M=64,
+    block_N=64,
+    num_stages=1,
+    threads=128,
+):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [UQ, heads, dim]
     kv_shape = [UKV, head_kv, dim]
@@ -83,8 +94,7 @@ def inner(
         global_offset_q: T.int32,
         kv_len_per_sp_block: T.int32,
     ):
-        T.copy(Q_unpad[q_start_idx + bx * block_M:q_start_idx + (bx + 1) * block_M, head_idx, :],
-               Q_shared)
+        T.copy(Q_unpad[q_start_idx + bx * block_M : q_start_idx + (bx + 1) * block_M, head_idx, :], Q_shared)
 
         T.fill(acc_o, 0)
         T.fill(logsum, 0)
@@ -92,30 +102,30 @@ def inner(
 
         prefix_len = k_current_seqlen - q_current_seqlen * num_ranks
         loop_range = (
-            T.ceildiv(prefix_len + global_offset_q + (bx + 1) * block_M, block_N)
-            if is_causal else T.ceildiv(k_current_seqlen, block_N))
+            T.ceildiv(prefix_len + global_offset_q + (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(k_current_seqlen, block_N)
+        )
 
         for k in T.Pipelined(loop_range, num_stages=num_stages):
             sp_block_idx = (k * block_N) // kv_len_per_sp_block
-            wait_rank = (
-                sp_block_idx if sp_block_idx < num_ranks else 2 * num_ranks - sp_block_idx - 1)
-            kv_load_offset = ((k * block_N) % kv_len_per_sp_block +
-                              sp_block_idx // num_ranks * kv_len_per_sp_block + wait_rank *
-                              (k_current_seqlen // num_ranks))
-            T.copy(
-                K_unpad[k_start_idx + kv_load_offset:k_start_idx + kv_load_offset + block_N,
-                        kv_head_idx, :], K_shared)
+            wait_rank = sp_block_idx if sp_block_idx < num_ranks else 2 * num_ranks - sp_block_idx - 1
+            kv_load_offset = (
+                (k * block_N) % kv_len_per_sp_block
+                + sp_block_idx // num_ranks * kv_len_per_sp_block
+                + wait_rank * (k_current_seqlen // num_ranks)
+            )
+            T.copy(K_unpad[k_start_idx + kv_load_offset : k_start_idx + kv_load_offset + block_N, kv_head_idx, :], K_shared)
 
             if is_causal:
                 for i, j in T.Parallel(block_M, block_N):
                     acc_s[i, j] = T.if_then_else(
-                        (prefix_len + global_offset_q + bx * block_M + i < k * block_N + j) or
-                        (bx * block_M + i >= q_current_seqlen or
-                         k * block_N + j >= k_current_seqlen), -1e9, 0)
+                        (prefix_len + global_offset_q + bx * block_M + i < k * block_N + j)
+                        or (bx * block_M + i >= q_current_seqlen or k * block_N + j >= k_current_seqlen),
+                        -1e9,
+                        0,
+                    )
             else:
                 for i, j in T.Parallel(block_M, block_N):
-                    acc_s[i, j] = T.if_then_else((bx * block_M + i >= q_current_seqlen or
-                                                  k * block_N + j >= k_current_seqlen), -1e9, 0)
+                    acc_s[i, j] = T.if_then_else((bx * block_M + i >= q_current_seqlen or k * block_N + j >= k_current_seqlen), -1e9, 0)
 
             T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
@@ -138,9 +148,7 @@ def inner(
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] *= scores_scale[i]
 
-            T.copy(
-                V_unpad[v_start_idx + kv_load_offset:v_start_idx + kv_load_offset + block_N,
-                        kv_head_idx, :], V_shared)
+            T.copy(V_unpad[v_start_idx + kv_load_offset : v_start_idx + kv_load_offset + block_N, kv_head_idx, :], V_shared)
 
             T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
@@ -154,17 +162,15 @@ def inner(
 
     @T.prim_func
     def main(
-            Q_unpad: T.Tensor(q_shape, dtype),
-            K_unpad: T.Tensor(kv_shape, dtype),
-            V_unpad: T.Tensor(kv_shape, dtype),
-            cu_seqlens_q: T.Tensor([batch_size + 1], "int32"),
-            cu_seqlens_k: T.Tensor([batch_size + 1], "int32"),
-            max_seqlen_q: T.int32,
-            Output_unpad: T.Tensor(o_shape, dtype),
+        Q_unpad: T.Tensor(q_shape, dtype),
+        K_unpad: T.Tensor(kv_shape, dtype),
+        V_unpad: T.Tensor(kv_shape, dtype),
+        cu_seqlens_q: T.Tensor([batch_size + 1], "int32"),
+        cu_seqlens_k: T.Tensor([batch_size + 1], "int32"),
+        max_seqlen_q: T.int32,
+        Output_unpad: T.Tensor(o_shape, dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(max_seqlen_q, block_M), heads, batch_size,
-                threads=threads) as (bx, by, bz):
+        with T.Kernel(T.ceildiv(max_seqlen_q, block_M), heads, batch_size, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
             K_shared = T.alloc_shared([block_N, dim], dtype)
             V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -194,24 +200,46 @@ def main(
             global_offset_q = q_current_seqlen * rank
             kv_len_per_sp_block = k_current_seqlen // num_ranks
 
-            inner(Q_unpad, K_unpad, V_unpad, Output_unpad, Q_shared, K_shared, V_shared, O_shared,
-                  acc_s, acc_s_cast, acc_o, scores_max, scores_max_prev, scores_scale, scores_sum,
-                  logsum, q_start_idx, k_start_idx, v_start_idx, q_current_seqlen, k_current_seqlen,
-                  bx, head_idx, kv_head_idx, global_offset_q, kv_len_per_sp_block)
+            inner(
+                Q_unpad,
+                K_unpad,
+                V_unpad,
+                Output_unpad,
+                Q_shared,
+                K_shared,
+                V_shared,
+                O_shared,
+                acc_s,
+                acc_s_cast,
+                acc_o,
+                scores_max,
+                scores_max_prev,
+                scores_scale,
+                scores_sum,
+                logsum,
+                q_start_idx,
+                k_start_idx,
+                v_start_idx,
+                q_current_seqlen,
+                k_current_seqlen,
+                bx,
+                head_idx,
+                kv_head_idx,
+                global_offset_q,
+                kv_len_per_sp_block,
+            )
 
     @T.prim_func
     def main_zigzag(
-            Q_unpad: T.Tensor(q_shape, dtype),
-            K_unpad: T.Tensor(kv_shape, dtype),
-            V_unpad: T.Tensor(kv_shape, dtype),
-            cu_seqlens_q: T.Tensor([batch_size + 1], "int32"),
-            cu_seqlens_k: T.Tensor([batch_size + 1], "int32"),
-            max_seqlen_q: T.int32,
-            Output_unpad: T.Tensor(o_shape, dtype),
+        Q_unpad: T.Tensor(q_shape, dtype),
+        K_unpad: T.Tensor(kv_shape, dtype),
+        V_unpad: T.Tensor(kv_shape, dtype),
+        cu_seqlens_q: T.Tensor([batch_size + 1], "int32"),
+        cu_seqlens_k: T.Tensor([batch_size + 1], "int32"),
+        max_seqlen_q: T.int32,
+        Output_unpad: T.Tensor(o_shape, dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(max_seqlen_q, block_M), heads, batch_size,
-                threads=threads) as (bx, by, bz):
+        with T.Kernel(T.ceildiv(max_seqlen_q, block_M), heads, batch_size, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
             K_shared = T.alloc_shared([block_N, dim], dtype)
             V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -239,27 +267,51 @@ def main_zigzag(
             k_current_seqlen = k_end_idx - k_start_idx
 
             half_q_shard_len = q_current_seqlen // 2
-            global_offset_q = rank * half_q_shard_len if bx * block_M < half_q_shard_len else \
-                q_current_seqlen * num_ranks - (rank + 2) * half_q_shard_len
+            global_offset_q = (
+                rank * half_q_shard_len if bx * block_M < half_q_shard_len else q_current_seqlen * num_ranks - (rank + 2) * half_q_shard_len
+            )
             kv_len_per_sp_block = k_current_seqlen // (2 * num_ranks)
 
-            inner(Q_unpad, K_unpad, V_unpad, Output_unpad, Q_shared, K_shared, V_shared, O_shared,
-                  acc_s, acc_s_cast, acc_o, scores_max, scores_max_prev, scores_scale, scores_sum,
-                  logsum, q_start_idx, k_start_idx, v_start_idx, q_current_seqlen, k_current_seqlen,
-                  bx, head_idx, kv_head_idx, global_offset_q, kv_len_per_sp_block)
+            inner(
+                Q_unpad,
+                K_unpad,
+                V_unpad,
+                Output_unpad,
+                Q_shared,
+                K_shared,
+                V_shared,
+                O_shared,
+                acc_s,
+                acc_s_cast,
+                acc_o,
+                scores_max,
+                scores_max_prev,
+                scores_scale,
+                scores_sum,
+                logsum,
+                q_start_idx,
+                k_start_idx,
+                v_start_idx,
+                q_current_seqlen,
+                k_current_seqlen,
+                bx,
+                head_idx,
+                kv_head_idx,
+                global_offset_q,
+                kv_len_per_sp_block,
+            )
 
     @T.prim_func
     def main_specialized(
-            Q_unpad: T.Tensor(q_shape, dtype),
-            K_unpad: T.Tensor(kv_shape, dtype),
-            V_unpad: T.Tensor(kv_shape, dtype),
-            cu_seqlens_q: T.Tensor([batch_size + 1], "int32"),
-            cu_seqlens_k: T.Tensor([batch_size + 1], "int32"),
-            max_seqlen_q: T.int32,
-            Output_unpad: T.Tensor(o_shape, dtype),
+        Q_unpad: T.Tensor(q_shape, dtype),
+        K_unpad: T.Tensor(kv_shape, dtype),
+        V_unpad: T.Tensor(kv_shape, dtype),
+        cu_seqlens_q: T.Tensor([batch_size + 1], "int32"),
+        cu_seqlens_k: T.Tensor([batch_size + 1], "int32"),
+        max_seqlen_q: T.int32,
+        Output_unpad: T.Tensor(o_shape, dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(max_seqlen_q, block_M), heads, batch_size, threads=384) as (bx_, by, bz):
+        with T.Kernel(T.ceildiv(max_seqlen_q, block_M), heads, batch_size, threads=384) as (bx_, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
             K_shared = T.alloc_shared([block_N, dim], dtype)
             V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -279,10 +331,12 @@ def main_specialized(
             bar_k_release = T.alloc_barrier(arrive_count=256)
             bar_v_release = T.alloc_barrier(arrive_count=256)
 
-            T.annotate_layout({
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
-            })
+            T.annotate_layout(
+                {
+                    O_shared: tilelang.layout.make_swizzled_layout(O_shared),
+                    Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
+                }
+            )
 
             batch_idx = bz
             head_idx = by
@@ -311,7 +365,9 @@ def main_specialized(
                 prefix_len = k_current_seqlen - q_current_seqlen * num_ranks
                 loop_range = (
                     T.ceildiv(prefix_len + global_offset_q + (bx + 1) * block_M, block_N)
-                    if is_causal else T.ceildiv(k_current_seqlen, block_N))
+                    if is_causal
+                    else T.ceildiv(k_current_seqlen, block_N)
+                )
 
                 T.barrier_wait(bar_q_ready, 0)
                 for k in T.serial(loop_range):
@@ -319,21 +375,18 @@ def main_specialized(
                         for i, j in T.Parallel(block_M, block_N):
                             acc_s[i, j] = T.if_then_else(
                                 (prefix_len + global_offset_q + bx * block_M + i < k * block_N + j)
-                                or (bx * block_M + i >= q_current_seqlen or
-                                    k * block_N + j >= k_current_seqlen), -1e9, 0)
+                                or (bx * block_M + i >= q_current_seqlen or k * block_N + j >= k_current_seqlen),
+                                -1e9,
+                                0,
+                            )
                     else:
                         for i, j in T.Parallel(block_M, block_N):
-                            acc_s[i, j] = T.if_then_else((bx * block_M + i >= q_current_seqlen or
-                                                          k * block_N + j >= k_current_seqlen),
-                                                         -1e9, 0)
+                            acc_s[i, j] = T.if_then_else(
+                                (bx * block_M + i >= q_current_seqlen or k * block_N + j >= k_current_seqlen), -1e9, 0
+                            )
 
                     T.barrier_wait(bar_k_ready, k % 2)
-                    T.gemm(
-                        Q_shared,
-                        K_shared,
-                        acc_s,
-                        transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullRow)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                     T.barrier_arrive(bar_k_release)
 
                     T.copy(scores_max, scores_max_prev)
@@ -371,35 +424,30 @@ def main_specialized(
                 prefix_len = k_current_seqlen - q_current_seqlen * num_ranks
                 loop_range = (
                     T.ceildiv(prefix_len + global_offset_q + (bx + 1) * block_M, block_N)
-                    if is_causal else T.ceildiv(k_current_seqlen, block_N))
-                T.copy(
-                    Q_unpad[q_start_idx + bx * block_M:q_start_idx + (bx + 1) * block_M,
-                            head_idx, :], Q_shared)
+                    if is_causal
+                    else T.ceildiv(k_current_seqlen, block_N)
+                )
+                T.copy(Q_unpad[q_start_idx + bx * block_M : q_start_idx + (bx + 1) * block_M, head_idx, :], Q_shared)
                 T.barrier_arrive(bar_q_ready)
                 for k in T.serial(loop_range):
                     T.barrier_wait(bar_k_release, (k + 1) % 2)
-                    T.copy(
-                        K_unpad[k_start_idx + (k * block_N):k_start_idx + (k * block_N) + block_N,
-                                kv_head_idx, :], K_shared)
+                    T.copy(K_unpad[k_start_idx + (k * block_N) : k_start_idx + (k * block_N) + block_N, kv_head_idx, :], K_shared)
                     T.barrier_arrive(bar_k_ready)
                     T.barrier_wait(bar_v_release, (k + 1) % 2)
-                    T.copy(
-                        V_unpad[v_start_idx + (k * block_N):v_start_idx + (k * block_N) + block_N,
-                                kv_head_idx, :], V_shared)
+                    T.copy(V_unpad[v_start_idx + (k * block_N) : v_start_idx + (k * block_N) + block_N, kv_head_idx, :], V_shared)
                     T.barrier_arrive(bar_v_ready)
 
     @T.prim_func
     def main_specialized_zigzag(
-            Q_unpad: T.Tensor(q_shape, dtype),
-            K_unpad: T.Tensor(kv_shape, dtype),
-            V_unpad: T.Tensor(kv_shape, dtype),
-            cu_seqlens_q: T.Tensor([batch_size + 1], "int32"),
-            cu_seqlens_k: T.Tensor([batch_size + 1], "int32"),
-            max_seqlen_q: T.int32,
-            Output_unpad: T.Tensor(o_shape, dtype),
+        Q_unpad: T.Tensor(q_shape, dtype),
+        K_unpad: T.Tensor(kv_shape, dtype),
+        V_unpad: T.Tensor(kv_shape, dtype),
+        cu_seqlens_q: T.Tensor([batch_size + 1], "int32"),
+        cu_seqlens_k: T.Tensor([batch_size + 1], "int32"),
+        max_seqlen_q: T.int32,
+        Output_unpad: T.Tensor(o_shape, dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(max_seqlen_q, block_M), heads, batch_size, threads=384) as (bx_, by, bz):
+        with T.Kernel(T.ceildiv(max_seqlen_q, block_M), heads, batch_size, threads=384) as (bx_, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
             K_shared = T.alloc_shared([block_N, dim], dtype)
             V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -420,10 +468,12 @@ def main_specialized_zigzag(
             bar_k_release = T.alloc_barrier(arrive_count=256)
             bar_v_release = T.alloc_barrier(arrive_count=256)
 
-            T.annotate_layout({
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
-            })
+            T.annotate_layout(
+                {
+                    O_shared: tilelang.layout.make_swizzled_layout(O_shared),
+                    Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
+                }
+            )
 
             batch_idx = bz
             head_idx = by
@@ -441,8 +491,9 @@ def main_specialized_zigzag(
             bx = T.ceildiv(max_seqlen_q, block_M) - bx_ - 1
 
             half_q_shard_len = q_current_seqlen // 2
-            global_offset_q = rank * half_q_shard_len if bx * block_M < half_q_shard_len else \
-                q_current_seqlen * num_ranks - (rank + 2) * half_q_shard_len
+            global_offset_q = (
+                rank * half_q_shard_len if bx * block_M < half_q_shard_len else q_current_seqlen * num_ranks - (rank + 2) * half_q_shard_len
+            )
             kv_len_per_sp_block = k_current_seqlen // (2 * num_ranks)
             tid = T.get_thread_binding(0)
 
@@ -455,7 +506,9 @@ def main_specialized_zigzag(
                 prefix_len = k_current_seqlen - q_current_seqlen * num_ranks
                 loop_range = (
                     T.ceildiv(prefix_len + global_offset_q + (bx + 1) * block_M, block_N)
-                    if is_causal else T.ceildiv(k_current_seqlen, block_N))
+                    if is_causal
+                    else T.ceildiv(k_current_seqlen, block_N)
+                )
 
                 T.barrier_wait(bar_q_ready, 0)
                 for k in T.serial(loop_range):
@@ -463,21 +516,18 @@ def main_specialized_zigzag(
                         for i, j in T.Parallel(block_M, block_N):
                             acc_s[i, j] = T.if_then_else(
                                 (prefix_len + global_offset_q + bx * block_M + i < k * block_N + j)
-                                or (bx * block_M + i >= q_current_seqlen or
-                                    k * block_N + j >= k_current_seqlen), -1e9, 0)
+                                or (bx * block_M + i >= q_current_seqlen or k * block_N + j >= k_current_seqlen),
+                                -1e9,
+                                0,
+                            )
                     else:
                         for i, j in T.Parallel(block_M, block_N):
-                            acc_s[i, j] = T.if_then_else((bx * block_M + i >= q_current_seqlen or
-                                                          k * block_N + j >= k_current_seqlen),
-                                                         -1e9, 0)
+                            acc_s[i, j] = T.if_then_else(
+                                (bx * block_M + i >= q_current_seqlen or k * block_N + j >= k_current_seqlen), -1e9, 0
+                            )
 
                     T.barrier_wait(bar_k_ready, k % 2)
-                    T.gemm(
-                        Q_shared,
-                        K_shared,
-                        acc_s,
-                        transpose_B=True,
-                        policy=T.GemmWarpPolicy.FullRow)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                     T.barrier_arrive(bar_k_release)
 
                     T.copy(scores_max, scores_max_prev)
@@ -515,28 +565,24 @@ def main_specialized_zigzag(
                 prefix_len = k_current_seqlen - q_current_seqlen * num_ranks
                 loop_range = (
                     T.ceildiv(prefix_len + global_offset_q + (bx + 1) * block_M, block_N)
-                    if is_causal else T.ceildiv(k_current_seqlen, block_N))
-                T.copy(
-                    Q_unpad[q_start_idx + bx * block_M:q_start_idx + (bx + 1) * block_M,
-                            head_idx, :], Q_shared)
+                    if is_causal
+                    else T.ceildiv(k_current_seqlen, block_N)
+                )
+                T.copy(Q_unpad[q_start_idx + bx * block_M : q_start_idx + (bx + 1) * block_M, head_idx, :], Q_shared)
                 T.barrier_arrive(bar_q_ready)
                 for k in T.serial(loop_range):
                     sp_block_idx = (k * block_N) // kv_len_per_sp_block
-                    wait_rank = (
-                        sp_block_idx if sp_block_idx < num_ranks else 2 * num_ranks - sp_block_idx -
-                        1)
-                    kv_load_offset = ((k * block_N) % kv_len_per_sp_block +
-                                      sp_block_idx // num_ranks * kv_len_per_sp_block + wait_rank *
-                                      (k_current_seqlen // num_ranks))
+                    wait_rank = sp_block_idx if sp_block_idx < num_ranks else 2 * num_ranks - sp_block_idx - 1
+                    kv_load_offset = (
+                        (k * block_N) % kv_len_per_sp_block
+                        + sp_block_idx // num_ranks * kv_len_per_sp_block
+                        + wait_rank * (k_current_seqlen // num_ranks)
+                    )
                     T.barrier_wait(bar_k_release, (k + 1) % 2)
-                    T.copy(
-                        K_unpad[k_start_idx + kv_load_offset:k_start_idx + kv_load_offset + block_N,
-                                kv_head_idx, :], K_shared)
+                    T.copy(K_unpad[k_start_idx + kv_load_offset : k_start_idx + kv_load_offset + block_N, kv_head_idx, :], K_shared)
                     T.barrier_arrive(bar_k_ready)
                     T.barrier_wait(bar_v_release, (k + 1) % 2)
-                    T.copy(
-                        V_unpad[v_start_idx + kv_load_offset:v_start_idx + kv_load_offset + block_N,
-                                kv_head_idx, :], V_shared)
+                    T.copy(V_unpad[v_start_idx + kv_load_offset : v_start_idx + kv_load_offset + block_N, kv_head_idx, :], V_shared)
                     T.barrier_arrive(bar_v_ready)
 
     if enable_specialized:
@@ -571,16 +617,14 @@ def create_sp_ag_attention_context_intra_node(
     device,
     allocator,
 ):
-    ag_k_buffers = tilelang.tensor((batch_size * max_seqlen_k, kv_head, head_dim),
-                                   dtype=input_dtype,
-                                   allocator=allocator,
-                                   return_peers=True)
+    ag_k_buffers = tilelang.tensor(
+        (batch_size * max_seqlen_k, kv_head, head_dim), dtype=input_dtype, allocator=allocator, return_peers=True
+    )
     ag_k_buffer = ag_k_buffers[rank]
 
-    ag_v_buffers = tilelang.tensor((batch_size * max_seqlen_k, kv_head, head_dim),
-                                   dtype=input_dtype,
-                                   allocator=allocator,
-                                   return_peers=True)
+    ag_v_buffers = tilelang.tensor(
+        (batch_size * max_seqlen_k, kv_head, head_dim), dtype=input_dtype, allocator=allocator, return_peers=True
+    )
     ag_v_buffer = ag_v_buffers[rank]
 
     attn_output_buffer = torch.empty(
@@ -603,14 +647,16 @@ def create_sp_ag_attention_context_intra_node(
         ag_v_buffer=ag_v_buffer,
         attn_output_buffer=attn_output_buffer,
         ag_stream=ag_stream,
-        barrier=barrier)
+        barrier=barrier,
+    )
 
     return ctx
 
 
 def barrier_all_on_stream(barrier: torch.Tensor, stream: torch.cuda.Stream, world_size: int):
     barrier_all_blocks_sys_func = barrier_all_blocks_sys_kernel(world_size)
-    barrier_all_blocks_sys_func(barrier, stream=stream.cuda_stream)
+    with torch.cuda.stream(stream):
+        barrier_all_blocks_sys_func(barrier)
 
 
 def cp_engine_producer_kv_all_gather(
@@ -681,12 +727,12 @@ def _cp_engine_copy_data(dst_ptr, src_ptr, cp_size, stream):
             for offset in range(1, world_size):
                 src_rank = (rank + offset) % world_size
 
-                k_src_ptr = (k_shards[src_rank].data_ptr() + byte_start // world_size)
-                k_dst_ptr = (k_buffers[rank].data_ptr() + byte_start + src_rank * byte_per_rank)
+                k_src_ptr = k_shards[src_rank].data_ptr() + byte_start // world_size
+                k_dst_ptr = k_buffers[rank].data_ptr() + byte_start + src_rank * byte_per_rank
                 _cp_engine_copy_data(k_dst_ptr, k_src_ptr, cp_size, ag_stream)
 
-                v_src_ptr = (v_shards[src_rank].data_ptr() + byte_start // world_size)
-                v_dst_ptr = (v_buffers[rank].data_ptr() + byte_start + src_rank * byte_per_rank)
+                v_src_ptr = v_shards[src_rank].data_ptr() + byte_start // world_size
+                v_dst_ptr = v_buffers[rank].data_ptr() + byte_start + src_rank * byte_per_rank
                 _cp_engine_copy_data(v_dst_ptr, v_src_ptr, cp_size, ag_stream)
 
     barrier_all_on_stream(barrier, ag_stream, world_size)
@@ -710,7 +756,6 @@ def fused_sp_ag_attn_intra_node(
     enable_specialized: bool = False,
     print_source: bool = False,
 ):
-
     BLOCK_M = 128
     BLOCK_N = 128
     num_stages = 2
@@ -764,20 +809,14 @@ def fused_sp_ag_attn_intra_node(
             block_M=BLOCK_M,
             block_N=BLOCK_N,
             num_stages=num_stages,
-            threads=threads)
+            threads=threads,
+        )
 
         if rank == 0 and print_source:
             print(kernel.get_kernel_source())
 
-        kernel(
-            q_shard,
-            ag_k,
-            ag_v,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            max_seqlen_q,
-            output,
-            stream=compute_stream.cuda_stream)
+        with torch.cuda.stream(compute_stream):
+            kernel(q_shard, ag_k, ag_v, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, output)
 
     compute_stream.wait_stream(ctx.ag_stream)
     barrier_all_on_stream(ctx.barrier, compute_stream, world_size)
diff --git a/examples/distributed/triton_sp.py b/examples/distributed/triton_sp.py
index d8236259b..1b99a5fac 100644
--- a/examples/distributed/triton_sp.py
+++ b/examples/distributed/triton_sp.py
@@ -97,8 +97,7 @@ def store_v4_b32_cond(ptr, val0, val1, val2, val3, mask, _semantic=None):
         }
         """,
         constraints=("=r,l,r,r,r,r,r"),  # no use output
-        args=[ptr, val0, val1, val2, val3,
-              mask.to(tl.int32, _semantic=_semantic)],
+        args=[ptr, val0, val1, val2, val3, mask.to(tl.int32, _semantic=_semantic)],
         dtype=tl.int32,
         is_pure=False,
         pack=1,
@@ -125,7 +124,7 @@ def _matmul_launch_metadata(grid, kernel, args):
         bytes_per_elem = args["c_ptr"].element_size()
     else:
         bytes_per_elem = 1 if args["FP8_OUTPUT"] else 2
-    ret[f"flops{bytes_per_elem * 8}"] = 2. * M * N * K
+    ret[f"flops{bytes_per_elem * 8}"] = 2.0 * M * N * K
     ret["bytes"] = bytes_per_elem * (M * K + N * K + M * N)
     return ret
 
@@ -138,13 +137,12 @@ def _kernel_consumer_gemm_persistent_repr(proxy):
     c_dtype = proxy.signature["c_ptr"].lstrip("*")
     BM, BN, BK = constexprs["BLOCK_SIZE_M"], constexprs["BLOCK_SIZE_N"], constexprs["BLOCK_SIZE_K"]
 
-    return f"cutlass_triton3x_sm{cap_major}{cap_minor}_a2a_consumer_gemm_persistent_tensorop_{a_dtype}_{b_dtype}_{c_dtype}_{BM}x{BN}x{BK}_ntn"
+    return (
+        f"cutlass_triton3x_sm{cap_major}{cap_minor}_a2a_consumer_gemm_persistent_tensorop_{a_dtype}_{b_dtype}_{c_dtype}_{BM}x{BN}x{BK}_ntn"
+    )
 
 
-@triton.jit(
-    do_not_specialize=["sp_rank"],
-    launch_metadata=_matmul_launch_metadata,
-    repr=_kernel_consumer_gemm_persistent_repr)
+@triton.jit(do_not_specialize=["sp_rank"], launch_metadata=_matmul_launch_metadata, repr=_kernel_consumer_gemm_persistent_repr)
 def matmul_kernel_descriptor_persistent(
     a_ptr,
     b_ptr,
@@ -176,13 +174,10 @@ def matmul_kernel_descriptor_persistent(
 
     tl.static_assert(K % sp_size == 0, f"K {K} must be divisible by sp_size {sp_size}")
     K_per_sp_rank: tl.constexpr = K // sp_size
-    tl.static_assert(
-        K_per_sp_rank % BLOCK_SIZE_K == 0,
-        f"K_per_sp_rank {K_per_sp_rank} must be divisible by BLOCK_SIZE_K {BLOCK_SIZE_K}")
+    tl.static_assert(K_per_sp_rank % BLOCK_SIZE_K == 0, f"K_per_sp_rank {K_per_sp_rank} must be divisible by BLOCK_SIZE_K {BLOCK_SIZE_K}")
     k_tiles: tl.constexpr = K // BLOCK_SIZE_K
 
-    tl.static_assert(A2A_TILE_N % BLOCK_SIZE_K == 0,
-                     f"A2A_TILE_N {A2A_TILE_N} must be divisible by BLOCK_SIZE_N {BLOCK_SIZE_K}")
+    tl.static_assert(A2A_TILE_N % BLOCK_SIZE_K == 0, f"A2A_TILE_N {A2A_TILE_N} must be divisible by BLOCK_SIZE_N {BLOCK_SIZE_K}")
     NUM_K_PER_TILE: tl.constexpr = A2A_TILE_N // BLOCK_SIZE_K
     # This is used for k-swizzle
     # k_tiles_per_rank: tl.constexpr = K_per_sp_rank // BLOCK_SIZE_K
@@ -212,10 +207,8 @@ def matmul_kernel_descriptor_persistent(
     tile_id_c = start_pid - NUM_GEMM_SMS
     num_pid_in_group = GROUP_SIZE_M * num_pid_n
 
-    for tile_id in tl.range(
-            start_pid, num_tiles, NUM_GEMM_SMS, flatten=False, warp_specialize=WARP_SPECIALIZE):
-        pid_m, pid_n = _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M,
-                                    NUM_GEMM_SMS)
+    for tile_id in tl.range(start_pid, num_tiles, NUM_GEMM_SMS, flatten=False, warp_specialize=WARP_SPECIALIZE):
+        pid_m, pid_n = _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_GEMM_SMS)
         offs_am = pid_m * BLOCK_SIZE_M
         offs_bn = pid_n * BLOCK_SIZE_N
 
@@ -235,12 +228,12 @@ def matmul_kernel_descriptor_persistent(
             if ki % NUM_K_PER_TILE == 0:
                 for chunk_id in range(chunk_beg, chunk_end + 1):
                     token = dl.wait(
-                        gemm_barrier_ptr + chunk_id * (k_tiles // NUM_K_PER_TILE) +
-                        ki // NUM_K_PER_TILE,
+                        gemm_barrier_ptr + chunk_id * (k_tiles // NUM_K_PER_TILE) + ki // NUM_K_PER_TILE,
                         1,
                         scope="gpu",
                         semantic="acquire",
-                        waitValue=1)
+                        waitValue=1,
+                    )
                     a_desc = dl.consume_token(a_desc, token)
             offs_k = ki * BLOCK_SIZE_K
             a = a_desc.load([offs_am, offs_k])
@@ -248,15 +241,13 @@ def matmul_kernel_descriptor_persistent(
             accumulator = tl.dot(a, b.T, accumulator)
 
         tile_id_c += NUM_GEMM_SMS
-        pid_m, pid_n = _compute_pid(tile_id_c, num_pid_in_group, num_pid_m, GROUP_SIZE_M,
-                                    NUM_GEMM_SMS)
+        pid_m, pid_n = _compute_pid(tile_id_c, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_GEMM_SMS)
         offs_cm = pid_m * BLOCK_SIZE_M
         offs_cn = pid_n * BLOCK_SIZE_N
 
         if HAS_BIAS:
             offs_bias_n = tl.arange(0, BLOCK_SIZE_N)
-            bias_data = tl.load(
-                bias_ptr + offs_cn + offs_bias_n, mask=(offs_cn + offs_bias_n < N)).to(tl.float32)
+            bias_data = tl.load(bias_ptr + offs_cn + offs_bias_n, mask=(offs_cn + offs_bias_n < N)).to(tl.float32)
             accumulator = accumulator + bias_data[None, :]
 
         if EPILOGUE_SUBTILE:
@@ -272,15 +263,7 @@ def matmul_kernel_descriptor_persistent(
             c_desc.store([offs_cm, offs_cn], c)
 
 
-def matmul_descriptor_persistent(sp_rank,
-                                 sp_size,
-                                 a,
-                                 b,
-                                 bias,
-                                 c,
-                                 gemm_barrier,
-                                 gemm_config: triton.Config,
-                                 warp_specialize: bool = False):
+def matmul_descriptor_persistent(sp_rank, sp_size, a, b, bias, c, gemm_barrier, gemm_config: triton.Config, warp_specialize: bool = False):
     # Check constraints.
     assert a.shape[1] == b.shape[1], "Incompatible dimensions"  # b is transposed
     assert a.dtype == b.dtype, "Incompatible dtypes"
@@ -295,8 +278,7 @@ def alloc_fn(size: int, alignment: int, stream: Optional[int]):
     triton.set_allocator(alloc_fn)
 
     def grid(META):
-        return (min(META["NUM_GEMM_SMS"],
-                    triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"])),)
+        return (min(META["NUM_GEMM_SMS"], triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"])),)
 
     matmul_kernel_descriptor_persistent[grid](
         a,
@@ -350,8 +332,7 @@ def kernel_all2all_push_intra_node_nvl(
 
     if FUSE_SYNC:
         tl.static_assert(SUPPORT_ATOMIC, "FUSE_SYNC requires SUPPORT_ATOMIC to be True")
-        barrier_all_intra_node_atomic_cas_block(sp_rank, rank, sp_size,
-                                                intra_node_sync_buf_ptr + pid * sp_size)
+        barrier_all_intra_node_atomic_cas_block(sp_rank, rank, sp_size, intra_node_sync_buf_ptr + pid * sp_size)
 
     for i in tl.static_range(sp_size + 1):
         tl.store(cum_seqlen_gpu_ptr + i, cum_seqlen_cpu_tuple[i])
@@ -363,13 +344,11 @@ def kernel_all2all_push_intra_node_nvl(
     offs_n = tl.arange(0, BLOCK_N // VEC)
 
     if sp_size <= NUM_COMM_SM:
-        tl.static_assert(NUM_COMM_SM % sp_size == 0,
-                         f"NUM_COMM_SM {NUM_COMM_SM} must be divisible by sp_size {sp_size}")
+        tl.static_assert(NUM_COMM_SM % sp_size == 0, f"NUM_COMM_SM {NUM_COMM_SM} must be divisible by sp_size {sp_size}")
         NUM_SM_PER_SP: tl.constexpr = NUM_COMM_SM // sp_size
         NUM_SP_PER_SM: tl.constexpr = 1
     else:
-        tl.static_assert(sp_size % NUM_COMM_SM == 0,
-                         f"sp_size {sp_size} must be divisible by NUM_COMM_SM {NUM_COMM_SM}")
+        tl.static_assert(sp_size % NUM_COMM_SM == 0, f"sp_size {sp_size} must be divisible by NUM_COMM_SM {NUM_COMM_SM}")
         NUM_SM_PER_SP: tl.constexpr = 1
         NUM_SP_PER_SM: tl.constexpr = sp_size // NUM_COMM_SM
 
@@ -384,8 +363,8 @@ def kernel_all2all_push_intra_node_nvl(
         remote_seq_len = seq_end - seq_beg
         num_tile_m = tl.cdiv(remote_seq_len, BLOCK_M)
         tl.static_assert(
-            local_head * head_dim % BLOCK_N == 0,
-            f"local_head * head_dim {local_head * head_dim} must be divisible by BLOCK_N {BLOCK_N}")
+            local_head * head_dim % BLOCK_N == 0, f"local_head * head_dim {local_head * head_dim} must be divisible by BLOCK_N {BLOCK_N}"
+        )
         num_tile_n = local_head * head_dim // BLOCK_N
 
         for tile_id_m_outer_n_tail in range(0, tl.cdiv(num_tile_m, GROUP_SIZE_M) * num_tile_n):
@@ -398,32 +377,32 @@ def kernel_all2all_push_intra_node_nvl(
                     attn_mask_m = attn_offs_m < seq_end
                     attn_offs_n = tile_id_n_tail * BLOCK_N + offs_n * VEC
                     data0, data1, data2, data3 = load_v4_b32_cond(
-                        attn_out_ptr + attn_offs_m[:, None] * local_head * head_dim +
-                        attn_offs_n[None, :],
-                        mask=attn_mask_m[:, None])
+                        attn_out_ptr + attn_offs_m[:, None] * local_head * head_dim + attn_offs_n[None, :], mask=attn_mask_m[:, None]
+                    )
 
                     out_offs_m = tile_id_m_tail * BLOCK_M + offs_m
                     out_mask_m = out_offs_m < remote_seq_len
                     out_offs_n = sp_rank * local_head * head_dim + tile_id_n_tail * BLOCK_N + offs_n * VEC
                     store_v4_b32_cond(
-                        remote_a2a_out_ptr + out_offs_m[:, None] * global_head * head_dim +
-                        out_offs_n[None, :],
+                        remote_a2a_out_ptr + out_offs_m[:, None] * global_head * head_dim + out_offs_n[None, :],
                         data0,
                         data1,
                         data2,
                         data3,
-                        mask=out_mask_m[:, None])
+                        mask=out_mask_m[:, None],
+                    )
 
                     if not SKIP_BARRIER:
                         __syncthreads()
-                        notify_barrier_ptr = remote_barrier_ptr + tile_id_m_tail * num_tile_n * sp_size + sp_rank * num_tile_n + tile_id_n_tail
+                        notify_barrier_ptr = (
+                            remote_barrier_ptr + tile_id_m_tail * num_tile_n * sp_size + sp_rank * num_tile_n + tile_id_n_tail
+                        )
                         thread_idx = tid(0)
                         if thread_idx == 0:
                             st(notify_barrier_ptr, 1, scope="sys", semantic="release")
 
 
 class SpUlysessOAll2AllGemmKernel:
-
     def __init__(
         self,
         world_group: torch.distributed.ProcessGroup,
@@ -492,14 +471,13 @@ def finalize(self):
     def init_symm_buffer(self):
         max_local_seq = self.max_seqlen // self.sp_size
         self._comm_output_buffer = nvshmem_create_tensor(
-            [self.max_num_comm_buf, self.max_batch, max_local_seq, self.num_head * self.head_dim],
-            self.input_dtype)
+            [self.max_num_comm_buf, self.max_batch, max_local_seq, self.num_head * self.head_dim], self.input_dtype
+        )
         self._barrier_buffer = nvshmem_create_tensor(
-            [triton.cdiv(self.max_batch * self.max_seqlen, self.BLOCK_SIZE_M) * self.num_head],
-            torch.int32)
+            [triton.cdiv(self.max_batch * self.max_seqlen, self.BLOCK_SIZE_M) * self.num_head], torch.int32
+        )
         self._barrier_buffer.zero_()
-        self._intra_node_sync_buffer = nvshmem_create_tensor([self.sp_size * self.max_sms],
-                                                             torch.int32)
+        self._intra_node_sync_buffer = nvshmem_create_tensor([self.sp_size * self.max_sms], torch.int32)
         self._intra_node_sync_buffer.zero_()
         self._sp_group_sync_buffer = nvshmem_create_tensor([self.world_size], torch.int32)
         self._sp_group_sync_buffer.zero_()
@@ -525,30 +503,31 @@ def sp_group_barrier_all_intra_node(self, stream=None):
         stream = torch.cuda.current_stream() if stream is None else stream
         sp_local_rank = self.local_rank % self.sp_size
         with torch.cuda.stream(stream):
-            barrier_all_intra_node_atomic_cas_block[(1,)](sp_local_rank, self.rank, self.sp_size,
-                                                          self._sp_group_sync_buffer)
+            barrier_all_intra_node_atomic_cas_block[(1,)](sp_local_rank, self.rank, self.sp_size, self._sp_group_sync_buffer)
 
     def reset_cusum_seq_lens(self, local_seqlen, seq_lens_cpu=None):
         if seq_lens_cpu is None:
             seq_lens_cpu = [local_seqlen] * self.sp_size
         else:
             seq_lens_cpu = seq_lens_cpu.tolist()
-        assert local_seqlen == seq_lens_cpu[
-            self.local_rank % self.
-            sp_size], f"local_seqlen {local_seqlen} != seq_lens_cpu[{self.local_rank % self.sp_size}]={seq_lens_cpu[self.local_rank % self.sp_size]}"
+        assert local_seqlen == seq_lens_cpu[self.local_rank % self.sp_size], (
+            f"local_seqlen {local_seqlen} != seq_lens_cpu[{self.local_rank % self.sp_size}]={seq_lens_cpu[self.local_rank % self.sp_size]}"
+        )
         cum_seqlen_cpu = [0] + list(itertools.accumulate(seq_lens_cpu))
         self._cum_seq_len_cpu_tuple = tuple(cum_seqlen_cpu)
 
-    def forward(self,
-                inputs: torch.Tensor,
-                weight: torch.Tensor,
-                seq_lens_cpu: Optional[torch.Tensor] = None,
-                bias: Optional[torch.Tensor] = None,
-                output: Optional[torch.Tensor] = None,
-                a2a_output: Optional[torch.Tensor] = None,
-                transpose_weight: bool = False,
-                num_comm_sms: int = -1,
-                sm_margin: int = 0):
+    def forward(
+        self,
+        inputs: torch.Tensor,
+        weight: torch.Tensor,
+        seq_lens_cpu: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
+        output: Optional[torch.Tensor] = None,
+        a2a_output: Optional[torch.Tensor] = None,
+        transpose_weight: bool = False,
+        num_comm_sms: int = -1,
+        sm_margin: int = 0,
+    ):
         if num_comm_sms == -1:
             num_comm_sms = self.world_size
         assert num_comm_sms >= 0, "num_comm_sms must be non-negative"
@@ -582,7 +561,7 @@ def forward(self,
 
         self.reset_cusum_seq_lens(local_seqlen=local_seq_len, seq_lens_cpu=seq_lens_cpu)
 
-        gemm_input_a = self._comm_output_buffer.view(-1)[:M * K].view([M, K])
+        gemm_input_a = self._comm_output_buffer.view(-1)[: M * K].view([M, K])
 
         cur_stream = torch.cuda.current_stream()
 
@@ -618,46 +597,42 @@ def forward(self,
         )
 
         if output is None:
-            output = torch.empty([bs, local_seq_len, N],
-                                 device=inputs.device,
-                                 dtype=self.output_dtype)
+            output = torch.empty([bs, local_seq_len, N], device=inputs.device, dtype=self.output_dtype)
 
         assert len(output.shape) == 3, f"output must be 4D tensor, got {len(output)}D"
-        assert output.shape[
-            0] == bs, f"output batch size {output.shape[0]} must be equal to input batch size {bs}"
-        assert output.shape[
-            1] == local_seq_len, f"output seq_len {output.shape[1]} must be equal to local_seq_len {local_seq_len}"
-        assert output.shape[
-            2] == N, f"output head {output.shape[2]} must be equal to output size {N}"
+        assert output.shape[0] == bs, f"output batch size {output.shape[0]} must be equal to input batch size {bs}"
+        assert output.shape[1] == local_seq_len, f"output seq_len {output.shape[1]} must be equal to local_seq_len {local_seq_len}"
+        assert output.shape[2] == N, f"output head {output.shape[2]} must be equal to output size {N}"
         assert output.is_contiguous(), f"output must be contiguous, got {output.shape}"
 
-        assert self.max_gemm_sms - num_comm_sms - sm_margin > 0, f"max_gemm_sms {self.max_gemm_sms} - num_comm_sms {num_comm_sms} - sm_margin {sm_margin} must be greater than 0"
+        assert self.max_gemm_sms - num_comm_sms - sm_margin > 0, (
+            f"max_gemm_sms {self.max_gemm_sms} - num_comm_sms {num_comm_sms} - sm_margin {sm_margin} must be greater than 0"
+        )
         gemm_config = triton.Config(
             {
-                'BLOCK_SIZE_M': self.BLOCK_SIZE_M,
-                'BLOCK_SIZE_N': self.BLOCK_SIZE_N,
-                'BLOCK_SIZE_K': self.BLOCK_SIZE_K,
-                'GROUP_SIZE_M': self.GROUP_SIZE_M,
-                'A2A_TILE_M': self.A2A_TILE_M,
-                'A2A_TILE_N': self.A2A_TILE_N,
-                'NUM_GEMM_SMS': self.max_gemm_sms - num_comm_sms - sm_margin
+                "BLOCK_SIZE_M": self.BLOCK_SIZE_M,
+                "BLOCK_SIZE_N": self.BLOCK_SIZE_N,
+                "BLOCK_SIZE_K": self.BLOCK_SIZE_K,
+                "GROUP_SIZE_M": self.GROUP_SIZE_M,
+                "A2A_TILE_M": self.A2A_TILE_M,
+                "A2A_TILE_N": self.A2A_TILE_N,
+                "NUM_GEMM_SMS": self.max_gemm_sms - num_comm_sms - sm_margin,
             },
             num_stages=self.num_stages,
-            num_warps=self.num_warps)
+            num_warps=self.num_warps,
+        )
 
         with torch.cuda.stream(self.compute_stream):
-            matmul_descriptor_persistent(self.sp_rank, self.sp_size, gemm_input_a, weight, bias,
-                                         output, self._barrier_buffer, gemm_config,
-                                         self.warp_specialize)
+            matmul_descriptor_persistent(
+                self.sp_rank, self.sp_size, gemm_input_a, weight, bias, output, self._barrier_buffer, gemm_config, self.warp_specialize
+            )
 
         if a2a_output is not None:
-            assert a2a_output.shape == (
-                bs, local_seq_len, local_head * self.sp_size, head_dim
-            ), f"a2a_output shape {a2a_output.shape} must be equal to (bs, local_seq_len, local_head * self.sp_size, head_dim) ({bs}, {local_seq_len}, {local_head * self.sp_size}, {head_dim})"
-            assert a2a_output.is_contiguous(
-            ), f"a2a_output must be contiguous, got {a2a_output.shape}"
-            a2a_output.copy_(
-                gemm_input_a.view(bs, local_seq_len, local_head * self.sp_size * head_dim))
+            assert a2a_output.shape == (bs, local_seq_len, local_head * self.sp_size, head_dim), (
+                f"a2a_output shape {a2a_output.shape} must be equal to (bs, local_seq_len, local_head * self.sp_size, head_dim) ({bs}, {local_seq_len}, {local_head * self.sp_size}, {head_dim})"
+            )
+            assert a2a_output.is_contiguous(), f"a2a_output must be contiguous, got {a2a_output.shape}"
+            a2a_output.copy_(gemm_input_a.view(bs, local_seq_len, local_head * self.sp_size * head_dim))
             ret = (output, a2a_output)
         else:
             ret = (output,)
@@ -701,7 +676,7 @@ def post_attn_a2a(
         self.reset_cusum_seq_lens(local_seqlen=local_seq_len, seq_lens_cpu=seq_lens_cpu)
 
         assert comm_buf_idx < self.max_num_comm_buf, f"comm_buf_idx {comm_buf_idx} must be less than num_comm_buf {self.max_num_comm_buf}"
-        gemm_input_a = self._comm_output_buffer[comm_buf_idx].view(-1)[:M * K].view([M, K])
+        gemm_input_a = self._comm_output_buffer[comm_buf_idx].view(-1)[: M * K].view([M, K])
 
         cur_stream = torch.cuda.current_stream()
 
diff --git a/examples/dsa_sparse_finetune/dsa.py b/examples/dsa_sparse_finetune/dsa.py
new file mode 100644
index 000000000..9fae8e5e3
--- /dev/null
+++ b/examples/dsa_sparse_finetune/dsa.py
@@ -0,0 +1,223 @@
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from indexer_topk_reducesum import indexer_topk_reducesum_interface
+from indexer_bwd import indexer_bwd_interface
+from sparse_mla_fwd import sparse_mla_fwd_interface
+from sparse_mla_bwd import sparse_mla_bwd
+from sparse_mla_topk_reducesum import sparse_mla_topk_reducesum_interface
+from einops import einsum, repeat
+from utils import get_abs_err, get_err_ratio
+
+
+class RegsiterLossFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, loss):
+        ctx.save_for_backward(loss)
+        return x
+
+    @staticmethod
+    def backward(ctx, grad):
+        loss = ctx.saved_tensors
+        return grad, torch.ones(1, dtype=loss[0].dtype, device=loss[0].device)
+
+
+register_loss = RegsiterLossFunction.apply
+
+
+def ref_deepseek_sparse_attention_innner(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    index_q: torch.Tensor,
+    index_k: torch.Tensor,
+    weights: torch.Tensor,
+    topk: int,
+    dim_v: int,
+    sm_scale: Optional[float] = None,
+    index_sm_scale: Optional[float] = None,
+):
+    dtype = q.dtype
+    q, kv, index_q, index_k, weights = map(lambda x: x.to(torch.float32), (q, kv, index_q, index_k, weights))
+
+    index_sm_scale = index_q.shape[-1] ** -0.5
+    b, s = index_q.shape[:2]
+
+    # tl_topk_indices = tl_topk_indices.to(torch.int64)
+    # tl_topk_indices[tl_topk_indices == -1] = s
+
+    casual_mask = (torch.arange(s)[:, None] >= torch.arange(s)[None, :]).to(q.device)
+    index_logits = einsum(index_q, index_k, "b s1 h k, b s2 k -> b s1 h s2")
+    index_logits = F.relu(index_logits)
+    index_logits = (index_logits * weights.unsqueeze(-1)).sum(dim=-2, dtype=torch.float32) * index_sm_scale
+    index_logits = torch.where(casual_mask, index_logits, float("-inf"))
+    topk_indices = torch.topk(index_logits, k=topk, dim=-1).indices
+    topk_logits = torch.gather(F.pad(index_logits, (0, 1), value=float("-inf")), dim=-1, index=topk_indices)
+    topk_score = F.log_softmax(topk_logits, dim=-1, dtype=torch.float32)
+    index_topk_score = topk_score
+
+    if sm_scale is None:
+        sm_scale = kv.shape[-1] ** -0.5
+
+    h = q.shape[-2]
+    index_mask = torch.zeros((b, s, s + 1), dtype=torch.bool, device="cuda").scatter_(
+        dim=-1, index=topk_indices, src=torch.ones_like(topk_indices, dtype=torch.bool)
+    )[:, :, :-1]
+    mask = repeat(casual_mask & index_mask, "b s1 s2 -> b s1 h s2", h=h)
+    k, v = kv, kv[..., :dim_v]
+    logits = einsum(q, k, "b s1 h d, b s2 d -> b s1 h s2") * sm_scale
+    logits = torch.where(mask, logits, float("-inf"))
+    attn_score = F.softmax(logits, dim=-1, dtype=torch.float32)
+    o = einsum(attn_score, v, "b s1 h s2, b s2 d -> b s1 h d")
+
+    attn_score = attn_score.sum(dim=-2)  # [b, s1, s2]
+    attn_topk_score = torch.gather(F.pad(attn_score, (0, 1)), dim=-1, index=topk_indices)
+    attn_topk_score = attn_topk_score / attn_topk_score.sum(dim=-1, keepdim=True)
+
+    loss = F.kl_div(index_topk_score.clip(-100, 0), attn_topk_score.detach().log().clip(-100, 0), log_target=True, reduction="sum")
+    o = register_loss(o, loss)
+
+    return o.to(dtype), topk_indices
+
+
+def ref_deepseek_sparse_attention(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    index_q: torch.Tensor,
+    index_k: torch.Tensor,
+    weights: torch.Tensor,
+    offsets: torch.Tensor,
+    topk: int,
+    dim_v: int,
+    sm_scale: Optional[float] = None,
+    index_sm_scale: Optional[float] = None,
+):
+    all_o, all_topk_indices = [], []
+    for i in range(offsets.shape[0] - 1):
+        o, topk_indices = ref_deepseek_sparse_attention_innner(
+            q[None, offsets[i] : offsets[i + 1]],
+            kv[None, offsets[i] : offsets[i + 1]],
+            index_q[None, offsets[i] : offsets[i + 1]],
+            index_k[None, offsets[i] : offsets[i + 1]],
+            weights[None, offsets[i] : offsets[i + 1]],
+            topk,
+            dim_v,
+            sm_scale,
+            index_sm_scale,
+        )
+        all_o.append(o.squeeze(0))
+        all_topk_indices.append(topk_indices.squeeze(0))
+    o = torch.cat(all_o, dim=0)
+    topk_indices = torch.cat(all_topk_indices, dim=0)
+    return o, topk_indices
+
+
+class DSAFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        kv: torch.Tensor,
+        index_q: torch.Tensor,
+        index_k: torch.Tensor,
+        weights: torch.Tensor,
+        offsets: torch.Tensor,
+        topk: int,
+        dim_v: int,
+        sm_scale: Optional[float] = None,
+    ):
+        # topk_indices, index_score = ref_index_score(index_q, weights, index_k, topk)
+        topk_indices, index_score = indexer_topk_reducesum_interface(index_q, weights, index_k, topk, offsets)
+        o, lse = sparse_mla_fwd_interface(q, kv.unsqueeze(-2), topk_indices.unsqueeze(-2), offsets, sm_scale=sm_scale, d_v=dim_v)
+        ctx.save_for_backward(q, kv, index_q, index_k, weights, topk_indices, index_score, o, lse, offsets)
+        ctx.topk = topk
+        ctx.dim_v = dim_v
+        ctx.sm_scale = sm_scale
+        return o, topk_indices
+
+    @staticmethod
+    def backward(
+        ctx,
+        do: torch.Tensor,
+        _1: torch.Tensor,
+    ):
+        q, kv, index_q, index_k, weights, topk_indices, index_score, o, lse, offsets = ctx.saved_tensors
+        attn_score = sparse_mla_topk_reducesum_interface(
+            q, kv.unsqueeze(-2), topk_indices.unsqueeze(-2), lse, offsets, dim_v=ctx.dim_v
+        ).squeeze(-2)
+        dq, dkv = sparse_mla_bwd(q, kv.unsqueeze(-2), o, do, topk_indices.unsqueeze(-2), lse, offsets, sm_scale=ctx.sm_scale)
+        dindex_q, dweights, dindex_k = indexer_bwd_interface(index_q, weights, index_k, attn_score, index_score, topk_indices, offsets)
+        return dq, dkv.squeeze(-2), dindex_q, dindex_k, dweights, None, None, None, None
+
+
+def deepseek_sparse_attention(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    index_q: torch.Tensor,
+    index_k: torch.Tensor,
+    weights: torch.Tensor,
+    offsets: torch.Tensor,
+    topk: int,
+    dim_v: int,
+    sm_scale: Optional[float] = None,
+):
+    return DSAFunction.apply(q, kv, index_q, index_k, weights, offsets, topk, dim_v, sm_scale)
+
+
+def test_kernel(
+    B=1,
+    S=2048,
+    H=16,
+    D=512,
+    tail_D=64,
+    index_D=128,
+    topk=64,
+):
+    torch.manual_seed(42)
+    q = torch.randn((S, H, D + tail_D)).cuda().bfloat16().requires_grad_()
+    kv = torch.randn((S, D + tail_D)).cuda().bfloat16().requires_grad_()
+    index_q = torch.randn((S, H, index_D)).cuda().bfloat16().requires_grad_()
+    weights = torch.randn((S, H)).cuda().bfloat16().requires_grad_()
+    index_k = torch.randn((S, index_D)).cuda().bfloat16().requires_grad_()
+    do = torch.randn((S, H, D)).cuda().bfloat16().requires_grad_()
+    offsets = torch.tensor([0, S // 2, S], dtype=torch.int32).cuda()
+
+    o, topk_indices = deepseek_sparse_attention(q, kv, index_q, index_k, weights, offsets, topk, D)
+    o.backward(do)
+    q_grad, q.grad = q.grad, None
+    kv_grad, kv.grad = kv.grad, None
+    index_q_grad, index_q.grad = index_q.grad, None
+    index_k_grad, index_k.grad = index_k.grad, None
+    weights_grad, weights.grad = weights.grad, None
+
+    ref_o, ref_topk_indices = ref_deepseek_sparse_attention(q, kv, index_q, index_k, weights, offsets, topk, D)
+    ref_o.backward(do)
+    ref_q_grad, q.grad = q.grad, None
+    ref_kv_grad, kv.grad = kv.grad, None
+    ref_index_q_grad, index_q.grad = index_q.grad, None
+    ref_index_k_grad, index_k.grad = index_k.grad, None
+    ref_weights_grad, weights.grad = weights.grad, None
+
+    print(f"o err: {get_abs_err(o, ref_o):.6f} ratio: {get_err_ratio(o, ref_o):.6f}")
+    print(f"q.grad err: {get_abs_err(q_grad, ref_q_grad):.6f} ratio: {get_err_ratio(q_grad, ref_q_grad):.6f}")
+    print(f"kv.grad err: {get_abs_err(kv_grad, ref_kv_grad):.6f} ratio: {get_err_ratio(kv_grad, ref_kv_grad):.6f}")
+    print(
+        f"index_q.grad err: {get_abs_err(index_q_grad[:, :64, :], ref_index_q_grad[:, :64, :]):.6f} ratio: {get_err_ratio(index_q_grad[:, :64, :], ref_index_q_grad[:, :64, :]):.6f}"
+    )
+    print(f"index_k.grad err: {get_abs_err(index_k_grad, ref_index_k_grad):.6f} ratio: {get_err_ratio(index_k_grad, ref_index_k_grad):.6f}")
+    print(f"weights.grad err: {get_abs_err(weights_grad, ref_weights_grad):.6f} ratio: {get_err_ratio(weights_grad, ref_weights_grad):.6f}")
+
+    intersections = []
+    for j in range(S):
+        ref_np = ref_topk_indices[j].cpu().to(torch.int32).numpy()
+        trt_np = topk_indices[j].cpu().to(torch.int32).numpy()
+
+        mask = trt_np != -1
+
+        set_ref = set(ref_np[mask])
+        set_trt = set(trt_np[mask])
+        intersection = set_ref & set_trt
+        intersections.append(len(intersection) / len(set_ref))
+    print("average intersections: {:.4f}".format(sum(intersections) / len(intersections)))
+
+
+test_kernel()
diff --git a/examples/dsa_sparse_finetune/index.py b/examples/dsa_sparse_finetune/index.py
new file mode 100644
index 000000000..5e4800411
--- /dev/null
+++ b/examples/dsa_sparse_finetune/index.py
@@ -0,0 +1,82 @@
+# Modified from: https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/utils/index.py
+import torch
+import torch.nn.functional as F
+import functools
+from typing import Callable, Any
+
+
+def tensor_cache(
+    fn: Callable[..., torch.Tensor],
+) -> Callable[..., torch.Tensor]:
+    """
+    A decorator that caches the most recent result of a function with tensor inputs.
+
+    This decorator will store the output of the decorated function for the most recent set of input tensors.
+    If the function is called again with the same input tensors, it will return the cached result.
+
+
+    Args:
+        fn (Callable[..., torch.Tensor]):
+            The function to be decorated. It should take tensor inputs and return tensor outputs.
+
+    Returns:
+        Callable[..., torch.Tensor]:
+            A wrapped version of the input function with single-entry caching.
+    """
+    last_args: tuple | None = None
+    last_kwargs: dict | None = None
+    last_result: Any = None
+
+    @functools.wraps(fn)
+    def wrapper(*args: Any, **kwargs: Any) -> Any:
+        nonlocal last_args, last_kwargs, last_result
+
+        if (
+            (last_args is not None and last_kwargs is not None)
+            and (len(args) == len(last_args) and len(kwargs) == len(last_kwargs))
+            and all(a is b for a, b in zip(args, last_args, strict=False))
+            and all(k in last_kwargs and v is last_kwargs[k] for k, v in kwargs.items())
+        ):
+            return last_result
+
+        result = fn(*args, **kwargs)
+        last_args, last_kwargs, last_result = args, kwargs, result
+        return result
+
+    return wrapper
+
+
+@tensor_cache
+def prepare_lens(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return torch.diff(cu_seqlens)
+
+
+@tensor_cache
+def prepare_cu_seqlens_from_lens(
+    lens: torch.LongTensor,
+    dtype: torch.dtype | None = torch.int32,
+) -> torch.LongTensor:
+    return F.pad(lens.cumsum(dim=0, dtype=dtype), (1, 0))
+
+
+@tensor_cache
+def prepare_lens_from_cu_seqlens(
+    cu_seqlens: torch.LongTensor,
+) -> torch.LongTensor:
+    return torch.diff(cu_seqlens)
+
+
+@tensor_cache
+def prepare_position_ids(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return torch.cat([torch.arange(n, dtype=cu_seqlens.dtype, device=cu_seqlens.device) for n in prepare_lens(cu_seqlens).unbind()])
+
+
+@tensor_cache
+def prepare_sequence_ids(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return prepare_position_ids(cu_seqlens).eq(0).cumsum(0) - 1
+
+
+@tensor_cache
+def prepare_token_indices(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    position_ids = prepare_position_ids(cu_seqlens)
+    return torch.stack([prepare_sequence_ids(cu_seqlens), position_ids], 1).to(cu_seqlens)
diff --git a/examples/dsa_sparse_finetune/indexer_bwd.py b/examples/dsa_sparse_finetune/indexer_bwd.py
new file mode 100644
index 000000000..68508ad4e
--- /dev/null
+++ b/examples/dsa_sparse_finetune/indexer_bwd.py
@@ -0,0 +1,254 @@
+import torch
+import torch.nn.functional as F
+from einops import einsum, repeat
+
+import tilelang as tl
+import tilelang.language as T
+from typing import Optional
+from index import prepare_token_indices
+
+from utils import get_abs_err, get_err_ratio
+
+BF16 = T.bfloat16
+FP32 = T.float32
+INT32 = T.int32
+
+pass_configs = {
+    tl.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+    tl.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+}
+
+
+@tl.jit(pass_configs=pass_configs)
+def tl_indexer_bwd_impl(
+    heads: int,
+    dim: int,
+    topk: int,
+    sm_scale: Optional[float] = None,
+    block_I: int = 32,
+    num_stages: int = 0,
+    num_threads: int = 128,
+):
+    assert num_stages == 0
+    assert topk == tl.math.next_power_of_2(topk)
+    assert topk % block_I == 0
+    assert heads <= 64 and heads % 8 == 0
+    batch_plus_one = T.symbolic("batch_plus_one")
+    seq_len = T.symbolic("seq_len")
+    dtype: str = BF16
+    accum_dtype: str = FP32
+    index_q_shape = [seq_len, heads, dim]
+    weights_shape = [seq_len, heads]
+    index_k_shape = [seq_len, dim]
+    shape_p = [seq_len, topk]
+    topk_indices_shape = [seq_len, topk]
+    offsets_shape = [batch_plus_one]
+    token_indices_shape = [seq_len, 2]
+    if sm_scale is None:
+        sm_scale = dim**-0.5
+
+    @T.prim_func
+    def tl_indexer_bwd_kernel(
+        IndexQ: T.Tensor(index_q_shape, dtype),
+        Weights: T.Tensor(weights_shape, dtype),
+        IndexK: T.Tensor(index_k_shape, dtype),
+        dIndexQ: T.Tensor(index_q_shape, dtype),
+        dWeights: T.Tensor(weights_shape, dtype),
+        dIndexK: T.Tensor(index_k_shape, dtype),
+        AttnScore: T.Tensor(shape_p, FP32),
+        IndexScore: T.Tensor(shape_p, FP32),
+        TopkIndices: T.Tensor(topk_indices_shape, INT32),
+        Offsets: T.Tensor(offsets_shape, INT32),
+        TokenIndices: T.Tensor(token_indices_shape, INT32),
+    ):
+        with T.Kernel(seq_len, threads=num_threads) as (bx):
+            i_b, i_t = TokenIndices[bx, 0], TokenIndices[bx, 1]
+            bos = Offsets[i_b]
+            num_blocks = T.ceildiv(topk, block_I)
+
+            index_q_shared = T.alloc_shared([heads, dim], dtype=dtype)
+            weights_shared = T.alloc_shared([heads], dtype=dtype)
+
+            d_index_q_frag = T.alloc_fragment([heads, dim], dtype=accum_dtype)
+            d_weights_frag = T.alloc_fragment([heads], dtype=accum_dtype)
+
+            T.copy(IndexQ[bos + i_t, :, :], index_q_shared)
+            T.copy(Weights[bos + i_t, :], weights_shared)
+            T.fill(d_index_q_frag, 0)
+            T.fill(d_weights_frag, 0)
+
+            for i, j in T.Parallel(heads, dim):
+                index_q_shared[i, j] = index_q_shared[i, j] * sm_scale
+
+            for bi_i in T.Pipelined(num_blocks, num_stages=num_stages):
+                i_st = bi_i * block_I
+                i_ed = (bi_i + 1) * block_I
+
+                indices_shared = T.alloc_shared([block_I], dtype=INT32)
+                T.copy(TopkIndices[bos + i_t, i_st:i_ed], indices_shared)
+
+                index_k_shared = T.alloc_shared([block_I, dim], dtype=dtype)
+                for i, j in T.Parallel(block_I, dim):
+                    pos = indices_shared[i]
+                    index_k_shared[i, j] = T.if_then_else((pos > -1) & (pos <= i_t), IndexK[bos + pos, j], 0)
+
+                attn_score_shared = T.alloc_shared([block_I], dtype=accum_dtype)
+                index_score_shared = T.alloc_shared([block_I], dtype=accum_dtype)
+                for i in T.Parallel(block_I):
+                    attn_score_shared[i] = AttnScore[bos + i_t, i_st + i]
+                    index_score_shared[i] = IndexScore[bos + i_t, i_st + i]
+
+                logits = T.alloc_fragment((block_I, heads), accum_dtype)
+                T.gemm(
+                    index_k_shared,
+                    index_q_shared,
+                    logits,
+                    transpose_A=False,
+                    transpose_B=True,
+                    clear_accum=True,
+                )
+                for i, j in T.Parallel(block_I, heads):
+                    logits[i, j] = T.max(logits[i, j], 0)
+
+                # dw
+                d_weights_i = T.alloc_fragment((block_I, heads), accum_dtype)
+                for i, j in T.Parallel(block_I, heads):
+                    d_weights_i[i, j] = (index_score_shared[i] - attn_score_shared[i]) * logits[i, j]
+                T.reduce_sum(d_weights_i, d_weights_frag, dim=0, clear=False)
+
+                d_logits_qk = T.alloc_shared((block_I, heads), accum_dtype)
+                d_logits_qk_cast1 = T.alloc_fragment((block_I, heads), dtype)
+                d_logits_qk_cast2 = T.alloc_fragment((block_I, heads), dtype)
+
+                for i, j in T.Parallel(block_I, heads):
+                    d_relu = T.alloc_var(accum_dtype)
+                    if logits[i, j] > 0:
+                        d_relu = 1.0
+                    else:
+                        d_relu = 0.0
+                    d_logits_qk[i, j] = (index_score_shared[i] - attn_score_shared[i]) * d_relu * weights_shared[j]
+
+                # dq
+                T.copy(d_logits_qk, d_logits_qk_cast1)
+                T.gemm(
+                    d_logits_qk_cast1,  # [BS, HQ]
+                    index_k_shared,  # [BS, K]
+                    d_index_q_frag,  # [HQ, K]
+                    transpose_A=True,
+                    transpose_B=False,
+                    clear_accum=False,
+                )
+
+                # dk
+                T.copy(d_logits_qk, d_logits_qk_cast2)
+                d_index_k_frag = T.alloc_fragment([block_I, dim], dtype=accum_dtype)
+                T.gemm(
+                    d_logits_qk_cast2,  # [BS, HQ]
+                    index_q_shared,  # [HQ, K]
+                    d_index_k_frag,  # [BS, K]
+                    transpose_A=False,
+                    transpose_B=False,
+                    clear_accum=True,
+                )
+
+                for i, j in T.Parallel(block_I, dim):
+                    pos = indices_shared[i]
+                    if (pos > -1) & (pos <= i_t):
+                        T.atomic_add(dIndexK[bos + pos, j], d_index_k_frag[i, j])
+
+            for i, j in T.Parallel(heads, dim):
+                d_index_q_frag[i, j] = d_index_q_frag[i, j] * sm_scale
+
+            T.copy(d_index_q_frag, dIndexQ[bos + i_t, :, :])
+            T.copy(d_weights_frag, dWeights[bos + i_t, :])
+
+    return tl_indexer_bwd_kernel
+
+
+def indexer_bwd_interface(
+    q: torch.Tensor,
+    weights: torch.Tensor,
+    k: torch.Tensor,
+    attn_score: torch.Tensor,
+    index_score: torch.Tensor,
+    topk_indices: torch.Tensor,
+    offsets: torch.Tensor,
+):
+    _, heads, dim, topk = *q.shape, topk_indices.shape[-1]
+    token_indices = prepare_token_indices(offsets)
+    dq = torch.zeros_like(q)
+    dweights = torch.zeros_like(weights)
+    dk = torch.zeros_like(k)
+    kernel = tl_indexer_bwd_impl(heads, dim, topk)
+    kernel(q, weights, k, dq, dweights, dk, attn_score, index_score, topk_indices, offsets, token_indices)
+    return dq, dweights, dk
+
+
+def ref_indexer_bwd(
+    Q: torch.Tensor, Weights: torch.Tensor, K: torch.Tensor, TopkIndices: torch.Tensor, AttnScore: torch.Tensor, offsets: torch.Tensor
+) -> torch.Tensor:
+    Q.requires_grad_(True)
+    Weights.requires_grad_(True)
+    K.requires_grad_(True)
+    softmax_scale = Q.shape[-1] ** -0.5
+    all_loss = []
+    all_log_topk_prob = []
+    for i in range(offsets.shape[0] - 1):
+        assert (offsets[i + 1] - offsets[i]).item() >= TopkIndices.shape[-1]
+        q = Q[offsets[i] : offsets[i + 1]]
+        weights = Weights[offsets[i] : offsets[i + 1]]
+        k = K[offsets[i] : offsets[i + 1]]
+        topk_indices = TopkIndices[offsets[i] : offsets[i + 1]]
+        attn_score = AttnScore[offsets[i] : offsets[i + 1]]
+        s = q.shape[0]
+        mask = (torch.arange(s)[:, None] >= torch.arange(s)[None, :]).to(q.device)
+        logits = einsum(q, k, "s1 h k, s2 k -> s1 h s2") * softmax_scale
+        logits = F.relu(logits)
+        score = (logits * weights.unsqueeze(-1)).sum(dim=-2, dtype=torch.float32)
+        score = torch.where(mask, score, float("-inf"))
+        topk_value = torch.gather(score, dim=-1, index=topk_indices.to(torch.int64))
+        log_topk_prob = F.log_softmax(topk_value, dim=-1, dtype=torch.float32)
+        loss = F.kl_div(log_topk_prob.clip(-100, 0), attn_score.log().clip(-100, 0), log_target=True, reduction="sum")
+        all_loss.append(loss)
+        all_log_topk_prob.append(log_topk_prob)
+    loss = torch.stack(all_loss).sum()
+    loss.backward()
+    log_topk_prob = torch.cat(all_log_topk_prob, dim=0)
+    return log_topk_prob.exp(), Q.grad, Weights.grad, K.grad
+
+
+def test_kernel(
+    B=1,
+    S=2048,
+    H=16,
+    D=128,
+    topk=64,
+):
+    torch.manual_seed(42)
+    q = torch.randn((S, H, D)).cuda().bfloat16()
+    w = torch.randn((S, H)).cuda().bfloat16()
+    k = torch.randn((S, D)).cuda().bfloat16()
+    offsets = torch.tensor([0, 1023, S], dtype=torch.int32).cuda()
+
+    all_attn_score = []
+    for i in range(offsets.shape[0] - 1):
+        seq_len = (offsets[i + 1] - offsets[i]).item()
+        mask = (torch.arange(seq_len)[:, None] >= torch.arange(topk)[None, :]).to(q.device)
+        logits = torch.ones(seq_len, topk).cuda()
+        logits = torch.where(mask, logits, float("-inf"))
+        attn_score = F.softmax(logits, dim=-1, dtype=torch.float32)
+        all_attn_score.append(attn_score)
+    attn_score = torch.cat(all_attn_score, dim=0)
+
+    topk_indices = repeat(torch.arange(topk, dtype=torch.int32).cuda(), "k -> s k", s=S).contiguous()
+    index_score, ref_dq, ref_dw, ref_dk = ref_indexer_bwd(q, w, k, topk_indices, attn_score, offsets)
+
+    dq, dw, dk = indexer_bwd_interface(q, w, k, attn_score, index_score, topk_indices, offsets)
+
+    print(f"dq err: {get_abs_err(dq, ref_dq):.6f} ratio: {get_err_ratio(dq, ref_dq):.6f}")
+    print(f"dq err: {get_abs_err(dw, ref_dw):.6f} ratio: {get_err_ratio(dw, ref_dw):.6f}")
+    print(f"dq err: {get_abs_err(dk, ref_dk):.6f} ratio: {get_err_ratio(dk, ref_dk):.6f}")
+
+
+if __name__ == "__main__":
+    test_kernel()
diff --git a/examples/dsa_sparse_finetune/indexer_topk_reducesum.py b/examples/dsa_sparse_finetune/indexer_topk_reducesum.py
new file mode 100644
index 000000000..d76eb0272
--- /dev/null
+++ b/examples/dsa_sparse_finetune/indexer_topk_reducesum.py
@@ -0,0 +1,273 @@
+import math
+import torch
+import torch.nn.functional as F
+from einops import einsum
+
+import tilelang as tl
+import tilelang.language as T
+from typing import Optional
+from index import prepare_token_indices
+
+from utils import get_abs_err, get_err_ratio
+
+BF16 = T.bfloat16
+FP32 = T.float32
+INT32 = T.int32
+
+pass_configs = {
+    tl.PassConfigKey.TL_DISABLE_THREAD_STORAGE_SYNC: True,
+    tl.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+    tl.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+}
+
+
+@tl.jit(pass_configs=pass_configs)
+def tl_indexer_topk_reducesum_impl(
+    heads: int,
+    dim: int,
+    topk: int,
+    sm_scale: Optional[float] = None,
+    block_K: int = 32,
+    dtype: str = FP32,
+    num_stages: int = 0,
+    num_threads: int = 128,
+):
+    assert topk == tl.math.next_power_of_2(topk)
+    assert topk % block_K == 0
+    assert heads <= 64 and heads % 8 == 0
+    assert num_stages == 0
+    batch_plus_one = T.symbolic("batch_plus_one")
+    seq_len = T.symbolic("seq_len")
+
+    index_q_shape = [seq_len, heads, dim]
+    weights_shape = [seq_len, heads]
+    index_k_shape = [seq_len, dim]
+    topk_indices_shape = [seq_len, topk]
+    offsets_shape = [batch_plus_one]
+    token_indices_shape = [seq_len, 2]
+
+    N = 2 * topk
+    num_iters = int(round(math.log2(N)))
+    if sm_scale is None:
+        sm_scale = dim**-0.5
+
+    @T.macro
+    def bitonic_sort(
+        topk_index_shared: T.SharedBuffer([N], dtype=INT32),
+        topk_value_shared: T.SharedBuffer([N], dtype=FP32),
+    ):
+        T.sync_threads()
+        for i1 in T.serial(num_iters):
+            for i2 in T.serial(i1 + 1):
+                for i in T.Parallel(N):
+                    ascending = (i & (1 << (i1 + 1))) != 0
+                    j = i ^ (1 << (i1 - i2))
+                    if i < j and (
+                        (ascending and topk_value_shared[i] > topk_value_shared[j])
+                        or (not ascending and topk_value_shared[i] < topk_value_shared[j])
+                    ):
+                        val = topk_value_shared[i]
+                        topk_value_shared[i] = topk_value_shared[j]
+                        topk_value_shared[j] = val
+                        idx = topk_index_shared[i]
+                        topk_index_shared[i] = topk_index_shared[j]
+                        topk_index_shared[j] = idx
+                T.sync_threads()
+
+    @T.prim_func
+    def tl_indexer_topk_reducesum_kernel(
+        IndexQ: T.Tensor(index_q_shape, dtype),
+        Weights: T.Tensor(weights_shape, dtype),
+        IndexK: T.Tensor(index_k_shape, dtype),
+        TopkIndices: T.Tensor(topk_indices_shape, INT32),
+        ReduceSum: T.Tensor(topk_indices_shape, FP32),
+        Offsets: T.Tensor(offsets_shape, INT32),
+        TokenIndices: T.Tensor(token_indices_shape, INT32),
+    ):
+        with T.Kernel(seq_len, threads=num_threads) as (bx):
+            i_b, i_t = TokenIndices[bx, 0], TokenIndices[bx, 1]
+            bos, eos = Offsets[i_b], Offsets[i_b + 1]
+            num_blocks = T.ceildiv(i_t + 1, block_K)
+
+            topk_index_shared = T.alloc_shared([N], dtype=INT32)
+            topk_value_shared = T.alloc_shared([N], dtype=FP32)
+
+            T.fill(topk_index_shared, -1)
+            T.fill(topk_value_shared, float("-inf"))
+            T.sync_threads()
+
+            index_q_shared = T.alloc_shared([heads, dim], dtype=dtype)
+            T.copy(IndexQ[bos + i_t, :, :], index_q_shared)
+            T.sync_threads()
+
+            weights_frag = T.alloc_shared([heads], dtype=dtype)
+            T.copy(Weights[bos + i_t, :], weights_frag)
+            T.sync_threads()
+
+            for i, j in T.Parallel(heads, dim):
+                index_q_shared[i, j] = index_q_shared[i, j] * sm_scale
+            T.sync_threads()
+
+            for bk_i in T.Pipelined(num_blocks, num_stages=num_stages):
+                k_st = bk_i * block_K
+                k_ed = T.min((bk_i + 1) * block_K, eos - bos)
+
+                index_k_shared = T.alloc_shared([block_K, dim], dtype=dtype)
+                for i, j in T.Parallel(block_K, dim):
+                    index_k_shared[i, j] = T.if_then_else(k_st + i < k_ed, IndexK[bos + k_st + i, j], 0)
+                T.sync_threads()
+
+                logits = T.alloc_fragment((block_K, heads), FP32)
+                T.gemm(
+                    index_k_shared,
+                    index_q_shared,
+                    logits,
+                    transpose_A=False,
+                    transpose_B=True,
+                    clear_accum=True,
+                )
+                T.sync_threads()
+
+                for i, j in T.Parallel(block_K, heads):
+                    logits[i, j] = T.max(logits[i, j], 0) * weights_frag[j]
+                T.sync_threads()
+
+                logits_sum = T.alloc_fragment(block_K, FP32)
+                T.reduce_sum(logits, logits_sum, dim=1)
+                T.sync_threads()
+
+                offset = T.alloc_var(INT32)
+                if k_st >= topk:
+                    offset = topk + (k_st % topk)
+                else:
+                    offset = k_st
+                T.sync_threads()
+                for i in T.Parallel(block_K):
+                    if k_st + i > i_t:
+                        logits_sum[i] = float("-inf")
+                    j = offset + i
+                    topk_index_shared[j] = k_st + i
+                    topk_value_shared[j] = logits_sum[i]
+                T.sync_threads()
+
+                if k_ed > topk and k_ed % topk == 0:
+                    bitonic_sort(topk_index_shared, topk_value_shared)
+
+            bitonic_sort(topk_index_shared, topk_value_shared)
+
+            logits_max_frag = T.alloc_fragment([1], dtype=FP32)
+            logits_frag = T.alloc_fragment([topk], dtype=FP32)
+            reducesum_shared = T.alloc_shared([topk], dtype=FP32)
+
+            T.copy(topk_value_shared[:topk], logits_frag)
+            T.sync_threads()
+
+            T.reduce_max(logits_frag, logits_max_frag, dim=-1)
+            T.sync_threads()
+
+            for i in T.Parallel(topk):
+                logits_frag[i] = T.exp(logits_frag[i] - logits_max_frag[0])
+            T.sync_threads()
+
+            lse_frag = T.alloc_fragment([1], dtype=FP32)
+            T.reduce_sum(logits_frag, lse_frag)
+            T.sync_threads()
+
+            for i in T.Parallel(topk):
+                reducesum_shared[i] = logits_frag[i] / lse_frag[0]
+            T.sync_threads()
+
+            # for i in T.Parallel(topk):
+            #     reducesum_shared[i] = logits_frag[i]
+            # T.sync_threads()
+
+            for i in T.Parallel(topk):
+                if topk_index_shared[i] > i_t:
+                    topk_index_shared[i] = -1
+            T.sync_threads()
+
+            T.copy(topk_index_shared[:topk], TopkIndices[bos + i_t, :])
+            T.copy(reducesum_shared[:topk], ReduceSum[bos + i_t, :])
+
+    return tl_indexer_topk_reducesum_kernel
+
+
+def indexer_topk_reducesum_interface(
+    q: torch.Tensor,
+    weights: torch.Tensor,
+    k: torch.Tensor,
+    topk: int,
+    offsets: torch.Tensor,
+    dtype: str = BF16,
+):
+    seq_len, heads, dim = q.shape
+    kernel = tl_indexer_topk_reducesum_impl(heads=heads, dim=dim, topk=topk, dtype=dtype)
+    token_indices = prepare_token_indices(offsets)
+    topk_indices = torch.zeros((seq_len, topk), device=q.device, dtype=torch.int32)
+    topk_score = torch.zeros((seq_len, topk), device=q.device, dtype=torch.float32)
+    kernel(q, weights, k, topk_indices, topk_score, offsets, token_indices)
+    return topk_indices, topk_score
+
+
+def ref_index_score(Q: torch.Tensor, Weights: torch.Tensor, K: torch.Tensor, topk: int, offsets: torch.Tensor) -> torch.Tensor:
+    all_topk_indices = []
+    all_topk_score = []
+    for i in range(offsets.shape[0] - 1):
+        assert (offsets[i + 1] - offsets[i]).item() >= topk
+        q = Q[offsets[i] : offsets[i + 1]]
+        weights = Weights[offsets[i] : offsets[i + 1]]
+        k = K[offsets[i] : offsets[i + 1]]
+        softmax_scale = q.shape[-1] ** -0.5
+        s = q.shape[0]
+        mask = (torch.arange(s)[:, None] >= torch.arange(s)[None, :]).to(q.device)
+        logits = einsum(q, k, "s1 h k, s2 k -> s1 h s2")
+        logits = F.relu(logits)
+        logits = (logits * weights.unsqueeze(-1)).sum(dim=-2, dtype=torch.float32) * softmax_scale
+        logits = torch.where(mask, logits, float("-inf"))
+        topk_logits, topk_indices = torch.topk(logits, k=topk, dim=-1)
+        topk_score = F.softmax(topk_logits, dim=-1, dtype=torch.float32)
+        all_topk_indices.append(topk_indices)
+        all_topk_score.append(topk_score)
+    topk_indices = torch.cat(all_topk_indices, dim=0)
+    topk_score = torch.cat(all_topk_score, dim=0)
+    return topk_indices, topk_score
+
+
+def test_kernel(
+    B=1,
+    S=2048,
+    H=64,
+    D=128,
+    topk=64,
+):
+    torch.manual_seed(42)
+
+    q = torch.randn((S, H, D)).cuda().bfloat16()
+    weights = torch.randn((S, H)).cuda().bfloat16()
+    k = torch.randn((S, D)).cuda().bfloat16()
+    offsets = torch.tensor([0, S], dtype=torch.int32).cuda()
+
+    ref_topk_indices, ref_topk_score = ref_index_score(q, weights, k, topk, offsets)
+
+    topk_indices, topk_score = indexer_topk_reducesum_interface(q, weights, k, topk, offsets)
+
+    for j in range(S):
+        ref_np = ref_topk_indices[j].cpu().to(torch.int32).numpy()
+        trt_np = topk_indices[j].cpu().to(torch.int32).numpy()
+
+        ref_np_val = ref_topk_score[j]
+        trt_np_val = topk_score[j]
+
+        mask = (ref_np_val > 0).cpu().numpy()
+
+        set_ref = set(ref_np[mask])
+        set_trt = set(trt_np[mask])
+        intersection = set_ref & set_trt
+
+        print("idx:", j, "selected/all:", len(intersection), "/", len(set_ref), "=", len(intersection) / len(set_ref))
+
+        print(f"err: {get_abs_err(ref_np_val, trt_np_val):.6f} ratio: {get_err_ratio(ref_np_val, trt_np_val):.6f}")
+
+
+if __name__ == "__main__":
+    test_kernel()
diff --git a/examples/dsa_sparse_finetune/sparse_mla_bwd.py b/examples/dsa_sparse_finetune/sparse_mla_bwd.py
new file mode 100644
index 000000000..53e5f8bfe
--- /dev/null
+++ b/examples/dsa_sparse_finetune/sparse_mla_bwd.py
@@ -0,0 +1,347 @@
+# ruff: noqa
+import tilelang
+from tilelang import language as T
+import torch
+from index import prepare_token_indices
+
+from utils import assert_tensors_similar
+
+
+@tilelang.jit(out_idx=[-1])
+def preprocess(
+    H,
+    D,
+    block_ND=32,
+    num_stages=5,
+    dtype=T.bfloat16,
+    accum_dtype=T.float32,
+):
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
+
+    S = T.symbolic("S")
+
+    shape = [S, H, D]
+
+    @T.prim_func
+    def preprocess_kernel(
+        O: T.Tensor(shape, dtype),
+        dO: T.Tensor(shape, dtype),
+        Delta: T.Tensor([S, H], accum_dtype),
+    ):
+        with T.Kernel(H, T.ceildiv(S, block_ND)) as (bx, by):
+            o = T.alloc_fragment([block_ND, block_ND], accum_dtype)
+            do = T.alloc_fragment([block_ND, block_ND], accum_dtype)
+            delta = T.alloc_fragment([block_ND], accum_dtype)
+            acc = T.alloc_fragment([block_ND, block_ND], accum_dtype)
+            T.clear(acc)
+            for k in T.Pipelined(T.ceildiv(D, block_ND), num_stages=num_stages):
+                T.copy(O[by * block_ND : (by + 1) * block_ND, bx, k * block_ND : (k + 1) * block_ND], o)
+                T.copy(dO[by * block_ND : (by + 1) * block_ND, bx, k * block_ND : (k + 1) * block_ND], do)
+                for i, j in T.Parallel(block_ND, block_ND):
+                    acc[i, j] += o[i, j] * do[i, j]
+            T.reduce_sum(acc, delta, 1)
+            T.copy(delta, Delta[by * block_ND : (by + 1) * block_ND, bx])
+
+    return preprocess_kernel
+
+
+@tilelang.jit(out_idx=[-1])
+def postprocess(
+    D,
+    D_tail,
+    kv_group=1,
+    block_N=64,
+    threads=128,
+    dtype=T.bfloat16,
+    accum_dtype=T.float32,
+):
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
+    S_kv = T.symbolic("S_kv")
+
+    dkv_shape = [S_kv, kv_group, D + D_tail]
+
+    @T.prim_func
+    def postprocess_kernel(
+        dKV: T.Tensor(dkv_shape, accum_dtype),
+        dKV_out: T.Tensor(dkv_shape, dtype),
+    ):
+        with T.Kernel(T.ceildiv(S_kv, block_N), kv_group, threads=threads) as (bx, by):
+            T.copy(
+                dKV[bx * block_N : (bx + 1) * block_N, by, :],
+                dKV_out[bx * block_N : (bx + 1) * block_N, by, :],
+            )
+
+    return postprocess_kernel
+
+
+@tilelang.jit(
+    out_idx=[-2],
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+    },
+)
+def bwd(
+    H,
+    D,
+    D_tail,
+    topk,
+    kv_group=1,
+    sm_scale=None,
+    is_causal=True,
+    block_size=32,
+    num_stages=0,
+    threads=128,
+    indices_dtype=T.int32,
+    dtype=T.bfloat16,
+    accum_dtype=T.float32,
+):
+    assert is_causal == True, "non-casual is not supported now"
+    assert topk % block_size == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
+    assert indices_dtype == T.int32
+
+    if sm_scale is None:
+        sm_scale = (D + D_tail) ** (-0.5)
+
+    B_plus_one = T.symbolic("B_plus_one")
+    S = T.symbolic("S")
+
+    H_kv = H // kv_group
+    q_shape = [S, H, D + D_tail]
+    k_shape = [S, kv_group, D + D_tail]
+    o_shape = [S, H, D]
+    indices_shape = [S, kv_group, topk]
+    delta_shape = [S, H]
+    lse_shape = [S, H]
+    offsets_shape = [B_plus_one]
+    token_indices_shape = [S, 2]
+    assert indices_dtype == T.int32
+    assert dtype == T.bfloat16
+    assert accum_dtype == T.float32
+
+    H = H_kv
+    padded_H = max(tilelang.math.next_power_of_2(H_kv), 16)
+    BS = block_size
+    NS = tilelang.cdiv(topk, block_size)
+
+    split_store = 2
+
+    @T.prim_func
+    def sparse_mla_bwd_kernel(
+        Q: T.Tensor(q_shape, dtype),
+        KV: T.Tensor(k_shape, dtype),
+        dO: T.Tensor(o_shape, dtype),
+        Indices: T.Tensor(indices_shape, indices_dtype),
+        Lse: T.Tensor(lse_shape, accum_dtype),
+        Delta: T.Tensor(delta_shape, accum_dtype),
+        Offsets: T.Tensor(offsets_shape, indices_dtype),
+        TokenIndices: T.Tensor(token_indices_shape, indices_dtype),
+        dQ: T.Tensor(q_shape, dtype),
+        dKV: T.Tensor(k_shape, accum_dtype),
+    ):
+        with T.Kernel(S, kv_group, threads=threads) as (b_s_i, bz):
+            Q_shared = T.alloc_shared([padded_H, D], dtype)
+            Q_tail_shared = T.alloc_shared([padded_H, D_tail], dtype)
+            KV_shared = T.alloc_shared([BS, D], dtype)
+            KV_tail_shared = T.alloc_shared([BS, D_tail], dtype)
+            dO_shared = T.alloc_shared([padded_H, D], dtype)
+            mask = T.alloc_fragment([BS], "bool")
+
+            P_shared_cast = T.alloc_shared([padded_H, BS], dtype)
+            dP_shared_cast = T.alloc_shared([padded_H, BS], dtype)
+            dQ_shared = T.alloc_shared([padded_H, D], dtype)
+            dQ_tail_shared = T.alloc_shared([padded_H, D_tail], dtype)
+
+            acc_p = T.alloc_fragment([padded_H, BS], accum_dtype)
+            acc_dp = T.alloc_fragment([padded_H, BS], accum_dtype)
+            acc_dq = T.alloc_fragment([padded_H, D], accum_dtype)
+            acc_dq_tail = T.alloc_fragment([padded_H, D_tail], accum_dtype)
+            acc_dkv = T.alloc_fragment([BS, D], accum_dtype)
+            acc_dkv_tail = T.alloc_fragment([BS, D_tail], accum_dtype)
+            acc_dkv_shared = T.view(KV_shared, shape=[BS // split_store, D], dtype=accum_dtype)
+            acc_dkv_tail_shared = T.view(KV_tail_shared, shape=[BS // split_store, D_tail], dtype=accum_dtype)
+
+            b_i, s_i = TokenIndices[b_s_i, 0], TokenIndices[b_s_i, 1]
+            bos, eos = Offsets[b_i], Offsets[b_i + 1]
+
+            max_kv_i = s_i
+
+            T.copy(Q[bos + s_i, bz * padded_H : (bz + 1) * padded_H, :D], Q_shared)
+            T.copy(Q[bos + s_i, bz * padded_H : (bz + 1) * padded_H, D:], Q_tail_shared)
+            T.copy(dO[bos + s_i, bz * padded_H : (bz + 1) * padded_H, :D], dO_shared)
+
+            T.clear(acc_dq)
+            T.clear(acc_dq_tail)
+
+            # Process each block of indices
+            for i_i in T.Pipelined(NS, num_stages=num_stages):
+                # Check which indices are valid
+                for bi_i in T.Parallel(BS):
+                    mask[bi_i] = (Indices[bos + s_i, bz, i_i * BS + bi_i] <= max_kv_i) & (Indices[bos + s_i, bz, i_i * BS + bi_i] != -1)
+
+                # Compute attention scores
+                for h_i, bi_i in T.Parallel(padded_H, BS):
+                    acc_p[h_i, bi_i] = T.if_then_else(mask[bi_i], 0, -T.infinity(acc_p.dtype))
+
+                # Load KV, V for this block of indices
+                for bi_i, d_i in T.Parallel(BS, D):
+                    KV_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i], bz, d_i]
+
+                T.gemm(Q_shared, KV_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+
+                for bi_i, d_i in T.Parallel(BS, D_tail):
+                    KV_tail_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i], bz, D + d_i]
+                T.gemm(Q_tail_shared, KV_tail_shared, acc_p, transpose_B=True, policy=T.GemmWarpPolicy.FullCol)
+
+                for h_i, bi_i in T.Parallel(padded_H, BS):
+                    acc_p[h_i, bi_i] = T.exp(acc_p[h_i, bi_i] * sm_scale - Lse[bos + s_i, bz * padded_H + h_i])
+
+                T.copy(acc_p, P_shared_cast)
+
+                T.gemm(dO_shared, KV_shared, acc_dp, transpose_B=True, policy=T.GemmWarpPolicy.FullCol, clear_accum=True)
+
+                for h_i, bi_i in T.Parallel(padded_H, BS):
+                    acc_dp[h_i, bi_i] = acc_p[h_i, bi_i] * (acc_dp[h_i, bi_i] - Delta[bos + s_i, bz * padded_H + h_i]) * sm_scale
+
+                T.copy(acc_dp, dP_shared_cast)
+                T.gemm(dP_shared_cast, KV_shared, acc_dq, policy=T.GemmWarpPolicy.FullCol)
+                T.gemm(dP_shared_cast, KV_tail_shared, acc_dq_tail, policy=T.GemmWarpPolicy.FullCol)
+
+                T.gemm(dP_shared_cast, Q_shared, acc_dkv, transpose_A=True, policy=T.GemmWarpPolicy.FullCol, clear_accum=True)
+                T.gemm(P_shared_cast, dO_shared, acc_dkv, transpose_A=True, policy=T.GemmWarpPolicy.FullCol)
+
+                T.clear(acc_dkv_tail)
+                T.gemm(dP_shared_cast, Q_tail_shared, acc_dkv_tail, transpose_A=True, policy=T.GemmWarpPolicy.FullCol)
+
+                for s in range(split_store):
+                    for bi_i, d_i in T.Parallel(BS, D):
+                        if bi_i < BS // split_store:
+                            acc_dkv_shared[bi_i, d_i] = acc_dkv[bi_i + s * (BS // split_store), d_i]
+
+                    for bi_i, d_i in T.Parallel(BS, D_tail):
+                        if bi_i < BS // split_store:
+                            acc_dkv_tail_shared[bi_i, d_i] = acc_dkv_tail[bi_i + s * (BS // split_store), d_i]
+
+                    for bi_i, d_i in T.Parallel(BS // split_store, D // 4):
+                        T.atomic_addx4(
+                            dKV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i + s * (BS // split_store)], bz, d_i * 4],
+                            acc_dkv_shared[bi_i, d_i * 4],
+                        )
+
+                    # Atomically update dKV, dKV_tail tensors
+                    for bi_i, d_i in T.Parallel(BS // split_store, D_tail // 4):
+                        T.atomic_addx4(
+                            dKV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i + s * (BS // split_store)], bz, D + d_i * 4],
+                            acc_dkv_tail_shared[bi_i, d_i * 4],
+                        )
+
+            # Store the accumulated dQ
+            T.copy(acc_dq, dQ_shared)
+            T.copy(acc_dq_tail, dQ_tail_shared)
+
+            T.copy(dQ_shared, dQ[bos + s_i, bz * padded_H : (bz + 1) * padded_H, :D])
+            T.copy(dQ_tail_shared, dQ[bos + s_i, bz * padded_H : (bz + 1) * padded_H, D:])
+
+    return sparse_mla_bwd_kernel
+
+
+def sparse_mla_bwd(q, kv, o, do, indices, lse, offsets, sm_scale=None, is_casual=True, return_kernel=False, delta=None):
+    assert q.is_contiguous()
+    assert kv.is_contiguous()
+    assert indices.is_contiguous()
+    assert lse.is_contiguous()
+    S, H, dim_plus_tail_dim = q.shape
+    S_kv, kv_group, _ = kv.shape
+    assert kv.shape[-1] == dim_plus_tail_dim
+    assert S == S_kv
+    # dim should be assigned
+    D = 512
+
+    D_tail = dim_plus_tail_dim - D
+    topk = indices.shape[-1]
+    assert indices.shape == (S, kv_group, topk)
+    assert lse.shape == (S, H)
+
+    token_indices = prepare_token_indices(offsets)
+
+    # Get kernels
+    preprocess_kernel = preprocess(H, D)
+    bwd_kernel = bwd(H, D, D_tail, topk, kv_group, sm_scale, is_casual)
+    postprocess_kernel = postprocess(D, D_tail, kv_group)
+
+    if delta is None:
+        delta = preprocess_kernel(o, do)
+    dkv = torch.zeros_like(kv, dtype=torch.float32)
+    dq = bwd_kernel(q, kv, do, indices, lse, delta, offsets, token_indices, dkv)
+    dkv = postprocess_kernel(dkv)
+
+    return dq, dkv
+
+
+def ref_sparse_mla_bwd_interface(q, kv, o, do, indices, lse, offsets, sm_scale=None, is_casual=True):
+    from sparse_mla_fwd import ref_sparse_mla_fwd_interface
+
+    q = q.detach().clone()
+    kv = kv.detach().clone()
+    q.requires_grad = True
+    kv.requires_grad = True
+    o = ref_sparse_mla_fwd_interface(q, kv, indices, offsets, sm_scale, is_casual)
+    o.backward(do)
+    return q.grad, kv.grad
+
+
+def test_sparse_mla_bwd(B=1, S=2048, H=64, HKV=1, DQKV=576, DV=512, topk=512, dtype=torch.bfloat16, check_correctness=True):
+    # Prepare data
+    q = torch.randn((S, H, DQKV), dtype=dtype, device="cuda").requires_grad_(True)
+    kv = torch.randn((S, HKV, DQKV), dtype=dtype, device="cuda").requires_grad_(True)
+    do = torch.randn((S, H, DV), dtype=dtype, device="cuda")
+    offsets = torch.tensor([0, S], dtype=torch.int32, device="cuda")
+
+    indices = torch.full((S, HKV, topk), S, dtype=torch.int32, device="cuda")
+    for i in range(offsets.shape[0] - 1):
+        seq_len = (offsets[i + 1] - offsets[i]).item()
+        assert seq_len >= topk
+        for t in range(seq_len):
+            for h in range(HKV):
+                i_i = torch.randperm(max(1, t))[:topk]
+                indices[offsets[i] + t, h, : len(i_i)] = i_i
+
+    # Forward
+    from sparse_mla_fwd import sparse_mla_fwd_interface
+
+    tl_out, tl_lse = sparse_mla_fwd_interface(q, kv, indices, offsets)
+
+    tl_dq, tl_dkv = sparse_mla_bwd(q, kv, tl_out, do, indices, tl_lse, offsets)
+    ref_dq, ref_dkv = ref_sparse_mla_bwd_interface(q, kv, None, do, indices, None, offsets)
+
+    if check_correctness:
+        assert_tensors_similar(tl_dq, ref_dq, eps=1e-4, name="dq")
+        assert_tensors_similar(tl_dkv, ref_dkv, eps=1e-4, name="dkv")
+        print("assert_tensors_similar passed")
+
+    per_token_flop = 2 * sum(
+        [
+            H * DV * topk,
+            H * DQKV * topk,
+            H * DQKV * topk,
+            H * DQKV * topk,
+            H * DV * topk,
+        ]
+    )
+    from tilelang.profiler import do_bench
+
+    def fn():
+        return sparse_mla_bwd(q, kv, tl_out, do, indices, tl_lse, offsets)
+
+    ms = do_bench(fn, rep=100, warmup=250)
+    print(f"Average time: {ms:.3f} ms")
+    print(f"bwd io bandwidth = ", (B * S * max(DQKV * 2, DQKV + DV) * topk * 2) / (ms * 1e-3) / 1e12)
+    print(f"bwd tflops = ", per_token_flop * S / (ms * 1e-3) / 1e12)
+
+
+if __name__ == "__main__":
+    test_sparse_mla_bwd(B=1, S=2048, H=64, HKV=1, DQKV=576, DV=512, topk=512, dtype=torch.bfloat16, check_correctness=True)
diff --git a/examples/dsa_sparse_finetune/sparse_mla_fwd.py b/examples/dsa_sparse_finetune/sparse_mla_fwd.py
new file mode 100644
index 000000000..d87523695
--- /dev/null
+++ b/examples/dsa_sparse_finetune/sparse_mla_fwd.py
@@ -0,0 +1,310 @@
+# ruff: noqa
+import torch
+import tilelang
+from tilelang import language as T
+from index import prepare_token_indices
+
+from utils import assert_tensors_similar
+
+
+@tilelang.jit(
+    out_idx=[-2, -1],
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+    },
+)
+def sparse_mla_fwd(
+    heads,
+    dim,
+    tail_dim,
+    topk,
+    kv_group=1,
+    sm_scale=None,
+    is_causal=True,
+    CP0=True,
+    block_I=32,
+    num_stages=2,
+    threads=128,
+):
+    assert dim == tilelang.math.next_power_of_2(dim), f"haven't check padding correctness yet, dim={dim}"
+    assert tail_dim == tilelang.math.next_power_of_2(tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
+    assert is_causal == True, "non-casual is not supported"
+    assert topk % block_I == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
+    if sm_scale is None:
+        sm_scale = (1.0 / (dim + tail_dim)) ** 0.5
+    else:
+        sm_scale = sm_scale
+
+    batch_plus_one = T.symbolic("batch_plus_one")
+    seq_len = T.symbolic("seq_len")
+
+    head_kv = heads // kv_group
+    q_shape = [seq_len, heads, dim + tail_dim]
+    kv_shape = [seq_len, kv_group, dim + tail_dim]
+    o_shape = [seq_len, heads, dim]
+    indices_shape = [seq_len, kv_group, topk]
+    lse_shape = [seq_len, heads]
+    offsets_shape = [batch_plus_one]
+    token_indices_shape = [seq_len, 2]
+    indices_dtype = T.int32
+    dtype = T.bfloat16
+    accum_dtype = T.float32
+
+    G = kv_group
+    H = head_kv
+    padded_H = max(tilelang.math.next_power_of_2(head_kv), 16)
+    if padded_H != H:
+        assert kv_group == 1, (
+            "here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)"
+        )
+    BI = block_I
+    NI = tilelang.cdiv(topk, block_I)
+    D = dim
+    D_tail = tail_dim
+
+    if head_kv > 64:
+        assert head_kv % 64 == 0, "head_kv should be a multiple of 64"
+        REPLICATE_H = head_kv // 64
+    else:
+        REPLICATE_H = 1
+
+    H_per_block = padded_H if REPLICATE_H == 1 else 64
+
+    @T.prim_func
+    def main(
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        KV: T.Tensor(kv_shape, dtype),  # type: ignore
+        Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
+        Offsets: T.Tensor(offsets_shape, indices_dtype),  # type: ignore
+        TokenIndices: T.Tensor(token_indices_shape, indices_dtype),  # type: ignore
+        Output: T.Tensor(o_shape, dtype),  # type: ignore
+        Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
+    ):
+        with T.Kernel(seq_len * REPLICATE_H, kv_group, threads=threads) as (
+            bx,
+            by,
+        ):
+            Q_shared = T.alloc_shared([H_per_block, D], dtype)
+            Q_tail_shared = T.alloc_shared([H_per_block, D_tail], dtype)
+            KV_shared = T.alloc_shared([BI, D], dtype)
+            K_tail_shared = T.alloc_shared([BI, D_tail], dtype)
+            mask = T.alloc_fragment([BI], "bool")
+
+            acc_o = T.alloc_fragment([H_per_block, D], accum_dtype)
+            acc_s = T.alloc_fragment([H_per_block, BI], accum_dtype)
+            S_shared = T.alloc_shared([H_per_block, BI], dtype)
+            sumexp = T.alloc_fragment([H_per_block], accum_dtype)
+            sumexp_i = T.alloc_fragment([H_per_block], accum_dtype)
+            alpha = T.alloc_fragment([H_per_block], accum_dtype)
+            m_i = T.alloc_fragment([H_per_block], accum_dtype)
+            m_i_prev = T.alloc_fragment([H_per_block], accum_dtype)
+
+            T.fill(acc_o, 0)
+            T.fill(sumexp, 0)
+            T.fill(m_i, -(2**30))  # avoid -inf - inf to cause nan
+
+            b_s_i = bx if REPLICATE_H == 1 else (bx // REPLICATE_H)
+            b_i, s_i = TokenIndices[b_s_i, 0], TokenIndices[b_s_i, 1]
+            bos, eos = Offsets[b_i], Offsets[b_i + 1]
+            g_i = by
+            q_i = s_i
+            max_kv_i = q_i
+
+            H0 = g_i * padded_H + (0 if REPLICATE_H == 1 else (bx % REPLICATE_H) * 64)
+            H1 = H0 + H_per_block
+
+            T.copy(Q[bos + s_i, H0:H1, :D], Q_shared)
+            T.copy(Q[bos + s_i, H0:H1, D:], Q_tail_shared)
+
+            for i_i in T.Pipelined(NI, num_stages=num_stages):
+                for bi_i in T.Parallel(BI):
+                    mask[bi_i] = (Indices[bos + s_i, g_i, i_i * BI + bi_i] <= max_kv_i) & (Indices[bos + s_i, g_i, i_i * BI + bi_i] != -1)
+
+                for bi_i, d_i in T.Parallel(BI, D):
+                    KV_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, g_i, i_i * BI + bi_i], g_i, d_i]
+                for bi_i, d_i in T.Parallel(BI, D_tail):
+                    K_tail_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, g_i, i_i * BI + bi_i], g_i, D + d_i]
+
+                for h_i, bi_i in T.Parallel(H_per_block, BI):
+                    acc_s[h_i, bi_i] = T.if_then_else(mask[bi_i], 0, -T.infinity(acc_s.dtype))
+                T.gemm(
+                    Q_shared,
+                    KV_shared,
+                    acc_s,
+                    transpose_B=True,
+                    policy=T.GemmWarpPolicy.FullRow,
+                )
+                T.gemm(
+                    Q_tail_shared,
+                    K_tail_shared,
+                    acc_s,
+                    transpose_B=True,
+                    policy=T.GemmWarpPolicy.FullRow,
+                )
+                T.copy(m_i, m_i_prev)
+                T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                for h_i in T.Parallel(H_per_block):
+                    alpha[h_i] = T.exp((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
+                for h_i, bi_i in T.Parallel(H_per_block, BI):
+                    acc_s[h_i, bi_i] = T.exp(acc_s[h_i, bi_i] * sm_scale - m_i[h_i] * sm_scale)
+                T.reduce_sum(acc_s, sumexp_i, dim=1)  # is this a accumulate operator?
+                for h_i in T.Parallel(H_per_block):
+                    sumexp[h_i] = sumexp[h_i] * alpha[h_i] + sumexp_i[h_i]
+                for h_i, d_i in T.Parallel(H_per_block, D):
+                    acc_o[h_i, d_i] = acc_o[h_i, d_i] * alpha[h_i]
+
+                T.copy(acc_s, S_shared)
+                T.gemm(S_shared, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
+            # Rescale
+            for h_i, d_i in T.Parallel(H_per_block, D):
+                acc_o[h_i, d_i] /= sumexp[h_i]
+            for h_i in T.Parallel(H_per_block):
+                sumexp[h_i] = T.log(sumexp[h_i]) + m_i[h_i] * sm_scale
+
+            T.copy(acc_o, Output[bos + s_i, H0:H1, :])
+            T.copy(sumexp, Lse[bos + s_i, H0:H1])
+
+    return main
+
+
+def sparse_mla_fwd_interface(
+    q, kv, indices, offsets, sm_scale=None, return_p_sum: bool = False, d_v=512, block_I=32, num_stages=2, threads=128
+):
+    is_casual = True
+    assert return_p_sum == False, "This kernel file is for fwd only"
+    assert q.is_contiguous() and kv.is_contiguous() and indices.is_contiguous()
+    seq_len, heads, dim_plus_tail_dim = q.shape
+    seq_len_kv, kv_group, _ = kv.shape
+    assert seq_len == seq_len_kv
+
+    assert dim_plus_tail_dim == 576, "you should assign dim otherwise"
+    dim = d_v
+
+    assert kv.shape[-1] == dim_plus_tail_dim
+    tail_dim = dim_plus_tail_dim - dim
+    _, _, topk = indices.shape
+    assert indices.shape == (seq_len, kv_group, topk)
+
+    token_indices = prepare_token_indices(offsets)
+
+    kernel = sparse_mla_fwd(
+        heads, dim, tail_dim, topk, kv_group, sm_scale, is_casual, block_I=block_I, num_stages=num_stages, threads=threads
+    )
+    out, lse = kernel(q, kv, indices, offsets, token_indices)
+    return out, lse
+
+
+def ref_sparse_mla_fwd_interface(Q, KV, Indices, offsets, sm_scale=None, is_casual=True):
+    Q = Q.float()
+    KV = KV.float()
+    all_o = []
+    for i in range(offsets.shape[0] - 1):
+        q = Q[None, offsets[i] : offsets[i + 1]]
+        kv = KV[None, offsets[i] : offsets[i + 1]]
+        indices = Indices[None, offsets[i] : offsets[i + 1]].clone()
+
+        indices = indices.transpose(1, 2)
+        b, sq, h, dim_q = q.shape
+        b, sk, g, _ = kv.shape
+
+        assert kv.shape[-1] == 576, "you should assign dim otherwise"
+        dim = 512
+        k = kv
+        v = kv[..., :dim]
+
+        b, _, _, dim_v = v.shape
+        g_index = g
+        h_index = h // g
+        compressed_casual_mask = torch.arange(0, sq, dtype=torch.int32, device="cuda").view(-1, 1) >= torch.arange(
+            1 - 1, sk * 1, 1, dtype=torch.int32, device="cuda"
+        ).view(1, -1)
+
+        indices[indices > sk] = sk
+        mask = q.new_zeros(b, g_index, sq, sk + 1, dtype=torch.bool).scatter(3, indices.long(), 1)
+        mask = mask[..., :-1]
+        mask = mask & compressed_casual_mask.view(1, 1, sq, sk)
+        mask[:, :, : 1 - 1, 0] = True
+        mask = mask.view(b, g_index, 1, sq, sk)
+
+        q = q.view(b, sq, g, -1, dim_q)
+        score = torch.einsum("bmghd,bngd->bghmn", q, k)
+        sm_scale = dim_q**-0.5 if sm_scale is None else sm_scale
+        score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+        p = score.softmax(dim=-1)
+        p = p.view(b, g_index, h_index, -1, sq, sk)
+        p = p.view(b, g, -1, sq, sk)
+        o = torch.einsum("bghmn,bngd->bmghd", p.type(v.dtype), v)
+        o = o.reshape(b, sq, h, dim_v)
+        all_o.append(o.squeeze(0))
+    o = torch.cat(all_o, dim=0)
+    return o.to(torch.bfloat16)
+
+
+def test_sparse_mla_fwd(
+    B=1,
+    S=4096,
+    H=128,
+    HKV=1,
+    DQK=576,
+    DV=512,
+    topk=2048,
+    dtype=torch.bfloat16,
+    check_correctness=True,
+    block_I=64,
+    num_stages=2,
+    threads=256,
+):
+    torch.random.manual_seed(0)
+    q = torch.randn((S, H, DQK), dtype=dtype, device="cuda").requires_grad_(True)
+    kv = torch.randn((S, HKV, DQK), dtype=dtype, device="cuda").requires_grad_(True)
+    offsets = torch.tensor([0, S // 2 - 1, S], dtype=torch.int32, device="cuda")
+
+    indices = torch.full((S, HKV, topk), S, dtype=torch.int32, device="cuda")
+    for i in range(offsets.shape[0] - 1):
+        seq_len = (offsets[i + 1] - offsets[i]).item()
+        assert seq_len >= topk
+        for t in range(seq_len):
+            for h in range(HKV):
+                i_i = torch.randperm(max(1, t))[:topk]
+                indices[offsets[i] + t, h, : len(i_i)] = i_i
+
+    tl_out, tl_lse = sparse_mla_fwd_interface(q, kv, indices, offsets, block_I=block_I, num_stages=num_stages, threads=threads)
+
+    if check_correctness:
+        # otherwise may cause out of memory
+        ref_out = ref_sparse_mla_fwd_interface(q, kv, indices, offsets)
+        assert_tensors_similar(tl_out, ref_out, eps=1e-2, name="out")
+        print("assert_tensors_similar passed")
+
+    def fn():
+        return sparse_mla_fwd_interface(q, kv, indices, offsets, block_I=block_I, num_stages=num_stages, threads=threads)
+
+    from tilelang.profiler import do_bench
+
+    ms = do_bench(
+        fn,
+        rep=100,
+        warmup=250,
+    )
+    print(f"Average time: {ms:.3f} ms")
+    print("fwd io bandwidth = ", (B * S * DQK * topk * 2) / (ms * 1e-3) / 1e12)
+    print("fwd tflops = ", (B * S * (DQK + DV) * topk * 2 * H) / (ms * 1e-3) / 1e12)
+
+
+if __name__ == "__main__":
+    test_sparse_mla_fwd(
+        B=1,
+        S=4096,
+        H=128,
+        HKV=1,
+        DQK=576,
+        DV=512,
+        topk=1024,
+        dtype=torch.bfloat16,
+        check_correctness=True,
+        block_I=64,
+        num_stages=2,
+        threads=256,
+    )
diff --git a/examples/dsa_sparse_finetune/sparse_mla_topk_reducesum.py b/examples/dsa_sparse_finetune/sparse_mla_topk_reducesum.py
new file mode 100644
index 000000000..a03bc74f5
--- /dev/null
+++ b/examples/dsa_sparse_finetune/sparse_mla_topk_reducesum.py
@@ -0,0 +1,226 @@
+# ruff: noqa
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import tilelang
+from tilelang import language as T
+from einops import repeat, rearrange, einsum
+from index import prepare_token_indices
+from utils import get_abs_err, get_err_ratio
+
+BF16 = T.bfloat16
+FP32 = T.float32
+INT32 = T.int32
+
+pass_configs = {
+    tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+    tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+}
+
+
+@tilelang.jit(pass_configs=pass_configs)
+def tl_sparse_mla_topk_reducesum_impl(
+    heads,
+    dim,
+    tail_dim,
+    topk,
+    kv_group=1,
+    sm_scale=None,
+    block_I=32,
+    num_stages=2,
+    threads=128,
+):
+    assert dim == tilelang.math.next_power_of_2(dim), f"haven't check padding correctness yet, dim={dim}"
+    assert tail_dim == tilelang.math.next_power_of_2(tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
+    assert topk % block_I == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
+    if sm_scale is None:
+        sm_scale = (1.0 / (dim + tail_dim)) ** 0.5
+
+    batch_plus_one = T.symbolic("batch_plus_one")
+    seq_len = T.symbolic("seq_len")
+    seq_len_kv = T.symbolic("seq_len_kv")
+
+    head_kv = heads // kv_group
+    indices_dtype = T.int32
+    dtype = T.bfloat16
+    accum_dtype = T.float32
+
+    G = kv_group
+    H = head_kv
+    padded_H = max(tilelang.math.next_power_of_2(head_kv), 16)
+    if padded_H != H:
+        assert kv_group == 1, (
+            "here we solve the H padding automatically, other wise you should handle Q copy and Output copy with your mask (when kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would be handled automatically)"
+        )
+    BI = block_I
+    NI = tilelang.cdiv(topk, block_I)
+    D = dim
+    D_tail = tail_dim
+
+    if head_kv > 64:
+        assert head_kv % 64 == 0, "head_kv should be a multiple of 64"
+        REPLICATE_H = head_kv // 64
+    else:
+        REPLICATE_H = 1
+
+    H_per_block = padded_H if REPLICATE_H == 1 else 64
+
+    q_shape = [seq_len, heads, dim + tail_dim]
+    kv_shape = [seq_len_kv, kv_group, dim + tail_dim]
+    indices_shape = [seq_len, kv_group, topk]
+    lse_shape = [seq_len, heads]
+    reducesum_shape = [seq_len, kv_group, REPLICATE_H, topk]
+    offsets_shape = [batch_plus_one]
+    token_indices_shape = [seq_len, 2]
+
+    @T.prim_func
+    def tl_sparse_mla_topk_reducesum_kernel(
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        KV: T.Tensor(kv_shape, dtype),  # type: ignore
+        Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
+        Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
+        Offsets: T.Tensor(offsets_shape, indices_dtype),  # type: ignore
+        TokenIndices: T.Tensor(token_indices_shape, indices_dtype),  # type: ignore
+        ReduceSum: T.Tensor(reducesum_shape, accum_dtype),  # type: ignore
+    ):
+        with T.Kernel(seq_len * REPLICATE_H, kv_group, threads=threads) as (
+            bx,
+            by,
+        ):
+            Q_shared = T.alloc_shared([H_per_block, D], dtype)
+            Q_tail_shared = T.alloc_shared([H_per_block, D_tail], dtype)
+            KV_shared = T.alloc_shared([BI, D], dtype)
+            K_tail_shared = T.alloc_shared([BI, D_tail], dtype)
+            mask = T.alloc_fragment([BI], "bool")
+
+            acc_s = T.alloc_fragment([H_per_block, BI], accum_dtype)
+            reducesum = T.alloc_fragment([BI], accum_dtype)
+            lse = T.alloc_fragment([H_per_block], accum_dtype)
+
+            T.fill(lse, 0)
+
+            b_s_i = bx if REPLICATE_H == 1 else (bx // REPLICATE_H)
+            b_i, s_i = TokenIndices[b_s_i, 0], TokenIndices[b_s_i, 1]
+            bos, eos = Offsets[b_i], Offsets[b_i + 1]
+            r_i = bx % REPLICATE_H
+            g_i = by
+            q_i = s_i
+            max_kv_i = q_i
+
+            H0 = g_i * padded_H + (0 if REPLICATE_H == 1 else (bx % REPLICATE_H) * 64)
+            H1 = H0 + H_per_block
+
+            T.copy(Q[bos + s_i, H0:H1, :D], Q_shared)
+            T.copy(Q[bos + s_i, H0:H1, D:], Q_tail_shared)
+            T.copy(Lse[bos + s_i, H0:H1], lse)
+
+            for i_i in T.Pipelined(NI, num_stages=num_stages):
+                for bi_i in T.Parallel(BI):
+                    mask[bi_i] = (Indices[bos + s_i, g_i, i_i * BI + bi_i] <= max_kv_i) & (Indices[bos + s_i, g_i, i_i * BI + bi_i] != -1)
+
+                for bi_i, d_i in T.Parallel(BI, D):
+                    KV_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, g_i, i_i * BI + bi_i], g_i, d_i]
+                for bi_i, d_i in T.Parallel(BI, D_tail):
+                    K_tail_shared[bi_i, d_i] = KV[bos + Indices[bos + s_i, g_i, i_i * BI + bi_i], g_i, D + d_i]
+
+                for h_i, bi_i in T.Parallel(H_per_block, BI):
+                    acc_s[h_i, bi_i] = T.if_then_else(mask[bi_i], 0, -T.infinity(acc_s.dtype))
+                T.gemm(
+                    Q_shared,
+                    KV_shared,
+                    acc_s,
+                    transpose_B=True,
+                    policy=T.GemmWarpPolicy.FullRow,
+                )
+                T.gemm(
+                    Q_tail_shared,
+                    K_tail_shared,
+                    acc_s,
+                    transpose_B=True,
+                    policy=T.GemmWarpPolicy.FullRow,
+                )
+                for h_i, bi_i in T.Parallel(H_per_block, BI):
+                    acc_s[h_i, bi_i] = T.exp(acc_s[h_i, bi_i] * sm_scale - lse[h_i])
+                T.reduce_sum(acc_s, reducesum, dim=0)
+                T.copy(reducesum, ReduceSum[bos + s_i, g_i, r_i, i_i * BI : i_i * BI + BI])
+
+    return tl_sparse_mla_topk_reducesum_kernel
+
+
+def sparse_mla_topk_reducesum_interface(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    topk_indices: torch.Tensor,
+    lse: torch.Tensor,
+    offsets: torch.Tensor,
+    dim_v: int,
+):
+    assert kv.shape[-2] == 1
+    seq_len, heads, dim_plus_tail_dim, topk = *q.shape, topk_indices.shape[-1]
+    REPLICATE_H = max(heads // 64, 1)
+    tail_dim = dim_plus_tail_dim - dim_v
+    token_indices = prepare_token_indices(offsets)
+
+    reducesum = torch.zeros([seq_len, 1, REPLICATE_H, topk], dtype=torch.float32, device=q.device)
+    kernel = tl_sparse_mla_topk_reducesum_impl(heads=heads, dim=dim_v, tail_dim=tail_dim, topk=topk)
+    kernel(q, kv, topk_indices, lse, offsets, token_indices, reducesum)
+    reducesum = reducesum.sum(dim=-2)  # [batch, seq_len, 1, RH, topk] -> [batch, seq_len, 1, topk]
+    attn_score = reducesum / reducesum.sum(dim=-1, keepdim=True)
+
+    return attn_score
+
+
+def ref_mla_topk_softmax(Q: torch.Tensor, K: torch.Tensor, TopkIndices: torch.Tensor, offsets: torch.Tensor):
+    # q: [batch, seq_len, heads, dim]
+    # k: [batch, seq_len, dim]
+    sm_scale = Q.shape[-1] ** -0.5
+    all_lse = []
+    all_topk_score = []
+    for i in range(offsets.shape[0] - 1):
+        q = Q[offsets[i] : offsets[i + 1]]
+        k = K[offsets[i] : offsets[i + 1]]
+        topk_indices = TopkIndices[offsets[i] : offsets[i + 1]]
+        seq_len = q.shape[0]
+        mask = (torch.arange(seq_len)[:, None] >= torch.arange(seq_len)[None, :]).unsqueeze(-2).cuda()
+        logits = einsum(q, k, "s1 h d, s2 d -> s1 h s2") * sm_scale
+        logits = torch.where(mask, logits, float("-inf"))
+        score = F.softmax(logits, dim=-1, dtype=torch.float32)
+        score_sum = score.sum(dim=-2)
+        topk_score = torch.gather(score_sum, dim=-1, index=topk_indices.to(torch.int64))
+        topk_score = topk_score / topk_score.sum(dim=-1, keepdim=True)
+        max_logits = logits.amax(dim=-1).to(torch.float32)
+        lse = torch.log((logits - max_logits.unsqueeze(-1).to(torch.float32)).exp().sum(dim=-1)) + max_logits
+        all_lse.append(lse)
+        all_topk_score.append(topk_score)
+    lse = torch.cat(all_lse, dim=0)
+    topk_score = torch.cat(all_topk_score, dim=0)
+    return lse, topk_score
+
+
+def test_kernel(
+    B=1,
+    S=2048,
+    H=16,
+    D=512,
+    tail_D=64,
+    topk=128,
+):
+    torch.manual_seed(42)
+
+    q = torch.randn((S, H, D + tail_D)).cuda().bfloat16()
+    kv = torch.randn((S, D + tail_D)).cuda().bfloat16()
+    offsets = torch.tensor([0, 1023, S], dtype=torch.int32).cuda()
+
+    topk_indices = repeat(torch.arange(topk, dtype=torch.int32).cuda(), "k -> s k", s=S).contiguous()
+
+    lse, ref_attn_score = ref_mla_topk_softmax(q, kv, topk_indices, offsets)
+
+    kv = kv.unsqueeze(-2)
+    topk_indices = topk_indices.unsqueeze(-2)
+
+    attn_score = sparse_mla_topk_reducesum_interface(q, kv, topk_indices, lse, offsets, dim_v=D).squeeze(-2)
+    print(f"attn_score err: {get_abs_err(attn_score, ref_attn_score):.6f} ratio: {get_err_ratio(attn_score, ref_attn_score):.6f}")
+
+
+if __name__ == "__main__":
+    test_kernel()
diff --git a/examples/dsa_sparse_finetune/utils.py b/examples/dsa_sparse_finetune/utils.py
new file mode 100644
index 000000000..96afd064d
--- /dev/null
+++ b/examples/dsa_sparse_finetune/utils.py
@@ -0,0 +1,73 @@
+import torch
+
+
+def get_abs_err(y, x):
+    x = x.to(torch.float32)
+    y = y.to(torch.float32)
+    return (x - y).flatten().abs().max().item()
+
+
+def get_err_ratio(y, x):
+    x = x.to(torch.float32)
+    y = y.to(torch.float32)
+    err = (x - y).flatten().square().mean().sqrt().item()
+    base = (x).flatten().square().mean().sqrt().item()
+    return err / base
+
+
+def calculate_tensor_similarity(x, y, name="tensor"):
+    """
+    Calculate similarity between two tensors using a normalized dot product metric.
+
+    Unlike torch.testing.assert_close which uses absolute/relative tolerance based on
+    element-wise differences, this function computes a global similarity score:
+        sim = 2 * <x, y> / (||x||^2 + ||y||^2)
+
+    This metric is scale-invariant and measures the cosine-like similarity normalized
+    by the magnitude of both tensors. It returns 1 for identical tensors and values
+    closer to 0 for dissimilar ones. This is particularly useful for comparing tensors
+    with varying magnitudes where relative errors matter more than absolute differences.
+
+    Args:
+        x: First tensor to compare
+        y: Second tensor to compare
+        name: Name of the tensor for logging purposes
+
+    Returns:
+        Similarity score in range [0, 1] where 1 means identical
+    """
+    x, y = x.data.double(), y.data.double()
+    denominator = (x * x + y * y).sum()
+    if denominator == 0:
+        print(f"\033[33mWARNING: {name} all zero\033[0m")
+        return 1
+    sim = 2 * (x * y).sum() / denominator
+    return sim
+
+
+def assert_tensors_similar(x, y, eps=1e-8, name="tensor", raise_assert=True):
+    """
+    Assert that two tensors are similar using a global similarity metric.
+
+    Key differences from torch.testing.assert_close:
+    - torch.testing.assert_close: Uses element-wise comparison with rtol/atol, checking
+      that |x - y| <= atol + rtol * |y| for each element. It's sensitive to outliers
+      and requires all elements to satisfy the tolerance.
+    - assert_tensors_similar: Uses a single global similarity score (1 - sim) where sim is the
+      normalized dot product. It's more robust to outliers and focuses on overall
+      tensor similarity rather than element-wise precision. This is better suited for
+      comparing large tensors where a few outlier elements shouldn't fail the test.
+
+    Args:
+        x: First tensor to compare
+        y: Second tensor to compare
+        eps: Maximum allowed difference (1 - similarity), default 1e-8
+        name: Name of the tensor for error messages
+        raise_assert: Whether to raise assertion error on failure
+    """
+    sim = calculate_tensor_similarity(x, y, name)
+    diff = 1.0 - sim
+    if not (0 <= diff <= eps):
+        print(f"\033[31mERROR: {name} similarity check failed, diff={diff:.2e} (threshold={eps:.2e})\033[0m")
+        if raise_assert:
+            assert False  # noqa: B011
diff --git a/examples/dynamic_shape/example_dynamic.py b/examples/dynamic_shape/example_dynamic.py
index be018c8b7..e338d76ca 100644
--- a/examples/dynamic_shape/example_dynamic.py
+++ b/examples/dynamic_shape/example_dynamic.py
@@ -1,10 +1,9 @@
 import tilelang
 import tilelang.language as T
 import tilelang.testing
-from tilelang import tvm as tvm
 
 
-@tilelang.jit(pass_configs={"tl.disable_dynamic_tail_split": True, "tl.dynamic_alignment": 8})
+@tilelang.jit
 def matmul_dynamic_mnk(
     block_M,
     block_N,
@@ -17,9 +16,9 @@ def matmul_dynamic_mnk(
     num_stages,
     threads,
 ):
-    M = tvm.te.var("m")
-    N = tvm.te.var("n")
-    K = tvm.te.var("k")
+    M = T.dynamic("m")
+    N = T.dynamic("n")
+    K = T.dynamic("k")
 
     A_shape = (K, M) if trans_A else (M, K)
     B_shape = (N, K) if trans_B else (K, N)
@@ -29,9 +28,9 @@ def matmul_dynamic_mnk(
 
     @T.prim_func
     def dynamic_matmul(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -53,15 +52,14 @@ def dynamic_matmul(
     return dynamic_matmul
 
 
-def matmul_dynamic(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype,
-                   accum_dtype, num_stages, threads):
+def matmul_dynamic(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, num_stages, threads):
     print(
         f"M: {M}, N: {N}, K: {K}, block_M: {block_M}, block_N: {block_N}, block_K: {block_K}, trans_A: {trans_A}, trans_B: {trans_B}, in_dtype: {in_dtype}, out_dtype: {out_dtype}, accum_dtype: {accum_dtype}, num_stages: {num_stages}, threads: {threads}"
     )
-    kernel = matmul_dynamic_mnk(block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype,
-                                accum_dtype, num_stages, threads)
+    kernel = matmul_dynamic_mnk(block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, num_stages, threads)
 
     import torch
+
     if trans_A:
         A = torch.rand(K, M, device="cuda", dtype=getattr(torch, in_dtype))
     else:
@@ -103,8 +101,30 @@ def main(M=16384, N=16384, K=16384):
     accum_dtype = "float32"
     num_stages = 3
     threads = 128
-    matmul_dynamic(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype,
-                   accum_dtype, num_stages, threads)
+    matmul_dynamic(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, num_stages, threads)
+
+
+def run_regression_perf(M=4096, N=4096, K=4096):
+    block_M, block_N, block_K = 128, 128, 32
+    trans_A, trans_B = False, False
+    in_dtype, out_dtype = "float16", "float16"
+    accum_dtype = "float32"
+    num_stages = 3
+    threads = 128
+    kernel = matmul_dynamic_mnk(block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, num_stages, threads)
+    import torch
+
+    if trans_A:
+        A = torch.rand(K, M, device="cuda", dtype=getattr(torch, in_dtype))
+    else:
+        A = torch.rand(M, K, device="cuda", dtype=getattr(torch, in_dtype))
+    if trans_B:
+        B = torch.rand(N, K, device="cuda", dtype=getattr(torch, in_dtype))
+    else:
+        B = torch.rand(K, N, device="cuda", dtype=getattr(torch, in_dtype))
+    C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+    return profiler.do_bench(input_tensors=[A, B, C], backend="cupti")
 
 
 if __name__ == "__main__":
diff --git a/examples/dynamic_shape/regression_example_dynamic.py b/examples/dynamic_shape/regression_example_dynamic.py
new file mode 100644
index 000000000..958695990
--- /dev/null
+++ b/examples/dynamic_shape/regression_example_dynamic.py
@@ -0,0 +1,10 @@
+import tilelang.testing
+import example_dynamic
+
+
+def regression_example_dynamic():
+    tilelang.testing.process_func(example_dynamic.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/elementwise/example_elementwise_add.py b/examples/elementwise/example_elementwise_add.py
index bc9bb4df5..32da94015 100644
--- a/examples/elementwise/example_elementwise_add.py
+++ b/examples/elementwise/example_elementwise_add.py
@@ -3,19 +3,25 @@
 import torch
 import tilelang
 import tilelang.language as T
-from tilelang.autotuner import AutoTuner
 
 
 def ref_program(x, y):
     return x + y
 
 
+def get_configs():
+    block_M = [64, 128, 256]
+    block_N = [64, 128, 256]
+    threads = [64, 128, 256]
+    configs = list(itertools.product(block_M, block_N, threads))
+    return [{"block_M": bm, "block_N": bn, "threads": th} for bm, bn, th in configs]
+
+
+@tilelang.autotune(configs=get_configs())
 @tilelang.jit(out_idx=[-1])
 def elementwise_add(M, N, block_M, block_N, in_dtype, out_dtype, threads):
-
     @T.prim_func
-    def elem_add(A: T.Tensor((M, N), in_dtype), B: T.Tensor((M, N), in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
+    def elem_add(A: T.Tensor((M, N), in_dtype), B: T.Tensor((M, N), in_dtype), C: T.Tensor((M, N), out_dtype)):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_N), in_dtype)
             B_shared = T.alloc_shared((block_M, block_N), in_dtype)
@@ -24,7 +30,7 @@ def elem_add(A: T.Tensor((M, N), in_dtype), B: T.Tensor((M, N), in_dtype), C: T.
 
             T.copy(A[by * block_M, bx * block_N], A_shared)
             T.copy(B[by * block_M, bx * block_N], B_shared)
-            for (local_y, local_x) in T.Parallel(block_M, block_N):
+            for local_y, local_x in T.Parallel(block_M, block_N):
                 C_local[local_y, local_x] = A_shared[local_y, local_x] + B_shared[local_y, local_x]
             T.copy(C_local, C_shared)
             T.copy(C_shared, C[by * block_M, bx * block_N])
@@ -32,53 +38,40 @@ def elem_add(A: T.Tensor((M, N), in_dtype), B: T.Tensor((M, N), in_dtype), C: T.
     return elem_add
 
 
-def get_configs(M, N):
-    block_M = [64, 128, 256]
-    block_N = [64, 128, 256]
-    threads = [64, 128, 256]
-    configs = list(itertools.product(block_M, block_N, threads))
-    return [{"block_M": bm, "block_N": bn, "threads": th} for bm, bn, th in configs]
-
-
-def get_best_config(M, N):
+def main(M=1024, N=1024, use_autotune=False):
+    a = torch.randn(M, N, dtype=torch.float32, device="cuda")
+    b = torch.randn(M, N, dtype=torch.float32, device="cuda")
 
-    def kernel(block_M=None, block_N=None, threads=None):
-        return elementwise_add(M, N, block_M, block_N, "float32", "float32", threads)
+    if use_autotune:
+        kernel = elementwise_add(M, N, in_dtype=T.float32, out_dtype=T.float32)
+    else:
+        # Default config
+        config = {"block_M": 32, "block_N": 32, "threads": 128}
+        kernel = elementwise_add(M, N, **config, in_dtype=T.float32, out_dtype=T.float32)
 
-    autotuner = AutoTuner.from_kernel(
-        kernel=kernel, configs=get_configs(M, N)).set_compile_args(
-            out_idx=[-1],
-            target="cuda",
-        ).set_profile_args(
-            supply_type=tilelang.TensorSupplyType.Auto,
-            ref_prog=ref_program,
-            skip_check=False,
-        )
-    return autotuner.run(warmup=3, rep=20)
+    out = kernel(a, b)
+    torch.testing.assert_close(out, ref_program(a, b), rtol=1e-2, atol=1e-2)
 
 
-def main():
+def run_regression_perf():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--m", type=int, default=1024)
-    parser.add_argument("--n", type=int, default=1024)
-    parser.add_argument("--use_autotune", action="store_true", default=False)
+    parser.add_argument("--m", type=int, default=4096)
+    parser.add_argument("--n", type=int, default=4096)
     args, _ = parser.parse_known_args()
     M, N = args.m, args.n
-
     a = torch.randn(M, N, dtype=torch.float32, device="cuda")
     b = torch.randn(M, N, dtype=torch.float32, device="cuda")
+    config = {"block_M": 32, "block_N": 32, "threads": 128}
+    kernel = elementwise_add(M, N, **config, in_dtype="float32", out_dtype="float32")
+    from tilelang.profiler import do_bench
 
-    if args.use_autotune:
-        result = get_best_config(M, N)
-        kernel = result.kernel
-    else:
-        # Default config
-        config = {"block_M": 32, "block_N": 32, "threads": 128}
-        kernel = elementwise_add(M, N, **config, in_dtype="float32", out_dtype="float32")
-
-    out = kernel(a, b)
-    torch.testing.assert_close(out, ref_program(a, b), rtol=1e-2, atol=1e-2)
+    return do_bench(lambda: kernel(a, b), backend="cupti")
 
 
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--m", type=int, default=1024)
+    parser.add_argument("--n", type=int, default=1024)
+    parser.add_argument("--use_autotune", action="store_true", default=False)
+    args, _ = parser.parse_known_args()
+    main(args.m, args.n, args.use_autotune)
diff --git a/examples/elementwise/example_elementwise_add_tma_1d.py b/examples/elementwise/example_elementwise_add_tma_1d.py
index 0467eba88..501e1f00d 100644
--- a/examples/elementwise/example_elementwise_add_tma_1d.py
+++ b/examples/elementwise/example_elementwise_add_tma_1d.py
@@ -10,10 +10,8 @@ def ref_program(x, y):
 
 @tilelang.jit(out_idx=[-1])
 def elementwise_add(M, N, block_M, block_N, in_dtype, out_dtype, threads):
-
     @T.prim_func
-    def elem_add(A: T.Tensor((M, N), in_dtype), B: T.Tensor((M, N), in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
+    def elem_add(A: T.Tensor((M, N), in_dtype), B: T.Tensor((M, N), in_dtype), C: T.Tensor((M, N), out_dtype)):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_N), in_dtype)
             B_shared = T.alloc_shared((block_M, block_N), in_dtype)
@@ -22,7 +20,7 @@ def elem_add(A: T.Tensor((M, N), in_dtype), B: T.Tensor((M, N), in_dtype), C: T.
 
             T.copy(A[by * block_M, bx * block_N], A_shared)
             T.copy(B[by * block_M, bx * block_N], B_shared)
-            for (local_y, local_x) in T.Parallel(block_M, block_N):
+            for local_y, local_x in T.Parallel(block_M, block_N):
                 C_local[local_y, local_x] = A_shared[local_y, local_x] + B_shared[local_y, local_x]
             T.copy(C_local, C_shared)
             T.copy(C_shared, C[by * block_M, bx * block_N])
diff --git a/examples/elementwise/regression_example_elementwise.py b/examples/elementwise/regression_example_elementwise.py
new file mode 100644
index 000000000..261202a56
--- /dev/null
+++ b/examples/elementwise/regression_example_elementwise.py
@@ -0,0 +1,10 @@
+import tilelang.testing
+import example_elementwise_add
+
+
+def regression_example_elementwise_add():
+    tilelang.testing.process_func(example_elementwise_add.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/elementwise/test_example_elementwise.py b/examples/elementwise/test_example_elementwise.py
index ff0b45a0a..24f675cd6 100644
--- a/examples/elementwise/test_example_elementwise.py
+++ b/examples/elementwise/test_example_elementwise.py
@@ -1,14 +1,13 @@
 import tilelang.testing
 import example_elementwise_add
-import example_elementwise_add_tma_1d
 
 
 def test_example_elementwise_add():
     example_elementwise_add.main()
 
 
-def test_example_elementwise_add_tma_1d():
-    example_elementwise_add_tma_1d.main()
+def test_example_elementwise_add_autotune():
+    example_elementwise_add.main(use_autotune=True)
 
 
 if __name__ == "__main__":
diff --git a/examples/flash_attention/README.md b/examples/flash_attention/README.md
index be11a8dc6..355ed7325 100644
--- a/examples/flash_attention/README.md
+++ b/examples/flash_attention/README.md
@@ -34,8 +34,6 @@ def flash_attention(
         scores_sum = T.alloc_fragment([block_M], accum_dtype)
         logsum = T.alloc_fragment([block_M], accum_dtype)
 
-        # Annotate layout for Q_shared, e.g., use a swizzled layout to optimize memory access
-        T.annotate_layout({Q_shared: tl.layout.make_swizzled_layout(Q_shared)})
 
         # Copy a block of Q from global memory to Q_shared
         T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
@@ -77,6 +75,8 @@ def flash_attention(
 
             # Compute the maximum value per row on dimension 1 (block_N)
             T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
 
             # Compute the factor by which we need to rescale previous partial sums
             for i in T.Parallel(block_M):
@@ -106,4 +106,4 @@ def flash_attention(
 
         # Write back the final output block from acc_o to the Output buffer
         T.copy(acc_o, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
-```
\ No newline at end of file
+```
diff --git a/examples/flash_attention/bert_padding.py b/examples/flash_attention/bert_padding.py
index 7058fd773..15c4097ce 100644
--- a/examples/flash_attention/bert_padding.py
+++ b/examples/flash_attention/bert_padding.py
@@ -6,7 +6,6 @@
 
 
 class IndexFirstAxis(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, input, indices):
         ctx.save_for_backward(indices)
@@ -15,9 +14,7 @@ def forward(ctx, input, indices):
         second_dim = other_shape.numel()
         # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
         # return input[indices]
-        return torch.gather(
-            rearrange(input, "b ... -> b (...)"), 0,
-            repeat(indices, "z -> z d", d=second_dim)).reshape(-1, *other_shape)
+        return torch.gather(rearrange(input, "b ... -> b (...)"), 0, repeat(indices, "z -> z d", d=second_dim)).reshape(-1, *other_shape)
 
     @staticmethod
     def backward(ctx, grad_output):
@@ -40,14 +37,12 @@ def backward(ctx, grad_output):
 
 
 class IndexPutFirstAxis(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, values, indices, first_axis_dim):
         ctx.save_for_backward(indices)
         assert indices.ndim == 1
         assert values.ndim >= 2
-        output = torch.zeros(
-            first_axis_dim, *values.shape[1:], device=values.device, dtype=values.dtype)
+        output = torch.zeros(first_axis_dim, *values.shape[1:], device=values.device, dtype=values.dtype)
         # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
         output[indices] = values
         # output.scatter_(0, repeat(indices, 'z -> z d', d=values.shape[1]), values)
@@ -66,7 +61,6 @@ def backward(ctx, grad_output):
 
 
 class IndexFirstAxisResidual(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, input, indices):
         ctx.save_for_backward(indices)
@@ -128,7 +122,7 @@ def unpad_input_for_concatenated_sequences(hidden_states, attention_mask_in_leng
     """
     Supports concatenating short samples in one sequence. The attention_mask_in_length is utilized to mask other short samples. It helps efficient training of variant lengths-based samples (e.g., the supervised fine-tuning task in large language model).
     The motivation for this function is explained [here](https://github.com/Dao-AILab/flash-attention/issues/432#issuecomment-1668822286).
-    
+
     For example, if batch = 3 and seqlen = 6, the attention_mask_in_length is:
         ```
         [
@@ -177,9 +171,7 @@ def unpad_input_for_concatenated_sequences(hidden_states, attention_mask_in_leng
     """
     length = attention_mask_in_length.sum(dim=-1)
     seqlen = attention_mask_in_length.size(-1)
-    attention_mask_2d = torch.arange(
-        seqlen, device=length.device, dtype=length.dtype).expand(len(length),
-                                                                 seqlen) < length.unsqueeze(1)
+    attention_mask_2d = torch.arange(seqlen, device=length.device, dtype=length.dtype).expand(len(length), seqlen) < length.unsqueeze(1)
     real_indices_idx = torch.nonzero(attention_mask_in_length.flatten(), as_tuple=False).flatten()
     seqlens_in_batch = attention_mask_in_length.flatten()[real_indices_idx]
     indices = torch.nonzero(attention_mask_2d.flatten(), as_tuple=False).flatten()
diff --git a/examples/flash_attention/example_gqa_bwd.py b/examples/flash_attention/example_gqa_bwd.py
index 907a121d2..801927faf 100644
--- a/examples/flash_attention/example_gqa_bwd.py
+++ b/examples/flash_attention/example_gqa_bwd.py
@@ -6,25 +6,27 @@
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, groups=1):
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            Output: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        Output: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=256) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -39,26 +41,25 @@ def flash_fwd(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
-            loop_range = (
-                T.ceildiv(
-                    (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+            loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
-                T.copy(K[bz, k * block_N:(k + 1) * block_N, by // groups, :], K_shared)
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by // groups, :], K_shared)
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                     -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
                 else:
-                    T.clear(acc_s)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by // groups, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim_v):
@@ -72,29 +73,31 @@ def flash_fwd(
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
             for i, j in T.Parallel(block_M, dim_v):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(acc_o, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim_v):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim_v]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -103,81 +106,74 @@ def flash_bwd_prep(
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim_v, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
 def make_dq_layout(dQ):
     # atomicAdd can not be vectorized, so we need to reorder dq to match the 8x8 gemm fragment
-    return T.Layout(dQ.shape,
-                    lambda b, l, h, d: [b, l // 8, h, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
+    return T.Layout(dQ.shape, lambda b, l, h, d: [b, l // 8, h, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
 
 
 @tilelang.jit(
-    out_idx=[1], pass_configs={
+    out_idx=[1],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_postprocess(batch, heads, seq_len, dim_qk):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim_qk]
     blk = 64
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(shape, dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
             T.copy(
-                dQ[bz, bx * blk:(bx + 1) * blk, by, :],
-                dQ_out[bz, bx * blk:(bx + 1) * blk, by, :],
+                dQ[bz, bx * blk : (bx + 1) * blk, by, :],
+                dQ_out[bz, bx * blk : (bx + 1) * blk, by, :],
             )
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd_atomic_add(batch,
-                             heads,
-                             seq_len,
-                             dim_qk,
-                             dim_v,
-                             is_causal,
-                             block_M,
-                             block_N,
-                             threads=256,
-                             num_stages=2,
-                             groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd_atomic_add(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -197,35 +193,35 @@ def flash_bwd(
             dk_shared = T.alloc_shared([block_M, dim_qk], accum_dtype)
             dv_shared = T.alloc_shared([block_M, dim_v], accum_dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-            })
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                }
+            )
 
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx // groups, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx // groups, :], V_shared)
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -237,49 +233,41 @@ def flash_bwd(
                 for i, j in T.Parallel(block_N, dim_qk):
                     T.atomic_add(dQ[bz, k * block_N + i, bx, j], dq[i, j])
             T.copy(dv, dv_shared)
-            T.atomic_add(dV[bz, by * block_M:(by + 1) * block_M, bx // groups, :], dv_shared)
+            T.atomic_add(dV[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dv_shared)
             T.copy(dk, dk_shared)
-            T.atomic_add(dK[bz, by * block_M:(by + 1) * block_M, bx // groups, :], dk_shared)
+            T.atomic_add(dK[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dk_shared)
 
     return flash_bwd
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd_split(batch,
-                        heads,
-                        seq_len,
-                        dim_qk,
-                        dim_v,
-                        is_causal,
-                        block_M,
-                        block_N,
-                        threads=256,
-                        num_stages=2,
-                        groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd_split(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
     dk_shape = [groups, batch, seq_len, head_kv, dim_qk]  # sum after kernel
     dv_shape = [groups, batch, seq_len, head_kv, dim_v]  # sum after kernel
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(dk_shape, dtype),  # type: ignore
-            dV: T.Tensor(dv_shape, dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(dk_shape, dtype),  # type: ignore
+        dV: T.Tensor(dv_shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -299,37 +287,35 @@ def flash_bwd(
             dv_shared = T.alloc_shared([block_M, dim_v], dtype)
             dk_shared = T.alloc_shared([block_M, dim_qk], dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                }
+            )
 
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx // groups, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx // groups, :], V_shared)
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -342,16 +328,15 @@ def flash_bwd(
                     T.atomic_add(dQ[bz, k * block_N + i, bx, j], dq[i, j])
 
             T.copy(dv, dv_shared)
-            T.copy(dv_shared, dV[bx % groups, bz, by * block_M:(by + 1) * block_M, bx // groups, :])
+            T.copy(dv_shared, dV[bx % groups, bz, by * block_M : (by + 1) * block_M, bx // groups, :])
             T.copy(dk, dk_shared)
-            T.copy(dk, dK[bx % groups, bz, by * block_M:(by + 1) * block_M, bx // groups, :])
+            T.copy(dk, dK[bx % groups, bz, by * block_M : (by + 1) * block_M, bx // groups, :])
 
     return flash_bwd
 
 
 @torch.compile
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, causal, groups=1, use_atomic=True):
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
@@ -369,7 +354,10 @@ def forward(ctx, q, k, v, causal, groups=1, use_atomic=True):
     def backward(ctx, do):
         q, k, v, o, lse = ctx.saved_tensors
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
-        HEAD_KV, D_HEAD_V, = v.shape[-2], v.shape[-1]
+        (
+            HEAD_KV,
+            D_HEAD_V,
+        ) = v.shape[-2], v.shape[-1]
         groups = H // HEAD_KV
 
         def maybe_contiguous(x):
@@ -386,17 +374,8 @@ def maybe_contiguous(x):
 
         if ctx.use_atomic:
             kernel = flashattn_bwd_atomic_add(
-                BATCH,
-                H,
-                N_CTX,
-                D_HEAD_QK,
-                D_HEAD_V,
-                ctx.causal,
-                block_M,
-                block_N,
-                threads=256,
-                num_stages=2,
-                groups=groups)
+                BATCH, H, N_CTX, D_HEAD_QK, D_HEAD_V, ctx.causal, block_M, block_N, threads=256, num_stages=2, groups=groups
+            )
             shape_q = [BATCH, N_CTX, H, D_HEAD_QK]
             shape_k = [BATCH, N_CTX, HEAD_KV, D_HEAD_QK]
             shape_v = [BATCH, N_CTX, HEAD_KV, D_HEAD_V]
@@ -409,17 +388,8 @@ def maybe_contiguous(x):
             dv = dv.to(torch.float16)
         else:
             kernel = flashattn_bwd_split(
-                BATCH,
-                H,
-                N_CTX,
-                D_HEAD_QK,
-                D_HEAD_V,
-                ctx.causal,
-                block_M,
-                block_N,
-                threads=256,
-                num_stages=2,
-                groups=groups)
+                BATCH, H, N_CTX, D_HEAD_QK, D_HEAD_V, ctx.causal, block_M, block_N, threads=256, num_stages=2, groups=groups
+            )
             shape_q = [BATCH, N_CTX, H, D_HEAD_QK]
             shape_k = [groups, BATCH, N_CTX, HEAD_KV, D_HEAD_QK]  # sum after kernel
             shape_v = [groups, BATCH, N_CTX, HEAD_KV, D_HEAD_V]  # sum after kernel
@@ -441,53 +411,45 @@ def ref_program(Q, K, V, is_causal, groups=1):
     # K: [B, T, HK, D_QK]
     # V: [B, T, HV, D_V]
     # HQ = HKV * groups
-    assert Q.size(2) == K.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
-    assert Q.size(2) == V.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
 
     dim_qk = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim_qk, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
-def main(BATCH: int = 1,
-         H: int = 32,
-         N_CTX: int = 256,
-         D_HEAD_QK: int = 192,
-         D_HEAD_V: int = 128,
-         groups: int = 16,
-         causal: bool = False,
-         use_atomic: bool = True):
+def main(
+    BATCH: int = 1,
+    H: int = 32,
+    N_CTX: int = 256,
+    D_HEAD_QK: int = 192,
+    D_HEAD_V: int = 128,
+    groups: int = 16,
+    causal: bool = False,
+    use_atomic: bool = True,
+):
     flops_per_qk = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_QK
     flops_per_v = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_V
     total_flops = 3 * flops_per_qk + 2 * flops_per_v
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
 
     head_kv = H // groups
-    K = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    V = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    dO = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    K = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    V = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    dO = torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
     O = attention(Q, K, V, causal, groups, use_atomic)
     O.backward(dO, retain_graph=True)
     dQ, Q.grad = Q.grad.clone(), None
@@ -504,7 +466,7 @@ def main(BATCH: int = 1,
     torch.testing.assert_close(dV, dV_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(dK, dK_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(dQ, dQ_ref, rtol=1e-2, atol=1e-2)
-    print('All checks passed.✅')
+    print("All checks passed.✅")
 
     def run():
         O_ref.backward(dO, retain_graph=True)
@@ -522,19 +484,61 @@ def run1():
     print("tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
+def run_regression_perf():
+    BATCH = 1
+    H = 32
+    N_CTX = 256
+    D_HEAD_QK = 192
+    D_HEAD_V = 128
+    groups = 16
+    causal = False
+    device = "cuda"
+    torch.manual_seed(42)
+    head_kv = H // groups
+    Q = torch.randn(BATCH, N_CTX, H, D_HEAD_QK, device=device, dtype=torch.half)
+    K = torch.randn(BATCH, N_CTX, head_kv, D_HEAD_QK, device=device, dtype=torch.half)
+    V = torch.randn(BATCH, N_CTX, head_kv, D_HEAD_V, device=device, dtype=torch.half)
+    O = torch.randn(BATCH, N_CTX, H, D_HEAD_V, device=device, dtype=torch.half)
+    dO = torch.randn(BATCH, N_CTX, H, D_HEAD_V, device=device, dtype=torch.half)
+    lse = torch.zeros(BATCH, H, N_CTX, device=device, dtype=torch.float32)
+    with torch.no_grad():
+        mod_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD_V)
+        kernel = flashattn_bwd_split(
+            BATCH,
+            H,
+            N_CTX,
+            D_HEAD_QK,
+            D_HEAD_V,
+            causal,
+            block_M=128,
+            block_N=32,
+            threads=256,
+            num_stages=2,
+            groups=groups,
+        )
+    dQ = torch.zeros_like(Q, dtype=torch.float32)
+    dK = torch.zeros(groups, BATCH, N_CTX, head_kv, D_HEAD_QK, device=device, dtype=torch.float16)
+    dV = torch.zeros(groups, BATCH, N_CTX, head_kv, D_HEAD_V, device=device, dtype=torch.float16)
+    Delta = mod_prep(O, dO)
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(Q, K, V, dO, lse, Delta, dQ, dK, dV)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head_qk', type=int, default=192, help='Head dimension for Q/K')
-    parser.add_argument('--d_head_v', type=int, default=128, help='Head dimension for V')
-    parser.add_argument('--causal', action='store_true', help='Causal flag')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
-    parser.add_argument(
-        '--use_atomic', action='store_true', default=False, help='Use atomic add for dK/dV')
-    parser.add_argument(
-        '--use_split', action='store_true', default=False, help='Use split for dK/dV')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head_qk", type=int, default=192, help="Head dimension for Q/K")
+    parser.add_argument("--d_head_v", type=int, default=128, help="Head dimension for V")
+    parser.add_argument("--causal", action="store_true", help="Causal flag")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
+    parser.add_argument("--use_atomic", action="store_true", default=False, help="Use atomic add for dK/dV")
+    parser.add_argument("--use_split", action="store_true", default=False, help="Use split for dK/dV")
     args = parser.parse_args()
 
     # Handle backward compatibility and logic
@@ -546,5 +550,4 @@ def run1():
         # Default: use atomic
         use_atomic = True
 
-    main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal,
-         use_atomic)
+    main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal, use_atomic)
diff --git a/examples/flash_attention/example_gqa_bwd_tma_reduce.py b/examples/flash_attention/example_gqa_bwd_tma_reduce.py
index b0732eb5a..fea547b6e 100644
--- a/examples/flash_attention/example_gqa_bwd_tma_reduce.py
+++ b/examples/flash_attention/example_gqa_bwd_tma_reduce.py
@@ -5,27 +5,31 @@
 from tilelang.contrib import nvcc
 import argparse
 
+tilelang.disable_cache()
+
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, groups=1):
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            Output: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        Output: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=256) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -40,26 +44,27 @@ def flash_fwd(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
-            T.fill(scores_max, -T.infinity(accum_dtype))
-            loop_range = (
-                T.ceildiv(
-                    (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+            # Warning: in causal/varlen/unaligned seqlen scenarios, the -inf will cause undefined behavior in exp ops
+            # We should set it to negative large number instead
+            T.fill(scores_max, T.Cast(accum_dtype, -1e30))
+            loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
-                T.copy(K[bz, k * block_N:(k + 1) * block_N, by // groups, :], K_shared)
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by // groups, :], K_shared)
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                     -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, T.Cast(accum_dtype, -1e30))
                 else:
-                    T.clear(acc_s)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by // groups, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim_v):
@@ -73,29 +78,31 @@ def flash_fwd(
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
             for i, j in T.Parallel(block_M, dim_v):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(acc_o, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim_v):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim_v]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -104,12 +111,12 @@ def flash_bwd_prep(
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim_v, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
@@ -120,12 +127,14 @@ def make_dq_layout(dQ):
 
 
 @tilelang.jit(
-    out_idx=[3, 4, 5], pass_configs={
+    out_idx=[3, 4, 5],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_postprocess(batch, heads, head_kv, seq_len, dim_qk, dim_v):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
@@ -133,64 +142,55 @@ def flashattn_bwd_postprocess(batch, heads, head_kv, seq_len, dim_qk, dim_v):
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(q_shape, dtype),  # type: ignore
-            dK_out: T.Tensor(k_shape, dtype),  # type: ignore
-            dV_out: T.Tensor(v_shape, dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(q_shape, dtype),  # type: ignore
+        dK_out: T.Tensor(k_shape, dtype),  # type: ignore
+        dV_out: T.Tensor(v_shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
-            T.copy(dQ[bz, bx * blk:(bx + 1) * blk, by, :], dQ_out[bz, bx * blk:(bx + 1) * blk,
-                                                                  by, :])
+            T.copy(dQ[bz, bx * blk : (bx + 1) * blk, by, :], dQ_out[bz, bx * blk : (bx + 1) * blk, by, :])
         with T.Kernel(T.ceildiv(seq_len, blk), head_kv, batch, threads=128) as (bx, by, bz):
-            T.annotate_layout({
-                dK: make_dq_layout(dK),
-                dV: make_dq_layout(dV),
-            })
-            T.copy(dK[bz, bx * blk:(bx + 1) * blk, by, :], dK_out[bz, bx * blk:(bx + 1) * blk,
-                                                                  by, :])
-            T.copy(dV[bz, bx * blk:(bx + 1) * blk, by, :], dV_out[bz, bx * blk:(bx + 1) * blk,
-                                                                  by, :])
+            T.annotate_layout(
+                {
+                    dK: make_dq_layout(dK),
+                    dV: make_dq_layout(dV),
+                }
+            )
+            T.copy(dK[bz, bx * blk : (bx + 1) * blk, by, :], dK_out[bz, bx * blk : (bx + 1) * blk, by, :])
+            T.copy(dV[bz, bx * blk : (bx + 1) * blk, by, :], dV_out[bz, bx * blk : (bx + 1) * blk, by, :])
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd_atomic_add(batch,
-                             heads,
-                             seq_len,
-                             dim_qk,
-                             dim_v,
-                             is_causal,
-                             block_M,
-                             block_N,
-                             threads=256,
-                             num_stages=2,
-                             groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd_atomic_add(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -211,37 +211,37 @@ def flash_bwd(
             dv_shared = T.alloc_shared([block_M, dim_v], accum_dtype)
             dq_shared = T.alloc_shared([block_N, dim_qk], accum_dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                dK: make_dq_layout(dK),
-                dV: make_dq_layout(dV),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-            })
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                    dK: make_dq_layout(dK),
+                    dV: make_dq_layout(dV),
+                }
+            )
 
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx // groups, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx // groups, :], V_shared)
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -251,53 +251,43 @@ def flash_bwd(
                 T.clear(dq)
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True)
                 T.copy(dq, dq_shared)
-                T.atomic_add(dQ[bz, k * block_N:(k + 1) * block_N, bx, :], dq_shared, use_tma=True)
+                T.atomic_add(dQ[bz, k * block_N : (k + 1) * block_N, bx, :], dq_shared, use_tma=True)
             T.copy(dv, dv_shared)
-            T.atomic_add(
-                dV[bz, by * block_M:(by + 1) * block_M, bx // groups, :], dv_shared, use_tma=True)
+            T.atomic_add(dV[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dv_shared, use_tma=True)
             T.copy(dk, dk_shared)
-            T.atomic_add(
-                dK[bz, by * block_M:(by + 1) * block_M, bx // groups, :], dk_shared, use_tma=True)
+            T.atomic_add(dK[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dk_shared, use_tma=True)
 
     return flash_bwd
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd_split(batch,
-                        heads,
-                        seq_len,
-                        dim_qk,
-                        dim_v,
-                        is_causal,
-                        block_M,
-                        block_N,
-                        threads=256,
-                        num_stages=2,
-                        groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd_split_novarlen(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
     dk_shape = [groups, batch, seq_len, head_kv, dim_qk]  # sum after kernel
     dv_shape = [groups, batch, seq_len, head_kv, dim_v]  # sum after kernel
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(dk_shape, dtype),  # type: ignore
-            dV: T.Tensor(dv_shape, dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(dk_shape, dtype),  # type: ignore
+        dV: T.Tensor(dv_shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -317,37 +307,35 @@ def flash_bwd(
             dv_shared = T.alloc_shared([block_M, dim_v], dtype)
             dk_shared = T.alloc_shared([block_M, dim_qk], dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                }
+            )
 
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx // groups, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx // groups, :], V_shared)
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -360,16 +348,15 @@ def flash_bwd(
                     T.atomic_add(dQ[bz, k * block_N + i, bx, j], dq[i, j])
 
             T.copy(dv, dv_shared)
-            T.copy(dv_shared, dV[bx % groups, bz, by * block_M:(by + 1) * block_M, bx // groups, :])
+            T.copy(dv_shared, dV[bx % groups, bz, by * block_M : (by + 1) * block_M, bx // groups, :])
             T.copy(dk, dk_shared)
-            T.copy(dk, dK[bx % groups, bz, by * block_M:(by + 1) * block_M, bx // groups, :])
+            T.copy(dk, dK[bx % groups, bz, by * block_M : (by + 1) * block_M, bx // groups, :])
 
     return flash_bwd
 
 
 @torch.compile
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, causal, groups=1, use_atomic=True):
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
@@ -387,7 +374,10 @@ def forward(ctx, q, k, v, causal, groups=1, use_atomic=True):
     def backward(ctx, do):
         q, k, v, o, lse = ctx.saved_tensors
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
-        HEAD_KV, D_HEAD_V, = v.shape[-2], v.shape[-1]
+        (
+            HEAD_KV,
+            D_HEAD_V,
+        ) = v.shape[-2], v.shape[-1]
         groups = H // HEAD_KV
 
         def maybe_contiguous(x):
@@ -404,17 +394,8 @@ def maybe_contiguous(x):
 
         if ctx.use_atomic:
             kernel = flashattn_bwd_atomic_add(
-                BATCH,
-                H,
-                N_CTX,
-                D_HEAD_QK,
-                D_HEAD_V,
-                ctx.causal,
-                block_M,
-                block_N,
-                threads=256,
-                num_stages=2,
-                groups=groups)
+                BATCH, H, N_CTX, D_HEAD_QK, D_HEAD_V, ctx.causal, block_M, block_N, threads=256, num_stages=2, groups=groups
+            )
             shape_q = [BATCH, N_CTX, H, D_HEAD_QK]
             shape_k = [BATCH, N_CTX, HEAD_KV, D_HEAD_QK]
             shape_v = [BATCH, N_CTX, HEAD_KV, D_HEAD_V]
@@ -424,18 +405,9 @@ def maybe_contiguous(x):
             kernel(q, k, v, do, lse, delta, dq, dk, dv)
             dq, dk, dv = mod_post(dq, dk, dv)
         else:
-            kernel = flashattn_bwd_split(
-                BATCH,
-                H,
-                N_CTX,
-                D_HEAD_QK,
-                D_HEAD_V,
-                ctx.causal,
-                block_M,
-                block_N,
-                threads=256,
-                num_stages=2,
-                groups=groups)
+            kernel = flashattn_bwd_split_novarlen(
+                BATCH, H, N_CTX, D_HEAD_QK, D_HEAD_V, ctx.causal, block_M, block_N, threads=256, num_stages=2, groups=groups
+            )
             shape_q = [BATCH, N_CTX, H, D_HEAD_QK]
             shape_k = [groups, BATCH, N_CTX, HEAD_KV, D_HEAD_QK]  # sum after kernel
             shape_v = [groups, BATCH, N_CTX, HEAD_KV, D_HEAD_V]  # sum after kernel
@@ -443,8 +415,7 @@ def maybe_contiguous(x):
             dk = torch.empty(shape_k, dtype=torch.float16, device=q.device)
             dv = torch.empty(shape_v, dtype=torch.float16, device=q.device)
             kernel(q, k, v, do, lse, delta, dq, dk, dv)
-            dq, _, _ = mod_post(dq, torch.zeros_like(k, dtype=torch.float32),
-                                torch.zeros_like(v, dtype=torch.float32))
+            dq, _, _ = mod_post(dq, torch.zeros_like(k, dtype=torch.float32), torch.zeros_like(v, dtype=torch.float32))
             dk, dv = dk.sum(0), dv.sum(0)
 
         return dq, dk, dv, None, None, None
@@ -458,53 +429,45 @@ def ref_program(Q, K, V, is_causal, groups=1):
     # K: [B, T, HK, D_QK]
     # V: [B, T, HV, D_V]
     # HQ = HKV * groups
-    assert Q.size(2) == K.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
-    assert Q.size(2) == V.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
 
     dim_qk = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim_qk, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
-def main(BATCH: int = 1,
-         H: int = 32,
-         N_CTX: int = 256,
-         D_HEAD_QK: int = 192,
-         D_HEAD_V: int = 128,
-         groups: int = 16,
-         causal: bool = False,
-         use_atomic: bool = True):
+def main(
+    BATCH: int = 1,
+    H: int = 32,
+    N_CTX: int = 256,
+    D_HEAD_QK: int = 192,
+    D_HEAD_V: int = 128,
+    groups: int = 16,
+    causal: bool = False,
+    use_atomic: bool = True,
+):
     flops_per_qk = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_QK
     flops_per_v = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_V
     total_flops = 3 * flops_per_qk + 2 * flops_per_v
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
 
     head_kv = H // groups
-    K = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    V = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    dO = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    K = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    V = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    dO = torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
     O = attention(Q, K, V, causal, groups, use_atomic)
     O.backward(dO, retain_graph=True)
     dQ, Q.grad = Q.grad.clone(), None
@@ -521,7 +484,7 @@ def main(BATCH: int = 1,
     torch.testing.assert_close(dV, dV_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(dK, dK_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(dQ, dQ_ref, rtol=1e-2, atol=1e-2)
-    print('All checks passed.✅')
+    print("All checks passed.✅")
 
     def run():
         O_ref.backward(dO, retain_graph=True)
@@ -544,17 +507,15 @@ def run1():
     print(f"Detected GPU compute capability: {arch}")
     assert float(arch) >= 9.0, "This example only supports GPU with compute capability >= 9.0"
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head_qk', type=int, default=192, help='Head dimension for Q/K')
-    parser.add_argument('--d_head_v', type=int, default=128, help='Head dimension for V')
-    parser.add_argument('--causal', action='store_true', help='Causal flag')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
-    parser.add_argument(
-        '--use_atomic', action='store_true', default=False, help='Use atomic add for dK/dV')
-    parser.add_argument(
-        '--use_split', action='store_true', default=False, help='Use split for dK/dV')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head_qk", type=int, default=192, help="Head dimension for Q/K")
+    parser.add_argument("--d_head_v", type=int, default=128, help="Head dimension for V")
+    parser.add_argument("--causal", action="store_true", help="Causal flag")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
+    parser.add_argument("--use_atomic", action="store_true", default=False, help="Use atomic add for dK/dV")
+    parser.add_argument("--use_split", action="store_true", default=False, help="Use split for dK/dV")
     args = parser.parse_args()
 
     # Handle backward compatibility and logic
@@ -566,5 +527,4 @@ def run1():
         # Default: use atomic
         use_atomic = True
 
-    main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal,
-         use_atomic)
+    main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal, use_atomic)
diff --git a/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py b/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
index 82d363768..a9f45e077 100644
--- a/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
+++ b/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
@@ -7,56 +7,44 @@
 from einops import rearrange, repeat
 from bert_padding import pad_input, unpad_input
 
-torch.manual_seed(1)
-
 
 def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random"):
     assert mode in ["full", "random", "third"]
     if mode == "full":
         lengths = torch.full((batch_size, 1), max_seqlen, device=device, dtype=torch.int32)
     elif mode == "random":
-        lengths = torch.randint(
-            max(1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device)
+        lengths = torch.randint(max(1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device)
     elif mode == "third":
         lengths = torch.randint(max_seqlen // 3, max_seqlen + 1, (batch_size, 1), device=device)
-    padding_mask = (
-        repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths)
+    padding_mask = repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths
     return padding_mask
 
 
 @tilelang.jit(
-    out_idx=[5, 6], pass_configs={
+    out_idx=[5, 6],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn_fwd(batch,
-                  total_q,
-                  total_kv,
-                  heads,
-                  max_seq_len,
-                  dim_qk,
-                  dim_v,
-                  is_causal,
-                  block_M,
-                  block_N,
-                  groups=1):
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn_fwd(batch, total_q, total_kv, N_CTX, heads, max_seq_len, dim_qk, dim_v, is_causal, block_M, block_N, groups=1):
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [total_q, heads, dim_qk]
     k_shape = [total_kv, head_kv, dim_qk]
     v_shape = [total_kv, head_kv, dim_v]
     o_shape = [total_q, heads, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            cu_seqlens_q: T.Tensor([batch + 1], "int32"),  # type: ignore
-            cu_seqlens_k: T.Tensor([batch + 1], "int32"),  # type: ignore
-            Output: T.Tensor(o_shape, dtype),  # type: ignore
-            lse: T.Tensor([total_q, heads], accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        cu_seqlens_q: T.Tensor([batch + 1], T.int32),  # type: ignore
+        cu_seqlens_k: T.Tensor([batch + 1], T.int32),  # type: ignore
+        Output: T.Tensor(o_shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(max_seq_len, block_M), heads, batch, threads=256) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -78,8 +66,6 @@ def flash_fwd(
             q_current_seqlen = q_end_idx - q_start_idx
             k_current_seqlen = k_end_idx - k_start_idx
 
-            T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-
             for i, d in T.Parallel(block_M, dim_qk):
                 if bx * block_M + i < q_current_seqlen:
                     Q_shared[i, d] = Q[q_start_idx + bx * block_M + i, by, d]
@@ -88,7 +74,9 @@ def flash_fwd(
 
             T.fill(acc_o, 0.0)
             T.fill(logsum, 0.0)
-            T.fill(scores_max, -T.infinity(accum_dtype))
+            # Warning: in causal/varlen/unaligned seqlen scenarios, the -inf will cause undefined behavior in exp ops
+            # We should set it to negative large number instead
+            T.fill(scores_max, T.Cast(accum_dtype, -1e30))
             loop_range = T.ceildiv(k_current_seqlen, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
                 for i, d in T.Parallel(block_N, dim_qk):
@@ -99,15 +87,17 @@ def flash_fwd(
 
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else((bx * block_M + i >= k * block_N + j) and
-                                                     (bx * block_M + i < q_current_seqlen and
-                                                      k * block_N + j < k_current_seqlen), 0,
-                                                     -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(
+                            (bx * block_M + i >= k * block_N + j)
+                            and (bx * block_M + i < q_current_seqlen and k * block_N + j < k_current_seqlen),
+                            0,
+                            T.Cast(accum_dtype, -1e30),
+                        )
                 else:
                     for i, j in T.Parallel(block_M, block_N):
                         acc_s[i, j] = T.if_then_else(
-                            bx * block_M + i < q_current_seqlen and
-                            k * block_N + j < k_current_seqlen, 0, -T.infinity(acc_s.dtype))
+                            bx * block_M + i < q_current_seqlen and k * block_N + j < k_current_seqlen, 0, T.Cast(accum_dtype, -1e30)
+                        )
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 for i, d in T.Parallel(block_N, dim_v):
                     if k * block_N + i < k_current_seqlen:
@@ -116,6 +106,8 @@ def flash_fwd(
                         V_shared[i, d] = 0.0
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim_v):
@@ -137,27 +129,29 @@ def flash_fwd(
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
                 if bx * block_M + i < q_current_seqlen:
-                    lse[q_start_idx + bx * block_M + i, by] = logsum[i]
+                    lse[bz, by, bx * block_M + i] = logsum[i]
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn_bwd_preprocess(batch, heads, total_q, max_seq_len, dim_v):
-    dtype = "float16"
-    accum_dtype = "float"
+    },
+)
+def flashattn_bwd_preprocess(batch, heads, total_q, N_CTX, max_seq_len, dim_v):
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [total_q, heads, dim_v]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            cu_seqlens_q: T.Tensor([batch + 1], "int32"),  # type: ignore
-            Delta: T.Tensor([total_q, heads], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        cu_seqlens_q: T.Tensor([batch + 1], T.int32),  # type: ignore
+        Delta: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(max_seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -185,23 +179,25 @@ def flash_bwd_prep(
 
             for i in T.Parallel(blk):
                 if by * blk + i < q_current_seqlen:
-                    Delta[q_start_idx + by * blk + i, bx] = delta[i]
+                    Delta[bz, bx, by * blk + i] = delta[i]
 
     return flash_bwd_prep
 
 
 def make_dq_layout(dQ):
-    # bshd -> bhld to use tma reduction instruction
-    return T.Layout(dQ.shape, lambda b, l, h, d: [b, h, l, d])
+    # bshd -> bhsd to use tma reduction instruction
+    return T.Layout(dQ.shape, lambda l, h, d: [h, l, d])
 
 
 @tilelang.jit(
-    out_idx=[3, 4, 5], pass_configs={
+    out_idx=[3, 4, 5],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_postprocess(total_q, total_kv, heads, head_kv, dim_qk, dim_v):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     q_shape = [total_q, heads, dim_qk]
     k_shape = [total_kv, head_kv, dim_qk]
     v_shape = [total_kv, head_kv, dim_v]
@@ -209,69 +205,62 @@ def flashattn_bwd_postprocess(total_q, total_kv, heads, head_kv, dim_qk, dim_v):
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(q_shape, dtype),  # type: ignore
-            dK_out: T.Tensor(k_shape, dtype),  # type: ignore
-            dV_out: T.Tensor(v_shape, dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(q_shape, dtype),  # type: ignore
+        dK_out: T.Tensor(k_shape, dtype),  # type: ignore
+        dV_out: T.Tensor(v_shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(total_q, blk), heads, threads=128) as (bx, by):
-            # T.annotate_layout({dQ: make_dq_layout(dQ)})
-            T.copy(dQ[bx * blk:(bx + 1) * blk, by, :], dQ_out[bx * blk:(bx + 1) * blk, by, :])
+            T.annotate_layout({dQ: make_dq_layout(dQ)})
+            T.copy(dQ[bx * blk : (bx + 1) * blk, by, :], dQ_out[bx * blk : (bx + 1) * blk, by, :])
         with T.Kernel(T.ceildiv(total_kv, blk), head_kv, threads=128) as (bx, by):
-            # T.annotate_layout({
-            #     dK: make_dq_layout(dK),
-            #     dV: make_dq_layout(dV),
-            # })
-            T.copy(dK[bx * blk:(bx + 1) * blk, by, :], dK_out[bx * blk:(bx + 1) * blk, by, :])
-            T.copy(dV[bx * blk:(bx + 1) * blk, by, :], dV_out[bx * blk:(bx + 1) * blk, by, :])
+            T.annotate_layout(
+                {
+                    dK: make_dq_layout(dK),
+                    dV: make_dq_layout(dV),
+                }
+            )
+            T.copy(dK[bx * blk : (bx + 1) * blk, by, :], dK_out[bx * blk : (bx + 1) * blk, by, :])
+            T.copy(dV[bx * blk : (bx + 1) * blk, by, :], dV_out[bx * blk : (bx + 1) * blk, by, :])
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd_atomic_add(batch,
-                             total_q,
-                             total_kv,
-                             heads,
-                             max_seq_len,
-                             dim_qk,
-                             dim_v,
-                             is_causal,
-                             block_M,
-                             block_N,
-                             threads=256,
-                             num_stages=2,
-                             groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd_atomic_add(
+    batch, total_q, total_kv, N_CTX, heads, max_seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1
+):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [total_q, heads, dim_qk]
     k_shape = [total_kv, head_kv, dim_qk]
     v_shape = [total_kv, head_kv, dim_v]
     do_shape = [total_q, heads, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor(do_shape, dtype),  # type: ignore
-            lse: T.Tensor([total_q, heads], accum_dtype),  # type: ignore
-            Delta: T.Tensor([total_q, heads], accum_dtype),  # type: ignore
-            cu_seqlens_q: T.Tensor([batch + 1], "int32"),  # type: ignore
-            cu_seqlens_k: T.Tensor([batch + 1], "int32"),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor(do_shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
+        cu_seqlens_q: T.Tensor([batch + 1], T.int32),  # type: ignore
+        cu_seqlens_k: T.Tensor([batch + 1], T.int32),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
     ):
-        with T.Kernel(
-                heads, T.ceildiv(max_seq_len, block_M), batch, threads=threads) as (bx, by, bz):
+        with T.Kernel(heads, T.ceildiv(max_seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
             dsT_shared = T.alloc_shared([block_M, block_N], dtype)
             q = T.alloc_shared([block_N, dim_qk], dtype)
@@ -286,6 +275,9 @@ def flash_bwd(
             dv = T.alloc_fragment([block_M, dim_v], accum_dtype)
             dk = T.alloc_fragment([block_M, dim_qk], accum_dtype)
             dq = T.alloc_fragment([block_N, dim_qk], accum_dtype)
+            dv_shared = T.alloc_shared([block_M, dim_v], accum_dtype)
+            dk_shared = T.alloc_shared([block_M, dim_qk], accum_dtype)
+            dq_shared = T.alloc_shared([block_N, dim_qk], accum_dtype)
 
             q_start_idx = cu_seqlens_q[bz]
             k_start_idx = cu_seqlens_k[bz]
@@ -294,71 +286,53 @@ def flash_bwd(
             q_current_seqlen = q_end_idx - q_start_idx
             k_current_seqlen = k_end_idx - k_start_idx
 
-            T.annotate_layout({
-                # dQ: make_dq_layout(dQ),
-                # dK: make_dq_layout(dK),
-                # dV: make_dq_layout(dV),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-            })
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                    dK: make_dq_layout(dK),
+                    dV: make_dq_layout(dV),
+                }
+            )
 
-            for i, d in T.Parallel(block_M, dim_qk):
-                if by * block_M + i < k_current_seqlen:
-                    K_shared[i, d] = K[k_start_idx + by * block_M + i, bx // groups, d]
-                    V_shared[i, d] = V[k_start_idx + by * block_M + i, bx // groups, d]
-                else:
-                    K_shared[i, d] = 0.0
-                    V_shared[i, d] = 0.0
+            T.copy(K[k_start_idx + by * block_M : k_start_idx + (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[k_start_idx + by * block_M : k_start_idx + (by + 1) * block_M, bx // groups, :], V_shared)
 
             T.clear(dv)
             T.clear(dk)
 
-            loop_st = (T.floordiv(by * block_M, block_N) if is_causal else 0)
+            loop_st = T.min(T.floordiv(by * block_M, block_N), T.floordiv(q_current_seqlen, block_N)) if is_causal else 0
             loop_ed = T.ceildiv(q_current_seqlen, block_N)
 
             for k_base in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                for i, d in T.Parallel(block_N, dim_qk):
-                    if k_base * block_N + i < q_current_seqlen:
-                        q[i, d] = Q[q_start_idx + k_base * block_N + i, bx, d]
-                    else:
-                        q[i, d] = 0.0
+                T.copy(Q[q_start_idx + k_base * block_N : q_start_idx + (k_base + 1) * block_N, bx, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                for i in T.Parallel(block_N):
-                    if k_base * block_N + i < q_current_seqlen:
-                        lse_shared[i] = lse[q_start_idx + k_base * block_N + i, bx]
-                    else:
-                        lse_shared[i] = 0.0
+                T.copy(lse[bz, bx, k_base * block_N : (k_base + 1) * block_N], lse_shared)
 
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else((by * block_M + i <= k_base * block_N + j) and
-                                                   (by * block_M + i < k_current_seqlen and
-                                                    k_base * block_N + j < q_current_seqlen),
-                                                   qkT[i, j], 0)
+                        qkT[i, j] = T.if_then_else(
+                            (by * block_M + i <= k_base * block_N + j)
+                            and (by * block_M + i < k_current_seqlen and k_base * block_N + j < q_current_seqlen),
+                            qkT[i, j],
+                            0,
+                        )
                 else:
                     for i, j in T.Parallel(block_M, block_N):
                         qkT[i, j] = T.if_then_else(
-                            by * block_M + i < k_current_seqlen and
-                            k_base * block_N + j < q_current_seqlen, qkT[i, j], 0)
+                            by * block_M + i < k_current_seqlen and k_base * block_N + j < q_current_seqlen, qkT[i, j], 0
+                        )
 
-                for i, d in T.Parallel(block_N, dim_v):
-                    if k_base * block_N + i < q_current_seqlen:
-                        do[i, d] = dO[q_start_idx + k_base * block_N + i, bx, d]
-                    else:
-                        do[i, d] = 0.0
+                T.copy(dO[q_start_idx + k_base * block_N : q_start_idx + (k_base + 1) * block_N, bx, :], do)
                 T.clear(dsT)
                 # dsT: (block_kv, block_q)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                for i in T.Parallel(block_N):
-                    if k_base * block_N + i < q_current_seqlen:
-                        delta[i] = Delta[q_start_idx + k_base * block_N + i, bx]
-                    else:
-                        delta[i] = 0.0
+                T.copy(Delta[bz, bx, k_base * block_N : (k_base + 1) * block_N], delta)
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
                 T.gemm(dsT_cast, q, dk, policy=T.GemmWarpPolicy.FullRow)
@@ -366,44 +340,42 @@ def flash_bwd(
                 T.copy(dsT_cast, dsT_shared)
                 T.clear(dq)
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True)
+                T.copy(dq, dq_shared)
                 T.atomic_add(
-                    dQ[q_start_idx + k_base * block_N:q_start_idx + k_base * block_N + block_N,
-                       bx, :],
-                    dq,
-                    memory_order="release")
+                    dQ[q_start_idx + k_base * block_N : q_start_idx + k_base * block_N + block_N, bx, :],
+                    dq_shared,
+                    memory_order="relaxed",
+                    use_tma=True,
+                )
 
+            T.copy(dv, dv_shared)
             T.atomic_add(
-                dV[k_start_idx + by * block_M:k_start_idx + by * block_M + block_M,
-                   bx // groups, :],
-                dv,
-                memory_order="release")
+                dV[k_start_idx + by * block_M : k_start_idx + by * block_M + block_M, bx // groups, :],
+                dv_shared,
+                memory_order="relaxed",
+                use_tma=True,
+            )
+            T.copy(dk, dk_shared)
             T.atomic_add(
-                dK[k_start_idx + by * block_M:k_start_idx + by * block_M + block_M,
-                   bx // groups, :],
-                dk,
-                memory_order="release")
+                dK[k_start_idx + by * block_M : k_start_idx + by * block_M + block_M, bx // groups, :],
+                dk_shared,
+                memory_order="relaxed",
+                use_tma=True,
+            )
 
     return flash_bwd
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd_split(batch,
-                        total_q,
-                        total_kv,
-                        heads,
-                        max_seq_len,
-                        dim_qk,
-                        dim_v,
-                        is_causal,
-                        block_M,
-                        block_N,
-                        threads=256,
-                        num_stages=2,
-                        groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd_split(
+    batch, total_q, total_kv, N_CTX, heads, max_seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1
+):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [total_q, heads, dim_qk]
     k_shape = [total_kv, head_kv, dim_qk]
@@ -411,25 +383,24 @@ def flashattn_bwd_split(batch,
     do_shape = [total_q, heads, dim_v]
     dk_shape = [groups, total_kv, head_kv, dim_qk]  # sum after kernel
     dv_shape = [groups, total_kv, head_kv, dim_v]  # sum after kernel
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor(do_shape, dtype),  # type: ignore
-            lse: T.Tensor([total_q, heads], accum_dtype),  # type: ignore
-            Delta: T.Tensor([total_q, heads], accum_dtype),  # type: ignore
-            cu_seqlens_q: T.Tensor([batch + 1], "int32"),  # type: ignore
-            cu_seqlens_k: T.Tensor([batch + 1], "int32"),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(dk_shape, dtype),  # type: ignore
-            dV: T.Tensor(dv_shape, dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor(do_shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, N_CTX], accum_dtype),  # type: ignore
+        cu_seqlens_q: T.Tensor([batch + 1], T.int32),  # type: ignore
+        cu_seqlens_k: T.Tensor([batch + 1], T.int32),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(dk_shape, dtype),  # type: ignore
+        dV: T.Tensor(dv_shape, dtype),  # type: ignore
     ):
-        with T.Kernel(
-                heads, T.ceildiv(max_seq_len, block_M), batch, threads=threads) as (bx, by, bz):
+        with T.Kernel(heads, T.ceildiv(max_seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
             dsT_shared = T.alloc_shared([block_M, block_N], dtype)
             q = T.alloc_shared([block_N, dim_qk], dtype)
@@ -454,67 +425,52 @@ def flash_bwd(
             q_current_seqlen = q_end_idx - q_start_idx
             k_current_seqlen = k_end_idx - k_start_idx
 
-            T.annotate_layout({
-                # dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                }
+            )
 
-            for i, d in T.Parallel(block_M, dim_qk):
-                if by * block_M + i < k_current_seqlen:
-                    K_shared[i, d] = K[k_start_idx + by * block_M + i, bx // groups, d]
-                    V_shared[i, d] = V[k_start_idx + by * block_M + i, bx // groups, d]
-                else:
-                    K_shared[i, d] = 0.0
-                    V_shared[i, d] = 0.0
+            T.copy(K[k_start_idx + by * block_M : k_start_idx + (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[k_start_idx + by * block_M : k_start_idx + (by + 1) * block_M, bx // groups, :], V_shared)
 
             T.clear(dv)
             T.clear(dk)
-            loop_st = (T.floordiv(by * block_M, block_N) if is_causal else 0)
+            loop_st = T.min(T.floordiv(by * block_M, block_N), T.floordiv(q_current_seqlen, block_N)) if is_causal else 0
             loop_ed = T.ceildiv(q_current_seqlen, block_N)
 
             for k_base in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                for i, d in T.Parallel(block_N, dim_qk):
-                    if k_base * block_N + i < q_current_seqlen:
-                        q[i, d] = Q[q_start_idx + k_base * block_N + i, bx, d]
-                    else:
-                        q[i, d] = 0.0
+                # Note: The padding zero of varlen should be considered in T.copy
+                T.copy(Q[q_start_idx + k_base * block_N : q_start_idx + (k_base + 1) * block_N, bx, :], q)
 
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                for i, d in T.Parallel(block_N, dim_v):
-                    if k_base * block_N + i < q_current_seqlen:
-                        do[i, d] = dO[q_start_idx + k_base * block_N + i, bx, d]
-                    else:
-                        do[i, d] = 0.0
+
+                T.copy(dO[q_start_idx + k_base * block_N : q_start_idx + (k_base + 1) * block_N, bx, :], do)
+
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                for i in T.Parallel(block_N):
-                    if k_base * block_N + i < q_current_seqlen:
-                        lse_shared[i] = lse[q_start_idx + k_base * block_N + i, bx]
-                    else:
-                        lse_shared[i] = 0.0
+
+                T.copy(lse[bz, bx, k_base * block_N : (k_base + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else((by * block_M + i <= k_base * block_N + j) and
-                                                   (by * block_M + i < k_current_seqlen and
-                                                    k_base * block_N + j < q_current_seqlen),
-                                                   qkT[i, j], 0)
+                        qkT[i, j] = T.if_then_else(
+                            (by * block_M + i <= k_base * block_N + j)
+                            and (by * block_M + i < k_current_seqlen and k_base * block_N + j < q_current_seqlen),
+                            qkT[i, j],
+                            0,
+                        )
                 else:
                     for i, j in T.Parallel(block_M, block_N):
                         qkT[i, j] = T.if_then_else(
-                            by * block_M + i < k_current_seqlen and
-                            k_base * block_N + j < q_current_seqlen, qkT[i, j], 0)
+                            by * block_M + i < k_current_seqlen and k_base * block_N + j < q_current_seqlen, qkT[i, j], 0
+                        )
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
-                for i in T.Parallel(block_N):
-                    if k_base * block_N + i < q_current_seqlen:
-                        delta[i] = Delta[q_start_idx + k_base * block_N + i, bx]
-                    else:
-                        delta[i] = 0.0
+
+                T.copy(Delta[bz, bx, k_base * block_N : (k_base + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -525,57 +481,38 @@ def flash_bwd(
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True)
                 for i, j in T.Parallel(block_N, dim_qk):
                     if k_base * block_N + i < q_current_seqlen:
-                        T.atomic_add(dQ[q_start_idx + k_base * block_N + i, bx, j], dq[i, j])
+                        T.atomic_add(dQ[q_start_idx + k_base * block_N + i, bx, j], dq[i, j], memory_order="relaxed")
 
             T.copy(dv, dv_shared)
-            for i, d in T.Parallel(block_M, dim_v):
-                if by * block_M + i < k_current_seqlen:
-                    dV[bx % groups, k_start_idx + by * block_M + i, bx // groups, d] = dv[i, d]
+            T.copy(dv_shared, dV[bx % groups, k_start_idx + by * block_M : k_start_idx + by * block_M + block_M, bx // groups, :])
             T.copy(dk, dk_shared)
-            for i, d in T.Parallel(block_M, dim_qk):
-                if by * block_M + i < k_current_seqlen:
-                    dK[bx % groups, k_start_idx + by * block_M + i, bx // groups, d] = dk[i, d]
+            T.copy(dk_shared, dK[bx % groups, k_start_idx + by * block_M : k_start_idx + by * block_M + block_M, bx // groups, :])
 
     return flash_bwd
 
 
 @torch.compile
 class _attention(torch.autograd.Function):
-
     @staticmethod
-    def forward(ctx,
-                q,
-                k,
-                v,
-                seqlens_q,
-                seqlens_k,
-                cu_seqlens_q,
-                cu_seqlens_k,
-                max_seqlen_q,
-                max_seqlen_k,
-                causal,
-                groups=1,
-                use_atomic=True):
+    def forward(
+        ctx, q, k, v, seqlens_q, seqlens_k, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, causal, groups=1, use_atomic=True
+    ):
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
         D_HEAD_V = v.shape[-1]
         block_M = 128
         block_N = 64
-        q_unpad, indices_q, _, _ = unpad_input(
-            q, (torch.arange(N_CTX, device=q.device).unsqueeze(0) < seqlens_q.unsqueeze(1)))
-        k_unpad, indices_k, _, _ = unpad_input(
-            k, (torch.arange(N_CTX, device=k.device).unsqueeze(0) < seqlens_k.unsqueeze(1)))
-        v_unpad, _, _, _ = unpad_input(
-            v, (torch.arange(N_CTX, device=v.device).unsqueeze(0) < seqlens_k.unsqueeze(1)))
+        q_unpad, indices_q, _, _ = unpad_input(q, (torch.arange(N_CTX, device=q.device).unsqueeze(0) < seqlens_q.unsqueeze(1)))
+        k_unpad, indices_k, _, _ = unpad_input(k, (torch.arange(N_CTX, device=k.device).unsqueeze(0) < seqlens_k.unsqueeze(1)))
+        v_unpad, _, _, _ = unpad_input(v, (torch.arange(N_CTX, device=v.device).unsqueeze(0) < seqlens_k.unsqueeze(1)))
 
         total_q = q_unpad.shape[0]
         total_kv = k_unpad.shape[0]
 
-        mod = flashattn_fwd(BATCH, total_q, total_kv, H, max_seqlen_q, D_HEAD_QK, D_HEAD_V, causal,
-                            block_M, block_N, groups)
+        mod = flashattn_fwd(BATCH, total_q, total_kv, N_CTX, H, max_seqlen_q, D_HEAD_QK, D_HEAD_V, causal, block_M, block_N, groups)
         o_unpad, lse = mod(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k)
         o = pad_input(o_unpad, indices_q, BATCH, N_CTX)
-        ctx.save_for_backward(q_unpad, k_unpad, v_unpad, o_unpad, lse, seqlens_q, seqlens_k,
-                              cu_seqlens_q, cu_seqlens_k)
+        ctx.save_for_backward(q_unpad, k_unpad, v_unpad, o_unpad, lse, seqlens_q, seqlens_k, cu_seqlens_q, cu_seqlens_k)
+        ctx.batch = BATCH
         ctx.causal = causal
         ctx.use_atomic = use_atomic
         ctx.max_seqlen_q = max_seqlen_q
@@ -587,9 +524,9 @@ def forward(ctx,
     @staticmethod
     def backward(ctx, do):
         N_CTX = do.shape[1]
-        q, k, v, o, lse, seqlens_q, seqlens_k, cu_seqlens_q, cu_seqlens_k = ctx.saved_tensors
-        do_unpad, _, _, _ = unpad_input(
-            do, (torch.arange(N_CTX, device=do.device).unsqueeze(0) < seqlens_q.unsqueeze(1)))
+        q, k, v, o, lse_clone, seqlens_q, seqlens_k, cu_seqlens_q, cu_seqlens_k = ctx.saved_tensors
+        # lse_clone = lse.clone()
+        do_unpad, _, _, _ = unpad_input(do, (torch.arange(N_CTX, device=do.device).unsqueeze(0) < seqlens_q.unsqueeze(1)))
         total_q, H, D_HEAD_QK = q.shape
         total_kv, HEAD_KV, D_HEAD_V = v.shape
         groups = H // HEAD_KV
@@ -603,7 +540,7 @@ def maybe_contiguous(x):
         do, q, k, v, o = [maybe_contiguous(x) for x in (do_unpad, q, k, v, o)]
         block_M = 128
         block_N = 32
-        mod_prep = flashattn_bwd_preprocess(BATCH, H, total_q, ctx.max_seqlen_q, D_HEAD_V)
+        mod_prep = flashattn_bwd_preprocess(BATCH, H, total_q, N_CTX, ctx.max_seqlen_q, D_HEAD_V)
         mod_post = flashattn_bwd_postprocess(total_q, total_kv, H, HEAD_KV, D_HEAD_QK, D_HEAD_V)
         delta = mod_prep(o, do, cu_seqlens_q)
 
@@ -612,6 +549,7 @@ def maybe_contiguous(x):
                 BATCH,
                 total_q,
                 total_kv,
+                N_CTX,
                 H,
                 ctx.max_seqlen_q,
                 D_HEAD_QK,
@@ -621,17 +559,19 @@ def maybe_contiguous(x):
                 block_N,
                 threads=256,
                 num_stages=2,
-                groups=groups)
+                groups=groups,
+            )
             dq = torch.zeros_like(q, dtype=torch.float32)
             dk = torch.zeros_like(k, dtype=torch.float32)
             dv = torch.zeros_like(v, dtype=torch.float32)
-            kernel(q, k, v, do, lse, delta, cu_seqlens_q, cu_seqlens_k, dq, dk, dv)
+            kernel(q, k, v, do, lse_clone, delta, cu_seqlens_q, cu_seqlens_k, dq, dk, dv)
             dq, dk, dv = mod_post(dq, dk, dv)
         else:
             kernel = flashattn_bwd_split(
                 BATCH,
                 total_q,
                 total_kv,
+                N_CTX,
                 H,
                 ctx.max_seqlen_q,
                 D_HEAD_QK,
@@ -641,13 +581,13 @@ def maybe_contiguous(x):
                 block_N,
                 threads=256,
                 num_stages=2,
-                groups=groups)
+                groups=groups,
+            )
             dq = torch.zeros_like(q, dtype=torch.float32)
             dk = torch.empty(groups, *k.shape, dtype=torch.float16, device=q.device)
             dv = torch.empty(groups, *v.shape, dtype=torch.float16, device=q.device)
-            kernel(q, k, v, do, lse, delta, cu_seqlens_q, cu_seqlens_k, dq, dk, dv)
-            dq, _, _ = mod_post(dq, torch.zeros_like(k, dtype=torch.float32),
-                                torch.zeros_like(v, dtype=torch.float32))
+            kernel(q, k, v, do, lse_clone, delta, cu_seqlens_q, cu_seqlens_k, dq, dk, dv)
+            dq, _, _ = mod_post(dq, torch.zeros_like(k, dtype=torch.float32), torch.zeros_like(v, dtype=torch.float32))
             dk, dv = dk.sum(0), dv.sum(0)
 
         dq = pad_input(dq, ctx.indices_q, BATCH, N_CTX)
@@ -666,15 +606,13 @@ def ref_program(Q, K, V, padding_mask, is_causal, groups=1):
     # HQ = HKV * groups
     # To handle precision issue
     Q, K, V = Q.float(), K.float(), V.float()
-    assert Q.size(2) == K.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
-    assert Q.size(2) == V.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
 
     dim_qk = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim_qk, dtype=scores.dtype))
     if padding_mask is not None:
         scores.masked_fill_(rearrange(~padding_mask, "b s -> b 1 1 s"), float("-inf"))
@@ -682,41 +620,35 @@ def ref_program(Q, K, V, padding_mask, is_causal, groups=1):
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     if padding_mask is not None:
         output.masked_fill_(rearrange(~padding_mask, "b s -> b s 1 1"), 0.0)
     return output
 
 
-def main(BATCH: int = 1,
-         H: int = 32,
-         N_CTX: int = 256,
-         D_HEAD_QK: int = 192,
-         D_HEAD_V: int = 128,
-         groups: int = 16,
-         causal: bool = False,
-         use_atomic: bool = True):
+def main(
+    BATCH: int = 1,
+    H: int = 32,
+    N_CTX: int = 256,
+    D_HEAD_QK: int = 192,
+    D_HEAD_V: int = 128,
+    groups: int = 16,
+    causal: bool = False,
+    use_atomic: bool = True,
+):
     flops_per_qk = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_QK
     flops_per_v = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_V
     total_flops = 3 * flops_per_qk + 2 * flops_per_v
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
 
     head_kv = H // groups
-    K = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    V = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    dO = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    K = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    V = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    dO = torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
     padding_mask = generate_random_padding_mask(N_CTX, BATCH, "cuda", mode="random")
     seqlens_q = padding_mask.sum(dim=-1, dtype=torch.int32)
     cu_seqlens_q = F.pad(torch.cumsum(seqlens_q, dim=0, dtype=torch.int32), (1, 0))
@@ -725,8 +657,7 @@ def main(BATCH: int = 1,
     # In training backward pass, seqlens_k should be the same as seqlens_q
     seqlens_k, cu_seqlens_k, max_seqlen_k = seqlens_q, cu_seqlens_q, max_seqlen_q
 
-    O = attention(Q, K, V, seqlens_q, seqlens_k, cu_seqlens_q, cu_seqlens_k, max_seqlen_q,
-                  max_seqlen_k, causal, groups, use_atomic)
+    O = attention(Q, K, V, seqlens_q, seqlens_k, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, causal, groups, use_atomic)
     O.backward(dO, retain_graph=True)
     dQ, Q.grad = Q.grad.clone(), None
     dK, K.grad = K.grad.clone(), None
@@ -738,12 +669,6 @@ def main(BATCH: int = 1,
     dK_ref, K.grad = K.grad.clone(), None
     dV_ref, V.grad = V.grad.clone(), None
 
-    torch.testing.assert_close(O, O_ref.half(), rtol=1e-2, atol=1e-2)
-    torch.testing.assert_close(dQ, dQ_ref, rtol=1e-2, atol=1e-2)
-    torch.testing.assert_close(dK, dK_ref, rtol=1e-2, atol=1e-2)
-    torch.testing.assert_close(dV, dV_ref, rtol=1e-2, atol=1e-2)
-    print('All checks passed.✅')
-
     def run():
         O_ref.backward(dO, retain_graph=True)
 
@@ -759,24 +684,85 @@ def run1():
     print("tilelang: {:.2f} ms".format(latency))
     print("tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
+    torch.testing.assert_close(O, O_ref.half(), rtol=1e-2, atol=1e-2)
+    torch.testing.assert_close(dQ, dQ_ref, rtol=1e-2, atol=1e-2)
+    torch.testing.assert_close(dK, dK_ref, rtol=1e-2, atol=1e-2)
+    torch.testing.assert_close(dV, dV_ref, rtol=1e-2, atol=1e-2)
+    print("All checks passed.✅")
+    print(
+        "Note: this varlen kernel performance is as good as the non-varlen kernel shown in Nsight-Compute. As you may observe that the TFLOPS is a bit lower, that's because the unpad operation is included in the above benchmark."
+    )
+
+
+def run_regression_perf():
+    BATCH = 1
+    H = 32
+    N_CTX = 256
+    D_HEAD_QK = 192
+    D_HEAD_V = 128
+    groups = 16
+    causal = False
+    device = "cuda"
+    torch.manual_seed(42)
+    total_q = BATCH * N_CTX
+    total_kv = BATCH * N_CTX
+    head_kv = H // groups
+    Q = torch.randn(total_q, H, D_HEAD_QK, device=device, dtype=torch.half)
+    K = torch.randn(total_kv, head_kv, D_HEAD_QK, device=device, dtype=torch.half)
+    V = torch.randn(total_kv, head_kv, D_HEAD_V, device=device, dtype=torch.half)
+    O = torch.randn(total_q, H, D_HEAD_V, device=device, dtype=torch.half)
+    dO = torch.randn(total_q, H, D_HEAD_V, device=device, dtype=torch.half)
+    cu_seqlens_q = torch.arange(0, (BATCH + 1) * N_CTX, N_CTX, device=device, dtype=torch.int32)
+    cu_seqlens_k = cu_seqlens_q
+    max_seqlen_q = N_CTX
+    lse = torch.zeros(BATCH, H, N_CTX, device=device, dtype=torch.float32)
+    with torch.no_grad():
+        mod_prep = flashattn_bwd_preprocess(BATCH, H, total_q, N_CTX, max_seqlen_q, D_HEAD_V)
+        kernel = flashattn_bwd_split(
+            BATCH,
+            total_q,
+            total_kv,
+            N_CTX,
+            H,
+            max_seqlen_q,
+            D_HEAD_QK,
+            D_HEAD_V,
+            causal,
+            block_M=128,
+            block_N=32,
+            threads=256,
+            num_stages=2,
+            groups=groups,
+        )
+    dQ = torch.zeros_like(Q, dtype=torch.float32)
+    dK = torch.zeros(groups, total_kv, head_kv, D_HEAD_QK, device=device, dtype=torch.float16)
+    dV = torch.zeros(groups, total_kv, head_kv, D_HEAD_V, device=device, dtype=torch.float16)
+    Delta = mod_prep(O, dO, cu_seqlens_q)
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(Q, K, V, dO, lse, Delta, cu_seqlens_q, cu_seqlens_k, dQ, dK, dV)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
 
 if __name__ == "__main__":
     arch = nvcc.get_target_compute_version()
     print(f"Detected GPU compute capability: {arch}")
     assert float(arch) >= 9.0, "This example only supports GPU with compute capability >= 9.0"
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head_qk', type=int, default=192, help='Head dimension for Q/K')
-    parser.add_argument('--d_head_v', type=int, default=128, help='Head dimension for V')
-    parser.add_argument('--causal', action='store_true', help='Causal flag')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
-    parser.add_argument(
-        '--use_atomic', action='store_true', default=False, help='Use atomic add for dK/dV')
-    parser.add_argument(
-        '--use_split', action='store_true', default=False, help='Use split for dK/dV')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head_qk", type=int, default=192, help="Head dimension for Q/K")
+    parser.add_argument("--d_head_v", type=int, default=128, help="Head dimension for V")
+    parser.add_argument("--causal", action="store_true", help="Causal flag")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
+    parser.add_argument("--use_atomic", action="store_true", default=False, help="Use atomic add for dK/dV")
+    parser.add_argument("--use_split", action="store_true", default=False, help="Use split for dK/dV")
     args = parser.parse_args()
+    # Can be set to True/False for testing
+    args.causal = True
 
     # Handle backward compatibility and logic
     if args.use_split:
@@ -787,5 +773,4 @@ def run1():
         # Default: use atomic
         use_atomic = True
 
-    main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal,
-         use_atomic)
+    main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal, use_atomic)
diff --git a/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py b/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
index ed07e7d9d..2da64472c 100644
--- a/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
@@ -6,25 +6,27 @@
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, groups=1):
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            Output: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        Output: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=256) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -39,26 +41,25 @@ def flash_fwd(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
-            loop_range = (
-                T.ceildiv(
-                    (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+            loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
-                T.copy(K[bz, k * block_N:(k + 1) * block_N, by // groups, :], K_shared)
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by // groups, :], K_shared)
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                     -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
                 else:
-                    T.clear(acc_s)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by // groups, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim_v):
@@ -72,29 +73,31 @@ def flash_fwd(
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
             for i, j in T.Parallel(block_M, dim_v):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(acc_o, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim_v):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim_v]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -103,50 +106,42 @@ def flash_bwd_prep(
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim_v, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
-def flashattn_bwd(batch,
-                  heads,
-                  seq_len,
-                  dim_qk,
-                  dim_v,
-                  is_causal,
-                  block_M,
-                  block_N,
-                  threads=256,
-                  num_stages=2,
-                  groups=1):
-    sm_scale = (1.0 / dim_qk)**0.5
-    scale = (1.0 / dim_qk)**0.5 * 1.44269504  # log2(e)
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd(batch, heads, seq_len, dim_qk, dim_v, is_causal, block_M, block_N, threads=256, num_stages=2, groups=1):
+    sm_scale = (1.0 / dim_qk) ** 0.5
+    scale = (1.0 / dim_qk) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim_qk]
     k_shape = [batch, seq_len, head_kv, dim_qk]
     v_shape = [batch, seq_len, head_kv, dim_v]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(q_shape, dtype),  # type: ignore
-            K: T.Tensor(k_shape, dtype),  # type: ignore
-            V: T.Tensor(v_shape, dtype),  # type: ignore
-            dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
-            dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        K: T.Tensor(k_shape, dtype),  # type: ignore
+        V: T.Tensor(v_shape, dtype),  # type: ignore
+        dO: T.Tensor([batch, seq_len, heads, dim_v], dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(q_shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(k_shape, accum_dtype),  # type: ignore
+        dV: T.Tensor(v_shape, accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim_qk], dtype)
@@ -167,45 +162,30 @@ def flash_bwd(
             dv_shared = T.alloc_shared([block_M, dim_v], accum_dtype)
             dq_shared = T.alloc_shared([block_N, dim_qk], accum_dtype)
 
-            T.annotate_layout({
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dq_shared: tilelang.layout.make_swizzled_layout(dq_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-            })
-
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx // groups, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx // groups, :], V_shared)
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
-                T.gemm(
-                    K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
-                T.gemm(
-                    V_shared,
-                    do,
-                    dsT,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullRow,
-                    wg_wait=-1)
+                T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
                 T.wait_wgmma(1)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -217,18 +197,17 @@ def flash_bwd(
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True, wg_wait=1)
                 T.wait_wgmma(0)
                 T.copy(dq, dq_shared)
-                T.atomic_add(dQ[bz, k * block_N:(k + 1) * block_N, bx, :], dq_shared)
+                T.atomic_add(dQ[bz, k * block_N : (k + 1) * block_N, bx, :], dq_shared)
             T.copy(dv, dv_shared)
-            T.atomic_add(dV[bz, by * block_M:(by + 1) * block_M, bx // groups, :], dv_shared)
+            T.atomic_add(dV[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dv_shared)
             T.copy(dk, dk_shared)
-            T.atomic_add(dK[bz, by * block_M:(by + 1) * block_M, bx // groups, :], dk_shared)
+            T.atomic_add(dK[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dk_shared)
 
     return flash_bwd
 
 
 @torch.compile
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, causal, groups=1, use_atomic=True):
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
@@ -246,7 +225,10 @@ def forward(ctx, q, k, v, causal, groups=1, use_atomic=True):
     def backward(ctx, do):
         q, k, v, o, lse = ctx.saved_tensors
         BATCH, N_CTX, H, D_HEAD_QK = q.shape
-        HEAD_KV, D_HEAD_V, = v.shape[-2], v.shape[-1]
+        (
+            HEAD_KV,
+            D_HEAD_V,
+        ) = v.shape[-2], v.shape[-1]
         groups = H // HEAD_KV
 
         def maybe_contiguous(x):
@@ -260,18 +242,7 @@ def maybe_contiguous(x):
         mod_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD_V)
         delta = mod_prep(o, do)
 
-        kernel = flashattn_bwd(
-            BATCH,
-            H,
-            N_CTX,
-            D_HEAD_QK,
-            D_HEAD_V,
-            ctx.causal,
-            block_M,
-            block_N,
-            threads=256,
-            num_stages=2,
-            groups=groups)
+        kernel = flashattn_bwd(BATCH, H, N_CTX, D_HEAD_QK, D_HEAD_V, ctx.causal, block_M, block_N, threads=256, num_stages=2, groups=groups)
         shape_q = [BATCH, N_CTX, H, D_HEAD_QK]
         shape_k = [BATCH, N_CTX, HEAD_KV, D_HEAD_QK]
         shape_v = [BATCH, N_CTX, HEAD_KV, D_HEAD_V]
@@ -294,52 +265,36 @@ def ref_program(Q, K, V, is_causal, groups=1):
     # K: [B, T, HK, D_QK]
     # V: [B, T, HV, D_V]
     # HQ = HKV * groups
-    assert Q.size(2) == K.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
-    assert Q.size(2) == V.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
 
     dim_qk = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim_qk, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
-def main(BATCH: int = 1,
-         H: int = 32,
-         N_CTX: int = 256,
-         D_HEAD_QK: int = 192,
-         D_HEAD_V: int = 128,
-         groups: int = 16,
-         causal: bool = False):
+def main(BATCH: int = 1, H: int = 32, N_CTX: int = 256, D_HEAD_QK: int = 192, D_HEAD_V: int = 128, groups: int = 16, causal: bool = False):
     flops_per_qk = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_QK
     flops_per_v = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD_V
     total_flops = 3 * flops_per_qk + 2 * flops_per_v
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
 
     head_kv = H // groups
-    K = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    V = (
-        torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
-    dO = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    K = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    V = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    dO = torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
     O = attention(Q, K, V, causal, groups)
     O.backward(dO, retain_graph=True)
     dQ, Q.grad = Q.grad.clone(), None
@@ -356,7 +311,7 @@ def main(BATCH: int = 1,
     torch.testing.assert_close(dV, dV_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(dK, dK_ref, rtol=1e-2, atol=1e-2)
     torch.testing.assert_close(dQ, dQ_ref, rtol=1e-2, atol=1e-2)
-    print('All checks passed.✅')
+    print("All checks passed.✅")
 
     def run():
         O_ref.backward(dO, retain_graph=True)
@@ -374,15 +329,34 @@ def run1():
     print("tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
+def run_regression_perf(
+    BATCH: int = 1, H: int = 32, N_CTX: int = 256, D_HEAD_QK: int = 192, D_HEAD_V: int = 128, groups: int = 16, causal: bool = False
+):
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
+
+    head_kv = H // groups
+    K = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    V = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    dO = torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    O = attention(Q, K, V, causal, groups)
+
+    def run1():
+        O.backward(dO, retain_graph=True)
+
+    from tilelang.profiler import do_bench
+
+    return do_bench(run1, warmup=500, backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head_qk', type=int, default=192, help='Head dimension for Q/K')
-    parser.add_argument('--d_head_v', type=int, default=128, help='Head dimension for V')
-    parser.add_argument('--causal', action='store_true', help='Causal flag')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head_qk", type=int, default=192, help="Head dimension for Q/K")
+    parser.add_argument("--d_head_v", type=int, default=128, help="Head dimension for V")
+    parser.add_argument("--causal", action="store_true", help="Causal flag")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
     args = parser.parse_args()
 
     main(args.batch, args.h, args.n_ctx, args.d_head_qk, args.d_head_v, args.groups, args.causal)
diff --git a/examples/flash_attention/example_gqa_fwd_bshd.py b/examples/flash_attention/example_gqa_fwd_bshd.py
index 4d9d06a4f..e884a8158 100644
--- a/examples/flash_attention/example_gqa_fwd_bshd.py
+++ b/examples/flash_attention/example_gqa_fwd_bshd.py
@@ -9,7 +9,6 @@
 
 
 class FlashAttentionTuneSpace:
-
     def __init__(
         self,
         block_sizes=(64, 128, 256),
@@ -40,7 +39,7 @@ def get_configs(user_config=None):
             warp_M = block_M // warp_count
             warp_N = block_N // warp_count
 
-            if (warp_M % config.warp_alignment != 0 or warp_N % config.warp_alignment != 0):
+            if warp_M % config.warp_alignment != 0 or warp_N % config.warp_alignment != 0:
                 continue
 
             shared_mem = 2 * config.dtype_bytes * config.dim * (block_M + block_N)
@@ -48,114 +47,38 @@ def get_configs(user_config=None):
                 continue
 
             for num_stages in config.num_stages_range:
-                valid_configs.append({
-                    "block_M": block_M,
-                    "block_N": block_N,
-                    "num_stages": num_stages,
-                    "threads": threads,
-                })
+                valid_configs.append(
+                    {
+                        "block_M": block_M,
+                        "block_N": block_N,
+                        "num_stages": num_stages,
+                        "threads": threads,
+                    }
+                )
     return valid_configs
 
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch,
-              heads,
-              seq_len,
-              dim,
-              is_causal,
-              groups=1,
-              block_M=64,
-              block_N=64,
-              num_stages=0,
-              threads=128):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch, heads, seq_len, dim, is_causal, groups=1, block_M=64, block_N=64, num_stages=0, threads=128):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
-
-    @T.macro
-    def MMA0(
-        K: T.Tensor(kv_shape, dtype),
-        Q_shared: T.SharedBuffer([block_M, dim], dtype),
-        K_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-        k: T.int32,
-        bx: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(K[bz, k * block_N:(k + 1) * block_N, by // groups, :], K_shared)
-        if is_causal:
-            for i, j in T.Parallel(block_M, block_N):
-                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                             -T.infinity(acc_s.dtype))
-        else:
-            T.clear(acc_s)
-        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def MMA1(
-        V: T.Tensor(kv_shape, dtype),
-        V_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-        k: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
-        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        T.copy(scores_max, scores_max_prev)
-        T.fill(scores_max, -T.infinity(accum_dtype))
-        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
-        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
-        # in the first ceil_div(kBlockM, kBlockN) steps.
-        # for i in T.Parallel(block_M):
-        #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
-        for i in T.Parallel(block_M):
-            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-        for i, j in T.Parallel(block_M, block_N):
-            # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
-            # max * log_2(e)) This allows the compiler to use the ffma
-            # instruction instead of fadd and fmul separately.
-            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-        T.reduce_sum(acc_s, scores_sum, dim=1)
-        for i in T.Parallel(block_M):
-            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-        T.copy(acc_s, acc_s_cast)
-
-    @T.macro
-    def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        for i, j in T.Parallel(block_M, dim):
-            acc_o[i, j] *= scores_scale[i]
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -171,25 +94,49 @@ def main(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                    (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+            )
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
-                MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
-                Rescale(acc_o, scores_scale)
-                MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by // groups, :], K_shared)
+                if is_causal:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
+                else:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                for i in T.Parallel(block_M):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, block_N):
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by // groups, :], V_shared)
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(O_shared, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
 
     return main
 
@@ -199,50 +146,34 @@ def ref_program(Q, K, V, is_causal, groups=1):
     # K: [B, T, HK, D]
     # V: [B, T, HV, D]
     # HQ = HKV * groups
-    assert Q.size(2) == K.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
-    assert Q.size(2) == V.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
 
     dim = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
-def main(batch: int = 1,
-         heads: int = 64,
-         seq_len: int = 4096,
-         dim: int = 128,
-         is_causal: bool = False,
-         groups: int = 16,
-         tune: bool = False):
+def main(
+    batch: int = 1, heads: int = 64, seq_len: int = 4096, dim: int = 128, is_causal: bool = False, groups: int = 16, tune: bool = False
+):
     flops_per_matmul = 2.0 * batch * heads * seq_len * seq_len * dim
     total_flops = 2 * flops_per_matmul
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_len,
-            dim,
-            is_causal,
-            groups=groups,
-            block_M=64,
-            block_N=64,
-            num_stages=2,
-            threads=128)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_len, dim, is_causal, groups=groups, block_M=64, block_N=64, num_stages=2, threads=128)
         ref_program_processed = partial(ref_program, is_causal=is_causal, groups=groups)
         profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
         profiler.assert_allclose(ref_program_processed, rtol=0.01, atol=0.01)
@@ -264,14 +195,22 @@ def main(batch: int = 1,
         print(f"Ref latency: {ref_latency}")
 
 
+def run_regression_perf(
+    batch: int = 1, heads: int = 64, seq_len: int = 4096, dim: int = 128, is_causal: bool = False, groups: int = 16, tune: bool = False
+):
+    kernel = flashattn(batch, heads, seq_len, dim, is_causal, groups=groups, block_M=64, block_N=64, num_stages=2, threads=128)
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=64, help='heads')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="heads")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.groups, args.tune)
diff --git a/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py b/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
index 1c1fc12d2..73a725d9f 100644
--- a/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
@@ -24,9 +24,11 @@ def get_configs():
     rep=10,
 )
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn(
     batch,
     heads,
@@ -39,90 +41,19 @@ def flashattn(
     num_stages=0,
     threads=128,
 ):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [batch, seq_len, heads, dim]
     kv_shape = [batch, seq_len, head_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
-
-    @T.macro
-    def MMA0(
-        K: T.Tensor(kv_shape, dtype),
-        Q_shared: T.SharedBuffer([block_M, dim], dtype),
-        K_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-        k: T.int32,
-        bx: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(K[bz, k * block_N:(k + 1) * block_N, by // groups, :], K_shared)
-        if is_causal:
-            for i, j in T.Parallel(block_M, block_N):
-                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                             -T.infinity(acc_s.dtype))
-        else:
-            T.clear(acc_s)
-        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def MMA1(
-        V: T.Tensor(kv_shape, dtype),
-        V_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-        k: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(V[bz, k * block_N:(k + 1) * block_N, by // groups, :], V_shared)
-        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        T.copy(scores_max, scores_max_prev)
-        T.fill(scores_max, -T.infinity(accum_dtype))
-        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
-        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
-        # in the first ceil_div(kBlockM, kBlockN) steps.
-        # for i in T.Parallel(block_M):
-        #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
-        for i in T.Parallel(block_M):
-            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-        for i, j in T.Parallel(block_M, block_N):
-            # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
-            # max * log_2(e)) This allows the compiler to use the ffma
-            # instruction instead of fadd and fmul separately.
-            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-        T.reduce_sum(acc_s, scores_sum, dim=1)
-        for i in T.Parallel(block_M):
-            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-        T.copy(acc_s, acc_s_cast)
-
-    @T.macro
-    def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        for i, j in T.Parallel(block_M, dim):
-            acc_o[i, j] *= scores_scale[i]
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -138,30 +69,55 @@ def main(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                    (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+            )
 
             for k in T.Pipelined(
-                    loop_range,
-                    num_stages=num_stages,
-                    order=[-1, 0, 3, 1, -1, 2],
-                    stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
-                MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
-                Rescale(acc_o, scores_scale)
-                MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
+                loop_range,
+                num_stages=num_stages,
+                order=[-1, 0, 3, 1, -1, 2],
+                stage=[-1, 0, 0, 1, -1, 1],
+                group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]],
+            ):
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by // groups, :], K_shared)
+                if is_causal:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
+                else:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                for i in T.Parallel(block_M):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, block_N):
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by // groups, :], V_shared)
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(O_shared, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
 
     return main
 
@@ -171,23 +127,21 @@ def ref_program(Q, K, V, is_causal, groups=1):
     # K: [B, T, HK, D]
     # V: [B, T, HV, D]
     # HQ = HKV * groups
-    assert Q.size(2) == K.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
-    assert Q.size(2) == V.size(
-        2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
+    assert Q.size(2) == K.size(2) * groups, f"Q.size(2): {Q.size(2)}, K.size(2): {K.size(2)}, groups: {groups}"
+    assert Q.size(2) == V.size(2) * groups, f"Q.size(2): {Q.size(2)}, V.size(2): {V.size(2)}, groups: {groups}"
 
     dim = Q.size(-1)
     K = K.repeat_interleave(groups, dim=2)
     V = V.repeat_interleave(groups, dim=2)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -205,18 +159,8 @@ def main(
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_len,
-            dim,
-            is_causal,
-            groups=groups,
-            block_M=128,
-            block_N=128,
-            num_stages=2,
-            threads=256)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_len, dim, is_causal, groups=groups, block_M=128, block_N=128, num_stages=2, threads=256)
         ref_program_processed = partial(ref_program, is_causal=is_causal, groups=groups)
         profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
         profiler.assert_allclose(ref_program_processed, rtol=0.01, atol=0.01)
@@ -238,14 +182,28 @@ def main(
         print(f"Ref latency: {ref_latency}")
 
 
+def run_regression_perf(
+    batch: int = 1,
+    heads: int = 64,
+    seq_len: int = 4096,
+    dim: int = 128,
+    is_causal: bool = False,
+    groups: int = 16,
+):
+    kernel = flashattn(batch, heads, seq_len, dim, is_causal, groups=groups, block_M=128, block_N=128, num_stages=2, threads=256)
+
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=64, help='heads')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="heads")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.groups, args.tune)
diff --git a/examples/flash_attention/example_gqa_fwd_varlen.py b/examples/flash_attention/example_gqa_fwd_varlen.py
index 37e81ebb3..0e8e21c43 100644
--- a/examples/flash_attention/example_gqa_fwd_varlen.py
+++ b/examples/flash_attention/example_gqa_fwd_varlen.py
@@ -4,80 +4,36 @@
 import tilelang
 import tilelang.language as T
 import tilelang.testing
-from einops import rearrange, repeat
 from tilelang.profiler import do_bench
 from varlen_utils import generate_random_padding_mask, generate_qkv
 
 
-def attention_ref(
-        q,
-        k,
-        v,
-        query_padding_mask=None,
-        key_padding_mask=None,
-        causal=False,
-        window_size=(-1, -1),
-        upcast=True,
-):
-    if causal:
-        window_size = (window_size[0], 0)
-    dtype_og = q.dtype
-    if upcast:
-        q, k, v = q.float(), k.float(), v.float()
-    dim = q.shape[-1]
-    scale = (1.0 / dim)**0.5
-    k = repeat(k, "b s h d -> b s (h g) d", g=q.shape[2] // k.shape[2])
-    v = repeat(v, "b s h d -> b s (h g) d", g=q.shape[2] // v.shape[2])
-    scores = torch.einsum("bthd,bshd->bhts", q, k)
-    if key_padding_mask is not None:
-        scores.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf"))
-    scores = scores * scale
-    attention = torch.softmax(scores, dim=-1).to(v.dtype)
-
-    if query_padding_mask is not None:
-        attention = attention.masked_fill(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0)
-    output = torch.einsum("bhts,bshd->bthd", attention, v)
-    if query_padding_mask is not None:
-        output.masked_fill_(rearrange(~query_padding_mask, "b s -> b s 1 1"), 0.0)
-    return output.to(dtype=dtype_og), attention.to(dtype=dtype_og)
-
-
 @tilelang.jit(
-    out_idx=[6], pass_configs={
+    out_idx=[6],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch_size,
-              groups,
-              UQ,
-              UKV,
-              heads,
-              dim,
-              is_causal,
-              block_M=64,
-              block_N=64,
-              num_stages=1,
-              threads=128):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch_size, groups, UQ, UKV, heads, dim, is_causal, block_M=64, block_N=64, num_stages=1, threads=128):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     head_kv = heads // groups
     q_shape = [UQ, heads, dim]
     kv_shape = [UKV, head_kv, dim]
     o_shape = [UQ, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
-            Q_unpad: T.Tensor(q_shape, dtype),
-            K_unpad: T.Tensor(kv_shape, dtype),
-            V_unpad: T.Tensor(kv_shape, dtype),
-            cu_seqlens_q: T.Tensor([batch_size + 1], "int32"),
-            cu_seqlens_k: T.Tensor([batch_size + 1], "int32"),
-            max_seqlen_q: T.int32,
-            Output_unpad: T.Tensor(o_shape, dtype),
+        Q_unpad: T.Tensor(q_shape, dtype),
+        K_unpad: T.Tensor(kv_shape, dtype),
+        V_unpad: T.Tensor(kv_shape, dtype),
+        cu_seqlens_q: T.Tensor([batch_size + 1], T.int32),
+        cu_seqlens_k: T.Tensor([batch_size + 1], T.int32),
+        max_seqlen_q: T.int32,
+        Output_unpad: T.Tensor(o_shape, dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(max_seqlen_q, block_M), heads, batch_size,
-                threads=threads) as (bx, by, bz):
+        with T.Kernel(T.ceildiv(max_seqlen_q, block_M), heads, batch_size, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
             K_shared = T.alloc_shared([block_N, dim], dtype)
             V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -96,54 +52,51 @@ def main(
             kv_head_idx = head_idx // groups
 
             q_start_idx = cu_seqlens_q[batch_idx]
-            k_start_idx = cu_seqlens_k[batch_idx]
-            v_start_idx = cu_seqlens_k[batch_idx]
+            kv_start_idx = cu_seqlens_k[batch_idx]
             q_end_idx = cu_seqlens_q[batch_idx + 1]
             k_end_idx = cu_seqlens_k[batch_idx + 1]
-            v_end_idx = cu_seqlens_k[batch_idx + 1]
 
             q_current_seqlen = q_end_idx - q_start_idx
-            k_current_seqlen = k_end_idx - k_start_idx
-            v_current_seqlen = v_end_idx - v_start_idx
+            kv_current_seqlen = k_end_idx - kv_start_idx
 
-            T.copy(
-                Q_unpad[q_start_idx + bx * block_M:q_start_idx + (bx + 1) * block_M, head_idx, :],
-                Q_shared)
-            for i, d in T.Parallel(block_M, dim):
-                if bx * block_M + i >= q_current_seqlen:
-                    Q_shared[i, d] = 0
+            T.copy(Q_unpad[q_start_idx + bx * block_M : q_start_idx + (bx + 1) * block_M, head_idx, :], Q_shared)
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
-            loop_range = T.ceildiv(k_current_seqlen, block_N)
+            offset = kv_current_seqlen - q_current_seqlen  # always align on the right
+            max_visible_k_idx = offset + (bx + 1) * block_M
+            loop_range = (
+                T.min(T.ceildiv(max_visible_k_idx, block_N), T.ceildiv(kv_current_seqlen, block_N))
+                if is_causal
+                else T.ceildiv(kv_current_seqlen, block_N)
+            )
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
-                T.copy(
-                    K_unpad[k_start_idx + k * block_N:k_start_idx + (k + 1) * block_N,
-                            kv_head_idx, :], K_shared)
-                for i, d in T.Parallel(block_N, dim):
-                    if k * block_N + i >= k_current_seqlen:
-                        K_shared[i, d] = 0
+                T.copy(K_unpad[kv_start_idx + k * block_N : kv_start_idx + (k + 1) * block_N, kv_head_idx, :], K_shared)
 
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else((bx * block_M + i >= k * block_N + j) and
-                                                     (bx * block_M + i >= q_current_seqlen or
-                                                      k * block_N + j >= k_current_seqlen),
-                                                     -T.infinity(acc_s.dtype), 0)
+                        acc_s[i, j] = T.if_then_else(
+                            (bx * block_M + i + offset < k * block_N + j)
+                            or (bx * block_M + i >= q_current_seqlen or k * block_N + j >= kv_current_seqlen),
+                            -1e9,
+                            0,
+                        )
                 else:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else((bx * block_M + i >= q_current_seqlen or
-                                                      k * block_N + j >= k_current_seqlen),
-                                                     -T.infinity(acc_s.dtype), 0)
+                        acc_s[i, j] = T.if_then_else(
+                            (bx * block_M + i >= q_current_seqlen or k * block_N + j >= kv_current_seqlen), -1e9, 0
+                        )
 
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
 
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
@@ -157,19 +110,15 @@ def main(
                 for i, j in T.Parallel(block_M, dim):
                     acc_o[i, j] *= scores_scale[i]
 
-                T.copy(
-                    V_unpad[v_start_idx + k * block_N:v_start_idx + (k + 1) * block_N,
-                            kv_head_idx, :], V_shared)
-                for i, d in T.Parallel(block_N, dim):
-                    if k * block_N + i >= v_current_seqlen:
-                        V_shared[i, d] = 0
+                T.copy(V_unpad[kv_start_idx + k * block_N : kv_start_idx + (k + 1) * block_N, kv_head_idx, :], V_shared)
 
                 T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             for i, j in T.Parallel(block_M, dim):
-                acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, O_shared)
+                # When sq > skv, some tokens can see nothing
+                acc_o[i, j] = 0 if is_causal and bx * block_M + i + offset < 0 else acc_o[i, j] / logsum[i]
 
+            T.copy(acc_o, O_shared)
             for i, d in T.Parallel(block_M, dim):
                 if bx * block_M + i < q_current_seqlen:
                     Output_unpad[q_start_idx + bx * block_M + i, head_idx, d] = O_shared[i, d]
@@ -177,13 +126,9 @@ def main(
     return main
 
 
-def main(batch: int = 1,
-         heads: int = 64,
-         q_seqlen: int = 2048,
-         k_seqlen: int = 2048,
-         dim: int = 128,
-         groups: int = 16,
-         is_causal: bool = False):
+def main(
+    batch: int = 1, heads: int = 64, q_seqlen: int = 2048, k_seqlen: int = 2048, dim: int = 128, groups: int = 16, is_causal: bool = False
+):
     assert heads % groups == 0, "heads must be divisible by groups"
 
     flops_per_matmul = 2.0 * batch * heads * q_seqlen * k_seqlen * dim
@@ -191,8 +136,7 @@ def main(batch: int = 1,
 
     tilelang.testing.set_random_seed(0)
 
-    causal = False
-    if causal:
+    if is_causal:
         total_flops *= 0.5
 
     tilelang.testing.set_random_seed(0)
@@ -201,9 +145,9 @@ def main(batch: int = 1,
     device = torch.device("cuda")
 
     head_kv = heads // groups
-    q = torch.randn(batch, q_seqlen, heads, dim, dtype=dtype, device=device, requires_grad=True)
-    k = torch.randn(batch, k_seqlen, head_kv, dim, dtype=dtype, device=device, requires_grad=True)
-    v = torch.randn(batch, k_seqlen, head_kv, dim, dtype=dtype, device=device, requires_grad=True)
+    q = torch.randn(batch, q_seqlen, heads, dim, dtype=dtype, device=device)
+    k = torch.randn(batch, k_seqlen, head_kv, dim, dtype=dtype, device=device)
+    v = torch.randn(batch, k_seqlen, head_kv, dim, dtype=dtype, device=device)
 
     query_padding_mask = generate_random_padding_mask(q_seqlen, batch, device, mode="random")
     key_padding_mask = generate_random_padding_mask(k_seqlen, batch, device, mode="random")
@@ -222,53 +166,46 @@ def main(batch: int = 1,
         output_pad_fn,
         _,
         _,
-    ) = generate_qkv(
-        q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
+    ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
 
     UQ = q_unpad.shape[0]
     UKV = k_unpad.shape[0]
 
-    kernel = flashattn(
-        batch,
-        groups,
-        UQ,
-        UKV,
-        heads,
-        dim,
-        is_causal,
-        block_M=64,
-        block_N=64,
-        num_stages=1,
-        threads=128)
+    kernel = flashattn(batch, groups, UQ, UKV, heads, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256)
 
     out_unpad = kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q)
     out = output_pad_fn(out_unpad)
 
-    out_ref, _ = attention_ref(
-        q,
-        k,
-        v,
-        query_padding_mask=query_padding_mask,
-        key_padding_mask=key_padding_mask,
+    import flash_attn
+
+    fa_out_unpad = flash_attn.flash_attn_varlen_func(
+        q_unpad,
+        k_unpad,
+        v_unpad,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        0.0,
         causal=is_causal,
     )
-    torch.testing.assert_close(out, out_ref, rtol=1e-2, atol=1e-2)
+    fa_out = output_pad_fn(fa_out_unpad)
+    torch.testing.assert_close(out, fa_out, rtol=1e-2, atol=1e-2)
+
     print("All checks passed.✅")
-    latency = do_bench(
-        lambda: kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q))
+    latency = do_bench(lambda: kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q), _n_warmup=5, _n_repeat=5)
     print("Tile-lang: {:.2f} ms".format(latency))
     print("Tile-lang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=64, help='query heads')
-    parser.add_argument('--groups', type=int, default=16, help='groups')
-    parser.add_argument('--q_seqlen', type=int, default=2048, help='query sequence length')
-    parser.add_argument('--k_seqlen', type=int, default=2048, help='key/value sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='head dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal attention')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="query heads")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
+    parser.add_argument("--q_seqlen", type=int, default=2048, help="query sequence length")
+    parser.add_argument("--k_seqlen", type=int, default=2048, help="key/value sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="head dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal attention")
     args = parser.parse_args()
-    main(args.batch, args.heads, args.q_seqlen, args.k_seqlen, args.dim, args.groups,
-         args.is_causal)
+    main(args.batch, args.heads, args.q_seqlen, args.k_seqlen, args.dim, args.groups, args.is_causal)
diff --git a/examples/flash_attention/example_mha_bwd_bhsd.py b/examples/flash_attention/example_mha_bwd_bhsd.py
index 1595ae764..34e8fefc5 100644
--- a/examples/flash_attention/example_mha_bwd_bhsd.py
+++ b/examples/flash_attention/example_mha_bwd_bhsd.py
@@ -7,22 +7,24 @@
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, heads, seq_len, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            Output: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        Output: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=128) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -38,29 +40,28 @@ def flash_fwd(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
             # T.copy(Q_shared, Q_local)
             # for i, j in T.Parallel(block_M, dim):
             #     Q_local[i, j] *= scale
-            loop_range = (
-                T.ceildiv(
-                    (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+            loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
-                T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+                T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                     -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
                 else:
-                    T.clear(acc_s)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+                T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim):
@@ -74,29 +75,31 @@ def flash_fwd(
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(acc_o, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len, dim]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -105,68 +108,71 @@ def flash_bwd_prep(
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, bx, by * blk:(by + 1) * blk, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, bx, by * blk:(by + 1) * blk, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, bx, by * blk : (by + 1) * blk, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, bx, by * blk : (by + 1) * blk, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
 def make_dq_layout(dQ):
     # atomicAdd can not be vectorized, so we need to reorder dq to match the 8x8 gemm fragment
-    return T.Layout(dQ.shape,
-                    lambda b, h, l, d: [b, h, l // 8, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
+    return T.Layout(dQ.shape, lambda b, h, l, d: [b, h, l // 8, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
 
 
 @tilelang.jit(
-    out_idx=[1], pass_configs={
+    out_idx=[1],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_postprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, heads, seq_len, dim]
     blk = 64
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(shape, dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
             T.copy(
-                dQ[bz, by, bx * blk:(bx + 1) * blk, :],
-                dQ_out[bz, by, bx * blk:(bx + 1) * blk, :],
+                dQ[bz, by, bx * blk : (bx + 1) * blk, :],
+                dQ_out[bz, by, bx * blk : (bx + 1) * blk, :],
             )
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
 def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    sm_scale = (1.0 / dim)**0.5
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    sm_scale = (1.0 / dim) ** 0.5
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, heads, seq_len, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(shape, dtype),  # type: ignore
-            dV: T.Tensor(shape, dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(shape, dtype),  # type: ignore
+        dV: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=128) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim], dtype)
@@ -190,36 +196,36 @@ def flash_bwd(
             dv_shared = T.alloc_shared([block_M, dim], dtype)
             dk_shared = T.alloc_shared([block_M, dim], dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-            })
-            T.copy(K[bz, bx, by * block_M:(by + 1) * block_M, :], K_shared)
-            T.copy(V[bz, bx, by * block_M:(by + 1) * block_M, :], V_shared)
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                }
+            )
+            T.copy(K[bz, bx, by * block_M : (by + 1) * block_M, :], K_shared)
+            T.copy(V[bz, bx, by * block_M : (by + 1) * block_M, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=2):
-                T.copy(Q[bz, bx, k * block_N:(k + 1) * block_N, :], q)
+                T.copy(Q[bz, bx, k * block_N : (k + 1) * block_N, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, bx, k * block_N:(k + 1) * block_N, :], do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                # We don't need to handle OOB positions for non-causal cases,
+                # since OOB values won't affect other positions here.
+                T.copy(dO[bz, bx, k * block_N : (k + 1) * block_N, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -232,14 +238,13 @@ def flash_bwd(
                     T.atomic_add(dQ[bz, bx, k * block_N + i, j], dq[i, j])
             T.copy(dv, dv_shared)
             T.copy(dk, dk_shared)
-            T.copy(dv_shared, dV[bz, bx, by * block_M:(by + 1) * block_M, :])
-            T.copy(dk_shared, dK[bz, bx, by * block_M:(by + 1) * block_M, :])
+            T.copy(dv_shared, dV[bz, bx, by * block_M : (by + 1) * block_M, :])
+            T.copy(dk_shared, dK[bz, bx, by * block_M : (by + 1) * block_M, :])
 
     return flash_bwd
 
 
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, causal):
         BATCH, H, N_CTX, D_HEAD = q.shape
@@ -281,15 +286,15 @@ def maybe_contiguous(x):
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bhqd,bhkd->bhqk', Q, K)
+    scores = torch.einsum("bhqd,bhkd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(2)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bhkd->bhqd', attention_weights, V)
+    output = torch.einsum("bhqk,bhkd->bhqd", attention_weights, V)
     return output
 
 
@@ -304,9 +309,7 @@ def main(
     total_flops = 5 * flops_per_matmul
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, H, N_CTX, D_HEAD, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, H, N_CTX, D_HEAD, dtype=torch.half, device="cuda").normal_().requires_grad_()
     K = torch.empty_like(Q).normal_().requires_grad_()
     V = torch.empty_like(Q).normal_().requires_grad_()
     dO = torch.randn_like(Q)
@@ -345,12 +348,43 @@ def run1():
     print("tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
+def run_regression_perf():
+    BATCH = 1
+    H = 16
+    N_CTX = 512
+    D_HEAD = 64
+    causal = False
+    device = "cuda"
+    torch.manual_seed(0)
+    block_M = 64
+    block_N = 64 if D_HEAD <= 64 else 32
+    Q = torch.randn(BATCH, H, N_CTX, D_HEAD, device=device, dtype=torch.half)
+    K = torch.randn_like(Q)
+    V = torch.randn_like(Q)
+    O = torch.randn_like(Q)
+    dO = torch.randn_like(Q)
+    lse = torch.zeros(BATCH, H, N_CTX, device=device, dtype=torch.float32)
+    with torch.no_grad():
+        mod_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD)
+        kernel = flashattn_bwd(BATCH, H, N_CTX, D_HEAD, causal, block_M, block_N)
+    dQ = torch.zeros(BATCH, H, N_CTX, D_HEAD, device=device, dtype=torch.float32)
+    dK = torch.zeros(BATCH, H, N_CTX, D_HEAD, device=device, dtype=torch.float16)
+    dV = torch.zeros(BATCH, H, N_CTX, D_HEAD, device=device, dtype=torch.float16)
+    Delta = mod_prep(O, dO)
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(Q, K, V, dO, lse, Delta, dQ, dK, dV)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head', type=int, default=64, help='Head dimension')
-    parser.add_argument('--causal', type=bool, default=False, help='Causal flag')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head", type=int, default=64, help="Head dimension")
+    parser.add_argument("--causal", type=bool, default=False, help="Causal flag")
     args = parser.parse_args()
     main(args.batch, args.h, args.n_ctx, args.d_head, args.causal)
diff --git a/examples/flash_attention/example_mha_bwd.py b/examples/flash_attention/example_mha_bwd_bshd.py
similarity index 65%
rename from examples/flash_attention/example_mha_bwd.py
rename to examples/flash_attention/example_mha_bwd_bshd.py
index 543c2c0e7..fc8328fa4 100644
--- a/examples/flash_attention/example_mha_bwd.py
+++ b/examples/flash_attention/example_mha_bwd_bshd.py
@@ -7,22 +7,24 @@
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            Output: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        Output: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=128) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -38,25 +40,25 @@ def flash_fwd(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
-            loop_range = (
-                T.ceildiv(
-                    (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+            loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
-                T.copy(K[bz, k * block_N:(k + 1) * block_N, by, :], K_shared)
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by, :], K_shared)
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                     -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
                 else:
-                    T.clear(acc_s)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(V[bz, k * block_N:(k + 1) * block_N, by, :], V_shared)
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim):
@@ -70,29 +72,31 @@ def flash_fwd(
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(acc_o, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -101,68 +105,71 @@ def flash_bwd_prep(
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
 def make_dq_layout(dQ):
     # atomicAdd can not be vectorized, so we need to reorder dq to match the 8x8 gemm fragment
-    return T.Layout(dQ.shape,
-                    lambda b, l, h, d: [b, l // 8, h, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
+    return T.Layout(dQ.shape, lambda b, l, h, d: [b, l // 8, h, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2])
 
 
 @tilelang.jit(
-    out_idx=[1], pass_configs={
+    out_idx=[1],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_postprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim]
     blk = 64
 
     @T.prim_func
     def flash_bwd_post(
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dQ_out: T.Tensor(shape, dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dQ_out: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
             T.annotate_layout({dQ: make_dq_layout(dQ)})
             T.copy(
-                dQ[bz, bx * blk:(bx + 1) * blk, by, :],
-                dQ_out[bz, bx * blk:(bx + 1) * blk, by, :],
+                dQ[bz, bx * blk : (bx + 1) * blk, by, :],
+                dQ_out[bz, bx * blk : (bx + 1) * blk, by, :],
             )
 
     return flash_bwd_post
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
 def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    sm_scale = (1.0 / dim)**0.5
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    sm_scale = (1.0 / dim) ** 0.5
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(shape, dtype),  # type: ignore
-            dV: T.Tensor(shape, dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(shape, dtype),  # type: ignore
+        dV: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=128) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim], dtype)
@@ -186,33 +193,36 @@ def flash_bwd(
             dv_shared = T.alloc_shared([block_M, dim], dtype)
             dk_shared = T.alloc_shared([block_M, dim], dtype)
 
-            T.annotate_layout({
-                dQ: make_dq_layout(dQ),
-            })
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx, :], V_shared)
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                }
+            )
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=2):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
                 T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                # We don't need to handle OOB positions for non-causal cases,
+                # since OOB values won't affect other positions here.
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
                 T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -225,14 +235,13 @@ def flash_bwd(
                     T.atomic_add(dQ[bz, k * block_N + i, bx, j], dq[i, j])
             T.copy(dv, dv_shared)
             T.copy(dk, dk_shared)
-            T.copy(dv_shared, dV[bz, by * block_M:(by + 1) * block_M, bx, :])
-            T.copy(dk_shared, dK[bz, by * block_M:(by + 1) * block_M, bx, :])
+            T.copy(dv_shared, dV[bz, by * block_M : (by + 1) * block_M, bx, :])
+            T.copy(dk_shared, dK[bz, by * block_M : (by + 1) * block_M, bx, :])
 
     return flash_bwd
 
 
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, causal):
         BATCH, N_CTX, H, D_HEAD = q.shape
@@ -274,15 +283,15 @@ def maybe_contiguous(x):
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -297,9 +306,7 @@ def main(
     total_flops = 5 * flops_per_matmul
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD, dtype=torch.half, device="cuda").normal_().requires_grad_()
     K = torch.empty_like(Q).normal_().requires_grad_()
     V = torch.empty_like(Q).normal_().requires_grad_()
     dO = torch.randn_like(Q)
@@ -336,12 +343,43 @@ def run1():
     print("tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
+def run_regression_perf():
+    BATCH = 1
+    H = 16
+    N_CTX = 512
+    D_HEAD = 64
+    causal = False
+    device = "cuda"
+    torch.manual_seed(42)
+    block_M = 64
+    block_N = 64 if D_HEAD <= 64 else 32
+    Q = torch.randn(BATCH, N_CTX, H, D_HEAD, device=device, dtype=torch.half)
+    K = torch.randn_like(Q)
+    V = torch.randn_like(Q)
+    O = torch.randn_like(Q)
+    dO = torch.randn_like(Q)
+    lse = torch.zeros(BATCH, H, N_CTX, device=device, dtype=torch.float32)
+    with torch.no_grad():
+        mod_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD)
+        kernel = flashattn_bwd(BATCH, H, N_CTX, D_HEAD, causal, block_M, block_N)
+    dQ = torch.zeros(BATCH, N_CTX, H, D_HEAD, device=device, dtype=torch.float32)
+    dK = torch.zeros(BATCH, N_CTX, H, D_HEAD, device=device, dtype=torch.float16)
+    dV = torch.zeros(BATCH, N_CTX, H, D_HEAD, device=device, dtype=torch.float16)
+    Delta = mod_prep(O, dO)
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(Q, K, V, dO, lse, Delta, dQ, dK, dV)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head', type=int, default=64, help='Head dimension')
-    parser.add_argument('--causal', type=bool, default=False, help='Causal flag')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head", type=int, default=64, help="Head dimension")
+    parser.add_argument("--causal", type=bool, default=False, help="Causal flag")
     args = parser.parse_args()
     main(args.batch, args.h, args.n_ctx, args.d_head, args.causal)
diff --git a/examples/flash_attention/example_mha_bwd_wgmma_pipelined.py b/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
similarity index 64%
rename from examples/flash_attention/example_mha_bwd_wgmma_pipelined.py
rename to examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
index 7ad417ef5..c0fe4e33d 100644
--- a/examples/flash_attention/example_mha_bwd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
@@ -7,22 +7,24 @@
 
 
 @tilelang.jit(
-    out_idx=[3, 4], pass_configs={
+    out_idx=[3, 4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_fwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            Output: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        Output: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=128) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -37,27 +39,26 @@ def flash_fwd(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.annotate_layout({Q_shared: tilelang.layout.make_swizzled_layout(Q_shared)})
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
-            loop_range = (
-                T.ceildiv(
-                    (bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N))
+            loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
-                T.copy(K[bz, k * block_N:(k + 1) * block_N, by, :], K_shared)
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by, :], K_shared)
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                     -T.infinity(acc_s.dtype))
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
                 else:
-                    T.clear(acc_s)
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(V[bz, k * block_N:(k + 1) * block_N, by, :], V_shared)
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by, :], V_shared)
                 T.copy(scores_max, scores_max_prev)
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_M):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_M, dim):
@@ -71,29 +72,31 @@ def flash_fwd(
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(acc_o, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M:(bx + 1) * block_M])
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
 
     return flash_fwd
 
 
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     shape = [batch, seq_len, heads, dim]
     blk = 32
 
     @T.prim_func
     def flash_bwd_prep(
-            O: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        O: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
             o = T.alloc_fragment([blk, blk], dtype)
@@ -102,37 +105,39 @@ def flash_bwd_prep(
             delta = T.alloc_fragment([blk], accum_dtype)
             T.clear(acc)
             for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], o)
-                T.copy(dO[bz, by * blk:(by + 1) * blk, bx, k * blk:(k + 1) * blk], do)
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
                 for i, j in T.Parallel(blk, blk):
                     acc[i, j] += o[i, j] * do[i, j]
             T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk:(by + 1) * blk])
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
 
     return flash_bwd_prep
 
 
-@tilelang.jit(pass_configs={
-    tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-})
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
 def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    sm_scale = (1.0 / dim)**0.5
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    sm_scale = (1.0 / dim) ** 0.5
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def flash_bwd(
-            Q: T.Tensor(shape, dtype),  # type: ignore
-            K: T.Tensor(shape, dtype),  # type: ignore
-            V: T.Tensor(shape, dtype),  # type: ignore
-            dO: T.Tensor(shape, dtype),  # type: ignore
-            lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-            dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-            dK: T.Tensor(shape, dtype),  # type: ignore
-            dV: T.Tensor(shape, dtype),  # type: ignore
+        Q: T.Tensor(shape, dtype),  # type: ignore
+        K: T.Tensor(shape, dtype),  # type: ignore
+        V: T.Tensor(shape, dtype),  # type: ignore
+        dO: T.Tensor(shape, dtype),  # type: ignore
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
+        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
+        dK: T.Tensor(shape, dtype),  # type: ignore
+        dV: T.Tensor(shape, dtype),  # type: ignore
     ):
         with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=256) as (bx, by, bz):
             K_shared = T.alloc_shared([block_M, dim], dtype)
@@ -157,47 +162,34 @@ def flash_bwd(
             dk_shared = T.alloc_shared([block_M, dim], dtype)
             dq_shared = T.alloc_shared([block_N, dim], accum_dtype)
 
-            T.annotate_layout({
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-                dq_shared: tilelang.layout.make_swizzled_layout(dq_shared),
-            })
-
-            T.copy(K[bz, by * block_M:(by + 1) * block_M, bx, :], K_shared)
-            T.copy(V[bz, by * block_M:(by + 1) * block_M, bx, :], V_shared)
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx, :], V_shared)
             T.clear(dv)
             T.clear(dk)
             loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
             loop_ed = T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_st, loop_ed, num_stages=2):
-                T.copy(Q[bz, k * block_N:(k + 1) * block_N, bx, :], q)
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
                 T.clear(qkT)
-                T.gemm(
-                    K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
-                T.copy(dO[bz, k * block_N:(k + 1) * block_N, bx, :], do)
+                T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
                 T.clear(dsT)
-                T.gemm(
-                    V_shared,
-                    do,
-                    dsT,
-                    transpose_B=True,
-                    policy=T.GemmWarpPolicy.FullRow,
-                    wg_wait=-1)
+                T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
                 T.wait_wgmma(1)
 
-                T.copy(lse[bz, bx, k * block_N:(k + 1) * block_N], lse_shared)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
                 for i, j in T.Parallel(block_M, block_N):
                     qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j],
-                                                   0)
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                # We don't need to handle OOB positions for non-causal cases,
+                # since OOB values won't affect other positions here.
                 T.wait_wgmma(0)
                 T.copy(qkT, qkT_cast)
                 T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
 
-                T.copy(Delta[bz, bx, k * block_N:(k + 1) * block_N], delta)
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
 
                 for i, j in T.Parallel(block_M, block_N):
                     dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
@@ -208,17 +200,16 @@ def flash_bwd(
                 T.gemm(dsT_shared, K_shared, dq, transpose_A=True, wg_wait=1)
                 T.wait_wgmma(0)
                 T.copy(dq, dq_shared)
-                T.atomic_add(dQ[bz, k * block_N:(k + 1) * block_N, bx, :], dq_shared)
+                T.atomic_add(dQ[bz, k * block_N : (k + 1) * block_N, bx, :], dq_shared)
             T.copy(dv, dv_shared)
             T.copy(dk, dk_shared)
-            T.copy(dv_shared, dV[bz, by * block_M:(by + 1) * block_M, bx, :])
-            T.copy(dk_shared, dK[bz, by * block_M:(by + 1) * block_M, bx, :])
+            T.copy(dv_shared, dV[bz, by * block_M : (by + 1) * block_M, bx, :])
+            T.copy(dk_shared, dK[bz, by * block_M : (by + 1) * block_M, bx, :])
 
     return flash_bwd
 
 
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, causal):
         BATCH, N_CTX, H, D_HEAD = q.shape
@@ -260,15 +251,15 @@ def maybe_contiguous(x):
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -283,9 +274,7 @@ def main(
     total_flops = 5 * flops_per_matmul
     if causal:
         total_flops *= 0.5
-    Q = (
-        torch.empty(BATCH, N_CTX, H, D_HEAD, dtype=torch.half,
-                    device="cuda").normal_().requires_grad_())
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD, dtype=torch.half, device="cuda").normal_().requires_grad_()
     K = torch.empty_like(Q).normal_().requires_grad_()
     V = torch.empty_like(Q).normal_().requires_grad_()
     dO = torch.randn_like(Q)
@@ -305,7 +294,7 @@ def main(
     assert torch.allclose(dV, dV_ref, rtol=1e-2, atol=1e-2)
     assert torch.allclose(dK, dK_ref, rtol=1e-2, atol=1e-2)
     assert torch.allclose(dQ, dQ_ref, rtol=1e-2, atol=1e-2)
-    print('All checks passed.✅')
+    print("All checks passed.✅")
 
     def run():
         O_ref.backward(dO, retain_graph=True)
@@ -321,12 +310,44 @@ def run1():
     print("tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
+def run_regression_perf():
+    BATCH = 1
+    H = 32
+    N_CTX = 256
+    D_HEAD = 64
+    causal = False
+    device = "cuda"
+    torch.manual_seed(0)
+    block_M = 128
+    block_N = 128 if D_HEAD <= 64 else 32
+    Q = torch.randn(BATCH, N_CTX, H, D_HEAD, device=device, dtype=torch.half)
+    K = torch.randn_like(Q)
+    V = torch.randn_like(Q)
+    O = torch.randn_like(Q)
+    dO = torch.randn_like(Q)
+    lse = torch.zeros(BATCH, H, N_CTX, device=device, dtype=torch.float32)
+    with torch.no_grad():
+        mod_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD)
+        kernel = flashattn_bwd(BATCH, H, N_CTX, D_HEAD, causal, block_M, block_N)
+    dQ = torch.zeros(BATCH, N_CTX, H, D_HEAD, device=device, dtype=torch.float32)
+    dK = torch.zeros_like(Q, dtype=torch.float16)
+    dV = torch.zeros_like(Q, dtype=torch.float16)
+    Delta = mod_prep(O, dO)
+
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(Q, K, V, dO, lse, Delta, dQ, dK, dV)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='Batch size')
-    parser.add_argument('--h', type=int, default=32, help='Number of heads')
-    parser.add_argument('--n_ctx', type=int, default=1024, help='Context size')
-    parser.add_argument('--d_head', type=int, default=64, help='Head dimension')
-    parser.add_argument('--causal', type=bool, default=False, help='Causal flag')
+    parser.add_argument("--batch", type=int, default=8, help="Batch size")
+    parser.add_argument("--h", type=int, default=32, help="Number of heads")
+    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
+    parser.add_argument("--d_head", type=int, default=64, help="Head dimension")
+    parser.add_argument("--causal", type=bool, default=False, help="Causal flag")
     args = parser.parse_args()
     main(args.batch, args.h, args.n_ctx, args.d_head, args.causal)
diff --git a/examples/flash_attention/example_mha_fwd_bhsd.py b/examples/flash_attention/example_mha_fwd_bhsd.py
index f07f7a618..400736541 100644
--- a/examples/flash_attention/example_mha_fwd_bhsd.py
+++ b/examples/flash_attention/example_mha_fwd_bhsd.py
@@ -15,107 +15,27 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch,
-              heads,
-              seq_q,
-              seq_kv,
-              dim,
-              is_causal,
-              block_M=64,
-              block_N=64,
-              num_stages=1,
-              threads=128):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=64, block_N=64, num_stages=1, threads=128):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     past_len = seq_kv - seq_q
     assert past_len >= 0, "seq_kv must be greater than or equal to seq_q"
 
-    @T.macro
-    def MMA0(
-        K: T.Tensor(kv_shape, dtype),
-        Q_shared: T.SharedBuffer([block_M, dim], dtype),
-        K_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-        k: T.int32,
-        bx: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
-        if is_causal:
-            for i, j in T.Parallel(block_M, block_N):
-                q_idx = bx * block_M + i + past_len
-                k_idx = k * block_N + j
-                acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
-        else:
-            T.clear(acc_s)
-        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def MMA1(
-        V: T.Tensor(kv_shape, dtype),
-        V_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-        k: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
-        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        T.copy(scores_max, scores_max_prev)
-        T.fill(scores_max, -T.infinity(accum_dtype))
-        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
-        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
-        # in the first ceil_div(kBlockM, kBlockN) steps.
-        # for i in T.Parallel(block_M):
-        #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
-        for i in T.Parallel(block_M):
-            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-
-        for i, j in T.Parallel(block_M, block_N):
-            # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
-            # max * log_2(e)) This allows the compiler to use the ffma
-            # instruction instead of fadd and fmul separately.
-            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-        T.reduce_sum(acc_s, scores_sum, dim=1)
-        for i in T.Parallel(block_M):
-            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-        T.copy(acc_s, acc_s_cast)
-
-    @T.macro
-    def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        for i, j in T.Parallel(block_M, dim):
-            acc_o[i, j] *= scores_scale[i]
-
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -131,43 +51,69 @@ def main(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(
-                    T.ceildiv(seq_kv, block_N), T.ceildiv(
-                        (bx + 1) * block_M +
-                        past_len, block_N)) if is_causal else T.ceildiv(seq_kv, block_N))
+                T.min(T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
+                if is_causal
+                else T.ceildiv(seq_kv, block_N)
+            )
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
-                MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
-                Rescale(acc_o, scores_scale)
-                MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
+                T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
+                if is_causal:
+                    for i, j in T.Parallel(block_M, block_N):
+                        q_idx = bx * block_M + i + past_len
+                        k_idx = k * block_N + j
+                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
+                else:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_kv, -T.infinity(acc_s.dtype), 0)
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                for i in T.Parallel(block_M):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, block_N):
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+
+                T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
     return main
 
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bhqd,bhkd->bhqk', Q, K)
+    scores = torch.einsum("bhqd,bhkd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_q = Q.size(2)
         seq_kv = K.size(2)
         mask = torch.tril(torch.ones(seq_q, seq_kv, device=scores.device), seq_kv - seq_q)
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bhkd->bhqd', attention_weights, V)
+    output = torch.einsum("bhqk,bhkd->bhqd", attention_weights, V)
     return output
 
 
@@ -185,18 +131,8 @@ def main(
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_q,
-            seq_kv,
-            dim,
-            is_causal,
-            block_M=64,
-            block_N=64,
-            num_stages=1,
-            threads=128)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=64, block_N=64, num_stages=1, threads=128)
         ref_program_processed = partial(ref_program, is_causal=is_causal)
 
         profiler = kernel.get_profiler()
@@ -219,14 +155,28 @@ def main(
         print(f"Ref latency: {ref_latency}")
 
 
+def run_regression_perf(
+    batch: int = 1,
+    heads: int = 32,
+    seq_q: int = 256,
+    seq_kv: int = 256,
+    dim: int = 64,
+    is_causal: bool = False,
+    tune: bool = False,
+):
+    kernel = flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256)
+    profiler = kernel.get_profiler()
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=1, help='heads')
-    parser.add_argument('--seq_q', type=int, default=256, help='query sequence length')
-    parser.add_argument('--seq_kv', type=int, default=256, help='key/value sequence length')
-    parser.add_argument('--dim', type=int, default=64, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=1, help="heads")
+    parser.add_argument("--seq_q", type=int, default=256, help="query sequence length")
+    parser.add_argument("--seq_kv", type=int, default=256, help="key/value sequence length")
+    parser.add_argument("--dim", type=int, default=64, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal", default=False)
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.is_causal, args.tune)
diff --git a/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py b/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
index 26167b34b..90514f762 100644
--- a/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
@@ -15,107 +15,27 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch,
-              heads,
-              seq_q,
-              seq_kv,
-              dim,
-              is_causal,
-              block_M=128,
-              block_N=128,
-              num_stages=2,
-              threads=256):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     past_len = seq_kv - seq_q
     assert past_len >= 0, "seq_kv must be greater than or equal to seq_q"
 
-    @T.macro
-    def MMA0(
-        K: T.Tensor(kv_shape, dtype),
-        Q_shared: T.SharedBuffer([block_M, dim], dtype),
-        K_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-        k: T.int32,
-        bx: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
-        if is_causal:
-            for i, j in T.Parallel(block_M, block_N):
-                q_idx = bx * block_M + i + past_len
-                k_idx = k * block_N + j
-                acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
-        else:
-            T.clear(acc_s)
-        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def MMA1(
-        V: T.Tensor(kv_shape, dtype),
-        V_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-        k: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
-        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        T.copy(scores_max, scores_max_prev)
-        T.fill(scores_max, -T.infinity(accum_dtype))
-        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
-        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
-        # in the first ceil_div(kBlockM, kBlockN) steps.
-        # for i in T.Parallel(block_M):
-        #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
-        for i in T.Parallel(block_M):
-            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-
-        for i, j in T.Parallel(block_M, block_N):
-            # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
-            # max * log_2(e)) This allows the compiler to use the ffma
-            # instruction instead of fadd and fmul separately.
-            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-        T.reduce_sum(acc_s, scores_sum, dim=1)
-        for i in T.Parallel(block_M):
-            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-        T.copy(acc_s, acc_s_cast)
-
-    @T.macro
-    def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        for i, j in T.Parallel(block_M, dim):
-            acc_o[i, j] *= scores_scale[i]
-
     @T.prim_func
     def main(
-            Q: T.Tensor(q_shape, dtype),
-            K: T.Tensor(kv_shape, dtype),
-            V: T.Tensor(kv_shape, dtype),
-            Output: T.Tensor(q_shape, dtype),
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -131,48 +51,75 @@ def main(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(
-                    T.ceildiv(seq_kv, block_N), T.ceildiv(
-                        (bx + 1) * block_M +
-                        past_len, block_N)) if is_causal else T.ceildiv(seq_kv, block_N))
+                T.min(T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
+                if is_causal
+                else T.ceildiv(seq_kv, block_N)
+            )
 
             for k in T.Pipelined(
-                    loop_range,
-                    num_stages=num_stages,
-                    order=[-1, 0, 3, 1, -1, 2],
-                    stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
-                MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
-                Rescale(acc_o, scores_scale)
-                MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
+                loop_range,
+                num_stages=num_stages,
+                order=[-1, 0, 3, 1, -1, 2],
+                stage=[-1, 0, 0, 1, -1, 1],
+                group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]],
+            ):
+                T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
+                if is_causal:
+                    for i, j in T.Parallel(block_M, block_N):
+                        q_idx = bx * block_M + i + past_len
+                        k_idx = k * block_N + j
+                        acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
+                else:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_kv, -T.infinity(acc_s.dtype), 0)
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                for i in T.Parallel(block_M):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, block_N):
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+
+                T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+            T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
     return main
 
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bhqd,bhkd->bhqk', Q, K)
+    scores = torch.einsum("bhqd,bhkd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_q = Q.size(2)
         seq_kv = K.size(2)
         mask = torch.tril(torch.ones(seq_q, seq_kv, device=scores.device), seq_kv - seq_q)
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bhkd->bhqd', attention_weights, V)
+    output = torch.einsum("bhqk,bhkd->bhqd", attention_weights, V)
     return output
 
 
@@ -190,18 +137,8 @@ def main(
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_q,
-            seq_kv,
-            dim,
-            is_causal,
-            block_M=128,
-            block_N=128,
-            num_stages=2,
-            threads=256)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256)
         ref_program_processed = partial(ref_program, is_causal=is_causal)
 
         profiler = kernel.get_profiler()
@@ -224,14 +161,28 @@ def main(
         print(f"Ref latency: {ref_latency}")
 
 
+def run_regression_perf(
+    batch: int = 1,
+    heads: int = 32,
+    seq_q: int = 256,
+    seq_kv: int = 256,
+    dim: int = 128,
+    is_causal: bool = False,
+    tune: bool = False,
+):
+    kernel = flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256)
+    profiler = kernel.get_profiler()
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--seq_q', type=int, default=4096, help='query sequence length')
-    parser.add_argument('--seq_kv', type=int, default=4096, help='key/value sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--seq_q", type=int, default=4096, help="query sequence length")
+    parser.add_argument("--seq_kv", type=int, default=4096, help="key/value sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.is_causal, args.tune)
diff --git a/examples/flash_attention/example_mha_fwd_bshd.py b/examples/flash_attention/example_mha_fwd_bshd.py
index 6a1f707e5..e584971c0 100644
--- a/examples/flash_attention/example_mha_fwd_bshd.py
+++ b/examples/flash_attention/example_mha_fwd_bshd.py
@@ -15,100 +15,23 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch,
-              heads,
-              seq_len,
-              dim,
-              is_causal,
-              block_M=64,
-              block_N=64,
-              num_stages=1,
-              threads=128):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch, heads, seq_len, dim, is_causal, block_M=64, block_N=64, num_stages=1, threads=128):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
-
-    @T.macro
-    def MMA0(
-        K: T.Tensor(shape, dtype),
-        Q_shared: T.SharedBuffer([block_M, dim], dtype),
-        K_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-        k: T.int32,
-        bx: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(K[bz, k * block_N:(k + 1) * block_N, by, :], K_shared)
-        if is_causal:
-            for i, j in T.Parallel(block_M, block_N):
-                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                             -T.infinity(acc_s.dtype))
-        else:
-            T.clear(acc_s)
-        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def MMA1(
-        V: T.Tensor(shape, dtype),
-        V_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-        k: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(V[bz, k * block_N:(k + 1) * block_N, by, :], V_shared)
-        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        T.copy(scores_max, scores_max_prev)
-        T.fill(scores_max, -T.infinity(accum_dtype))
-        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
-        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
-        # in the first ceil_div(kBlockM, kBlockN) steps.
-        # for i in T.Parallel(block_M):
-        #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
-        for i in T.Parallel(block_M):
-            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-        for i, j in T.Parallel(block_M, block_N):
-            # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
-            # max * log_2(e)) This allows the compiler to use the ffma
-            # instruction instead of fadd and fmul separately.
-            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-        T.reduce_sum(acc_s, scores_sum, dim=1)
-        for i in T.Parallel(block_M):
-            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-        T.copy(acc_s, acc_s_cast)
-
-    @T.macro
-    def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        for i, j in T.Parallel(block_M, dim):
-            acc_o[i, j] *= scores_scale[i]
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
-            Q: T.Tensor(shape, dtype),
-            K: T.Tensor(shape, dtype),
-            V: T.Tensor(shape, dtype),
-            Output: T.Tensor(shape, dtype),
+        Q: T.Tensor(shape, dtype),
+        K: T.Tensor(shape, dtype),
+        V: T.Tensor(shape, dtype),
+        Output: T.Tensor(shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -124,40 +47,64 @@ def main(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                    (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+            )
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
-                MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
-                Rescale(acc_o, scores_scale)
-                MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by, :], K_shared)
+                if is_causal:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
+                else:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                for i in T.Parallel(block_M):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, block_N):
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by, :], V_shared)
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(O_shared, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
 
     return main
 
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -174,17 +121,8 @@ def main(
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_len,
-            dim,
-            is_causal,
-            block_M=128,
-            block_N=128,
-            num_stages=1,
-            threads=128)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_len, dim, is_causal, block_M=128, block_N=128, num_stages=1, threads=128)
         ref_program_processed = partial(ref_program, is_causal=is_causal)
         profiler = kernel.get_profiler()
         profiler.assert_allclose(ref_program_processed, rtol=0.01, atol=0.01)
@@ -206,13 +144,19 @@ def main(
         print(f"Ref latency: {ref_latency}")
 
 
+def run_regression_perf(batch: int = 8, heads: int = 32, seq_len: int = 4096, dim: int = 128, is_causal: bool = False):
+    kernel = flashattn(batch, heads, seq_len, dim, is_causal, block_M=128, block_N=128, num_stages=1, threads=128)
+    profiler = kernel.get_profiler()
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.tune)
diff --git a/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py b/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
index 3928db4c3..d6e1490c9 100644
--- a/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_fwd_bshd_wgmma_pipelined.py
@@ -15,100 +15,23 @@ def get_configs():
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(
-    out_idx=[3], pass_configs={
+    out_idx=[3],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch,
-              heads,
-              seq_len,
-              dim,
-              is_causal,
-              block_M=128,
-              block_N=128,
-              num_stages=2,
-              threads=256):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch, heads, seq_len, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape = [batch, seq_len, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
-
-    @T.macro
-    def MMA0(
-        K: T.Tensor(shape, dtype),
-        Q_shared: T.SharedBuffer([block_M, dim], dtype),
-        K_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-        k: T.int32,
-        bx: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(K[bz, k * block_N:(k + 1) * block_N, by, :], K_shared)
-        if is_causal:
-            for i, j in T.Parallel(block_M, block_N):
-                acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                             -T.infinity(acc_s.dtype))
-        else:
-            T.clear(acc_s)
-        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def MMA1(
-        V: T.Tensor(shape, dtype),
-        V_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-        k: T.int32,
-        by: T.int32,
-        bz: T.int32,
-    ):
-        T.copy(V[bz, k * block_N:(k + 1) * block_N, by, :], V_shared)
-        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        T.copy(scores_max, scores_max_prev)
-        T.fill(scores_max, -T.infinity(accum_dtype))
-        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
-        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
-        # in the first ceil_div(kBlockM, kBlockN) steps.
-        # for i in T.Parallel(block_M):
-        #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
-        for i in T.Parallel(block_M):
-            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-        for i, j in T.Parallel(block_M, block_N):
-            # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
-            # max * log_2(e)) This allows the compiler to use the ffma
-            # instruction instead of fadd and fmul separately.
-            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-        T.reduce_sum(acc_s, scores_sum, dim=1)
-        for i in T.Parallel(block_M):
-            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-        T.copy(acc_s, acc_s_cast)
-
-    @T.macro
-    def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        for i, j in T.Parallel(block_M, dim):
-            acc_o[i, j] *= scores_scale[i]
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
-            Q: T.Tensor(shape, dtype),
-            K: T.Tensor(shape, dtype),
-            V: T.Tensor(shape, dtype),
-            Output: T.Tensor(shape, dtype),
+        Q: T.Tensor(shape, dtype),
+        K: T.Tensor(shape, dtype),
+        V: T.Tensor(shape, dtype),
+        Output: T.Tensor(shape, dtype),
     ):
         with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -124,45 +47,70 @@ def main(
             scores_sum = T.alloc_fragment([block_M], accum_dtype)
             logsum = T.alloc_fragment([block_M], accum_dtype)
 
-            T.copy(Q[bz, bx * block_M:(bx + 1) * block_M, by, :], Q_shared)
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = (
-                T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                    (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+            )
 
             for k in T.Pipelined(
-                    loop_range,
-                    num_stages=num_stages,
-                    order=[-1, 0, 3, 1, -1, 2],
-                    stage=[-1, 0, 0, 1, -1, 1],
-                    group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10], [11], [12], [13]]):
-                MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
-                Rescale(acc_o, scores_scale)
-                MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
+                loop_range,
+                num_stages=num_stages,
+                order=[-1, 0, 3, 1, -1, 2],
+                stage=[-1, 0, 0, 1, -1, 1],
+                group=[[0], [1, 2], [3, 4, 5, 6, 7, 8, 9, 10, 11], [12], [13], [14]],
+            ):
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by, :], K_shared)
+                if is_causal:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
+                else:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                for i in T.Parallel(block_M):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, block_N):
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by, :], V_shared)
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             T.copy(acc_o, O_shared)
-            T.copy(O_shared, Output[bz, bx * block_M:(bx + 1) * block_M, by, :])
+            T.copy(O_shared, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
 
     return main
 
 
 def ref_program(Q, K, V, is_causal):
     dim = Q.size(-1)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     if is_causal:
         seq_len = Q.size(1)
         mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
         mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -179,17 +127,8 @@ def main(
     if is_causal:
         total_flops *= 0.5
 
-    if (not tune):
-        kernel = flashattn(
-            batch,
-            heads,
-            seq_len,
-            dim,
-            is_causal,
-            block_M=128,
-            block_N=128,
-            num_stages=2,
-            threads=256)
+    if not tune:
+        kernel = flashattn(batch, heads, seq_len, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256)
         ref_program_processed = partial(ref_program, is_causal=is_causal)
         profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
         profiler.assert_allclose(ref_program_processed, rtol=0.01, atol=0.01)
@@ -211,13 +150,19 @@ def main(
         print(f"Ref latency: {ref_latency}")
 
 
+def run_regression_perf(batch: int = 8, heads: int = 32, seq_len: int = 4096, dim: int = 128, is_causal: bool = False):
+    kernel = flashattn(batch, heads, seq_len, dim, is_causal, block_M=128, block_N=128, num_stages=2, threads=256)
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--is_causal', action='store_true', help='causal')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
     main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.tune)
diff --git a/examples/flash_attention/example_mha_fwd_varlen.py b/examples/flash_attention/example_mha_fwd_varlen.py
index f381e900a..0f3610b11 100644
--- a/examples/flash_attention/example_mha_fwd_varlen.py
+++ b/examples/flash_attention/example_mha_fwd_varlen.py
@@ -4,109 +4,51 @@
 import tilelang.language as T
 import tilelang.testing
 import argparse
+from tilelang.profiler import do_bench
+from tilelang.autotuner import set_autotune_inputs, autotune
 
 import torch
-from einops import rearrange, repeat
 from varlen_utils import generate_random_padding_mask, generate_qkv
+import itertools
 
 
-def attention_ref(
-        q,
-        k,
-        v,
-        query_padding_mask=None,
-        key_padding_mask=None,
-        causal=False,
-        window_size=(-1, -1),  # -1 means infinite window size
-        upcast=True,
-):
-    """
-    Arguments:
-        q: (batch_size, seqlen_q, nheads, head_dim)
-        k: (batch_size, seqlen_k, nheads_k, head_dim)
-        v: (batch_size, seqlen_k, nheads_k, head_dim)
-        query_padding_mask: (batch_size, seqlen_q)
-        key_padding_mask: (batch_size, seqlen_k)
-        attn_bias: broadcastable to (batch_size, nheads, seqlen_q, seqlen_k)
-        dropout_p: float
-        dropout_mask: (batch_size, nheads, seqlen_q, seqlen_k)
-        causal: whether to apply causal masking
-        window_size: (int, int), left and right window size
-        upcast: whether to cast all inputs to fp32, do all computation in fp32, then cast
-            output back to fp16/bf16.
-        reorder_ops: whether to change the order of operations (scaling k instead of scaling q, etc.)
-            without changing the math. This is to estimate the numerical error from operation
-            reordering.
-    Output:
-        output: (batch_size, seqlen_q, nheads, head_dim)
-        attention: (batch_size, nheads, seqlen_q, seqlen_k), softmax after dropout
-    """
-    if causal:
-        window_size = (window_size[0], 0)
-    dtype_og = q.dtype
-    if upcast:
-        q, k, v = q.float(), k.float(), v.float()
-    dim = q.shape[-1]
-    scale = (1.0 / dim)**0.5  # log2(e)
-    k = repeat(k, "b s h d -> b s (h g) d", g=q.shape[2] // k.shape[2])
-    v = repeat(v, "b s h d -> b s (h g) d", g=q.shape[2] // v.shape[2])
-    scores = torch.einsum("bthd,bshd->bhts", q, k)
-    if key_padding_mask is not None:
-        scores.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf"))
-        # scores.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), 0)
-    scores = scores * scale
-    attention = torch.softmax(scores, dim=-1).to(v.dtype)
-
-    # We want to mask here so that the attention matrix doesn't have any NaNs
-    # Otherwise we'll get NaN in dV
-    if query_padding_mask is not None:
-        attention = attention.masked_fill(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0)
-    output = torch.einsum("bhts,bshd->bthd", attention, v)
-    if query_padding_mask is not None:
-        output.masked_fill_(rearrange(~query_padding_mask, "b s -> b s 1 1"), 0.0)
-    return output.to(dtype=dtype_og), attention.to(dtype=dtype_og)
+def get_configs():
+    iter_params = dict(block_M=[64, 128], block_N=[64, 128], num_stages=[0, 1, 2, 3], threads=[128, 256])
+    return [dict(zip(iter_params, values)) for values in itertools.product(*iter_params.values())]
 
 
+@autotune(configs=get_configs())
 @tilelang.jit(
-    out_idx=[6], pass_configs={
+    out_idx=[6],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
-def flashattn(batch_size,
-              UQ,
-              UKV,
-              heads,
-              dim,
-              is_causal,
-              block_M=64,
-              block_N=64,
-              num_stages=0,
-              threads=32):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    },
+)
+def flashattn(batch_size, UQ, UKV, heads, dim, is_causal, block_M=64, block_N=64, num_stages=1, threads=128):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     q_shape = [UQ, heads, dim]
     k_shape = [UKV, heads, dim]
     v_shape = [UKV, heads, dim]
     o_shape = [UQ, heads, dim]
 
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
-            Q_unpad: T.Tensor(q_shape, dtype),
-            K_unpad: T.Tensor(k_shape, dtype),
-            V_unpad: T.Tensor(v_shape, dtype),
-            cu_seqlens_q: T.Tensor([batch_size + 1], "int32"),
-            cu_seqlens_k: T.Tensor([batch_size + 1], "int32"),
-            max_seqlen_q: T.int32,
-            Output_unpad: T.Tensor(o_shape, dtype),
+        Q_unpad: T.Tensor(q_shape, dtype),
+        K_unpad: T.Tensor(k_shape, dtype),
+        V_unpad: T.Tensor(v_shape, dtype),
+        cu_seqlens_q: T.Tensor([batch_size + 1], T.int32),
+        cu_seqlens_k: T.Tensor([batch_size + 1], T.int32),
+        max_seqlen_q: T.int32,
+        Output_unpad: T.Tensor(o_shape, dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(max_seqlen_q, block_M), heads, batch_size,
-                threads=threads) as (bx, by, bz):
-            Q_shared = T.alloc_shared([block_M, dim], dtype, "shared")
-            K_shared = T.alloc_shared([block_N, dim], dtype, "shared")
-            V_shared = T.alloc_shared([block_N, dim], dtype, "shared")
-            O_shared = T.alloc_shared([block_M, dim], dtype, "shared")
+        with T.Kernel(T.ceildiv(max_seqlen_q, block_M), heads, batch_size, threads=threads) as (bx, by, bz):
+            Q_shared = T.alloc_shared([block_M, dim], dtype)
+            K_shared = T.alloc_shared([block_N, dim], dtype)
+            V_shared = T.alloc_shared([block_N, dim], dtype)
+            O_shared = T.alloc_shared([block_M, dim], dtype)
             acc_s = T.alloc_fragment([block_M, block_N], accum_dtype)
             acc_s_cast = T.alloc_fragment([block_M, block_N], dtype)
             acc_o = T.alloc_fragment([block_M, dim], accum_dtype)
@@ -120,46 +62,46 @@ def main(
             head_idx = by
 
             q_start_idx = cu_seqlens_q[batch_idx]
-            k_start_idx = cu_seqlens_k[batch_idx]
-            v_start_idx = cu_seqlens_k[batch_idx]
+            kv_start_idx = cu_seqlens_k[batch_idx]
             q_end_idx = cu_seqlens_q[batch_idx + 1]
-            k_end_idx = cu_seqlens_k[batch_idx + 1]
-            v_end_idx = cu_seqlens_k[batch_idx + 1]
+            kv_end_idx = cu_seqlens_k[batch_idx + 1]
 
             q_current_seqlen = q_end_idx - q_start_idx
-            k_current_seqlen = k_end_idx - k_start_idx
-            v_current_seqlen = v_end_idx - v_start_idx
+            kv_current_seqlen = kv_end_idx - kv_start_idx
 
-            for i, d in T.Parallel(block_M, dim):
-                if bx * block_M + i < q_current_seqlen:
-                    Q_shared[i, d] = Q_unpad[q_start_idx + bx * block_M + i, head_idx, d]
-                else:
-                    Q_shared[i, d] = 0
+            T.copy(
+                Q_unpad[q_start_idx + bx * block_M : q_start_idx + bx * block_M + block_M, head_idx, :], Q_shared
+            )  # OOB positions will be handled below
 
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
-            loop_range = T.ceildiv(k_current_seqlen, block_N)
+            offset = kv_current_seqlen - q_current_seqlen  # always align on the right
+            loop_range = (
+                T.min(T.ceildiv(offset + (bx + 1) * block_M, block_N), T.ceildiv(kv_current_seqlen, block_N))
+                if is_causal
+                else T.ceildiv(kv_current_seqlen, block_N)
+            )
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 # Q * K
-                for i, d in T.Parallel(block_N, dim):
-                    if k * block_N + i < k_current_seqlen:
-                        K_shared[i, d] = K_unpad[k_start_idx + k * block_N + i, head_idx, d]
-                    else:
-                        K_shared[i, d] = 0
+                T.copy(
+                    K_unpad[kv_start_idx + k * block_N : kv_start_idx + k * block_N + block_N, head_idx, :], K_shared
+                )  # OOB positions will be handled below
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else((bx * block_M + i >= k * block_N + j) and
-                                                     (bx * block_M + i >= q_current_seqlen or
-                                                      k * block_N + j >= k_current_seqlen),
-                                                     -T.infinity(acc_s.dtype), 0)
+                        acc_s[i, j] = T.if_then_else(
+                            (bx * block_M + i + offset < k * block_N + j)
+                            or (bx * block_M + i >= q_current_seqlen or k * block_N + j >= kv_current_seqlen),
+                            -1e9,
+                            0,
+                        )
                 else:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else((bx * block_M + i >= q_current_seqlen or
-                                                      k * block_N + j >= k_current_seqlen),
-                                                     -T.infinity(acc_s.dtype), 0)
+                        acc_s[i, j] = T.if_then_else(
+                            (bx * block_M + i >= q_current_seqlen or k * block_N + j >= kv_current_seqlen), -1e9, 0
+                        )
 
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
@@ -167,6 +109,8 @@ def main(
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 # To do causal softmax, we need to set the scores_max to 0 if it is -inf
                 # This process is called Check_inf in FlashAttention3 code, and it only need to be done
                 # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -189,18 +133,17 @@ def main(
                     acc_o[i, j] *= scores_scale[i]
 
                 # V * softmax(Q * K)
-                for i, d in T.grid(block_N, dim):
-                    if k * block_N + i < v_current_seqlen:
-                        V_shared[i, d] = V_unpad[v_start_idx + k * block_N + i, head_idx, d]
-                    else:
-                        V_shared[i, d] = 0
+                T.copy(
+                    V_unpad[kv_start_idx + k * block_N : kv_start_idx + k * block_N + block_N, head_idx, :], V_shared
+                )  # OOB positions' weights are 0
 
                 T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
             for i, j in T.Parallel(block_M, dim):
-                acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, O_shared)
+                # When sq > skv, some tokens can see nothing
+                acc_o[i, j] = 0 if is_causal and bx * block_M + i + offset < 0 else acc_o[i, j] / logsum[i]
 
+            T.copy(acc_o, O_shared)
             for i, d in T.Parallel(block_M, dim):
                 if bx * block_M + i < q_current_seqlen:
                     Output_unpad[q_start_idx + bx * block_M + i, head_idx, d] = O_shared[i, d]
@@ -208,19 +151,17 @@ def main(
     return main
 
 
-def main(batch: int = 8, heads: int = 64, seq_len: int = 2048, dim: int = 128):
+def main(batch: int = 8, heads: int = 64, seq_len: int = 2048, dim: int = 128, causal: bool = False, tune: bool = False):
     flops_per_matmul = 2.0 * batch * heads * seq_len * seq_len * dim
     total_flops = 2 * flops_per_matmul
 
     tilelang.testing.set_random_seed(0)
 
-    causal = False
     if causal:
         total_flops *= 0.5
 
     dtype = torch.float16
     device = torch.device("cuda")
-    window_size = (-1, -1)
 
     q = torch.randn(batch, seq_len, heads, dim, dtype=dtype, requires_grad=True).to(device)
     k = torch.randn(batch, seq_len, heads, dim, dtype=dtype, requires_grad=True).to(device)
@@ -240,30 +181,23 @@ def main(batch: int = 8, heads: int = 64, seq_len: int = 2048, dim: int = 128):
         k,
         v,
         output_pad_fn,
-        dq_pad_fn,
-        dk_pad_fn,
-    ) = generate_qkv(
-        q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
+        _,
+        _,
+    ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
 
     UQ = q_unpad.shape[0]  # unpadded query length
-    UK = k_unpad.shape[0]  # unpadded key length
     UKV = k_unpad.shape[0]  # unpadded query key length
 
-    kernel = flashattn(batch, UQ, UKV, heads, dim, causal)
+    if tune:
+        with set_autotune_inputs(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q):
+            kernel = flashattn(batch, UQ, UKV, heads, dim, causal)
+    else:
+        kernel = flashattn(batch, UQ, UKV, heads, dim, causal, block_M=64, block_N=64, num_stages=1, threads=128)
+        # NOTE: (128, 128, 2or3, 256) is recommended for Hopper
 
     out_unpad = kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q)
     out = output_pad_fn(out_unpad)
 
-    out_ref, _ = attention_ref(
-        q,
-        k,
-        v,
-        query_padding_mask,
-        key_padding_mask,
-        causal=causal,
-    )
-    torch.testing.assert_close(out, out_ref, rtol=1e-2, atol=1e-2)
-
     import flash_attn
 
     fla_out_unpad = flash_attn.flash_attn_varlen_func(
@@ -282,13 +216,67 @@ def main(batch: int = 8, heads: int = 64, seq_len: int = 2048, dim: int = 128):
 
     print("All checks passed.✅")
 
+    # benchmark
+    t = do_bench(lambda: kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q))
+    print(f"Tilelang time: {t} ms")
+    print(f"Tilelang: {total_flops / t * 1e-9} TFlops")
+    t = do_bench(
+        lambda: flash_attn.flash_attn_varlen_func(
+            q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, 0.0, causal=causal
+        )
+    )
+    print(f"FA2 time: {t} ms")
+    print(f"FA2: {total_flops / t * 1e-9} TFlops")
+
+
+def run_regression_perf(batch: int = 8, heads: int = 64, seq_len: int = 2048, dim: int = 128, causal: bool = False):
+    flops_per_matmul = 2.0 * batch * heads * seq_len * seq_len * dim
+    total_flops = 2 * flops_per_matmul
+    tilelang.testing.set_random_seed(0)
+    if causal:
+        total_flops *= 0.5
+    dtype = torch.float16
+    device = torch.device("cuda")
+    q = torch.randn(batch, seq_len, heads, dim, dtype=dtype, requires_grad=True).to(device)
+    k = torch.randn(batch, seq_len, heads, dim, dtype=dtype, requires_grad=True).to(device)
+    v = torch.randn(batch, seq_len, heads, dim, dtype=dtype, requires_grad=True).to(device)
+    query_padding_mask = generate_random_padding_mask(seq_len, batch, device, mode="random")
+    key_padding_mask = generate_random_padding_mask(seq_len, batch, device, mode="random")
+    (
+        q_unpad,
+        k_unpad,
+        v_unpad,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        q,
+        k,
+        v,
+        output_pad_fn,
+        dq_pad_fn,
+        dk_pad_fn,
+    ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
+    UQ = q_unpad.shape[0]
+    UKV = k_unpad.shape[0]
+    kernel = flashattn(batch, UQ, UKV, heads, dim, causal, block_M=128, block_N=128, num_stages=2, threads=256)
+
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=64, help='heads')
-    parser.add_argument('--seq_len', type=int, default=2048, help='sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="heads")
+    parser.add_argument("--seq_len", type=int, default=2048, help="sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--is_causal", action="store_true", default=False, help="causal attention")
+    parser.add_argument("--tune", action="store_true", default=False, help="tune the kernel")
 
     args = parser.parse_args()
-    main(args.batch, args.heads, args.seq_len, args.dim)
+    main(args.batch, args.heads, args.seq_len, args.dim, args.is_causal, args.tune)
diff --git a/examples/flash_attention/regression_example_flash_attention.py b/examples/flash_attention/regression_example_flash_attention.py
new file mode 100644
index 000000000..8710bbb6e
--- /dev/null
+++ b/examples/flash_attention/regression_example_flash_attention.py
@@ -0,0 +1,74 @@
+import tilelang.testing
+import example_gqa_fwd_bshd
+import example_gqa_fwd_bshd_wgmma_pipelined
+import example_mha_fwd_bhsd
+import example_mha_fwd_bhsd_wgmma_pipelined
+import example_mha_fwd_bshd
+import example_mha_fwd_bshd_wgmma_pipelined
+import example_mha_fwd_varlen
+import example_gqa_bwd_tma_reduce_varlen
+import example_gqa_bwd
+import example_gqa_bwd_wgmma_pipelined
+import example_mha_bwd_bshd
+import example_mha_bwd_bhsd
+import example_mha_bwd_bshd_wgmma_pipelined
+
+
+def regression_example_gqa_bwd_tma_reduce_varlen():
+    tilelang.testing.process_func(example_gqa_bwd_tma_reduce_varlen.run_regression_perf)
+
+
+def regression_example_gqa_bwd():
+    tilelang.testing.process_func(example_gqa_bwd.run_regression_perf)
+
+
+def regression_example_gqa_bwd_wgmma_pipelined():
+    tilelang.testing.process_func(example_gqa_bwd_wgmma_pipelined.run_regression_perf)
+
+
+def regression_example_mha_bwd_bshd():
+    tilelang.testing.process_func(example_mha_bwd_bshd.run_regression_perf)
+
+
+def regression_example_mha_bwd_bhsd():
+    tilelang.testing.process_func(example_mha_bwd_bhsd.run_regression_perf)
+
+
+def regression_example_mha_bwd_bshd_wgmma_pipelined():
+    tilelang.testing.process_func(example_mha_bwd_bshd_wgmma_pipelined.run_regression_perf)
+
+
+def regression_example_gqa_fwd_bshd_wgmma_pipelined():
+    tilelang.testing.process_func(
+        example_gqa_fwd_bshd_wgmma_pipelined.run_regression_perf, batch=1, heads=16, seq_len=1024, dim=128, is_causal=False, groups=16
+    )
+
+
+def regression_example_gqa_fwd_bshd():
+    tilelang.testing.process_func(
+        example_gqa_fwd_bshd.run_regression_perf, batch=1, heads=16, seq_len=1024, dim=128, is_causal=False, groups=16
+    )
+
+
+def regression_example_mha_fwd_bhsd_wgmma_pipelined():
+    tilelang.testing.process_func(example_mha_fwd_bhsd_wgmma_pipelined.run_regression_perf)
+
+
+def regression_example_mha_fwd_bhsd():
+    tilelang.testing.process_func(example_mha_fwd_bhsd.run_regression_perf)
+
+
+def regression_example_mha_fwd_bshd_wgmma_pipelined():
+    tilelang.testing.process_func(example_mha_fwd_bshd_wgmma_pipelined.run_regression_perf, batch=1, heads=32, seq_len=256)
+
+
+def regression_example_mha_fwd_bshd():
+    tilelang.testing.process_func(example_mha_fwd_bshd.run_regression_perf, batch=1, seq_len=256)
+
+
+def regression_example_mha_fwd_varlen():
+    tilelang.testing.process_func(example_mha_fwd_varlen.run_regression_perf, batch=4, heads=16, seq_len=512, dim=64)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/flash_attention/test_example_flash_attention.py b/examples/flash_attention/test_example_flash_attention.py
index 8a58f3b6a..a74bf071b 100644
--- a/examples/flash_attention/test_example_flash_attention.py
+++ b/examples/flash_attention/test_example_flash_attention.py
@@ -2,7 +2,7 @@
 
 import example_gqa_bwd
 import example_gqa_bwd_wgmma_pipelined
-import example_mha_bwd
+import example_mha_bwd_bshd
 import example_mha_bwd_bhsd
 import example_mha_fwd_bhsd_wgmma_pipelined
 import example_gqa_fwd_bshd
@@ -10,8 +10,15 @@
 import example_gqa_fwd_bshd_wgmma_pipelined
 import example_mha_fwd_bshd_wgmma_pipelined
 import example_mha_fwd_varlen
-import example_mha_bwd_wgmma_pipelined
+import example_mha_bwd_bshd_wgmma_pipelined
 import example_mha_fwd_bhsd
+import example_gqa_bwd_tma_reduce_varlen
+import example_gqa_fwd_varlen
+
+
+@tilelang.testing.requires_cuda
+def test_example_gqa_bwd_tma_reduce_varlen():
+    example_gqa_bwd_tma_reduce_varlen.main()
 
 
 @tilelang.testing.requires_cuda
@@ -27,31 +34,41 @@ def test_example_gqa_bwd_wgmma_pipelined():
 
 @tilelang.testing.requires_cuda
 def test_example_mha_bwd():
-    example_mha_bwd.main(BATCH=1)
+    example_mha_bwd_bshd.main(
+        BATCH=1,
+        H=16,
+        N_CTX=512,
+        D_HEAD=64,
+        causal=False,
+    )
 
 
 @tilelang.testing.requires_cuda
 def test_example_mha_bwd_bhsd():
-    example_mha_bwd_bhsd.main(BATCH=1)
+    example_mha_bwd_bhsd.main(
+        BATCH=1,
+        H=16,
+        N_CTX=512,
+        D_HEAD=64,
+        causal=False,
+    )
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_mha_bwd_wgmma_pipelined():
-    example_mha_bwd_wgmma_pipelined.main(BATCH=1)
+    example_mha_bwd_bshd_wgmma_pipelined.main(BATCH=1, H=32, N_CTX=256, D_HEAD=64, causal=False)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_gqa_fwd_bshd_wgmma_pipelined():
-    example_gqa_fwd_bshd_wgmma_pipelined.main(
-        batch=1, heads=16, seq_len=1024, dim=128, is_causal=False, groups=16, tune=False)
+    example_gqa_fwd_bshd_wgmma_pipelined.main(batch=1, heads=16, seq_len=1024, dim=128, is_causal=False, groups=16, tune=False)
 
 
 @tilelang.testing.requires_cuda
 def test_example_gqa_fwd_bshd():
-    example_gqa_fwd_bshd.main(
-        batch=1, heads=16, seq_len=1024, dim=128, is_causal=False, groups=16, tune=False)
+    example_gqa_fwd_bshd.main(batch=1, heads=16, seq_len=1024, dim=128, is_causal=False, groups=16, tune=False)
 
 
 @tilelang.testing.requires_cuda
@@ -78,7 +95,14 @@ def test_example_mha_fwd_bshd():
 
 @tilelang.testing.requires_cuda
 def test_example_mha_fwd_varlen():
-    example_mha_fwd_varlen.main()
+    example_mha_fwd_varlen.main(batch=4, heads=16, seq_len=512, dim=64, causal=False)
+    example_mha_fwd_varlen.main(batch=4, heads=16, seq_len=512, dim=64, causal=True)
+
+
+@tilelang.testing.requires_cuda
+def test_example_gqa_fwd_varlen():
+    example_gqa_fwd_varlen.main(batch=4, heads=16, q_seqlen=512, k_seqlen=512, dim=64, is_causal=False)
+    example_gqa_fwd_varlen.main(batch=4, heads=16, q_seqlen=512, k_seqlen=512, dim=64, is_causal=True)
 
 
 if __name__ == "__main__":
diff --git a/examples/flash_attention/varlen_utils.py b/examples/flash_attention/varlen_utils.py
index 4301215d5..43e21cc3b 100644
--- a/examples/flash_attention/varlen_utils.py
+++ b/examples/flash_attention/varlen_utils.py
@@ -9,22 +9,14 @@ def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random"):
     if mode == "full":
         lengths = torch.full((batch_size, 1), max_seqlen, device=device, dtype=torch.int32)
     elif mode == "random":
-        lengths = torch.randint(
-            max(1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device)
+        lengths = torch.randint(max(1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device)
     elif mode == "third":
         lengths = torch.randint(max_seqlen // 3, max_seqlen + 1, (batch_size, 1), device=device)
-    padding_mask = (
-        repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths)
+    padding_mask = repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths
     return padding_mask
 
 
-def generate_qkv(q,
-                 k,
-                 v,
-                 query_padding_mask=None,
-                 key_padding_mask=None,
-                 kvpacked=False,
-                 qkvpacked=False):
+def generate_qkv(q, k, v, query_padding_mask=None, key_padding_mask=None, kvpacked=False, qkvpacked=False):
     """
     Arguments:
         q: (batch_size, seqlen_q, nheads, d)
@@ -39,15 +31,12 @@ def generate_qkv(q,
 
     if query_padding_mask is not None:
         q_unpad, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, query_padding_mask)
-        output_pad_fn = lambda output_unpad: pad_input(output_unpad, indices_q, batch_size, seqlen_q
-                                                      )
+        output_pad_fn = lambda output_unpad: pad_input(output_unpad, indices_q, batch_size, seqlen_q)
     else:
         q_unpad = rearrange(q, "b s h d -> (b s) h d")
-        cu_seqlens_q = torch.arange(
-            0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q_unpad.device)
+        cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q_unpad.device)
         max_seqlen_q = seqlen_q
-        output_pad_fn = lambda output_unpad: rearrange(
-            output_unpad, "(b s) h d -> b s h d", b=batch_size)
+        output_pad_fn = lambda output_unpad: rearrange(output_unpad, "(b s) h d -> b s h d", b=batch_size)
 
     if key_padding_mask is not None:
         k_unpad, indices_k, cu_seqlens_k, max_seqlen_k = unpad_input(k, key_padding_mask)
@@ -55,8 +44,7 @@ def generate_qkv(q,
     else:
         k_unpad = rearrange(k, "b s h d -> (b s) h d")
         v_unpad = rearrange(v, "b s h d -> (b s) h d")
-        cu_seqlens_k = torch.arange(
-            0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, device=k_unpad.device)
+        cu_seqlens_k = torch.arange(0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, device=k_unpad.device)
         max_seqlen_k = seqlen_k
 
     if qkvpacked:
@@ -67,8 +55,7 @@ def generate_qkv(q,
         if query_padding_mask is not None:
             dqkv_pad_fn = lambda dqkv_unpad: pad_input(dqkv_unpad, indices_q, batch_size, seqlen_q)
         else:
-            dqkv_pad_fn = lambda dqkv_unpad: rearrange(
-                dqkv_unpad, "(b s) t h d -> b s t h d", b=batch_size)
+            dqkv_pad_fn = lambda dqkv_unpad: rearrange(dqkv_unpad, "(b s) t h d -> b s t h d", b=batch_size)
         return (
             qkv_unpad.detach().requires_grad_(),
             cu_seqlens_q,
@@ -84,8 +71,7 @@ def generate_qkv(q,
         if key_padding_mask is not None:
             dkv_pad_fn = lambda dkv_unpad: pad_input(dkv_unpad, indices_k, batch_size, seqlen_k)
         else:
-            dkv_pad_fn = lambda dkv_unpad: rearrange(
-                dkv_unpad, "(b s) t h d -> b s t h d", b=batch_size)
+            dkv_pad_fn = lambda dkv_unpad: rearrange(dkv_unpad, "(b s) t h d -> b s t h d", b=batch_size)
         return (
             q_unpad.detach().requires_grad_(),
             kv_unpad.detach().requires_grad_(),
diff --git a/examples/flash_decoding/example_gqa_decode.py b/examples/flash_decoding/example_gqa_decode.py
index 5f946d8b5..9e6f36017 100644
--- a/examples/flash_decoding/example_gqa_decode.py
+++ b/examples/flash_decoding/example_gqa_decode.py
@@ -15,18 +15,12 @@
 def get_configs():
     block_N = [64, 128]
     block_H = [64]
-    num_split = [2, 4, 8]
+    num_split = [1, 2, 4, 8]
     num_stages = [1, 2, 3]
     threads = [128]
     _configs = list(itertools.product(block_N, block_H, num_split, num_stages, threads))
 
-    configs = [{
-        'block_N': c[0],
-        'block_H': c[1],
-        'num_split': c[2],
-        'num_stages': c[3],
-        'threads': c[4]
-    } for c in _configs]
+    configs = [{"block_N": c[0], "block_H": c[1], "num_split": c[2], "num_stages": c[3], "threads": c[4]} for c in _configs]
     return configs
 
 
@@ -40,45 +34,44 @@ def get_heuristic_config() -> Tuple[Dict, int]:
     sm_version = sm_major * 10 + sm_minor
     print(f"CUDA device capability: {sm_version}")
     if sm_version == 89:
-        cfg = dict(block_N=128, block_H=64, num_split=16, num_stages=0, threads=128)
+        cfg = dict(block_N=128, block_H=64, num_split=1, num_stages=0, threads=128)
     else:
-        cfg = dict(block_N=128, block_H=64, num_split=16, num_stages=2, threads=128)
+        cfg = dict(block_N=128, block_H=64, num_split=8, num_stages=2, threads=128)
     return cfg, sm_version
 
 
 # TODO(lei): fix warp specialized and tma lower pass
 def get_pass_configs():
-    return {
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
-    }
+    return {tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True}
 
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(out_idx=[6], pass_configs=get_pass_configs())
-def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split, num_stages,
-              threads):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split, num_stages, threads):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape_q = [batch, heads, dim]
     shape_k = [batch, seqlen_kv, groups, dim]
     shape_v = [batch, seqlen_kv, groups, dim]
     shape_o = [batch, heads, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // groups
 
     part_shape = [batch, heads, num_split, dim]
     valid_block_H = min(block_H, kv_group_num)
     valid_block_N = min(block_N, seqlen_kv // num_split)
 
-    @T.macro
-    def flash_attn(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
-            Output: T.Tensor([batch, heads, dim], dtype),
+    @T.prim_func
+    def flashattn_gqa_decode_split(
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor(part_shape, dtype),
+        Output: T.Tensor(shape_o, dtype),
     ):
+        # split
         with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
             K_shared = T.alloc_shared([block_N, dim], dtype)
@@ -96,25 +89,43 @@ def flash_attn(
 
             bid = bx
             hid = by
+            sid = bz
             cur_kv_head = hid // (kv_group_num // valid_block_H)
 
-            T.copy(Q[bid, hid * valid_block_H:hid * valid_block_H + block_H, :], Q_shared)
+            T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = T.ceildiv((seqlen_kv // num_split), block_N)
+
             for k in T.Pipelined(loop_range, num_stages=num_stages):
-                T.copy(K[bid, k * block_N:(k + 1) * block_N, cur_kv_head, :], K_shared)
-                T.copy(mask[bid, k * block_N:(k + 1) * block_N, cur_kv_head], mask_local)
+                T.copy(
+                    K[
+                        bid,
+                        (seqlen_kv // num_split) * sid + k * valid_block_N : (seqlen_kv // num_split) * sid + (k + 1) * valid_block_N,
+                        cur_kv_head,
+                        :,
+                    ],
+                    K_shared,
+                )
+                T.copy(
+                    mask[
+                        bid,
+                        (seqlen_kv // num_split) * sid + k * valid_block_N : (seqlen_kv // num_split) * sid + (k + 1) * valid_block_N,
+                        cur_kv_head,
+                    ],
+                    mask_local,
+                )
                 T.clear(acc_s)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 for i, j in T.Parallel(block_H, block_N):
-                    acc_s[i, j] = T.if_then_else(mask_local[j] != 0, acc_s[i, j],
-                                                 -T.infinity(accum_dtype))
+                    acc_s[i, j] = T.if_then_else((mask_local[j] != 0) & (j < seqlen_kv // num_split), acc_s[i, j], -T.infinity(accum_dtype))
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
@@ -125,23 +136,66 @@ def flash_attn(
                 T.copy(acc_s, acc_s_cast)
                 for i, j in T.Parallel(block_H, dim):
                     acc_o[i, j] *= scores_scale[i]
-                T.copy(V[bid, k * block_N:(k + 1) * block_N, cur_kv_head, :], V_shared)
+                T.copy(
+                    V[
+                        bid,
+                        (seqlen_kv // num_split) * sid + k * valid_block_N : (seqlen_kv // num_split) * sid + (k + 1) * valid_block_N,
+                        cur_kv_head,
+                        :,
+                    ],
+                    V_shared,
+                )
                 T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
             for i, j in T.Parallel(block_H, dim):
                 acc_o[i, j] /= logsum[i]
             for i in T.Parallel(block_H):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
+
+            for i in T.Parallel(block_H):
+                if i < valid_block_H:
+                    glse[bid, hid * valid_block_H + i, sid] = logsum[i]
             T.copy(acc_o[:valid_block_H, :], O_shared)
-            T.copy(O_shared, Output[bid, hid * valid_block_H:(hid + 1) * valid_block_H, :])
-
-    @T.macro
-    def flash_attn_split(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),
+            T.copy(O_shared, Output_partial[bid, hid * valid_block_H : (hid + 1) * valid_block_H, sid, :])
+
+        # combine
+        with T.Kernel(heads, batch, threads=128) as (by, bz):
+            po_local = T.alloc_fragment([dim], dtype)
+            o_accum_local = T.alloc_fragment([dim], accum_dtype)
+            lse_local = T.alloc_fragment([num_split, 128], dtype)
+            lse_logsum_local = T.alloc_fragment([128], accum_dtype)
+            lse_max_local = T.alloc_fragment([128], accum_dtype)
+            scale_local = T.alloc_fragment([128], accum_dtype)
+
+            T.clear(lse_logsum_local)
+            T.clear(o_accum_local)
+            for k, j in T.Parallel(num_split, 128):
+                lse_local[k, j] = glse[bz, by, k]
+            T.reduce_max(lse_local, lse_max_local, dim=0, clear=True)
+            for k in T.serial(num_split):
+                for j in T.Parallel(128):
+                    lse_logsum_local[j] += T.exp2(lse_local[k, j] - lse_max_local[j])
+            for j in T.Parallel(128):
+                lse_logsum_local[j] = T.log2(lse_logsum_local[j]) + lse_max_local[j]
+            for k in T.serial(num_split):
+                for i in T.Parallel(dim):
+                    po_local[i] = Output_partial[bz, by, k, i]
+                for j in T.Parallel(128):
+                    scale_local[j] = T.exp2(lse_local[k, j] - lse_logsum_local[j])
+                # Note: Pay attention to dim and the number of threads in Parallel
+                for i in T.Parallel(dim):
+                    o_accum_local[i] += po_local[i] * scale_local[i]
+            for i in T.Parallel(dim):
+                Output[bz, by, i] = o_accum_local[i]
+
+    @T.prim_func
+    def flashattn_gqa_decode_no_split(
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor(part_shape, dtype),
+        Output: T.Tensor(shape_o, dtype),
     ):
         with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
@@ -160,34 +214,26 @@ def flash_attn_split(
 
             bid = bx
             hid = by
-            sid = bz
             cur_kv_head = hid // (kv_group_num // valid_block_H)
 
-            T.copy(Q[bid, hid * valid_block_H:hid * valid_block_H + block_H, :], Q_shared)
+            T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             loop_range = T.ceildiv((seqlen_kv // num_split), block_N)
-
             for k in T.Pipelined(loop_range, num_stages=num_stages):
-                T.copy(
-                    K[bid, (seqlen_kv // num_split) * sid +
-                      k * valid_block_N:(seqlen_kv // num_split) * sid + (k + 1) * valid_block_N,
-                      cur_kv_head, :], K_shared)
-                T.copy(
-                    mask[bid, (seqlen_kv // num_split) * sid +
-                         k * valid_block_N:(seqlen_kv // num_split) * sid + (k + 1) * valid_block_N,
-                         cur_kv_head], mask_local)
+                T.copy(K[bid, k * block_N : (k + 1) * block_N, cur_kv_head, :], K_shared)
+                T.copy(mask[bid, k * block_N : (k + 1) * block_N, cur_kv_head], mask_local)
                 T.clear(acc_s)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 for i, j in T.Parallel(block_H, block_N):
-                    acc_s[i,
-                          j] = T.if_then_else((mask_local[j] != 0) & (j < seqlen_kv // num_split),
-                                              acc_s[i, j], -T.infinity(accum_dtype))
+                    acc_s[i, j] = T.if_then_else(mask_local[j] != 0, acc_s[i, j], -T.infinity(accum_dtype))
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_H):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
@@ -198,88 +244,14 @@ def flash_attn_split(
                 T.copy(acc_s, acc_s_cast)
                 for i, j in T.Parallel(block_H, dim):
                     acc_o[i, j] *= scores_scale[i]
-                T.copy(
-                    V[bid, (seqlen_kv // num_split) * sid +
-                      k * valid_block_N:(seqlen_kv // num_split) * sid + (k + 1) * valid_block_N,
-                      cur_kv_head, :], V_shared)
+                T.copy(V[bid, k * block_N : (k + 1) * block_N, cur_kv_head, :], V_shared)
                 T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
             for i, j in T.Parallel(block_H, dim):
                 acc_o[i, j] /= logsum[i]
             for i in T.Parallel(block_H):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-
-            for i in T.Parallel(block_H):
-                if i < valid_block_H:
-                    glse[bid, hid * valid_block_H + i, sid] = logsum[i]
             T.copy(acc_o[:valid_block_H, :], O_shared)
-            T.copy(O_shared, Output_partial[bid, hid * valid_block_H:(hid + 1) * valid_block_H,
-                                            sid, :])
-
-    @T.macro
-    def combine(
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),
-            Output: T.Tensor(shape_o, dtype),
-    ):
-        with T.Kernel(heads, batch, threads=128) as (by, bz):
-            po_local = T.alloc_fragment([dim], dtype)
-            o_accum_local = T.alloc_fragment([dim], accum_dtype)
-            lse_local = T.alloc_fragment([num_split, 128], dtype)
-            lse_local_split = T.alloc_local([1], accum_dtype)
-            lse_logsum_local = T.alloc_local([1], accum_dtype)
-            lse_max_local = T.alloc_fragment([128], accum_dtype)
-            scale_local = T.alloc_local([1], accum_dtype)
-
-            T.annotate_layout({
-                lse_logsum_local: T.Fragment(lse_logsum_local.shape, forward_thread_fn=lambda i: i),
-                lse_max_local: T.Fragment(lse_max_local.shape, forward_thread_fn=lambda i: i),
-                # lse_local: (local_id, thread_id)
-                lse_local: T.Fragment(lse_local.shape, forward_fn=lambda i, j: (j, i)),
-            })
-
-            T.clear(lse_logsum_local)
-            T.clear(o_accum_local)
-            for k, j in T.Parallel(num_split, 128):
-                lse_local[k, j] = glse[bz, by, k]
-            T.reduce_max(lse_local, lse_max_local, dim=0, clear=True)
-            for k in T.Pipelined(num_split, num_stages=1):
-                lse_local_split[0] = glse[bz, by, k]
-                lse_logsum_local[0] += T.exp2(lse_local_split[0] - lse_max_local[0])
-            lse_logsum_local[0] = T.log2(lse_logsum_local[0]) + lse_max_local[0]
-            for k in T.serial(num_split):
-                for i in T.Parallel(dim):
-                    po_local[i] = Output_partial[bz, by, k, i]
-                lse_local_split[0] = glse[bz, by, k]
-                scale_local[0] = T.exp2(lse_local_split[0] - lse_logsum_local[0])
-                for i in T.Parallel(dim):
-                    o_accum_local[i] += po_local[i] * scale_local[0]
-            for i in T.Parallel(dim):
-                Output[bz, by, i] = o_accum_local[i]
-
-    @T.prim_func
-    def flashattn_gqa_decode_split(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),
-            Output: T.Tensor(shape_o, dtype),
-    ):
-        flash_attn_split(Q, K, V, mask, glse, Output_partial)
-        combine(glse, Output_partial, Output)
-
-    @T.prim_func
-    def flashattn_gqa_decode_no_split(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),
-            Output: T.Tensor(shape_o, dtype),
-    ):
-        flash_attn(Q, K, V, mask, Output)
+            T.copy(O_shared, Output[bid, hid * valid_block_H : (hid + 1) * valid_block_H, :])
 
     if num_split > 1:
         return flashattn_gqa_decode_split
@@ -300,27 +272,21 @@ def ref_program(query, key, value, mask, glse, Output_partial):
     dim = query.shape[-1]
     num_head_groups = query.shape[1] // key.shape[2]
     scale = dim**0.5
-    key = rearrange(key, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
-    value = rearrange(value, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    key = rearrange(key, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
+    value = rearrange(value, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    query = rearrange(
-        query, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    query = rearrange(query, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
     if mask is not None:
-        mask = rearrange(mask, 'b s h -> b h s')
+        mask = rearrange(mask, "b s h -> b h s")
         mask = mask.unsqueeze(1)
-        scores = scores.masked_fill(mask == 0, float('-inf'))
+        scores = scores.masked_fill(mask == 0, float("-inf"))
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, value,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, value, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
@@ -334,16 +300,12 @@ def flash_split_ref(Q, K, V, mask):
     seqlen_kv = K.size(1)
     num_head_groups = nheads // groups
 
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     acc_s = torch.empty((batch, num_head_groups, groups, block_N), device="cuda", dtype=torch.float)
-    acc_s_cast = torch.empty((batch, num_head_groups, groups, block_N),
-                             device="cuda",
-                             dtype=torch.float16)
+    acc_s_cast = torch.empty((batch, num_head_groups, groups, block_N), device="cuda", dtype=torch.float16)
     acc_o = torch.empty((batch, num_head_groups, groups, dim), device="cuda", dtype=torch.float)
     scores_max = torch.empty((batch, num_head_groups, groups), device="cuda", dtype=torch.float)
-    scores_max_prev = torch.empty((batch, num_head_groups, groups),
-                                  device="cuda",
-                                  dtype=torch.float)
+    scores_max_prev = torch.empty((batch, num_head_groups, groups), device="cuda", dtype=torch.float)
     scores_scale = torch.empty((batch, num_head_groups, groups), device="cuda", dtype=torch.float)
     scores_sum = torch.empty((batch, num_head_groups, groups), device="cuda", dtype=torch.float)
     logsum = torch.empty((batch, num_head_groups, groups), device="cuda", dtype=torch.float)
@@ -351,25 +313,25 @@ def flash_split_ref(Q, K, V, mask):
     glogsum = torch.empty((num_split, batch, nheads), device="cuda", dtype=torch.float)
 
     Q_ = Q * scale
-    Q_ = rearrange(Q_, 'b (h g) d -> b g h d', g=num_head_groups)
+    Q_ = rearrange(Q_, "b (h g) d -> b g h d", g=num_head_groups)
 
     for ks in range(num_split):
         acc_o.fill_(0)
         logsum.fill_(0)
-        scores_max.fill_(float('-inf'))
-        scores_max_prev.fill_(float('-inf'))
+        scores_max.fill_(float("-inf"))
+        scores_max_prev.fill_(float("-inf"))
         for i in range(int((seqlen_kv // num_split) / block_N)):
             acc_s.fill_(0)
-            acc_s = torch.einsum('bghd,bkhd->bghk', Q_,
-                                 K[:, (seqlen_kv // num_split) * ks +
-                                   i * block_N:(seqlen_kv // num_split) * ks +
-                                   (i + 1) * block_N, :, :])  # [batch, nheads, block_N]
+            acc_s = torch.einsum(
+                "bghd,bkhd->bghk",
+                Q_,
+                K[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )  # [batch, nheads, block_N]
             if mask is not None:
-                mask_local = mask[:, (seqlen_kv // num_split) * ks +
-                                  i * block_N:(seqlen_kv // num_split) * ks + (i + 1) * block_N, :]
-                mask_local = rearrange(mask_local, 'b s h -> b h s')
+                mask_local = mask[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :]
+                mask_local = rearrange(mask_local, "b s h -> b h s")
                 mask_local = mask_local.unsqueeze(1)
-                acc_s = acc_s.masked_fill(mask_local == 0, float('-inf'))
+                acc_s = acc_s.masked_fill(mask_local == 0, float("-inf"))
             scores_max_prev = scores_max
             scores_max = acc_s.max(dim=-1, keepdim=False).values  # [batch, nheads]
             scores_scale = torch.exp2(scores_max_prev - scores_max)  # [batch, nheads]
@@ -377,15 +339,16 @@ def flash_split_ref(Q, K, V, mask):
             acc_s = torch.exp2(acc_s - scores_max[:, :, :, None])
             acc_s_cast = acc_s.to(torch.float16)  # [batch, nheads, block_N]
             acc_o += torch.einsum(
-                'bghk,bkhd->bghd', acc_s_cast,
-                V[:, (seqlen_kv // num_split) * ks + i * block_N:(seqlen_kv // num_split) * ks +
-                  (i + 1) * block_N, :, :])
+                "bghk,bkhd->bghd",
+                acc_s_cast,
+                V[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )
             scores_sum = acc_s.sum(dim=-1, keepdim=False)
             logsum = logsum * scores_scale + scores_sum
-        acc_o_out = rearrange(acc_o, 'b g h d->b (h g) d')
-        logsum_out = rearrange(logsum, 'b g h->b (h g)')
+        acc_o_out = rearrange(acc_o, "b g h d->b (h g) d")
+        logsum_out = rearrange(logsum, "b g h->b (h g)")
         acc_o_out /= logsum_out[:, :, None]
-        logsum_out = torch.log2(logsum_out) + rearrange(scores_max, 'b g h->b (h g)')
+        logsum_out = torch.log2(logsum_out) + rearrange(scores_max, "b g h->b (h g)")
         gacc_o[ks, :, :, :] = acc_o_out
         glogsum[ks, :, :] = logsum_out
 
@@ -421,7 +384,7 @@ def calc_sim(x, y, name="tensor"):
     x, y = x.data.double(), y.data.double()
     denominator = (x * x + y * y).sum()
     if denominator == 0:
-        print_red_warning(f'{name} all zero')
+        print_red_warning(f"{name} all zero")
         return 1
     sim = 2 * (x * y).sum() / denominator
     return sim
@@ -429,28 +392,23 @@ def calc_sim(x, y, name="tensor"):
 
 def assert_similar(x, y, eps=1e-2, name="tensor", assert_=False, print_=True):
     sim = calc_sim(x, y, name)
-    diff = 1. - sim
+    diff = 1.0 - sim
     if not (0 <= diff <= eps):
-        print_red_warning(f'{name} Error: {diff}')
+        print_red_warning(f"{name} Error: {diff}")
         if assert_:
-            raise AssertionError(f'{name} Error: {diff}')
+            raise AssertionError(f"{name} Error: {diff}")
     else:
         if print_:
-            print(f'passed: {name} diff={diff}')
+            print(f"passed: {name} diff={diff}")
 
 
-def main(batch: int = 1,
-         heads: int = 32,
-         groups: int = 8,
-         kv_seqlen: int = 8192,
-         dim: int = 128,
-         tune: bool = False):
+def main(batch: int = 1, heads: int = 32, groups: int = 8, kv_seqlen: int = 8192, dim: int = 128, tune: bool = False):
     batch, heads, groups, kv_seqlen, dim = batch, heads, groups, kv_seqlen, dim
     qk_flops = 2 * batch * heads * kv_seqlen * dim
     pv_flops = 2 * batch * heads * kv_seqlen * dim
     total_flops = qk_flops + pv_flops
 
-    if (not tune):
+    if not tune:
         config, sm_version = get_heuristic_config()
         kernel = flashattn(batch, heads, groups, kv_seqlen, dim, **config)
         profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Auto)
@@ -459,8 +417,9 @@ def main(batch: int = 1,
         k = torch.randn(batch, kv_seqlen, groups, dim, device="cuda", dtype=torch.float16)
         v = torch.randn(batch, kv_seqlen, groups, dim, device="cuda", dtype=torch.float16)
         mask = torch.randint(0, 2, (batch, kv_seqlen, groups), device="cuda", dtype=torch.uint8)
-        glse = torch.empty(batch, heads, 16, device="cuda", dtype=torch.float16)
-        Output_partial = torch.empty(batch, heads, 16, dim, device="cuda", dtype=torch.float16)
+        split = config["num_split"]
+        glse = torch.empty(batch, heads, split, device="cuda", dtype=torch.float16)
+        Output_partial = torch.empty(batch, heads, split, dim, device="cuda", dtype=torch.float16)
         o = kernel(q, k, v, mask, glse, Output_partial)
         o_ref = ref_program(q, k, v, mask, glse, Output_partial)
         o_ref_split = ref_split_program(q, k, v, mask, glse, Output_partial)
@@ -469,7 +428,7 @@ def main(batch: int = 1,
         print(o_ref)
 
         assert_similar(o, o_ref, name="o_ref")
-        assert_similar(o_ref_split, o_ref, name="o_ref_split")
+        assert_similar(o, o_ref_split, name="o_ref_split")
 
         print("All checks pass.")
         latency = profiler.do_bench(ref_program, warmup=500)
@@ -489,13 +448,21 @@ def main(batch: int = 1,
         print(f"Ref latency: {ref_latency}")
 
 
+def run_regression_perf(batch: int = 1, heads: int = 32, groups: int = 8, kv_seqlen: int = 8192, dim: int = 128):
+    batch, heads, groups, kv_seqlen, dim = batch, heads, groups, kv_seqlen, dim
+    config, _ = get_heuristic_config()
+    kernel = flashattn(batch, heads, groups, kv_seqlen, dim, **config)
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Auto)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=32, help='heads')
-    parser.add_argument('--groups', type=int, default=8, help='groups')
-    parser.add_argument('--kv_seqlen', type=int, default=8192, help='kv sequence length')
-    parser.add_argument('--dim', type=int, default=128, help='dim')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=1, help="batch size")
+    parser.add_argument("--heads", type=int, default=32, help="heads")
+    parser.add_argument("--groups", type=int, default=8, help="groups")
+    parser.add_argument("--kv_seqlen", type=int, default=8192, help="kv sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="dim")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
     main(args.batch, args.heads, args.groups, args.kv_seqlen, args.dim, args.tune)
diff --git a/examples/flash_decoding/example_gqa_decode_varlen_logits.py b/examples/flash_decoding/example_gqa_decode_varlen_logits.py
new file mode 100644
index 000000000..30acd879e
--- /dev/null
+++ b/examples/flash_decoding/example_gqa_decode_varlen_logits.py
@@ -0,0 +1,785 @@
+import torch
+import triton
+import triton.language as tl
+import math
+import argparse
+import tilelang
+import tilelang.language as T
+
+torch.manual_seed(0)
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+@triton.jit
+def _fwd_inner(
+    q,
+    k_ptrs,
+    v_ptrs,
+    s_ptrs,
+    m_i,
+    l_i,
+    acc,
+    offs_h,
+    mask_h,
+    offs_n,
+    seqlen,
+    softmax_scale,
+    lo,
+    hi,
+    stride_kt,
+    stride_vt,
+    stride_sh,
+    stride_sn,
+    BLOCK_N: tl.constexpr,
+):
+    """Inner loop computation for attention"""
+
+    for blk_idx in tl.range(lo, hi):
+        start_n = blk_idx * BLOCK_N
+        k = tl.load(k_ptrs + start_n * stride_kt, mask=offs_n[None, :] + start_n < seqlen)
+        v = tl.load(v_ptrs + start_n * stride_vt, mask=offs_n[:, None] + start_n < seqlen)
+
+        qk = tl.dot(q, k)
+        qk *= softmax_scale
+        qk += tl.where(offs_n[None, :] + start_n < seqlen, 0, -1.0e9)
+
+        row_max = tl.max(qk, 1)
+        tl.store(s_ptrs + offs_h * stride_sh + blk_idx * stride_sn, row_max, mask=mask_h)
+
+        m_ij = tl.maximum(m_i, row_max)
+        qk -= m_ij[:, None]
+        p = tl.math.exp(qk)
+        l_ij = tl.sum(p, 1)
+        alpha = tl.math.exp(m_i - m_ij)
+        l_i = l_i * alpha + l_ij
+        m_i = m_ij
+        acc *= alpha[:, None]
+        p = p.to(v.type.element_ty)
+        acc += tl.dot(p, v)
+
+    return m_i, l_i, acc
+
+
+@triton.autotune(
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [4, 8] for num_stages in [2, 4]],
+    key=["gqa_group_size", "BLOCK_N", "BLOCK_D", "BLOCK_H"],
+)
+@triton.jit
+def _fwd_kernel_varlen(
+    Q,  # [token_q = b, h_q, dim]
+    K,  # [token_k, h_kv, dim]
+    V,
+    O,
+    S,
+    s_aux,
+    softmax_scale,
+    cu_seqlens_k,
+    stride_qt,
+    stride_qh,
+    stride_qd,
+    stride_kt,
+    stride_kh,
+    stride_kd,
+    stride_vt,
+    stride_vh,
+    stride_vd,
+    stride_ot,
+    stride_oh,
+    stride_od,
+    stride_sb,
+    stride_sh,
+    stride_sn,  # bmask shape [b, q_h, seq/BLOCK_N]
+    gqa_group_size: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    off_z = tl.program_id(0)
+    off_h_for_kv = tl.program_id(1)
+    off_h_q = off_h_for_kv * gqa_group_size
+
+    cu_k_start = tl.load(cu_seqlens_k + off_z)
+    cu_k_end = tl.load(cu_seqlens_k + off_z + 1)
+
+    seqlen_k = cu_k_end - cu_k_start
+
+    offs_h = tl.arange(0, BLOCK_H)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_D)
+
+    Q_ptrs = Q + off_z * stride_qt + off_h_q * stride_qh
+    K_ptrs = K + (cu_k_start) * stride_kt + off_h_for_kv * stride_kh
+    V_ptrs = V + (cu_k_start) * stride_vt + off_h_for_kv * stride_vh
+    O_ptrs = O + off_z * stride_ot + off_h_q * stride_oh
+    S_ptrs = S + off_z * stride_sb + off_h_q * stride_sh
+
+    mask_h = offs_h < gqa_group_size
+    q = tl.load(Q_ptrs + offs_d[None, :] * stride_qd + offs_h[:, None] * stride_qh, mask=mask_h[:, None])
+
+    if s_aux is not None:
+        sink = tl.load(s_aux + off_h_q + offs_h, mask=mask_h).to(tl.float32)
+        l_i = tl.zeros([BLOCK_H], dtype=tl.float32)
+        m_i = tl.zeros([BLOCK_H], dtype=tl.float32) + sink
+    else:
+        l_i = tl.full([BLOCK_H], 1.0, dtype=tl.float32)
+        m_i = tl.full([BLOCK_H], float("-inf"), dtype=tl.float32)
+
+    acc = tl.zeros([BLOCK_H, BLOCK_D], dtype=tl.float32)
+
+    k_ptrs = K_ptrs + offs_n[None, :] * stride_kt + offs_d[:, None] * stride_kd
+    v_ptrs = V_ptrs + offs_n[:, None] * stride_vt + offs_d[None, :] * stride_vd
+
+    lo, hi = 0, tl.cdiv(seqlen_k, BLOCK_N)
+    m_i, l_i, acc = _fwd_inner(
+        q,
+        k_ptrs,
+        v_ptrs,
+        S_ptrs,
+        m_i,
+        l_i,
+        acc,
+        offs_h,
+        mask_h,
+        offs_n,
+        seqlen_k,
+        softmax_scale,
+        lo,
+        hi,
+        stride_kt,
+        stride_vt,
+        stride_sh,
+        stride_sn,
+        BLOCK_N,
+    )
+
+    if s_aux is not None:
+        sink = tl.math.exp(sink - m_i)
+        l_i = l_i + sink
+        acc = acc / l_i[:, None]
+
+    else:
+        l_recip = 1 / l_i[:, None]
+        acc = acc * l_recip
+
+    for blk_idx in tl.range(lo, hi):
+        s = tl.load(S_ptrs + offs_h * stride_sh + blk_idx * stride_sn, mask=mask_h)
+        s = tl.exp(s - m_i) / l_i
+        tl.store(S_ptrs + offs_h * stride_sh + blk_idx * stride_sn, s, mask=mask_h)
+
+    acc = acc.to(O.dtype.element_ty)
+
+    tl.store(O_ptrs + offs_h[:, None] * stride_oh + offs_d[None, :] * stride_od, acc, mask=mask_h[:, None])
+
+
+def get_configs():
+    import itertools
+
+    block_N = [64, 128]
+    block_H = [64]
+    num_split = [1]
+    num_stages = [1, 2, 3]
+    threads = [128]
+    _configs = list(itertools.product(block_N, block_H, num_split, num_stages, threads))
+
+    configs = [{"block_N": c[0], "block_H": c[1], "num_split": c[2], "num_stages": c[3], "threads": c[4]} for c in _configs]
+    return configs
+
+
+@tilelang.jit(out_idx=[-2, -1])
+def flashattn(
+    batch, heads, k_heads, max_seqlen_kv, total_seqlen_k, dim, has_sink, block_N=128, block_H=64, num_split=1, num_stages=1, threads=128
+):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
+    shape_q = [batch, heads, dim]
+    shape_k = [total_seqlen_k, k_heads, dim]
+    shape_v = [total_seqlen_k, k_heads, dim]
+    shape_o = [batch, heads, dim]
+    shape_s = [batch, heads, math.ceil(max_seqlen_kv / block_N)]
+    dtype = T.float16
+    accum_dtype = T.float32
+    kv_group_num = heads // k_heads
+
+    valid_block_H = min(block_H, kv_group_num)
+    # TODO: check if max_seqlen_kv is correct for varlen case
+
+    @T.prim_func
+    def flashattn_gqa_decode_no_split(
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        cu_seqlens_k: T.Tensor([batch + 1], T.int32),
+        s_aux: T.Tensor([heads], T.float32),
+        Output: T.Tensor(shape_o, dtype),
+        S: T.Tensor(shape_s, dtype),
+    ):
+        with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
+            Q_shared = T.alloc_shared([block_H, dim], dtype)
+            K_shared = T.alloc_shared([block_N, dim], dtype)
+            V_shared = T.alloc_shared([block_N, dim], dtype)
+            O_shared = T.alloc_shared([valid_block_H, dim], dtype)
+            acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
+            acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
+            acc_o = T.alloc_fragment([block_H, dim], accum_dtype)
+            scores_max = T.alloc_fragment([block_H], accum_dtype)
+            scores_max_prev = T.alloc_fragment([block_H], accum_dtype)
+            scores_scale = T.alloc_fragment([block_H], accum_dtype)
+            scores_sum = T.alloc_fragment([block_H], accum_dtype)
+            logsum = T.alloc_fragment([block_H], accum_dtype)
+            S_shared = T.alloc_shared([block_H, math.ceil(max_seqlen_kv / block_N)], dtype)
+            # S_fragment = T.alloc_fragment([block_H, math.ceil(max_seqlen_kv / block_N)], accum_dtype)
+            s_aux_shared = T.alloc_shared([block_H], T.float32)
+
+            bid = bx
+            hid = by
+            cur_kv_head = hid // (kv_group_num // valid_block_H)
+
+            cur_start_k = cu_seqlens_k[bid]
+            cur_end_k = cu_seqlens_k[bid + 1]
+            cur_seqlen_k = cur_end_k - cur_start_k
+
+            T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
+            T.fill(acc_o, 0)
+            T.fill(logsum, 0)
+            T.fill(scores_max, -T.infinity(accum_dtype))
+
+            # loop_range = T.ceildiv((seqlen_kv // num_split), block_N)
+            loop_range = T.ceildiv((cur_seqlen_k // num_split), block_N)
+            for k in T.Pipelined(loop_range, num_stages=num_stages):
+                T.copy(K[cur_start_k + k * block_N : cur_start_k + (k + 1) * block_N, cur_kv_head, :], K_shared)
+                T.clear(acc_s)
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+                for i, j in T.Parallel(block_H, block_N):
+                    # acc_s[i, j] = T.if_then_else(mask_local[j] != 0 and k * block_N + j < cur_seqlen_k, acc_s[i, j],
+                    #                              -T.infinity(accum_dtype))
+                    acc_s[i, j] = T.if_then_else(k * block_N + j < cur_seqlen_k, acc_s[i, j], -T.infinity(accum_dtype))
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                # scores_max_prev is m_i
+                # scores_max is row_max->m_ij in triton
+                T.copy(scores_max, S_shared[:, k])
+                # scores_scale is alpha in triton
+                for i in T.Parallel(block_H):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_H, block_N):
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                # scores_sum is l_ij in triton
+                # logsum is l_i in triton
+                for i in T.Parallel(block_H):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+                for i, j in T.Parallel(block_H, dim):
+                    acc_o[i, j] *= scores_scale[i]
+                T.copy(V[cur_start_k + k * block_N : cur_start_k + (k + 1) * block_N, cur_kv_head, :], V_shared)
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
+            if has_sink:
+                T.copy(s_aux[hid * valid_block_H : hid * valid_block_H + block_H], s_aux_shared)
+                for i in T.Parallel(block_H):
+                    logsum[i] += s_aux_shared[i]
+            for i, j in T.Parallel(block_H, dim):
+                acc_o[i, j] /= logsum[i]
+            for h, k in T.Parallel(block_H, math.ceil(max_seqlen_kv / block_N)):
+                S_shared[h, k] = T.exp2((S_shared[h, k] - scores_max[h]) * scale) / logsum[h]
+            # T.copy(S_shared, S_fragment)
+            # for h, k in T.Parallel(block_H, math.ceil(max_seqlen_kv / block_N)):
+            #     S_fragment[h, k] = T.exp2((S_fragment[h, k] - scores_max[h]) * scale) / logsum[h]
+            for i in T.Parallel(block_H):
+                logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
+            T.copy(acc_o[:valid_block_H, :], O_shared)
+            T.copy(O_shared, Output[bid, hid * valid_block_H : (hid + 1) * valid_block_H, :])
+            # T.copy(S_fragment, S_shared)
+            T.copy(S_shared[:valid_block_H, :], S[bid, hid * valid_block_H : (hid + 1) * valid_block_H, :])
+
+    # TODO: split version
+    return flashattn_gqa_decode_no_split
+
+
+def flash_attn_with_attn_pool_decode_tilelang(
+    Q: torch.Tensor,  ## [tq = b, q_h, q_dim]
+    K: torch.Tensor,  ## [tk, k_h, k_dim]
+    V: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    max_seqlen_k: int,
+    real_max_k_seqlen: int,
+    num_split: int,
+    softmax_scale: float,
+    s_aux: torch.Tensor = None,
+    block_size: int = 64,
+    use_per_kv_head_sparse_index: bool = False,
+    tl_kernel=None,
+):
+    num_tokens, q_h, head_size = Q.shape
+    batch = cu_seqlens_k.size(0) - 1
+    k_h = K.size(1)
+
+    assert Q.dim() == K.dim() == 3
+    assert Q.size(2) == K.size(2)
+    assert cu_seqlens_k.dim() == 1
+    assert head_size in {64, 128, 256}
+    assert Q.is_contiguous()
+    # assert K.is_contiguous()
+    # assert V.is_contiguous()
+
+    gqa_group_size = q_h // k_h
+
+    O_tl = torch.zeros_like(Q)
+    S_tl = torch.zeros((batch, q_h, math.ceil(real_max_k_seqlen / block_size)), dtype=Q.dtype, device=Q.device)
+    O_tl, S_tl = tl_kernel(Q, K, V, cu_seqlens_k, s_aux)
+
+    if use_per_kv_head_sparse_index:
+        S_tl = torch.max_pool2d(S_tl, kernel_size=(gqa_group_size, 1), stride=(gqa_group_size, 1))
+    else:
+        S_tl = torch.max_pool2d(S_tl, kernel_size=(q_h, 1), stride=(q_h, 1))
+
+    return O_tl, S_tl
+
+
+def flash_attn_with_attn_pool_decode(
+    Q: torch.Tensor,  ## [tq = b, q_h, q_dim]
+    K: torch.Tensor,  ## [tk, k_h, k_dim]
+    V: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    max_seqlen_k: int,
+    real_max_k_seqlen: int,
+    num_split: int,
+    softmax_scale: float,
+    s_aux: torch.Tensor = None,
+    block_size: int = 64,
+    use_per_kv_head_sparse_index: bool = False,
+):
+    num_tokens, q_h, head_size = Q.shape
+    batch = cu_seqlens_k.size(0) - 1
+    k_h = K.size(1)
+
+    assert Q.dim() == K.dim() == 3
+    assert Q.size(2) == K.size(2)
+    assert cu_seqlens_k.dim() == 1
+    assert head_size in {64, 128, 256}
+    assert Q.is_contiguous()
+    # assert K.is_contiguous()
+    # assert V.is_contiguous()
+
+    gqa_group_size = q_h // k_h
+
+    BLOCK_D = head_size
+    BLOCK_N = block_size
+    BLOCK_H = 64
+
+    O = torch.zeros_like(Q)
+    S = torch.zeros((batch, q_h, math.ceil(max_seqlen_k / block_size)), dtype=Q.dtype, device=Q.device)
+
+    def grid(META):
+        return (batch, k_h)
+
+    with torch.cuda.device(Q.device.index):
+        _fwd_kernel_varlen[grid](
+            Q,
+            K,
+            V,
+            O,
+            S,
+            s_aux,
+            softmax_scale,
+            cu_seqlens_k,
+            *Q.stride(),
+            *K.stride(),
+            *V.stride(),
+            *O.stride(),
+            *S.stride(),
+            gqa_group_size,
+            BLOCK_H=BLOCK_H,
+            BLOCK_N=BLOCK_N,
+            BLOCK_D=BLOCK_D,
+        )
+
+    if use_per_kv_head_sparse_index:
+        S = torch.max_pool2d(S, kernel_size=(gqa_group_size, 1), stride=(gqa_group_size, 1))
+    else:
+        S = torch.max_pool2d(S, kernel_size=(q_h, 1), stride=(q_h, 1))
+
+    return O, S
+
+
+def test_varlen_decode_main(args):
+    """Test decode kernel with variable sequence lengths"""
+    batch_size = args.batch_size
+    q_heads = args.q_heads
+    kv_heads = args.kv_heads
+    max_k_seqlen = args.k_seqlen  # Use as max sequence length
+    real_max_k_seqlen = args.k_seqlen
+    head_size = args.head_size
+    block_size = args.block_size
+    dtype = torch.bfloat16 if args.dtype == T.bfloat16 else torch.float16
+
+    print(f"Testing decode kernel with variable sequence lengths (max_k_seqlen={max_k_seqlen})")
+
+    # Generate sink values if needed
+    sink = None
+    if args.test_sink:
+        sink = torch.randn(q_heads, device="cuda", dtype=torch.float32) * 0.1  # Small sink values
+        print(f"Using sink attention with sink values: {sink}")
+
+    # Generate variable length k sequences
+    k_seqlens = torch.randint(max_k_seqlen // 4, max_k_seqlen + 1, size=(batch_size,))
+    print(f"k_seqlens: {k_seqlens}")
+
+    # Generate cumulative sequence lengths for k
+    cu_seqlens_k = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
+    total_k_tokens = 0
+    for i in range(batch_size):
+        cu_seqlens_k[i] = total_k_tokens
+        total_k_tokens += k_seqlens[i]
+    cu_seqlens_k[batch_size] = total_k_tokens
+
+    print(f"cu_seqlens_k: {cu_seqlens_k}")
+
+    # Generate tensors - Q is [batch_size, q_heads, head_size] for decode
+    q_decode = torch.randn(batch_size, q_heads, head_size, device="cuda", dtype=dtype)
+    k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
+    v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
+
+    softmax_scale = 1.0 / math.sqrt(head_size)
+    max_seqlen_k = int(k_seqlens.max())
+
+    print(f"Actual max_seqlen_k: {max_seqlen_k}")
+    print(f"q_decode shape: {q_decode.shape}")
+    print(f"k_varlen shape: {k_varlen.shape}")
+    print(f"v_varlen shape: {v_varlen.shape}")
+
+    num_tokens, q_h, head_size = q_decode.shape
+    batch = cu_seqlens_k.size(0) - 1
+    k_h = k_varlen.size(1)
+    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size, args.test_sink)
+
+    # Test our decode kernel
+    O_triton, S_triton = flash_attn_with_attn_pool_decode(
+        q_decode,
+        k_varlen,
+        v_varlen,
+        cu_seqlens_k,
+        max_seqlen_k,
+        real_max_k_seqlen,
+        args.num_split,
+        softmax_scale,
+        s_aux=sink,
+        block_size=block_size,
+    )
+    O_tilelang, S_tilelang = flash_attn_with_attn_pool_decode_tilelang(
+        q_decode,
+        k_varlen,
+        v_varlen,
+        cu_seqlens_k,
+        max_seqlen_k,
+        real_max_k_seqlen,
+        args.num_split,
+        softmax_scale,
+        s_aux=sink,
+        block_size=block_size,
+        tl_kernel=tl_kernel,
+    )
+    for i in range(batch_size):
+        S_tilelang[i, :, math.ceil((cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()) / block_size) :] = 0
+
+    # Create torch reference - pad tensors for comparison
+    k_padded_list = []
+    v_padded_list = []
+
+    for i in range(batch_size):
+        actual_k_len = k_seqlens[i]
+
+        # Extract and pad k, v for this batch
+        k_start = cu_seqlens_k[i]
+        k_end = cu_seqlens_k[i + 1]
+
+        # Pad to max_seqlen_k
+        k_padded = torch.zeros(max_seqlen_k, kv_heads, head_size, device="cuda", dtype=dtype)
+        v_padded = torch.zeros(max_seqlen_k, kv_heads, head_size, device="cuda", dtype=dtype)
+
+        k_padded[:actual_k_len] = k_varlen[k_start:k_end]
+        v_padded[:actual_k_len] = v_varlen[k_start:k_end]
+
+        k_padded_list.append(k_padded)
+        v_padded_list.append(v_padded)
+
+    # Stack to create batched tensors [b, max_seqlen, kv_heads, head_size]
+    k_padded_batched = torch.stack(k_padded_list, dim=0).transpose(1, 2)  # [b, kv_heads, max_seqlen, head_size]
+    v_padded_batched = torch.stack(v_padded_list, dim=0).transpose(1, 2)  # [b, kv_heads, max_seqlen, head_size]
+
+    # Expand q to match kv heads: [b, q_heads, 1, head_size]
+    q_expanded = q_decode.unsqueeze(2)  # [b, q_heads, 1, head_size]
+
+    print(f"q_expanded shape: {q_expanded.shape}")
+    print(f"k_padded_batched shape: {k_padded_batched.shape}")
+    print(f"v_padded_batched shape: {v_padded_batched.shape}")
+
+    # Compute torch reference
+    k_repeat = repeat_kv(k_padded_batched, q_heads // kv_heads)  # [b, q_heads, max_seqlen, head_size]
+    v_repeat = repeat_kv(v_padded_batched, q_heads // kv_heads)  # [b, q_heads, max_seqlen, head_size]
+
+    if sink is None:
+        # Standard attention computation: [b, q_heads, 1, head_size] @ [b, q_heads, head_size, max_seqlen]
+        attn_score = torch.matmul(q_expanded, k_repeat.transpose(-2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
+
+        # Apply sequence length masking
+        for i in range(batch_size):
+            actual_k_len = k_seqlens[i]
+            attn_score[i, :, :, actual_k_len:] = float("-inf")
+
+        attn_weights = attn_score.softmax(dim=-1)  # [b, q_heads, 1, max_seqlen]
+
+        # Mask out invalid positions
+        for i in range(batch_size):
+            actual_k_len = k_seqlens[i]
+            attn_weights[i, :, :, actual_k_len:] = 0.0
+
+        # Compute output: [b, q_heads, 1, max_seqlen] @ [b, q_heads, max_seqlen, head_size]
+        O_torch = torch.matmul(attn_weights, v_repeat)  # [b, q_heads, 1, head_size]
+    else:
+        # s_aux attention
+        logits = torch.matmul(q_expanded, k_repeat.transpose(-2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
+
+        # Apply sequence length masking
+        for i in range(batch_size):
+            actual_k_len = k_seqlens[i]
+            logits[i, :, :, actual_k_len:] = float("-inf")
+
+        sink_expanded = sink.view(1, q_heads, 1, 1)  # [1, q_heads, 1, 1]
+        logits_max = torch.max(logits, dim=-1, keepdim=True).values
+        logits_or_sinks_max = torch.maximum(logits_max, sink_expanded)
+        sinks = torch.exp(sink_expanded - logits_or_sinks_max)
+        unnormalized_scores = torch.exp(logits - logits_or_sinks_max)
+        normalizer = unnormalized_scores.sum(dim=-1, keepdim=True) + sinks
+        attn_weights = unnormalized_scores / normalizer
+
+        # Mask out invalid positions
+        for i in range(batch_size):
+            actual_k_len = k_seqlens[i]
+            attn_weights[i, :, :, actual_k_len:] = 0.0
+
+        # Compute output: [b, q_heads, 1, max_seqlen] @ [b, q_heads, max_seqlen, head_size]
+        O_torch = torch.matmul(attn_weights.to(v_repeat.dtype), v_repeat)  # [b, q_heads, 1, head_size]
+
+    O_torch = O_torch.squeeze(2)  # [b, q_heads, head_size]
+
+    # Compute attention score pooling for S
+    attn_score_pooled = torch.max_pool2d(
+        attn_weights.squeeze(2),  # [b, q_heads, max_seqlen]
+        kernel_size=(q_heads, block_size),
+        stride=(q_heads, block_size),
+        ceil_mode=True,
+    ).to(dtype=torch.float16)  # [b, 1, ceil(max_seqlen/block_size)]
+
+    print(f"O_triton shape: {O_triton.shape}")
+    print(f"O_tilelang shape: {O_tilelang.shape}")
+    print(f"O_torch shape: {O_torch.shape}")
+    print(f"S_triton shape: {S_triton.shape}")
+    print(f"S_tilelang shape: {S_tilelang.shape}")
+    print(f"attn_score_pooled shape: {attn_score_pooled.shape}")
+
+    # Compare results
+    max_diff_o = torch.max(torch.abs(O_triton - O_torch))
+    max_diff_o_tl = torch.max(torch.abs(O_tilelang - O_torch))
+    print(f"Max difference in O: {max_diff_o.item()}")
+    print(f"Max difference in O_tilelang: {max_diff_o_tl.item()}")
+
+    max_diff_s = torch.max(torch.abs(S_triton - attn_score_pooled))
+    max_diff_s_tl = torch.max(
+        torch.abs(
+            S_tilelang[:, :, : math.ceil(max_seqlen_k / block_size)] - attn_score_pooled[:, :, : math.ceil(max_seqlen_k / block_size)]
+        )
+    )
+    print(f"Max difference in S: {max_diff_s.item()}")
+    print(f"Max difference in S_tilelang: {max_diff_s_tl.item()}")
+
+    assert torch.allclose(O_triton, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o.item()}"
+    assert torch.allclose(S_triton, attn_score_pooled, atol=1e-2, rtol=1e-2), f"Score mismatch: {max_diff_s.item()}"
+    assert torch.allclose(O_tilelang, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o_tl.item()}"
+    assert torch.allclose(
+        S_tilelang[:, :, : math.ceil(max_seqlen_k / block_size)],
+        attn_score_pooled[:, :, : math.ceil(max_seqlen_k / block_size)],
+        atol=1e-2,
+        rtol=1e-2,
+    ), f"Score mismatch: {max_diff_s_tl.item()}"
+
+    print("✅ All tests passed!")
+
+
+def do_bench(fn, *args, warmup=10, rep=10, **kwargs):
+    """
+    Do benchmark for a function.
+    """
+    start_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
+    end_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
+    for _ in range(warmup):
+        fn(*args, **kwargs)
+
+    torch.cuda.synchronize()
+    for i in range(rep):
+        start_event[i].record()
+        fn(*args, **kwargs)
+        end_event[i].record()
+    torch.cuda.synchronize()
+
+    # Record clocks
+    times = torch.tensor(
+        [s.elapsed_time(e) for s, e in zip(start_event, end_event)],
+        dtype=torch.float,
+    )
+
+    return times.mean().item()
+
+
+def speed_benchmark_decode_comparison(args):
+    """Speed benchmark for decode kernel"""
+    batch_size = args.batch_size
+    q_heads = args.q_heads
+    kv_heads = args.kv_heads
+    max_k_seqlen = args.k_seqlen
+    head_size = args.head_size
+    block_size = args.block_size
+    dtype = torch.bfloat16 if args.dtype == T.bfloat16 else torch.float16
+
+    print("\n=== Decode Speed Benchmark Comparison ===")
+    print("Configuration:")
+    print(f"  Batch size: {batch_size}")
+    print(f"  Q heads: {q_heads}, KV heads: {kv_heads}")
+    print(f"  Max K sequence length: {max_k_seqlen}")
+    print(f"  Head size: {head_size}")
+    print(f"  Block size: {block_size}")
+    print(f"  Data type: {dtype}")
+    print(f"  Variable lengths: {args.test_varlen}")
+    print(f"  s_aux attention: {args.test_sink}")
+    print()
+
+    # Generate input data
+    if args.test_varlen:
+        k_seqlens = torch.randint(max_k_seqlen // 4, max_k_seqlen + 1, size=(batch_size,))
+    else:
+        k_seqlens = torch.full((batch_size,), max_k_seqlen, dtype=int)
+
+    # Generate cumulative sequence lengths for k
+    cu_seqlens_k = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
+    total_k_tokens = 0
+    for i in range(batch_size):
+        cu_seqlens_k[i] = total_k_tokens
+        total_k_tokens += k_seqlens[i]
+    cu_seqlens_k[batch_size] = total_k_tokens
+
+    # Generate tensors
+    q_decode = torch.randn(batch_size, q_heads, head_size, device="cuda", dtype=dtype)
+    k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
+    v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
+
+    softmax_scale = 1.0 / math.sqrt(head_size)
+    max_seqlen_k = int(k_seqlens.max())
+
+    # Generate sink values if needed
+    sink = None
+    if args.test_sink:
+        sink = torch.randn(q_heads, device="cuda", dtype=torch.float32) * 0.1  # Small sink values
+        print("  Using sink attention with sink values")
+
+    print("Setup complete:")
+    print(f"  Total K tokens: {total_k_tokens}")
+    print(f"  Actual max K seq len: {max_seqlen_k}")
+    if args.test_varlen:
+        print(f"  K sequence lengths: {k_seqlens.tolist()}")
+
+    # Warmup
+    num_tokens, q_h, head_size = q_decode.shape
+    batch = cu_seqlens_k.size(0) - 1
+    k_h = k_varlen.size(1)
+    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size, args.test_sink)
+
+    # Benchmark
+    print("⚡ Benchmarking Tilelang kernel (100 iterations)...")
+    tilelang_time = do_bench(
+        flash_attn_with_attn_pool_decode_tilelang,
+        q_decode,
+        k_varlen,
+        v_varlen,
+        cu_seqlens_k,
+        max_seqlen_k,
+        args.k_seqlen,
+        1,
+        softmax_scale,
+        sink,
+        block_size,
+        False,
+        tl_kernel,
+    )
+    print(f"Average decode kernel time Tilelang: {tilelang_time:.3f} ms")
+
+    # Benchmark
+    print("⚡ Benchmarking Triton kernel (100 iterations)...")
+    triton_time = do_bench(
+        flash_attn_with_attn_pool_decode,
+        q_decode,
+        k_varlen,
+        v_varlen,
+        cu_seqlens_k,
+        max_seqlen_k,
+        args.k_seqlen,
+        1,
+        softmax_scale,
+        sink,
+        block_size,
+    )
+    print(f"Average decode kernel time Triton: {triton_time:.3f} ms")
+
+    print(f"Speedup: {(triton_time / tilelang_time):.3f}")
+
+
+def main():
+    args = argparse.Namespace(
+        batch_size=1,
+        q_heads=32,
+        kv_heads=8,
+        k_seqlen=8192,
+        head_size=128,
+        block_size=128,
+        dtype=T.float16,
+    )
+    args.test_sink = True
+    args.test_varlen = True
+    args.dtype = T.float16
+    args.num_split = 1
+    test_varlen_decode_main(args)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Flash Attention Decode with Attention Pooling")
+    parser.add_argument("--batch_size", type=int, default=1, help="Batch size")
+    parser.add_argument("--q_heads", type=int, default=32, help="Number of query heads")
+    parser.add_argument("--kv_heads", type=int, default=8, help="Number of key-value heads")
+    parser.add_argument("--k_seqlen", type=int, default=8192, help="Key sequence length")
+    parser.add_argument("--head_size", type=int, default=128, choices=[64, 128, 256], help="Head dimension")
+    parser.add_argument("--block_size", type=int, default=128, help="Block size for computation")
+    parser.add_argument("--dtype", type=str, default=T.bfloat16, choices=[T.float16, T.bfloat16], help="Data type")
+    parser.add_argument("--test_varlen", action="store_true", help="Test with truly variable sequence lengths")
+    parser.add_argument("--test_sink", action="store_true", help="Test with sink attention mechanism")
+    parser.add_argument("--benchmark", action="store_true", help="Run speed benchmark")
+    parser.add_argument("--num_split", type=int, default=1, choices=[1, 16], help="Number of splits")
+    args = parser.parse_args()
+    args.test_sink = True
+    args.test_varlen = True
+    args.dtype = T.float16
+    args.num_split = 1
+
+    if args.benchmark:
+        speed_benchmark_decode_comparison(args)
+    else:
+        test_varlen_decode_main(args)
diff --git a/examples/flash_decoding/example_gqa_decode_varlen_logits_paged.py b/examples/flash_decoding/example_gqa_decode_varlen_logits_paged.py
new file mode 100644
index 000000000..87748512d
--- /dev/null
+++ b/examples/flash_decoding/example_gqa_decode_varlen_logits_paged.py
@@ -0,0 +1,550 @@
+import torch
+import math
+import argparse
+import tilelang
+import tilelang.language as T
+from example_gqa_decode_varlen_logits import flash_attn_with_attn_pool_decode, repeat_kv, do_bench
+
+torch.manual_seed(0)
+
+
+def get_configs():
+    import itertools
+
+    block_N = [64, 128]
+    block_H = [64]
+    num_split = [1]
+    num_stages = [1, 2, 3]
+    threads = [128]
+    _configs = list(itertools.product(block_N, block_H, num_split, num_stages, threads))
+
+    configs = [{"block_N": c[0], "block_H": c[1], "num_split": c[2], "num_stages": c[3], "threads": c[4]} for c in _configs]
+    return configs
+
+
+# @autotune(configs=get_configs(), warmup=10, rep=10)
+@tilelang.jit(out_idx=[-2, -1])
+def flashattn(
+    batch,
+    heads,
+    k_heads,
+    max_seqlen_kv,
+    total_seqlen_k,
+    dim,
+    has_sink,
+    page_block_size,
+    block_N=128,
+    block_H=64,
+    num_split=1,
+    num_stages=1,
+    threads=128,
+):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
+    shape_q = [batch, heads, dim]
+    shape_k = [total_seqlen_k, k_heads, dim]
+    shape_v = [total_seqlen_k, k_heads, dim]
+    shape_o = [batch, heads, dim]
+    shape_s = [batch, heads, math.ceil(max_seqlen_kv / block_N)]
+    dtype = T.float16
+    accum_dtype = T.float32
+    kv_group_num = heads // k_heads
+    assert page_block_size >= block_N and page_block_size % block_N == 0, (
+        "page_block_size must be larger than block_N and a multiple of block_N"
+    )
+
+    valid_block_H = min(block_H, kv_group_num)
+    # TODO: check if max_seqlen_kv is correct for varlen case
+
+    @T.prim_func
+    def flashattn_gqa_decode_no_split(
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        cu_seqlens_k: T.Tensor([batch + 1], T.int32),
+        s_aux: T.Tensor([heads], T.float32),
+        BLOCK_TABLE: T.Tensor([batch, math.ceil(max_seqlen_kv / page_block_size)], T.int32),
+        Output: T.Tensor(shape_o, dtype),
+        S: T.Tensor(shape_s, dtype),
+    ):
+        with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
+            Q_shared = T.alloc_shared([block_H, dim], dtype)
+            K_shared = T.alloc_shared([block_N, dim], dtype)
+            V_shared = T.alloc_shared([block_N, dim], dtype)
+            O_shared = T.alloc_shared([valid_block_H, dim], dtype)
+            acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
+            acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
+            acc_o = T.alloc_fragment([block_H, dim], accum_dtype)
+            scores_max = T.alloc_fragment([block_H], accum_dtype)
+            scores_max_prev = T.alloc_fragment([block_H], accum_dtype)
+            scores_scale = T.alloc_fragment([block_H], accum_dtype)
+            scores_sum = T.alloc_fragment([block_H], accum_dtype)
+            logsum = T.alloc_fragment([block_H], accum_dtype)
+            S_shared = T.alloc_shared([block_H, math.ceil(max_seqlen_kv / block_N)], dtype)
+            s_aux_shared = T.alloc_shared([block_H], T.float32)
+
+            bid = bx
+            hid = by
+            cur_kv_head = hid // (kv_group_num // valid_block_H)
+
+            cur_start_k = cu_seqlens_k[bid]
+            cur_end_k = cu_seqlens_k[bid + 1]
+            cur_seqlen_k = cur_end_k - cur_start_k
+
+            T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
+            T.fill(acc_o, 0)
+            T.fill(logsum, 0)
+            T.fill(scores_max, -T.infinity(accum_dtype))
+
+            # loop_range = T.ceildiv((seqlen_kv // num_split), block_N)
+            loop_range = T.ceildiv((cur_seqlen_k // num_split), block_N)
+            for k in T.Pipelined(loop_range, num_stages=num_stages):
+                k_start = BLOCK_TABLE[bid, (k * block_N) // page_block_size] * page_block_size + (k * block_N) % page_block_size
+                T.copy(K[cur_start_k + k_start : cur_start_k + k_start + block_N, cur_kv_head, :], K_shared)
+                T.clear(acc_s)
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+                for i, j in T.Parallel(block_H, block_N):
+                    acc_s[i, j] = T.if_then_else(k * block_N + j < cur_seqlen_k, acc_s[i, j], -T.infinity(accum_dtype))
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                # scores_max_prev is m_i
+                # scores_max is row_max->m_ij in triton
+                T.copy(scores_max, S_shared[:, k])
+                # scores_scale is alpha in triton
+                for i in T.Parallel(block_H):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_H, block_N):
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                # scores_sum is l_ij in triton
+                # logsum is l_i in triton
+                for i in T.Parallel(block_H):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+                for i, j in T.Parallel(block_H, dim):
+                    acc_o[i, j] *= scores_scale[i]
+                v_start = BLOCK_TABLE[bid, (k * block_N) // page_block_size] * page_block_size + (k * block_N) % page_block_size
+                T.copy(V[cur_start_k + v_start : cur_start_k + v_start + block_N, cur_kv_head, :], V_shared)
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
+            if has_sink:
+                T.copy(s_aux[hid * valid_block_H : hid * valid_block_H + block_H], s_aux_shared)
+                for i in T.Parallel(block_H):
+                    logsum[i] += s_aux_shared[i]
+            for i, j in T.Parallel(block_H, dim):
+                acc_o[i, j] /= logsum[i]
+            for h, k in T.Parallel(block_H, math.ceil(max_seqlen_kv / block_N)):
+                S_shared[h, k] = T.exp2((S_shared[h, k] - scores_max[h]) * scale) / logsum[h]
+            for i in T.Parallel(block_H):
+                logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
+            T.copy(acc_o[:valid_block_H, :], O_shared)
+            T.copy(O_shared, Output[bid, hid * valid_block_H : (hid + 1) * valid_block_H, :])
+            T.copy(S_shared[:valid_block_H, :], S[bid, hid * valid_block_H : (hid + 1) * valid_block_H, :])
+
+    # TODO: split version
+    return flashattn_gqa_decode_no_split
+
+
+def flash_attn_with_attn_pool_decode_tilelang(
+    Q: torch.Tensor,  ## [tq = b, q_h, q_dim]
+    K: torch.Tensor,  ## [tk, k_h, k_dim]
+    V: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    max_seqlen_k: int,
+    real_max_k_seqlen: int,
+    num_split: int,
+    softmax_scale: float,
+    s_aux: torch.Tensor = None,
+    block_size: int = 64,
+    use_per_kv_head_sparse_index: bool = False,
+    tl_kernel=None,
+    block_table: torch.Tensor = None,
+):
+    num_tokens, q_h, head_size = Q.shape
+    batch = cu_seqlens_k.size(0) - 1
+    k_h = K.size(1)
+
+    assert Q.dim() == K.dim() == 3
+    assert Q.size(2) == K.size(2)
+    assert cu_seqlens_k.dim() == 1
+    assert head_size in {64, 128, 256}
+    assert Q.is_contiguous()
+    assert K.is_contiguous()
+    assert V.is_contiguous()
+
+    gqa_group_size = q_h // k_h
+
+    O_tl = torch.zeros_like(Q)
+    S_tl = torch.zeros((batch, q_h, math.ceil(real_max_k_seqlen / block_size)), dtype=Q.dtype, device=Q.device)
+    O_tl, S_tl = tl_kernel(Q, K, V, cu_seqlens_k, s_aux, block_table)
+
+    if use_per_kv_head_sparse_index:
+        S_tl = torch.max_pool2d(S_tl, kernel_size=(gqa_group_size, 1), stride=(gqa_group_size, 1))
+    else:
+        S_tl = torch.max_pool2d(S_tl, kernel_size=(q_h, 1), stride=(q_h, 1))
+
+    return O_tl, S_tl
+
+
+def test_varlen_decode_main(args):
+    """Test decode kernel with variable sequence lengths"""
+    batch_size = args.batch_size
+    q_heads = args.q_heads
+    kv_heads = args.kv_heads
+    max_k_seqlen = args.k_seqlen  # Use as max sequence length
+    real_max_k_seqlen = args.k_seqlen
+    head_size = args.head_size
+    block_size = args.block_size
+    page_block_size = args.page_block_size
+    dtype = torch.bfloat16 if args.dtype == T.bfloat16 else torch.float16
+
+    print(f"Testing decode kernel with variable sequence lengths (max_k_seqlen={max_k_seqlen})")
+
+    # Generate sink values if needed
+    sink = None
+    if args.test_sink:
+        sink = torch.randn(q_heads, device="cuda", dtype=torch.float32) * 0.1  # Small sink values
+        print(f"Using sink attention with sink values: {sink}")
+
+    # Generate variable length k sequences
+    k_seqlens = torch.randint(max_k_seqlen // 4, max_k_seqlen + 1, size=(batch_size,))
+    print(f"k_seqlens: {k_seqlens}")
+
+    # Generate cumulative sequence lengths for k
+    cu_seqlens_k = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
+    total_k_tokens = 0
+    for i in range(batch_size):
+        cu_seqlens_k[i] = total_k_tokens
+        total_k_tokens += k_seqlens[i]
+    cu_seqlens_k[batch_size] = total_k_tokens
+
+    print(f"cu_seqlens_k: {cu_seqlens_k}")
+
+    # Generate tensors - Q is [batch_size, q_heads, head_size] for decode
+    q_decode = torch.randn(batch_size, q_heads, head_size, device="cuda", dtype=dtype)
+    k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
+    v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
+
+    softmax_scale = 1.0 / math.sqrt(head_size)
+    max_seqlen_k = int(k_seqlens.max())
+
+    print(f"Actual max_seqlen_k: {max_seqlen_k}")
+    print(f"q_decode shape: {q_decode.shape}")
+    print(f"k_varlen shape: {k_varlen.shape}")
+    print(f"v_varlen shape: {v_varlen.shape}")
+
+    num_tokens, q_h, head_size = q_decode.shape
+    batch = cu_seqlens_k.size(0) - 1
+    k_h = k_varlen.size(1)
+    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size, args.test_sink, page_block_size)
+
+    block_table = torch.zeros(batch, math.ceil(real_max_k_seqlen / page_block_size), device="cuda", dtype=torch.int32)
+    block_cnt = 0
+    for i in range(batch):
+        cur_seqlen = cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()
+        for j in range(math.ceil(cur_seqlen / page_block_size)):
+            block_table[i, j] = block_cnt
+            block_cnt += 1
+        block_cnt = 0
+
+    # Test our decode kernel
+    O_triton, S_triton = flash_attn_with_attn_pool_decode(
+        q_decode,
+        k_varlen,
+        v_varlen,
+        cu_seqlens_k,
+        max_seqlen_k,
+        real_max_k_seqlen,
+        args.num_split,
+        softmax_scale,
+        s_aux=sink,
+        block_size=block_size,
+    )
+    O_tilelang, S_tilelang = flash_attn_with_attn_pool_decode_tilelang(
+        q_decode,
+        k_varlen,
+        v_varlen,
+        cu_seqlens_k,
+        max_seqlen_k,
+        real_max_k_seqlen,
+        args.num_split,
+        softmax_scale,
+        s_aux=sink,
+        block_size=block_size,
+        tl_kernel=tl_kernel,
+        block_table=block_table,
+    )
+    for i in range(batch_size):
+        S_tilelang[i, :, math.ceil((cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()) / block_size) :] = 0
+
+    # Create torch reference - pad tensors for comparison
+    k_padded_list = []
+    v_padded_list = []
+
+    for i in range(batch_size):
+        actual_k_len = k_seqlens[i]
+
+        # Extract and pad k, v for this batch
+        k_start = cu_seqlens_k[i]
+        k_end = cu_seqlens_k[i + 1]
+
+        # Pad to max_seqlen_k
+        k_padded = torch.zeros(max_seqlen_k, kv_heads, head_size, device="cuda", dtype=dtype)
+        v_padded = torch.zeros(max_seqlen_k, kv_heads, head_size, device="cuda", dtype=dtype)
+
+        k_padded[:actual_k_len] = k_varlen[k_start:k_end]
+        v_padded[:actual_k_len] = v_varlen[k_start:k_end]
+
+        k_padded_list.append(k_padded)
+        v_padded_list.append(v_padded)
+
+    # Stack to create batched tensors [b, max_seqlen, kv_heads, head_size]
+    k_padded_batched = torch.stack(k_padded_list, dim=0).transpose(1, 2)  # [b, kv_heads, max_seqlen, head_size]
+    v_padded_batched = torch.stack(v_padded_list, dim=0).transpose(1, 2)  # [b, kv_heads, max_seqlen, head_size]
+
+    # Expand q to match kv heads: [b, q_heads, 1, head_size]
+    q_expanded = q_decode.unsqueeze(2)  # [b, q_heads, 1, head_size]
+
+    print(f"q_expanded shape: {q_expanded.shape}")
+    print(f"k_padded_batched shape: {k_padded_batched.shape}")
+    print(f"v_padded_batched shape: {v_padded_batched.shape}")
+
+    # Compute torch reference
+    k_repeat = repeat_kv(k_padded_batched, q_heads // kv_heads)  # [b, q_heads, max_seqlen, head_size]
+    v_repeat = repeat_kv(v_padded_batched, q_heads // kv_heads)  # [b, q_heads, max_seqlen, head_size]
+
+    if sink is None:
+        # Standard attention computation: [b, q_heads, 1, head_size] @ [b, q_heads, head_size, max_seqlen]
+        attn_score = torch.matmul(q_expanded, k_repeat.transpose(-2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
+
+        # Apply sequence length masking
+        for i in range(batch_size):
+            actual_k_len = k_seqlens[i]
+            attn_score[i, :, :, actual_k_len:] = float("-inf")
+
+        attn_weights = attn_score.softmax(dim=-1)  # [b, q_heads, 1, max_seqlen]
+
+        # Mask out invalid positions
+        for i in range(batch_size):
+            actual_k_len = k_seqlens[i]
+            attn_weights[i, :, :, actual_k_len:] = 0.0
+
+        # Compute output: [b, q_heads, 1, max_seqlen] @ [b, q_heads, max_seqlen, head_size]
+        O_torch = torch.matmul(attn_weights, v_repeat)  # [b, q_heads, 1, head_size]
+    else:
+        # s_aux attention
+        logits = torch.matmul(q_expanded, k_repeat.transpose(-2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
+
+        # Apply sequence length masking
+        for i in range(batch_size):
+            actual_k_len = k_seqlens[i]
+            logits[i, :, :, actual_k_len:] = float("-inf")
+
+        sink_expanded = sink.view(1, q_heads, 1, 1)  # [1, q_heads, 1, 1]
+        logits_max = torch.max(logits, dim=-1, keepdim=True).values
+        logits_or_sinks_max = torch.maximum(logits_max, sink_expanded)
+        sinks = torch.exp(sink_expanded - logits_or_sinks_max)
+        unnormalized_scores = torch.exp(logits - logits_or_sinks_max)
+        normalizer = unnormalized_scores.sum(dim=-1, keepdim=True) + sinks
+        attn_weights = unnormalized_scores / normalizer
+
+        # Mask out invalid positions
+        for i in range(batch_size):
+            actual_k_len = k_seqlens[i]
+            attn_weights[i, :, :, actual_k_len:] = 0.0
+
+        # Compute output: [b, q_heads, 1, max_seqlen] @ [b, q_heads, max_seqlen, head_size]
+        O_torch = torch.matmul(attn_weights.to(v_repeat.dtype), v_repeat)  # [b, q_heads, 1, head_size]
+
+    O_torch = O_torch.squeeze(2)  # [b, q_heads, head_size]
+
+    # Compute attention score pooling for S
+    attn_score_pooled = torch.max_pool2d(
+        attn_weights.squeeze(2),  # [b, q_heads, max_seqlen]
+        kernel_size=(q_heads, block_size),
+        stride=(q_heads, block_size),
+        ceil_mode=True,
+    ).to(dtype=torch.float16)  # [b, 1, ceil(max_seqlen/block_size)]
+
+    print(f"O_triton shape: {O_triton.shape}")
+    print(f"O_tilelang shape: {O_tilelang.shape}")
+    print(f"O_torch shape: {O_torch.shape}")
+    print(f"S_triton shape: {S_triton.shape}")
+    print(f"S_tilelang shape: {S_tilelang.shape}")
+    print(f"attn_score_pooled shape: {attn_score_pooled.shape}")
+
+    # Compare results
+    max_diff_o = torch.max(torch.abs(O_triton - O_torch))
+    max_diff_o_tl = torch.max(torch.abs(O_tilelang - O_torch))
+    print(f"Max difference in O: {max_diff_o.item()}")
+    print(f"Max difference in O_tilelang: {max_diff_o_tl.item()}")
+
+    max_diff_s = torch.max(torch.abs(S_triton - attn_score_pooled))
+    max_diff_s_tl = torch.max(torch.abs(S_tilelang[:, :, : math.ceil(max_seqlen_k / block_size)] - attn_score_pooled))
+    print(f"Max difference in S: {max_diff_s.item()}")
+    print(f"Max difference in S_tilelang: {max_diff_s_tl.item()}")
+
+    assert torch.allclose(O_triton, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o.item()}"
+    assert torch.allclose(S_triton, attn_score_pooled, atol=1e-2, rtol=1e-2), f"Score mismatch: {max_diff_s.item()}"
+    assert torch.allclose(O_tilelang, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o_tl.item()}"
+    assert torch.allclose(S_tilelang[:, :, : math.ceil(max_seqlen_k / block_size)], attn_score_pooled, atol=1e-2, rtol=1e-2), (
+        f"Score mismatch: {max_diff_s_tl.item()}"
+    )
+
+    print("✅ All tests passed!")
+
+
+def speed_benchmark_decode_comparison(args):
+    """Speed benchmark for decode kernel"""
+    batch_size = args.batch_size
+    q_heads = args.q_heads
+    kv_heads = args.kv_heads
+    max_k_seqlen = args.k_seqlen
+    real_max_k_seqlen = args.k_seqlen
+    head_size = args.head_size
+    block_size = args.block_size
+    page_block_size = args.page_block_size
+    dtype = torch.bfloat16 if args.dtype == T.bfloat16 else torch.float16
+
+    print("\n=== Decode Speed Benchmark Comparison ===")
+    print("Configuration:")
+    print(f"  Batch size: {batch_size}")
+    print(f"  Q heads: {q_heads}, KV heads: {kv_heads}")
+    print(f"  Max K sequence length: {max_k_seqlen}")
+    print(f"  Head size: {head_size}")
+    print(f"  Block size: {block_size}")
+    print(f"  Data type: {dtype}")
+    print(f"  Variable lengths: {args.test_varlen}")
+    print(f"  s_aux attention: {args.test_sink}")
+    print()
+
+    # Generate input data
+    if args.test_varlen:
+        k_seqlens = torch.randint(max_k_seqlen // 4, max_k_seqlen + 1, size=(batch_size,))
+    else:
+        k_seqlens = torch.full((batch_size,), max_k_seqlen, dtype=int)
+
+    # Generate cumulative sequence lengths for k
+    cu_seqlens_k = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
+    total_k_tokens = 0
+    for i in range(batch_size):
+        cu_seqlens_k[i] = total_k_tokens
+        total_k_tokens += k_seqlens[i]
+    cu_seqlens_k[batch_size] = total_k_tokens
+
+    # Generate tensors
+    q_decode = torch.randn(batch_size, q_heads, head_size, device="cuda", dtype=dtype)
+    k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
+    v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
+
+    softmax_scale = 1.0 / math.sqrt(head_size)
+    max_seqlen_k = int(k_seqlens.max())
+
+    # Generate sink values if needed
+    sink = None
+    if args.test_sink:
+        sink = torch.randn(q_heads, device="cuda", dtype=torch.float32) * 0.1  # Small sink values
+        print("  Using sink attention with sink values")
+
+    print("Setup complete:")
+    print(f"  Total K tokens: {total_k_tokens}")
+    print(f"  Actual max K seq len: {max_seqlen_k}")
+    if args.test_varlen:
+        print(f"  K sequence lengths: {k_seqlens.tolist()}")
+
+    # Warmup
+    num_tokens, q_h, head_size = q_decode.shape
+    batch = cu_seqlens_k.size(0) - 1
+    k_h = k_varlen.size(1)
+    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size, args.test_sink, page_block_size)
+
+    block_table = torch.zeros(batch, math.ceil(real_max_k_seqlen / page_block_size), device="cuda", dtype=torch.int32)
+    block_cnt = 0
+    for i in range(batch):
+        cur_seqlen = cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()
+        for j in range(math.ceil(cur_seqlen / page_block_size)):
+            block_table[i, j] = block_cnt
+            block_cnt += 1
+        block_cnt = 0
+
+    # Benchmark
+    print("⚡ Benchmarking Tilelang kernel (100 iterations)...")
+    tilelang_time = do_bench(
+        flash_attn_with_attn_pool_decode_tilelang,
+        q_decode,
+        k_varlen,
+        v_varlen,
+        cu_seqlens_k,
+        max_seqlen_k,
+        args.k_seqlen,
+        1,
+        softmax_scale,
+        sink,
+        block_size,
+        False,
+        tl_kernel,
+        block_table,
+    )
+    print(f"Average decode kernel time Tilelang: {tilelang_time:.3f} ms")
+
+    # Benchmark
+    print("⚡ Benchmarking Triton kernel (100 iterations)...")
+    triton_time = do_bench(
+        flash_attn_with_attn_pool_decode,
+        q_decode,
+        k_varlen,
+        v_varlen,
+        cu_seqlens_k,
+        max_seqlen_k,
+        args.k_seqlen,
+        1,
+        softmax_scale,
+        sink,
+        block_size,
+    )
+    print(f"Average decode kernel time Triton: {triton_time:.3f} ms")
+    print(f"Speedup: {(triton_time / tilelang_time):.3f}")
+
+
+def main():
+    args = argparse.Namespace(
+        batch_size=1,
+        q_heads=32,
+        kv_heads=8,
+        k_seqlen=8192,
+        head_size=128,
+        block_size=128,
+        dtype=T.float16,
+    )
+    args.test_sink = True
+    args.test_varlen = True
+    args.dtype = T.float16
+    args.num_split = 1
+    args.page_block_size = 128
+    test_varlen_decode_main(args)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Flash Attention Decode with Attention Pooling")
+    parser.add_argument("--batch_size", type=int, default=1, help="Batch size")
+    parser.add_argument("--q_heads", type=int, default=32, help="Number of query heads")
+    parser.add_argument("--kv_heads", type=int, default=8, help="Number of key-value heads")
+    parser.add_argument("--k_seqlen", type=int, default=8192, help="Key sequence length")
+    parser.add_argument("--head_size", type=int, default=128, choices=[64, 128, 256], help="Head dimension")
+    parser.add_argument("--block_size", type=int, default=128, help="Block size for computation")
+    parser.add_argument("--dtype", type=str, default=T.bfloat16, choices=[T.float16, T.bfloat16], help="Data type")
+    parser.add_argument("--test_varlen", action="store_true", help="Test with truly variable sequence lengths")
+    parser.add_argument("--test_sink", action="store_true", help="Test with sink attention mechanism")
+    parser.add_argument("--benchmark", action="store_true", help="Run speed benchmark")
+    parser.add_argument("--num_split", type=int, default=1, choices=[1, 16], help="Number of splits")
+    parser.add_argument("--page_block_size", type=int, default=128, help="Page block size")
+    args = parser.parse_args()
+    args.test_sink = True
+    args.test_varlen = True
+    args.dtype = T.float16
+    args.num_split = 1
+
+    if args.benchmark:
+        speed_benchmark_decode_comparison(args)
+    else:
+        test_varlen_decode_main(args)
diff --git a/examples/flash_decoding/example_mha_inference.py b/examples/flash_decoding/example_mha_inference.py
index b4285a64f..24a90c57b 100644
--- a/examples/flash_decoding/example_mha_inference.py
+++ b/examples/flash_decoding/example_mha_inference.py
@@ -10,102 +10,24 @@
 
 @tilelang.jit(out_idx=[5])
 def flashattn(batch, heads, seqlen_q, seqlen_kv, dim, is_causal, block_M, block_N):
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape_q = [batch, seqlen_q, heads, dim]
     shape_kv = [batch, seqlen_kv, heads, dim]
     part_shape = [batch, seqlen_q, heads, num_split, dim]
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
 
-    @T.macro
-    def MMA0(
+    @T.prim_func
+    def flashattn_mha_inference(
+        Q: T.Tensor(shape_q, dtype),
         K: T.Tensor(shape_kv, dtype),
-        Q_shared: T.SharedBuffer([block_M, dim], dtype),
-        K_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-        k: T.int32,
-        mid: T.int32,
-        hid: T.int32,
-        bid: T.int32,
-        sid: T.int32,
-    ):
-        T.copy(
-            K[bid, (seqlen_kv // num_split) * sid + k * block_N:(seqlen_kv // num_split) * sid +
-              (k + 1) * block_N, hid, :], K_shared)
-        # TODO: Handle causal split case
-        if is_causal:
-            for i, j in T.Parallel(block_M, block_N):
-                acc_s[i, j] = T.if_then_else(mid * block_M + i >= k * block_N + j, 0,
-                                             -T.infinity(acc_s.dtype))
-        else:
-            T.clear(acc_s)
-        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def MMA1(
         V: T.Tensor(shape_kv, dtype),
-        V_shared: T.SharedBuffer([block_N, dim], dtype),
-        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-        k: T.int32,
-        hid: T.int32,
-        bid: T.int32,
-        sid: T.int32,
-    ):
-        T.copy(
-            V[bid, (seqlen_kv // num_split) * sid + k * block_N:(seqlen_kv // num_split) * sid +
-              (k + 1) * block_N, hid, :], V_shared)
-        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-
-    @T.macro
-    def Softmax(
-            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-            scores_max: T.FragmentBuffer([block_M], accum_dtype),
-            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-            logsum: T.FragmentBuffer([block_M], accum_dtype),
+        glse: T.Tensor([batch, heads, num_split, seqlen_q], dtype),
+        Output_partial: T.Tensor(part_shape, dtype),  # [batch, seqlen_q, heads, num_split, dim]
+        Output: T.Tensor(shape_q, dtype),
     ):
-        T.copy(scores_max, scores_max_prev)
-        T.fill(scores_max, -T.infinity(accum_dtype))
-        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
-        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
-        # in the first ceil_div(kBlockM, kBlockN) steps.
-        # for i in T.Parallel(block_M):
-        #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
-        for i in T.Parallel(block_M):
-            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-        for i, j in T.Parallel(block_M, block_N):
-            # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
-            # max * log_2(e)) This allows the compiler to use the ffma
-            # instruction instead of fadd and fmul separately.
-            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-        T.reduce_sum(acc_s, scores_sum, dim=1)
-        for i in T.Parallel(block_M):
-            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-        T.copy(acc_s, acc_s_cast)
-
-    @T.macro
-    def Rescale(
-            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-    ):
-        for i, j in T.Parallel(block_M, dim):
-            acc_o[i, j] *= scores_scale[i]
-
-    @T.macro
-    def flash_attn_split(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_kv, dtype),
-            V: T.Tensor(shape_kv, dtype),
-            glse: T.Tensor([batch, heads, num_split, seqlen_q], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),
-    ):
-        with T.Kernel(
-                T.ceildiv(seqlen_q, block_M), heads * batch, num_split,
-                threads=128) as (bx, by, bz):
+        # split
+        with T.Kernel(T.ceildiv(seqlen_q, block_M), heads * batch, num_split, threads=128) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
             K_shared = T.alloc_shared([block_N, dim], dtype)
             V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -126,43 +48,73 @@ def flash_attn_split(
 
             # NOTE(wt): tma barrier has some problems with padded dimensions (seq_q here) currently
             # disable relevant tma copy and use SIMT as fallback for now
-            T.copy(Q[bid, mid * block_M:(mid + 1) * block_M, hid, :], Q_shared, disable_tma=True)
+            T.copy(Q[bid, mid * block_M : (mid + 1) * block_M, hid, :], Q_shared, disable_tma=True)
             T.fill(acc_o, 0)
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
             # TODO: Handle causal split case
             loop_range = (
-                T.min(T.ceildiv(seqlen_kv, block_N), T.ceildiv(
-                    (mid + 1) * block_M, block_N)) if is_causal else T.ceildiv(
-                        (seqlen_kv // num_split), block_N))
+                T.min(T.ceildiv(seqlen_kv, block_N), T.ceildiv((mid + 1) * block_M, block_N))
+                if is_causal
+                else T.ceildiv((seqlen_kv // num_split), block_N)
+            )
 
             for k in T.Pipelined(loop_range, num_stages=2):
-                MMA0(K, Q_shared, K_shared, acc_s, k, mid, hid, bid, sid)
-                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum,
-                        logsum)
-                Rescale(acc_o, scores_scale)
-                MMA1(V, V_shared, acc_s_cast, acc_o, k, hid, bid, sid)
+                T.copy(
+                    K[bid, (seqlen_kv // num_split) * sid + k * block_N : (seqlen_kv // num_split) * sid + (k + 1) * block_N, hid, :],
+                    K_shared,
+                )
+                # TODO: Handle causal split case
+                if is_causal:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(mid * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
+                else:
+                    T.clear(acc_s)
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                # To do causal softmax, we need to set the scores_max to 0 if it is -inf
+                # This process is called Check_inf in FlashAttention3 code, and it only need to be done
+                # in the first ceil_div(kBlockM, kBlockN) steps.
+                # for i in T.Parallel(block_M):
+                #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
+                for i in T.Parallel(block_M):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, block_N):
+                    # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
+                    # max * log_2(e)) This allows the compiler to use the ffma
+                    # instruction instead of fadd and fmul separately.
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+
+                T.copy(
+                    V[bid, (seqlen_kv // num_split) * sid + k * block_N : (seqlen_kv // num_split) * sid + (k + 1) * block_N, hid, :],
+                    V_shared,
+                )
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] /= logsum[i]
             for i in T.Parallel(block_M):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, glse[bid, hid, sid, mid * block_M:(mid + 1) * block_M])
+            T.copy(logsum, glse[bid, hid, sid, mid * block_M : (mid + 1) * block_M])
             T.copy(acc_o, O_shared)
-            T.copy(
-                O_shared,
-                Output_partial[bid, mid * block_M:(mid + 1) * block_M, hid, sid, :],
-                disable_tma=True)
-
-    @T.macro
-    def combine(
-            glse: T.Tensor([batch, heads, num_split, seqlen_q], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),
-            Output: T.Tensor(shape_q, dtype),
-    ):
+            T.copy(O_shared, Output_partial[bid, mid * block_M : (mid + 1) * block_M, hid, sid, :], disable_tma=True)
+
+        # combine
         with T.Kernel(T.ceildiv(seqlen_q, block_M), heads, batch, threads=128) as (bx, by, bz):
             po_local = T.alloc_fragment([block_M, dim], dtype)
-            po_shared = T.alloc_shared([block_M, dim], dtype)
             o_accum_local = T.alloc_fragment([block_M, dim], accum_dtype)
             o_shared = T.alloc_shared([block_M, dim], dtype)
             lse_local = T.alloc_fragment([num_split, block_M], dtype)
@@ -171,20 +123,17 @@ def combine(
             lse_max_local = T.alloc_fragment([block_M], accum_dtype)
             scale_local = T.alloc_fragment([block_M], accum_dtype)
 
-            T.annotate_layout({
-                o_accum_local: T.Fragment(o_accum_local.shape, forward_thread_fn=lambda i, j: i),
-                o_shared: tilelang.layout.make_swizzled_layout(o_shared),
-                po_shared: tilelang.layout.make_swizzled_layout(po_shared),
-            })
-
             T.clear(lse_logsum_local)
             T.clear(o_accum_local)
-            T.copy(glse[
-                bz,
-                by,
-                :,
-                bx * block_M:(bx + 1) * block_M,
-            ], lse_local)
+            T.copy(
+                glse[
+                    bz,
+                    by,
+                    :,
+                    bx * block_M : (bx + 1) * block_M,
+                ],
+                lse_local,
+            )
             T.reduce_max(lse_local, lse_max_local, dim=0, clear=False)
             for k in T.Pipelined(num_split):
                 T.copy(lse_local[k, :], lse_local_split)
@@ -193,11 +142,7 @@ def combine(
             for i in T.Parallel(block_M):
                 lse_logsum_local[i] = T.log2(lse_logsum_local[i]) + lse_max_local[i]
             for k in T.Pipelined(num_split, num_stages=2):
-                T.copy(
-                    Output_partial[bz, bx * block_M:(bx + 1) * block_M, by, k, :],
-                    po_shared,
-                    disable_tma=True)
-                T.copy(po_shared, po_local)
+                T.copy(Output_partial[bz, bx * block_M : (bx + 1) * block_M, by, k, :], po_local)
                 for i in T.Parallel(block_M):
                     lse_local_split[i] = lse_local[k, i]
                 for i in T.Parallel(block_M):
@@ -205,19 +150,7 @@ def combine(
                 for i, j in T.Parallel(block_M, dim):
                     o_accum_local[i, j] += po_local[i, j] * scale_local[i]
             T.copy(o_accum_local, o_shared)
-            T.copy(o_shared, Output[bz, bx * block_M:(bx + 1) * block_M, by, :], disable_tma=True)
-
-    @T.prim_func
-    def flashattn_mha_inference(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_kv, dtype),
-            V: T.Tensor(shape_kv, dtype),
-            glse: T.Tensor([batch, heads, num_split, seqlen_q], dtype),
-            Output_partial: T.Tensor(part_shape, dtype),  # [batch, seqlen_q, heads, num_split, dim]
-            Output: T.Tensor(shape_q, dtype),
-    ):
-        flash_attn_split(Q, K, V, glse, Output_partial)
-        combine(glse, Output_partial, Output)
+            T.copy(o_shared, Output[bz, bx * block_M : (bx + 1) * block_M, by, :], disable_tma=True)
 
     return flashattn_mha_inference
 
@@ -225,10 +158,10 @@ def flashattn_mha_inference(
 def ref_program(Q, K, V, glse, Output_partial, causal):
     assert causal is False
     dim = Q.size(-1)
-    scores = torch.einsum('bqhd,bkhd->bhqk', Q, K)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
     scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
     attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum('bhqk,bkhd->bqhd', attention_weights, V)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
     return output
 
 
@@ -256,7 +189,7 @@ def flash_split_ref(Q, K, V, causal):
     block_N = 128
     seqlen_kv = K.size(1)
 
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     acc_s = torch.empty((batch, nheads, block_M, block_N), device="cuda", dtype=torch.float)
     acc_s_cast = torch.empty((batch, nheads, block_M, block_N), device="cuda", dtype=torch.float16)
     acc_o = torch.empty((batch, block_M, nheads, dim), device="cuda", dtype=torch.float)
@@ -273,14 +206,15 @@ def flash_split_ref(Q, K, V, causal):
     for ks in range(num_split):
         acc_o.fill_(0)
         logsum.fill_(0)
-        scores_max.fill_(float('-inf'))
-        scores_max_prev.fill_(float('-inf'))
+        scores_max.fill_(float("-inf"))
+        scores_max_prev.fill_(float("-inf"))
         for i in range(int((seqlen_kv // num_split) / block_N)):
             acc_s.fill_(0)
-            acc_s = torch.einsum('bqhd,bkhd->bhqk', Q_,
-                                 K[:, (seqlen_kv // num_split) * ks +
-                                   i * block_N:(seqlen_kv // num_split) * ks +
-                                   (i + 1) * block_N, :, :])  # [batch, seqlen, nheads, block_N]
+            acc_s = torch.einsum(
+                "bqhd,bkhd->bhqk",
+                Q_,
+                K[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )  # [batch, seqlen, nheads, block_N]
             scores_max_prev = scores_max
             scores_max = acc_s.max(dim=-1, keepdim=False).values  # [blockM]
             scores_scale = torch.exp2(scores_max_prev - scores_max)
@@ -288,9 +222,10 @@ def flash_split_ref(Q, K, V, causal):
             acc_s = torch.exp2(acc_s - scores_max[:, :, :, None])
             acc_s_cast = acc_s.to(torch.float16)
             acc_o += torch.einsum(
-                'bhqk,bkhd->bqhd', acc_s_cast,
-                V[:, (seqlen_kv // num_split) * ks + i * block_N:(seqlen_kv // num_split) * ks +
-                  (i + 1) * block_N, :, :])
+                "bhqk,bkhd->bqhd",
+                acc_s_cast,
+                V[:, (seqlen_kv // num_split) * ks + i * block_N : (seqlen_kv // num_split) * ks + (i + 1) * block_N, :, :],
+            )
             scores_sum = acc_s.sum(dim=-1, keepdim=False)
             logsum = logsum * scores_scale + scores_sum
         acc_o /= logsum[:, :, :, None].transpose(1, 2)
@@ -298,13 +233,10 @@ def flash_split_ref(Q, K, V, causal):
         gacc_o[ks, :, :, :, :] = acc_o
         glogsum[ks, :, :, :] = logsum
 
-    return glogsum.to(torch.float16).permute(1, 2, 0,
-                                             3), gacc_o.to(torch.float16).permute(1, 2, 3, 0, 4)
+    return glogsum.to(torch.float16).permute(1, 2, 0, 3), gacc_o.to(torch.float16).permute(1, 2, 3, 0, 4)
 
 
-def main():
-    BATCH, H, Q_CTX, KV_CTX, D_HEAD = 1, 32, 128, 8192, 128
-    causal = False
+def main(BATCH=1, H=32, Q_CTX=128, KV_CTX=8192, D_HEAD=128, causal=False):
     flops_per_matmul = 2.0 * BATCH * H * Q_CTX * KV_CTX * D_HEAD
     total_flops = 2 * flops_per_matmul
     if causal:
@@ -325,5 +257,13 @@ def main():
     print("{:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
+def run_regression_perf(BATCH=1, H=32, Q_CTX=128, KV_CTX=8192, D_HEAD=128, causal=False):
+    BLOCK_M = 128
+    BLOCK_N = 64
+    kernel = flashattn(BATCH, H, Q_CTX, KV_CTX, D_HEAD, causal, BLOCK_M, BLOCK_N)
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/flash_decoding/regression_example_flash_decoding.py b/examples/flash_decoding/regression_example_flash_decoding.py
new file mode 100644
index 000000000..476bceb34
--- /dev/null
+++ b/examples/flash_decoding/regression_example_flash_decoding.py
@@ -0,0 +1,17 @@
+import tilelang.testing
+import example_gqa_decode
+import example_mha_inference
+
+
+def regression_example_gqa_decode():
+    tilelang.testing.process_func(example_gqa_decode.run_regression_perf)
+
+
+def regression_example_mha_inference():
+    tilelang.testing.process_func(
+        example_mha_inference.run_regression_perf, BATCH=1, H=32, Q_CTX=128, KV_CTX=2048, D_HEAD=128, causal=False
+    )
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/flash_decoding/test_example_flash_decoding.py b/examples/flash_decoding/test_example_flash_decoding.py
index a6ec1c68e..a02a92097 100644
--- a/examples/flash_decoding/test_example_flash_decoding.py
+++ b/examples/flash_decoding/test_example_flash_decoding.py
@@ -2,6 +2,8 @@
 
 import example_gqa_decode
 import example_mha_inference
+import example_gqa_decode_varlen_logits
+import example_gqa_decode_varlen_logits_paged
 
 
 # TODO(lei): fix the correctness of gqa decode on sm90
@@ -12,7 +14,15 @@ def test_example_example_gqa_decode():
 
 
 def test_example_example_mha_inference():
-    example_mha_inference.main()
+    example_mha_inference.main(BATCH=1, H=32, Q_CTX=128, KV_CTX=2048, D_HEAD=128, causal=False)
+
+
+def test_example_example_gqa_decode_varlen_logits():
+    example_gqa_decode_varlen_logits.main()
+
+
+def test_example_example_gqa_decode_varlen_logits_paged():
+    example_gqa_decode_varlen_logits_paged.main()
 
 
 if __name__ == "__main__":
diff --git a/examples/fusedmoe/example_fusedmoe_tilelang.py b/examples/fusedmoe/example_fusedmoe_tilelang.py
index a8d684965..5c236dd80 100644
--- a/examples/fusedmoe/example_fusedmoe_tilelang.py
+++ b/examples/fusedmoe/example_fusedmoe_tilelang.py
@@ -9,17 +9,18 @@
 
 
 @tilelang.jit(pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
-def moe_forward_tilelang_shared(d_hidden,
-                                d_expert,
-                                n_shared_experts,
-                                dtype,
-                                num_tokens,
-                                block_token=128,
-                                block_dhidden=128,
-                                block_dexpert=128,
-                                threads=256,
-                                num_stages=1):
-
+def moe_forward_tilelang_shared(
+    d_hidden,
+    d_expert,
+    n_shared_experts,
+    dtype,
+    num_tokens,
+    block_token=128,
+    block_dhidden=128,
+    block_dexpert=128,
+    threads=256,
+    num_stages=1,
+):
     scale = 1.44269504  # log2(e)
 
     # Parameters
@@ -32,21 +33,19 @@ def moe_forward_tilelang_shared(d_hidden,
     shared_W_up_shape = (dexpert, dhidden)
     shared_W_down_shape = (dhidden, dexpert)
 
-    accum_type = "float32"
+    accum_type = T.float32
 
     @T.prim_func
     def kernel_shared(
-            input: T.Tensor(input_shape, dtype),  # type: ignore
-            shared_W_gate: T.Tensor(shared_W_gate_shape, dtype),  # type: ignore
-            shared_W_up: T.Tensor(shared_W_up_shape, dtype),  # type: ignore
-            shared_W_down: T.Tensor(shared_W_down_shape, dtype),  # type: ignore
-            up_logits: T.Tensor((num_tokens, dexpert), dtype),  # type: ignore
-            output: T.Tensor(input_shape, dtype),  # type: ignore
+        input: T.Tensor(input_shape, dtype),  # type: ignore
+        shared_W_gate: T.Tensor(shared_W_gate_shape, dtype),  # type: ignore
+        shared_W_up: T.Tensor(shared_W_up_shape, dtype),  # type: ignore
+        shared_W_down: T.Tensor(shared_W_down_shape, dtype),  # type: ignore
+        up_logits: T.Tensor((num_tokens, dexpert), dtype),  # type: ignore
+        output: T.Tensor(input_shape, dtype),  # type: ignore
     ):
         # Step 1: Compute gate and up logits
-        with T.Kernel(
-                T.ceildiv(num_tokens, block_token), T.ceildiv(dexpert, block_dexpert),
-                threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(num_tokens, block_token), T.ceildiv(dexpert, block_dexpert), threads=threads) as (bx, by):
             # Split the block to shared experts and routed experts
             input_shared = T.alloc_fragment((block_token, block_dhidden), dtype=dtype)
             W_gate_shared = T.alloc_shared((block_dexpert, block_dhidden), dtype=dtype)
@@ -70,16 +69,13 @@ def kernel_shared(
 
             # Fuse with SiLU and element-wise product
             for i, j in T.Parallel(block_token, block_dexpert):
-                gate_logits_local[i, j] = gate_logits_local[i, j] * (
-                    1.0 / (1.0 + T.exp2(-gate_logits_local[i, j] * scale)))
+                gate_logits_local[i, j] = gate_logits_local[i, j] * (1.0 / (1.0 + T.exp2(-gate_logits_local[i, j] * scale)))
                 up_logits_local[i, j] = up_logits_local[i, j] * gate_logits_local[i, j]
 
             T.copy(up_logits_local, up_logits[bx * block_token, by * block_dexpert])
 
         # Step 2: Compute down logits
-        with T.Kernel(
-                T.ceildiv(num_tokens, block_token), T.ceildiv(dhidden, block_dhidden),
-                threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(num_tokens, block_token), T.ceildiv(dhidden, block_dhidden), threads=threads) as (bx, by):
             up_logits_shared = T.alloc_fragment((block_token, block_dexpert), dtype=dtype)
             W_down_shared = T.alloc_shared((block_dhidden, block_dexpert), dtype=dtype)
             output_local = T.alloc_fragment((block_token, block_dhidden), dtype=accum_type)
@@ -98,20 +94,21 @@ def kernel_shared(
 
 
 @tilelang.jit(pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
-def moe_forward_tilelang_routed(d_hidden,
-                                d_expert,
-                                n_routed_experts,
-                                dtype,
-                                group_sum,
-                                group_count,
-                                block_token=128,
-                                block_dhidden=128,
-                                block_dexpert=128,
-                                threads=256,
-                                num_stages=1,
-                                k_pack=1,
-                                coalesced_width=None):
-
+def moe_forward_tilelang_routed(
+    d_hidden,
+    d_expert,
+    n_routed_experts,
+    dtype,
+    group_sum,
+    group_count,
+    block_token=128,
+    block_dhidden=128,
+    block_dexpert=128,
+    threads=256,
+    num_stages=1,
+    k_pack=1,
+    coalesced_width=None,
+):
     scale = 1.44269504  # log2(e)
 
     # Parameters
@@ -124,7 +121,7 @@ def moe_forward_tilelang_routed(d_hidden,
     # group_count = len(group_sizes_list)
     # M = sum([(group_size + block_token - 1) // block_token for group_size in group_sizes_list])
     M = math.ceil(group_sum / block_token) + group_count
-    accum_dtype = "float32"
+    accum_dtype = T.float32
 
     # Tensors: Note that input shape is reshape to (bs * seq_len * n_experts_per_token, dhidden) for grouped gemm
     input_shape = (group_sum, dhidden)
@@ -132,22 +129,22 @@ def moe_forward_tilelang_routed(d_hidden,
     routed_expert_gate_shape = (n_routed_experts, dexpert, dhidden)
     routed_expert_up_shape = (n_routed_experts, dexpert, dhidden)
     routed_expert_down_shape = (n_routed_experts, dhidden, dexpert)
-    routed_expert_weights_shape = (group_sum)
-    group_sizes_shape = (n_routed_experts)
+    routed_expert_weights_shape = group_sum
+    group_sizes_shape = n_routed_experts
 
     @T.prim_func
     def kernel(
-            input: T.Tensor(input_shape, dtype),  # type: ignore
-            routed_expert_gate: T.Tensor(routed_expert_gate_shape, dtype),  # type: ignore
-            routed_expert_up: T.Tensor(routed_expert_up_shape, dtype),  # type: ignore
-            routed_expert_down: T.Tensor(routed_expert_down_shape, dtype),  # type: ignore
-            routed_expert_weights: T.Tensor(routed_expert_weights_shape, dtype),  # type: ignore
-            group_sizes: T.Tensor(group_sizes_shape, "int32"),  # type: ignore
-            group_offsets: T.Tensor(group_sizes_shape, "int32"),  # type: ignore
-            group_padded_offsets: T.Tensor(group_sizes_shape, "int32"),  # type: ignore
-            group_idx_for_bx: T.Tensor((M,), "int32"),  # type: ignore
-            up_logits: T.Tensor(intermediate_shape, dtype),  # type: ignore
-            output: T.Tensor(input_shape, dtype),  # type: ignore
+        input: T.Tensor(input_shape, dtype),  # type: ignore
+        routed_expert_gate: T.Tensor(routed_expert_gate_shape, dtype),  # type: ignore
+        routed_expert_up: T.Tensor(routed_expert_up_shape, dtype),  # type: ignore
+        routed_expert_down: T.Tensor(routed_expert_down_shape, dtype),  # type: ignore
+        routed_expert_weights: T.Tensor(routed_expert_weights_shape, dtype),  # type: ignore
+        group_sizes: T.Tensor(group_sizes_shape, T.int32),  # type: ignore
+        group_offsets: T.Tensor(group_sizes_shape, T.int32),  # type: ignore
+        group_padded_offsets: T.Tensor(group_sizes_shape, T.int32),  # type: ignore
+        group_idx_for_bx: T.Tensor((M,), T.int32),  # type: ignore
+        up_logits: T.Tensor(intermediate_shape, dtype),  # type: ignore
+        output: T.Tensor(input_shape, dtype),  # type: ignore
     ):
         # Step 1: Compute gate and up logits
         with T.Kernel(M, T.ceildiv(dexpert, block_dexpert), threads=threads) as (bx, by):
@@ -158,58 +155,44 @@ def kernel(
             gate_logits_local = T.alloc_fragment((block_token, block_dexpert), dtype=accum_dtype)
             up_logits_local = T.alloc_fragment((block_token, block_dexpert), dtype=accum_dtype)
 
-            cur_group_idx = T.alloc_local([1], "int32")
-            cur_group_size = T.alloc_local([1], "int32")
-
             T.use_swizzle(10, enable=True)
 
             m_start_padded = bx * block_token
 
-            cur_group_idx[0] = group_idx_for_bx[bx]
+            cur_group_idx = group_idx_for_bx[bx]
 
-            cur_group_size[0] = group_sizes[cur_group_idx[0]]
-            m_start = m_start_padded - group_padded_offsets[cur_group_idx[0]] + group_offsets[
-                cur_group_idx[0]]
-            actual_rows = T.max(
-                0,
-                T.min(block_token, cur_group_size[0] -
-                      (m_start_padded - group_padded_offsets[cur_group_idx[0]])))
+            cur_group_size = group_sizes[cur_group_idx]
+            m_start = m_start_padded - group_padded_offsets[cur_group_idx] + group_offsets[cur_group_idx]
+            actual_rows = T.max(0, T.min(block_token, cur_group_size - (m_start_padded - group_padded_offsets[cur_group_idx])))
 
             T.clear(gate_logits_local)
             T.clear(up_logits_local)
 
             for k in T.Pipelined(T.ceildiv(dhidden, block_dhidden), num_stages=num_stages):
                 T.copy(
-                    input[m_start:m_start + block_token, k * block_dhidden:(k + 1) * block_dhidden],
+                    input[m_start : m_start + block_token, k * block_dhidden : (k + 1) * block_dhidden],
                     input_shared,
-                    coalesced_width=coalesced_width)
+                    coalesced_width=coalesced_width,
+                )
                 T.copy(
-                    routed_expert_gate[cur_group_idx[0],
-                                       by * block_dexpert:(by + 1) * block_dexpert,
-                                       k * block_dhidden:(k + 1) * block_dhidden],
+                    routed_expert_gate[
+                        cur_group_idx, by * block_dexpert : (by + 1) * block_dexpert, k * block_dhidden : (k + 1) * block_dhidden
+                    ],
                     routed_expert_gate_shared,
-                    coalesced_width=coalesced_width)
-                T.gemm(
-                    input_shared,
-                    routed_expert_gate_shared,
-                    gate_logits_local,
-                    k_pack=k_pack,
-                    transpose_B=True)
+                    coalesced_width=coalesced_width,
+                )
+                T.gemm(input_shared, routed_expert_gate_shared, gate_logits_local, k_pack=k_pack, transpose_B=True)
                 T.copy(
-                    routed_expert_up[cur_group_idx[0], by * block_dexpert:(by + 1) * block_dexpert,
-                                     k * block_dhidden:(k + 1) * block_dhidden],
-                    routed_expert_up_shared,
-                    coalesced_width=coalesced_width)
-                T.gemm(
-                    input_shared,
+                    routed_expert_up[
+                        cur_group_idx, by * block_dexpert : (by + 1) * block_dexpert, k * block_dhidden : (k + 1) * block_dhidden
+                    ],
                     routed_expert_up_shared,
-                    up_logits_local,
-                    k_pack=k_pack,
-                    transpose_B=True)
+                    coalesced_width=coalesced_width,
+                )
+                T.gemm(input_shared, routed_expert_up_shared, up_logits_local, k_pack=k_pack, transpose_B=True)
 
             for i, j in T.Parallel(block_token, block_dexpert):
-                gate_logits_local[i, j] = gate_logits_local[i, j] * (
-                    1.0 / (1.0 + T.exp2(-gate_logits_local[i, j] * scale)))
+                gate_logits_local[i, j] = gate_logits_local[i, j] * (1.0 / (1.0 + T.exp2(-gate_logits_local[i, j] * scale)))
                 up_logits_local[i, j] = up_logits_local[i, j] * gate_logits_local[i, j]
 
             for i, j in T.Parallel(block_token, block_dexpert):
@@ -222,60 +205,42 @@ def kernel(
             routed_expert_down_shared = T.alloc_shared((block_dhidden, block_dexpert), dtype=dtype)
             output_local = T.alloc_fragment((block_token, block_dhidden), dtype=accum_dtype)
 
-            cur_group_idx = T.alloc_local([1], "int32")
-            cur_group_size = T.alloc_local([1], "int32")
-
             T.use_swizzle(10, enable=True)
 
             m_start_padded = bx * block_token
 
-            cur_group_idx[0] = group_idx_for_bx[bx]
+            cur_group_idx = group_idx_for_bx[bx]
 
-            cur_group_size[0] = group_sizes[cur_group_idx[0]]
-            m_start = m_start_padded - group_padded_offsets[cur_group_idx[0]] + group_offsets[
-                cur_group_idx[0]]
-            actual_rows = T.max(
-                0,
-                T.min(block_token, cur_group_size[0] -
-                      (m_start_padded - group_padded_offsets[cur_group_idx[0]])))
+            cur_group_size = group_sizes[cur_group_idx]
+            m_start = m_start_padded - group_padded_offsets[cur_group_idx] + group_offsets[cur_group_idx]
+            actual_rows = T.max(0, T.min(block_token, cur_group_size - (m_start_padded - group_padded_offsets[cur_group_idx])))
 
             T.clear(output_local)
 
             for k in T.Pipelined(T.ceildiv(dexpert, block_dexpert), num_stages=num_stages):
                 T.copy(
-                    up_logits[m_start:m_start + block_token,
-                              k * block_dexpert:(k + 1) * block_dexpert],
+                    up_logits[m_start : m_start + block_token, k * block_dexpert : (k + 1) * block_dexpert],
                     up_logits_shared,
-                    coalesced_width=coalesced_width)
+                    coalesced_width=coalesced_width,
+                )
                 T.copy(
-                    routed_expert_down[cur_group_idx[0],
-                                       by * block_dhidden:(by + 1) * block_dhidden,
-                                       k * block_dexpert:(k + 1) * block_dexpert],
-                    routed_expert_down_shared,
-                    coalesced_width=coalesced_width)
-                T.gemm(
-                    up_logits_shared,
+                    routed_expert_down[
+                        cur_group_idx, by * block_dhidden : (by + 1) * block_dhidden, k * block_dexpert : (k + 1) * block_dexpert
+                    ],
                     routed_expert_down_shared,
-                    output_local,
-                    k_pack=k_pack,
-                    transpose_B=True)
+                    coalesced_width=coalesced_width,
+                )
+                T.gemm(up_logits_shared, routed_expert_down_shared, output_local, k_pack=k_pack, transpose_B=True)
 
             for i, j in T.Parallel(block_token, block_dhidden):
                 if i < actual_rows:
-                    output[m_start + i, by * block_dhidden +
-                           j] = output_local[i, j] * routed_expert_weights[m_start + i]
+                    output[m_start + i, by * block_dhidden + j] = output_local[i, j] * routed_expert_weights[m_start + i]
 
     return kernel
 
 
 class Expert(nn.Module):
-
-    def __init__(self,
-                 config: Dict,
-                 gate: torch.Tensor,
-                 up: torch.Tensor,
-                 down: torch.Tensor,
-                 d_expert: Optional[int] = None):
+    def __init__(self, config: Dict, gate: torch.Tensor, up: torch.Tensor, down: torch.Tensor, d_expert: Optional[int] = None):
         super().__init__()
         self.config = config
         self.act_fn = nn.SiLU()
@@ -294,14 +259,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class MoEGate(nn.Module):
-
     def __init__(self, config: Dict, weights: Dict):
         super().__init__()
         self.top_k: int = config["n_experts_per_token"]
         self.num_experts: int = config["n_routed_experts"]
         self.d_hidden: int = config["d_hidden"]
 
-        self.W_g_weight = weights['router.weight'].t()
+        self.W_g_weight = weights["router.weight"].t()
 
     def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         logits = x @ self.W_g_weight
@@ -312,76 +276,69 @@ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
 
 
 class MoE(nn.Module):
-
-    def __init__(self,
-                 config: Dict,
-                 shared_kernel: tilelang.JITKernel,
-                 routed_kernel: tilelang.JITKernel,
-                 weights: Dict,
-                 padding_M: int = 128):
+    def __init__(
+        self, config: Dict, shared_kernel: tilelang.JITKernel, routed_kernel: tilelang.JITKernel, weights: Dict, padding_M: int = 128
+    ):
         super().__init__()
         self.config = config
         self.shared_kernel = shared_kernel
         self.routed_kernel = routed_kernel
         self.padding_M = padding_M
-        self.experts = nn.ModuleList([
-            Expert(
-                config,
-                gate=weights[f'experts.{i}.0.weight'],
-                up=weights[f'experts.{i}.1.weight'],
-                down=weights[f'experts.{i}.2.weight']) for i in range(config["n_routed_experts"])
-        ])
+        self.experts = nn.ModuleList(
+            [
+                Expert(
+                    config,
+                    gate=weights[f"experts.{i}.0.weight"],
+                    up=weights[f"experts.{i}.1.weight"],
+                    down=weights[f"experts.{i}.2.weight"],
+                )
+                for i in range(config["n_routed_experts"])
+            ]
+        )
         self.device = torch.device("cuda")
         self.gating_network = MoEGate(config, weights).to(self.device)
         shared_expert_dim = config["d_expert"] * config["n_shared_experts"]
         self.shared_expert = Expert(
             config=config,
-            gate=weights['shared_experts.0.weight'],
-            up=weights['shared_experts.1.weight'],
-            down=weights['shared_experts.2.weight'],
-            d_expert=shared_expert_dim).to(self.device)
+            gate=weights["shared_experts.0.weight"],
+            up=weights["shared_experts.1.weight"],
+            down=weights["shared_experts.2.weight"],
+            d_expert=shared_expert_dim,
+        ).to(self.device)
         self.expert_cache = torch.zeros(
-            (config["batch_size"] * config["seq_len"], config["d_hidden"]),
-            dtype=torch.float16,
-            device=self.device)
-        self.stacked_expert_w_gate = torch.stack([expert.W_gate_weight for expert in self.experts],
-                                                 dim=0)
-        self.stacked_expert_w_up = torch.stack([expert.W_up_weight for expert in self.experts],
-                                               dim=0)
-        self.stacked_expert_w_down = torch.stack([expert.W_down_weight for expert in self.experts],
-                                                 dim=0)
+            (config["batch_size"] * config["seq_len"], config["d_hidden"]), dtype=torch.float16, device=self.device
+        )
+        self.stacked_expert_w_gate = torch.stack([expert.W_gate_weight for expert in self.experts], dim=0)
+        self.stacked_expert_w_up = torch.stack([expert.W_up_weight for expert in self.experts], dim=0)
+        self.stacked_expert_w_down = torch.stack([expert.W_down_weight for expert in self.experts], dim=0)
         self.stacked_expert_tokens = torch.empty(
-            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"],
-             self.config["d_hidden"]),
+            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"], self.config["d_hidden"]),
             dtype=torch.float16,
-            device=self.device)
+            device=self.device,
+        )
         self.stacked_expert_weights = torch.empty(
-            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"]),
-            dtype=torch.float16,
-            device=self.device)
+            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"]), dtype=torch.float16, device=self.device
+        )
         self.stacked_expert_tokens_idxs = torch.empty(
-            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"]),
-            dtype=torch.int64,
-            device=self.device)
+            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"]), dtype=torch.int64, device=self.device
+        )
 
         self.up_logits_shared = torch.empty(
-            (config["batch_size"] * config["seq_len"], self.config["d_expert"]),
-            dtype=torch.float16,
-            device=self.device)
+            (config["batch_size"] * config["seq_len"], self.config["d_expert"]), dtype=torch.float16, device=self.device
+        )
         self.expert_output_shared = torch.empty(
-            (config["batch_size"] * config["seq_len"], self.config["d_hidden"]),
-            dtype=torch.float16,
-            device=self.device)
+            (config["batch_size"] * config["seq_len"], self.config["d_hidden"]), dtype=torch.float16, device=self.device
+        )
         self.up_logits_routed = torch.empty(
-            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"],
-             self.config["d_expert"]),
+            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"], self.config["d_expert"]),
             dtype=torch.float16,
-            device=self.device)
+            device=self.device,
+        )
         self.expert_output_routed = torch.empty(
-            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"],
-             self.config["d_hidden"]),
+            (config["batch_size"] * config["seq_len"] * config["n_experts_per_token"], self.config["d_hidden"]),
             dtype=torch.float16,
-            device=self.device)
+            device=self.device,
+        )
 
     @torch.no_grad()
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -413,22 +370,20 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
             self.stacked_expert_tokens[start_idx:end_idx] = expert_tokens
             self.stacked_expert_tokens_idxs[start_idx:end_idx] = exp_token_idxs
-            self.stacked_expert_weights[start_idx:end_idx] = flat_expert_weights[
-                idxs[start_idx:end_idx]]
+            self.stacked_expert_weights[start_idx:end_idx] = flat_expert_weights[idxs[start_idx:end_idx]]
 
         group_sizes = torch.tensor(counts, dtype=torch.int32, device=self.device)
-        group_offset = torch.tensor(
-            tokens_per_expert - counts, dtype=torch.int32, device=self.device)
+        group_offset = torch.tensor(tokens_per_expert - counts, dtype=torch.int32, device=self.device)
 
         group_padded_offsets = [0 for _ in range(len(group_sizes))]
         for i in range(1, len(group_sizes)):
-            group_padded_offsets[i] = group_padded_offsets[i - 1] + math.ceil(
-                (counts[i - 1] + 1) / self.padding_M) * self.padding_M
+            group_padded_offsets[i] = group_padded_offsets[i - 1] + math.ceil((counts[i - 1] + 1) / self.padding_M) * self.padding_M
 
         block_token = 128
-        M = math.ceil(
-            self.config["batch_size"] * self.config["seq_len"] *
-            self.config["n_experts_per_token"] / block_token) + self.config["n_routed_experts"]
+        M = (
+            math.ceil(self.config["batch_size"] * self.config["seq_len"] * self.config["n_experts_per_token"] / block_token)
+            + self.config["n_routed_experts"]
+        )
         group_idx_for_bx = [0 for _ in range(M)]
 
         for bx in range(M):
@@ -437,8 +392,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 if m_start_padded >= group_padded_offsets[i]:
                     group_idx_for_bx[bx] = i
 
-        group_padded_offsets = torch.tensor(
-            group_padded_offsets, dtype=torch.int32, device=self.device)
+        group_padded_offsets = torch.tensor(group_padded_offsets, dtype=torch.int32, device=self.device)
         group_idx_for_bx = torch.tensor(group_idx_for_bx, dtype=torch.int32, device=self.device)
 
         # Multi-stream execution
@@ -448,11 +402,19 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         with torch.cuda.stream(routed_stream):
             # Tilelang version: Grouped GEMM
-            self.routed_kernel(self.stacked_expert_tokens, self.stacked_expert_w_gate,
-                               self.stacked_expert_w_up, self.stacked_expert_w_down,
-                               self.stacked_expert_weights, group_sizes, group_offset,
-                               group_padded_offsets, group_idx_for_bx, self.up_logits_routed,
-                               self.expert_output_routed)
+            self.routed_kernel(
+                self.stacked_expert_tokens,
+                self.stacked_expert_w_gate,
+                self.stacked_expert_w_up,
+                self.stacked_expert_w_down,
+                self.stacked_expert_weights,
+                group_sizes,
+                group_offset,
+                group_padded_offsets,
+                group_idx_for_bx,
+                self.up_logits_routed,
+                self.expert_output_routed,
+            )
 
             # Scatter reduce
             self.expert_cache = torch.scatter_reduce(
@@ -460,14 +422,19 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 0,
                 self.stacked_expert_tokens_idxs.view(-1, 1).repeat(1, x_flat.shape[-1]),
                 self.expert_output_routed,
-                reduce='sum')
+                reduce="sum",
+            )
             routed_output = self.expert_cache.view(*orig_shape)
 
         with torch.cuda.stream(shared_stream):
-
-            self.shared_kernel(x_flat, self.shared_expert.W_gate_weight,
-                               self.shared_expert.W_up_weight, self.shared_expert.W_down_weight,
-                               self.up_logits_shared, self.expert_output_shared)
+            self.shared_kernel(
+                x_flat,
+                self.shared_expert.W_gate_weight,
+                self.shared_expert.W_up_weight,
+                self.shared_expert.W_down_weight,
+                self.up_logits_shared,
+                self.expert_output_shared,
+            )
             shared_output = self.expert_output_shared.view(*orig_shape)
 
         torch.cuda.synchronize()
@@ -491,14 +458,15 @@ def custom_kernel(data: Tuple[torch.Tensor, Dict, Dict]) -> torch.Tensor:
     """
     input_tensor, weights, config = data
 
-    dtype_str = "float16"
+    dtype_str = T.float16
 
     shared_kernel = moe_forward_tilelang_shared(
         config["d_hidden"],
         config["d_expert"],
         config["n_shared_experts"],
         dtype=dtype_str,
-        num_tokens=config["batch_size"] * config["seq_len"])
+        num_tokens=config["batch_size"] * config["seq_len"],
+    )
     routed_kernel = moe_forward_tilelang_routed(
         config["d_hidden"],
         config["d_expert"],
@@ -512,7 +480,8 @@ def custom_kernel(data: Tuple[torch.Tensor, Dict, Dict]) -> torch.Tensor:
         threads=256,
         num_stages=1,
         k_pack=1,
-        coalesced_width=2)
+        coalesced_width=2,
+    )
 
     moe = MoE(config, shared_kernel, routed_kernel, weights, padding_M=128)
 
@@ -521,13 +490,7 @@ def custom_kernel(data: Tuple[torch.Tensor, Dict, Dict]) -> torch.Tensor:
     return output
 
 
-def main(d_hidden=7168,
-         d_expert=2048,
-         n_routed_experts=8,
-         n_shared_experts=1,
-         n_experts_per_token=4,
-         batch_size=1,
-         seq_len=8192):
+def main(d_hidden=7168, d_expert=2048, n_routed_experts=8, n_shared_experts=1, n_experts_per_token=4, batch_size=1, seq_len=8192):
     config = {
         "dhidden": d_hidden,
         "dexpert": d_expert,
@@ -536,7 +499,7 @@ def main(d_hidden=7168,
         "nexpertspertoken": n_experts_per_token,
         "bs": batch_size,
         "seqlen": seq_len,
-        "seed": 81394
+        "seed": 81394,
     }
 
     data = generate_input(**config)
@@ -551,5 +514,121 @@ def main(d_hidden=7168,
     print("✅ Tilelang and Torch match")
 
 
+def run_regression_perf(
+    d_hidden=7168, d_expert=2048, n_routed_experts=8, n_shared_experts=1, n_experts_per_token=4, batch_size=1, seq_len=8192
+):
+    config = {
+        "dhidden": d_hidden,
+        "dexpert": d_expert,
+        "nroutedexperts": n_routed_experts,
+        "nsharedexperts": n_shared_experts,
+        "nexpertspertoken": n_experts_per_token,
+        "bs": batch_size,
+        "seqlen": seq_len,
+        "seed": 81394,
+    }
+    from tilelang.profiler import do_bench
+
+    data = generate_input(**config)
+
+    x, weights, config = data
+
+    dtype_str = "float16"
+
+    shared_kernel = moe_forward_tilelang_shared(
+        config["d_hidden"],
+        config["d_expert"],
+        config["n_shared_experts"],
+        dtype=dtype_str,
+        num_tokens=config["batch_size"] * config["seq_len"],
+    )
+    routed_kernel = moe_forward_tilelang_routed(
+        config["d_hidden"],
+        config["d_expert"],
+        config["n_routed_experts"],
+        dtype=dtype_str,
+        group_sum=config["batch_size"] * config["seq_len"] * config["n_experts_per_token"],
+        group_count=config["n_routed_experts"],
+        block_token=128,
+        block_dhidden=128,
+        block_dexpert=128,
+        threads=256,
+        num_stages=1,
+        k_pack=1,
+        coalesced_width=2,
+    )
+
+    moe = MoE(config, shared_kernel, routed_kernel, weights, padding_M=128)
+    batch_size, seq_len, hidden_dim = x.shape
+    expert_indices, expert_scores = moe.gating_network(x)
+    flat_expert_indices = expert_indices.view(-1)
+    flat_expert_weights = expert_scores.view(-1)
+    x_flat = x.view(-1, hidden_dim)
+    idxs = flat_expert_indices.argsort()
+    counts = flat_expert_indices.bincount().cpu().numpy()
+    tokens_per_expert = counts.cumsum()
+    num_per_tok = moe.config["n_experts_per_token"]
+    token_idxs = idxs // num_per_tok
+    for expert_id, end_idx in enumerate(tokens_per_expert):
+        start_idx = 0 if expert_id == 0 else tokens_per_expert[expert_id - 1]
+        if start_idx == end_idx:
+            continue
+        exp_token_idxs = token_idxs[start_idx:end_idx]
+        expert_tokens = x_flat[exp_token_idxs]
+        moe.stacked_expert_tokens[start_idx:end_idx] = expert_tokens
+        moe.stacked_expert_tokens_idxs[start_idx:end_idx] = exp_token_idxs
+        moe.stacked_expert_weights[start_idx:end_idx] = flat_expert_weights[idxs[start_idx:end_idx]]
+    group_sizes = torch.tensor(counts, dtype=torch.int32, device=moe.device)
+    group_offset = torch.tensor(tokens_per_expert - counts, dtype=torch.int32, device=moe.device)
+    group_padded_offsets = [0 for _ in range(len(group_sizes))]
+    for i in range(1, len(group_sizes)):
+        group_padded_offsets[i] = group_padded_offsets[i - 1] + math.ceil((counts[i - 1] + 1) / moe.padding_M) * moe.padding_M
+    block_token = 128
+    M = (
+        math.ceil(moe.config["batch_size"] * moe.config["seq_len"] * moe.config["n_experts_per_token"] / block_token)
+        + moe.config["n_routed_experts"]
+    )
+    group_idx_for_bx = [0 for _ in range(M)]
+    for bx in range(M):
+        m_start_padded = bx * block_token
+        for i in range(moe.config["n_routed_experts"]):
+            if m_start_padded >= group_padded_offsets[i]:
+                group_idx_for_bx[bx] = i
+    group_padded_offsets = torch.tensor(group_padded_offsets, dtype=torch.int32, device=moe.device)
+    group_idx_for_bx = torch.tensor(group_idx_for_bx, dtype=torch.int32, device=moe.device)
+
+    def run_shared_kernel_only():
+        moe.routed_kernel(
+            moe.stacked_expert_tokens,
+            moe.stacked_expert_w_gate,
+            moe.stacked_expert_w_up,
+            moe.stacked_expert_w_down,
+            moe.stacked_expert_weights,
+            group_sizes,
+            group_offset,
+            group_padded_offsets,
+            group_idx_for_bx,
+            moe.up_logits_routed,
+            moe.expert_output_routed,
+        )
+
+    def run_routed_kernel_only():
+        moe.routed_kernel(
+            moe.stacked_expert_tokens,
+            moe.stacked_expert_w_gate,
+            moe.stacked_expert_w_up,
+            moe.stacked_expert_w_down,
+            moe.stacked_expert_weights,
+            group_sizes,
+            group_offset,
+            group_padded_offsets,
+            group_idx_for_bx,
+            moe.up_logits_routed,
+            moe.expert_output_routed,
+        )
+
+    return do_bench(run_routed_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/fusedmoe/example_fusedmoe_torch.py b/examples/fusedmoe/example_fusedmoe_torch.py
index 00219c6e9..6b6322aff 100644
--- a/examples/fusedmoe/example_fusedmoe_torch.py
+++ b/examples/fusedmoe/example_fusedmoe_torch.py
@@ -6,7 +6,6 @@
 
 # Reference code in PyTorch
 class ExpertTorch(nn.Module):
-
     def __init__(self, config: Dict, d_expert: Optional[int] = None):
         super().__init__()
         self.config = config
@@ -25,7 +24,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class MoEGateTorch(nn.Module):
-
     def __init__(self, config: Dict):
         super().__init__()
         self.top_k: int = config["n_experts_per_token"]
@@ -43,12 +41,10 @@ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
 
 
 class MoETorch(nn.Module):
-
     def __init__(self, config: Dict):
         super().__init__()
         self.config = config
-        self.experts = nn.ModuleList(
-            [ExpertTorch(config) for _ in range(config["n_routed_experts"])])
+        self.experts = nn.ModuleList([ExpertTorch(config) for _ in range(config["n_routed_experts"])])
         self.gating_network = MoEGateTorch(config)
         shared_expert_dim = config["d_expert"] * config["n_shared_experts"]
         self.shared_expert = ExpertTorch(config=config, d_expert=shared_expert_dim)
@@ -67,8 +63,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return routed_output + shared_output
 
     @torch.no_grad()
-    def moe_infer(self, x: torch.Tensor, flat_expert_indices: torch.Tensor,
-                  flat_expert_weights: torch.Tensor) -> torch.Tensor:
+    def moe_infer(self, x: torch.Tensor, flat_expert_indices: torch.Tensor, flat_expert_weights: torch.Tensor) -> torch.Tensor:
         expert_cache = torch.zeros_like(x)
         # test_expert_cache = torch.zeros((x.shape[0] * self.config["n_experts_per_token"], self.config["d_hidden"]))
         # test_expert_tokens = torch.zeros((x.shape[0] * self.config["n_experts_per_token"], self.config["d_hidden"]))
@@ -91,8 +86,7 @@ def moe_infer(self, x: torch.Tensor, flat_expert_indices: torch.Tensor,
             expert_out = expert(expert_tokens)
 
             expert_out.mul_(flat_expert_weights[idxs[start_idx:end_idx]])
-            expert_cache.scatter_reduce_(
-                0, exp_token_idxs.view(-1, 1).repeat(1, x.shape[-1]), expert_out, reduce='sum')
+            expert_cache.scatter_reduce_(0, exp_token_idxs.view(-1, 1).repeat(1, x.shape[-1]), expert_out, reduce="sum")
 
         return expert_cache
 
@@ -116,21 +110,21 @@ def ref_kernel(data: Tuple[torch.Tensor, Dict, Dict]) -> torch.Tensor:
     moe = MoETorch(config)
 
     # Fill in the given weights of the model
-    moe.gating_network.W_g.weight = nn.Parameter(weights['router.weight'])
+    moe.gating_network.W_g.weight = nn.Parameter(weights["router.weight"])
 
     for i in range(num_experts):
-        gate_proj_weight = weights[f'experts.{i}.0.weight']
-        up_proj_weight = weights[f'experts.{i}.1.weight']
-        down_proj_weight = weights[f'experts.{i}.2.weight']
+        gate_proj_weight = weights[f"experts.{i}.0.weight"]
+        up_proj_weight = weights[f"experts.{i}.1.weight"]
+        down_proj_weight = weights[f"experts.{i}.2.weight"]
 
         # Transpose weights to match expected shape for nn.Linear
         moe.experts[i].W_gate.weight = nn.Parameter(gate_proj_weight.t())
         moe.experts[i].W_up.weight = nn.Parameter(up_proj_weight.t())
         moe.experts[i].W_down.weight = nn.Parameter(down_proj_weight.t())
 
-    moe.shared_expert.W_gate.weight = nn.Parameter(weights['shared_experts.0.weight'].t())
-    moe.shared_expert.W_up.weight = nn.Parameter(weights['shared_experts.1.weight'].t())
-    moe.shared_expert.W_down.weight = nn.Parameter(weights['shared_experts.2.weight'].t())
+    moe.shared_expert.W_gate.weight = nn.Parameter(weights["shared_experts.0.weight"].t())
+    moe.shared_expert.W_up.weight = nn.Parameter(weights["shared_experts.1.weight"].t())
+    moe.shared_expert.W_down.weight = nn.Parameter(weights["shared_experts.2.weight"].t())
 
     output = moe(input_tensor)
 
@@ -140,10 +134,9 @@ def ref_kernel(data: Tuple[torch.Tensor, Dict, Dict]) -> torch.Tensor:
 # Input generation for the reference code
 
 
-def generate_input(dhidden: int, dexpert: int, nroutedexperts: int, nsharedexperts: int,
-                   nexpertspertoken: int, bs: int, seqlen: int,
-                   seed: int) -> Tuple[torch.Tensor, Dict, Dict]:
-
+def generate_input(
+    dhidden: int, dexpert: int, nroutedexperts: int, nsharedexperts: int, nexpertspertoken: int, bs: int, seqlen: int, seed: int
+) -> Tuple[torch.Tensor, Dict, Dict]:
     # Really dumb but for now _ isn't parsing correctly.
     d_hidden = dhidden
     d_expert = dexpert
@@ -163,50 +156,40 @@ def generate_input(dhidden: int, dexpert: int, nroutedexperts: int, nsharedexper
         "seq_len": seq_len,
     }
 
-    gen = torch.Generator(device='cuda')
+    gen = torch.Generator(device="cuda")
     gen.manual_seed(seed)
 
     num_experts = n_routed_experts
     expert_dim = d_expert
     weights = {}
 
-    input_tensor = torch.randn((batch_size, seq_len, d_hidden),
-                               device='cuda',
-                               dtype=torch.float16,
-                               generator=gen).contiguous()
+    input_tensor = torch.randn((batch_size, seq_len, d_hidden), device="cuda", dtype=torch.float16, generator=gen).contiguous()
 
     # Initialize router weights
-    weights['router.weight'] = torch.randn(
-        (num_experts, d_hidden), device="cuda", dtype=torch.float16,
-        generator=gen) / math.sqrt(d_hidden)
+    weights["router.weight"] = torch.randn((num_experts, d_hidden), device="cuda", dtype=torch.float16, generator=gen) / math.sqrt(d_hidden)
 
     for i in range(num_experts):
-        weights[f'experts.{i}.0.weight'] = torch.randn(
-            (d_hidden, expert_dim), device='cuda', dtype=torch.float16,
-            generator=gen) / math.sqrt(expert_dim)
-
-        weights[f'experts.{i}.1.weight'] = torch.randn(
-            (d_hidden, expert_dim), device='cuda', dtype=torch.float16,
-            generator=gen) / math.sqrt(expert_dim)
-
-        weights[f'experts.{i}.2.weight'] = torch.randn(
-            (expert_dim, d_hidden), device='cuda', dtype=torch.float16,
-            generator=gen) / math.sqrt(d_hidden)
-
-    weights['shared_experts.0.weight'] = torch.randn(
-        (d_hidden, expert_dim * n_shared_experts),
-        device='cuda',
-        dtype=torch.float16,
-        generator=gen) / math.sqrt(expert_dim * n_shared_experts)
-    weights['shared_experts.1.weight'] = torch.randn(
-        (d_hidden, expert_dim * n_shared_experts),
-        device='cuda',
-        dtype=torch.float16,
-        generator=gen) / math.sqrt(expert_dim * n_shared_experts)
-    weights['shared_experts.2.weight'] = torch.randn((expert_dim * n_shared_experts, d_hidden),
-                                                     device='cuda',
-                                                     dtype=torch.float16,
-                                                     generator=gen) / math.sqrt(d_hidden)
+        weights[f"experts.{i}.0.weight"] = torch.randn(
+            (d_hidden, expert_dim), device="cuda", dtype=torch.float16, generator=gen
+        ) / math.sqrt(expert_dim)
+
+        weights[f"experts.{i}.1.weight"] = torch.randn(
+            (d_hidden, expert_dim), device="cuda", dtype=torch.float16, generator=gen
+        ) / math.sqrt(expert_dim)
+
+        weights[f"experts.{i}.2.weight"] = torch.randn(
+            (expert_dim, d_hidden), device="cuda", dtype=torch.float16, generator=gen
+        ) / math.sqrt(d_hidden)
+
+    weights["shared_experts.0.weight"] = torch.randn(
+        (d_hidden, expert_dim * n_shared_experts), device="cuda", dtype=torch.float16, generator=gen
+    ) / math.sqrt(expert_dim * n_shared_experts)
+    weights["shared_experts.1.weight"] = torch.randn(
+        (d_hidden, expert_dim * n_shared_experts), device="cuda", dtype=torch.float16, generator=gen
+    ) / math.sqrt(expert_dim * n_shared_experts)
+    weights["shared_experts.2.weight"] = torch.randn(
+        (expert_dim * n_shared_experts, d_hidden), device="cuda", dtype=torch.float16, generator=gen
+    ) / math.sqrt(d_hidden)
 
     return (input_tensor, weights, config)
 
diff --git a/examples/fusedmoe/regression_example_fusedmoe.py b/examples/fusedmoe/regression_example_fusedmoe.py
new file mode 100644
index 000000000..ac0f18aae
--- /dev/null
+++ b/examples/fusedmoe/regression_example_fusedmoe.py
@@ -0,0 +1,19 @@
+import tilelang.testing
+import example_fusedmoe_tilelang
+
+
+def regression_example_fusedmoe_tilelang():
+    tilelang.testing.process_func(
+        example_fusedmoe_tilelang.run_regression_perf,
+        d_hidden=1024,
+        d_expert=256,
+        n_routed_experts=8,
+        n_shared_experts=1,
+        n_experts_per_token=4,
+        batch_size=1,
+        seq_len=1024,
+    )
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/fusedmoe/test_example_fusedmoe.py b/examples/fusedmoe/test_example_fusedmoe.py
index 806aff49e..ba8415895 100644
--- a/examples/fusedmoe/test_example_fusedmoe.py
+++ b/examples/fusedmoe/test_example_fusedmoe.py
@@ -4,13 +4,8 @@
 
 def test_example_fusedmoe_tilelang():
     example_fusedmoe_tilelang.main(
-        d_hidden=1024,
-        d_expert=256,
-        n_routed_experts=8,
-        n_shared_experts=1,
-        n_experts_per_token=4,
-        batch_size=1,
-        seq_len=1024)
+        d_hidden=1024, d_expert=256, n_routed_experts=8, n_shared_experts=1, n_experts_per_token=4, batch_size=1, seq_len=1024
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/gdn/example_chunk_delta_bwd.py b/examples/gdn/example_chunk_delta_bwd.py
index 518b0ee21..4230df525 100644
--- a/examples/gdn/example_chunk_delta_bwd.py
+++ b/examples/gdn/example_chunk_delta_bwd.py
@@ -12,6 +12,7 @@
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__, flush=True)
     from fla.ops.common.chunk_delta_h import chunk_gated_delta_rule_bwd_dhu
 except ImportError:
@@ -24,7 +25,7 @@
 torch.random.manual_seed(0)
 # torch.set_printoptions(profile="full")
 
-from utils import *
+from test_utils import assert_similar
 
 
 def prepare_input(
@@ -49,6 +50,7 @@ def prepare_input(
     G = F.logsigmoid(G)
     try:
         from fla.ops.utils.cumsum import chunk_local_cumsum
+
         G = chunk_local_cumsum(G, chunk_size)
     except ImportError:
         print("fla not found, skip cumsum")
@@ -125,8 +127,11 @@ def torch_chunk_gated_delta_rule_bwd_dhu(
     DV = dv.shape[-1]
     block_S = 64
     BS = S // block_S
-    dh, dh0, dv2 = torch.empty((B, BS, H, DK, DV), dtype=output_dtype), torch.empty(
-        (B, H, DK, DV), dtype=state_dtype), torch.empty((B, S, H, DV), dtype=output_dtype)
+    dh, dh0, dv2 = (
+        torch.empty((B, BS, H, DK, DV), dtype=output_dtype),
+        torch.empty((B, H, DK, DV), dtype=state_dtype),
+        torch.empty((B, S, H, DV), dtype=output_dtype),
+    )
     dh_tmp = torch.empty((B, H, DK, DV), dtype=accum_dtype)
     dv_tmp = torch.empty((B, S, H, DV), dtype=accum_dtype)
     Q_tmp = torch.empty((B, S, H, DK), dtype=accum_dtype)
@@ -138,34 +143,30 @@ def torch_chunk_gated_delta_rule_bwd_dhu(
 
     for i_s in range(BS - 1, -1, -1):
         dh[:, i_s, :, :, :] = dh_tmp
-        dv_tmp = torch.matmul(K[:, i_s * block_S:(i_s + 1) * block_S, :, :].permute(0, 2, 1, 3),
-                              dh_tmp.to(K.dtype)).permute(0, 2, 1, 3)
+        dv_tmp = torch.matmul(K[:, i_s * block_S : (i_s + 1) * block_S, :, :].permute(0, 2, 1, 3), dh_tmp.to(K.dtype)).permute(0, 2, 1, 3)
         if use_g:
             for i_bh in range(B * H):
                 i_b, i_h = i_bh // H, i_bh % H
                 for i_s2 in range(block_S):
-                    if G[i_b, i_s * block_S + block_S - 1, i_h] - G[i_b, i_s * block_S + i_s2,
-                                                                    i_h] <= 0:
-                        dv_tmp[i_b, i_s2,
-                               i_h, :] *= torch.exp(G[i_b, i_s * block_S + block_S - 1, i_h] -
-                                                    G[i_b, i_s * block_S + i_s2, i_h])
+                    if G[i_b, i_s * block_S + block_S - 1, i_h] - G[i_b, i_s * block_S + i_s2, i_h] <= 0:
+                        dv_tmp[i_b, i_s2, i_h, :] *= torch.exp(G[i_b, i_s * block_S + block_S - 1, i_h] - G[i_b, i_s * block_S + i_s2, i_h])
                     else:
                         dv_tmp[i_b, i_s2, i_h, :] = 0
-        dv_tmp += dv[:, i_s * block_S:(i_s + 1) * block_S, :, :]
-        dv2[:, i_s * block_S:(i_s + 1) * block_S, :, :] = dv_tmp
+        dv_tmp += dv[:, i_s * block_S : (i_s + 1) * block_S, :, :]
+        dv2[:, i_s * block_S : (i_s + 1) * block_S, :, :] = dv_tmp
 
         if use_g:
             G_last = G[:, i_s * block_S + block_S - 1, :]
             for i_bh in range(B * H):
                 i_b, i_h = i_bh // H, i_bh % H
                 dh_tmp[i_b, i_h, :, :] *= torch.exp(G_last[i_b, i_h])
-            Q_tmp = Q[:, i_s * block_S:(i_s + 1) * block_S, :, :]
+            Q_tmp = Q[:, i_s * block_S : (i_s + 1) * block_S, :, :]
             for i_s2 in range(block_S):
                 for i_k in range(DK):
                     Q_tmp[:, i_s2, :, i_k] *= torch.exp(G[:, i_s * block_S + i_s2, :])
         Q_tmp *= scale
-        W_tmp = W[:, i_s * block_S:(i_s + 1) * block_S, :, :]
-        dO_tmp = dO[:, i_s * block_S:(i_s + 1) * block_S, :, :]
+        W_tmp = W[:, i_s * block_S : (i_s + 1) * block_S, :, :]
+        dO_tmp = dO[:, i_s * block_S : (i_s + 1) * block_S, :, :]
 
         torch.backends.cuda.matmul.allow_tf32 = True
         dh_tmp += torch.matmul(Q_tmp.permute(0, 2, 3, 1), dO_tmp.permute(0, 2, 1, 3))
@@ -223,25 +224,24 @@ def tilelang_chunk_gated_delta_rule_bwd_dhu(
 
     @T.prim_func
     def kernel(
-            # Input
-            Q: T.Tensor(Q_shape, dtype=input_dtype),
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            W: T.Tensor(W_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            h0: T.Tensor(h0_shape, dtype=input_dtype),
-            dht: T.Tensor(dht_shape, dtype=input_dtype),
-            dO: T.Tensor(dO_shape, dtype=input_dtype),
-            dv: T.Tensor(dv_shape, dtype=input_dtype),
-            # Output
-            dh: T.Tensor(dh_shape, dtype=output_dtype),
-            dh0: T.Tensor(dh0_shape, dtype=state_dtype),
-            dv2: T.Tensor(dv2_shape, dtype=output_dtype),
+        # Input
+        Q: T.Tensor(Q_shape, dtype=input_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        W: T.Tensor(W_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        h0: T.Tensor(h0_shape, dtype=input_dtype),
+        dht: T.Tensor(dht_shape, dtype=input_dtype),
+        dO: T.Tensor(dO_shape, dtype=input_dtype),
+        dv: T.Tensor(dv_shape, dtype=input_dtype),
+        # Output
+        dh: T.Tensor(dh_shape, dtype=output_dtype),
+        dh0: T.Tensor(dh0_shape, dtype=state_dtype),
+        dv2: T.Tensor(dv2_shape, dtype=output_dtype),
     ):
         with T.Kernel(T.ceildiv(DV, block_DV), B * H, threads=threads) as (bv, bbh):
             bb, bh = bbh // H, bbh % H
 
             b_dh_shared = T.alloc_shared((DK, block_DV), dtype=output_dtype)
-            b_dh_shared_fp32 = T.alloc_shared((DK, block_DV), dtype=state_dtype)
             b_dh_fragment = T.alloc_fragment((DK, block_DV), dtype=accum_dtype)
             b_dh_fragment_1 = T.alloc_fragment((DK, block_DV), dtype=accum_dtype)
             b_dh_fragment_2 = T.alloc_fragment((DK, block_DV), dtype=accum_dtype)
@@ -249,17 +249,14 @@ def kernel(
             dv_fragment = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
             dv_fragment_2 = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
             dO_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
-            dO_shared_t = T.alloc_shared((block_DV, block_S), dtype="float32")
-            dO_fragment = T.alloc_fragment((block_S, block_DV), dtype="float32")
-            dO_fragment_t = T.alloc_fragment((block_DV, block_S), dtype="float32")
+            dO_shared_t = T.alloc_shared((block_DV, block_S), dtype=T.float32)
+            dO_fragment = T.alloc_fragment((block_S, block_DV), dtype=T.float32)
+            dO_fragment_t = T.alloc_fragment((block_DV, block_S), dtype=T.float32)
             K_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
 
             Q_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
-            Q_shared_fp32 = T.alloc_shared((block_S, DK), dtype="float32")
             W_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
 
-            G_last_local = T.alloc_local((1), dtype=gate_dtype)
-            G_last_local_exp = T.alloc_local((1), dtype=gate_dtype)
             G_shared = T.alloc_shared((block_S), dtype=gate_dtype, scope="shared")
             G_fragment = T.alloc_fragment((block_S), dtype=gate_dtype)
             G_fragment_post = T.alloc_fragment((block_S), dtype=gate_dtype)
@@ -269,20 +266,15 @@ def kernel(
 
             T.use_swizzle(10)
 
-            T.annotate_layout({
-                b_dh_shared: tilelang.layout.make_swizzled_layout(b_dh_shared),
-                b_dh_shared_fp32: tilelang.layout.make_swizzled_layout(b_dh_shared_fp32),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                dO_shared: tilelang.layout.make_swizzled_layout(dO_shared),
-                dO_shared_t: tilelang.layout.make_swizzled_layout(dO_shared_t),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
-                Q_shared_fp32: tilelang.layout.make_swizzled_layout(Q_shared_fp32),
-                W_shared: tilelang.layout.make_swizzled_layout(W_shared),
-            })
+            T.annotate_layout(
+                {
+                    dO_shared: tilelang.layout.make_swizzled_layout(dO_shared),
+                    Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
+                }
+            )
 
             if use_final_state_gradient:
-                T.copy(dht[bb, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV], b_dh_shared)
+                T.copy(dht[bb, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV], b_dh_shared)
                 T.copy(b_dh_shared, b_dh_fragment)
             else:
                 T.clear(b_dh_fragment)
@@ -293,57 +285,45 @@ def kernel(
 
                 # Store the updated dh
                 T.copy(b_dh_fragment, b_dh_shared)
-                T.copy(b_dh_shared, dh[bb, i_s_inv, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV])
+                T.copy(b_dh_shared, dh[bb, i_s_inv, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV])
 
                 # Update dv
-                T.copy(K[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh, 0:DK], K_shared)
+                T.copy(K[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, 0:DK], K_shared)
                 T.gemm(K_shared, b_dh_shared, dv_fragment, clear_accum=True)
 
                 if use_g:
-                    T.copy(
-                        G[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh],
-                        G_shared,
-                        disable_tma=True)
+                    T.copy(G[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh], G_shared, disable_tma=True)
                     T.copy(G_shared, G_fragment)
-                    G_last_local[0] = G_shared[block_S - 1]
-                    G_last_local_exp[0] = T.exp(G_last_local[0])
+                    G_last_local = G_shared[block_S - 1]
+                    G_last_local_exp = T.exp(G_last_local)
                     for i_s2 in T.Parallel(block_S):
-                        G_fragment_post[i_s2] = T.exp(G_last_local[0] - G_fragment[i_s2])
+                        G_fragment_post[i_s2] = T.exp(G_last_local - G_fragment[i_s2])
                     for i_s2, i_v in T.Parallel(block_S, block_DV):
-                        # with T.If(G_last_local[0] - G_shared[i_s2] <= 0):
-                        with T.If(G_last_local[0] - G_fragment[i_s2] <= 0):
-                            with T.Then():
-                                dv_fragment[i_s2,
-                                            i_v] = dv_fragment[i_s2, i_v] * G_fragment_post[i_s2]
-                            with T.Else():
-                                dv_fragment[i_s2, i_v] = 0
-
-                T.copy(
-                    dv[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh,
-                       bv * block_DV:(bv + 1) * block_DV], dv_shared)
+                        dv_fragment[i_s2, i_v] = (
+                            dv_fragment[i_s2, i_v] * G_fragment_post[i_s2] if G_last_local - G_fragment[i_s2] <= 0 else 0
+                        )
+
+                T.copy(dv[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV], dv_shared)
                 T.copy(dv_shared, dv_fragment_2)
                 for i_s2, i_v in T.Parallel(block_S, block_DV):
                     dv_fragment[i_s2, i_v] = dv_fragment[i_s2, i_v] + dv_fragment_2[i_s2, i_v]
 
                 # Store the updated dv
                 T.copy(dv_fragment, dv_shared)
-                T.copy(
-                    dv_shared, dv2[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh,
-                                   bv * block_DV:(bv + 1) * block_DV])
+                T.copy(dv_shared, dv2[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV])
 
                 # Update dh
-                T.copy(Q[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh, 0:DK], Q_shared)
-                T.copy(W[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh, 0:DK], W_shared)
+                T.copy(Q[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, 0:DK], Q_shared)
+                T.copy(W[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, 0:DK], W_shared)
 
                 T.clear(Q_fragment)
                 if use_g:
                     for i_k, i_v in T.Parallel(DK, block_DV):
-                        b_dh_fragment[i_k, i_v] *= G_last_local_exp[0]
+                        b_dh_fragment[i_k, i_v] *= G_last_local_exp
                     T.copy(Q_shared, Q_fragment)
                     for i_s2 in T.Parallel(block_S):
                         G_fragment_exp[i_s2] = T.exp(G_shared[i_s2])
                     for i_s2, i_k in T.Parallel(block_S, DK):
-                        # Q_fragment[i_s2, i_k] = Q_fragment[i_s2, i_k] * T.exp(G_shared[i_s2]) * scale
                         Q_fragment[i_s2, i_k] = Q_fragment[i_s2, i_k] * G_fragment_exp[i_s2] * scale
                 else:
                     T.copy(Q_shared, Q_fragment)
@@ -353,9 +333,7 @@ def kernel(
                 for i_s2, i_k in T.Parallel(block_S, DK):
                     Q_fragment_t[i_k, i_s2] = Q_fragment[i_s2, i_k]
 
-                T.copy(
-                    dO[bb, i_s_inv * block_S:(i_s_inv + 1) * block_S, bh,
-                       bv * block_DV:(bv + 1) * block_DV], dO_shared)
+                T.copy(dO[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV], dO_shared)
                 T.copy(dO_shared, dO_fragment)
                 for i_s2, i_v in T.Parallel(block_S, block_DV):
                     dO_fragment_t[i_v, i_s2] = dO_fragment[i_s2, i_v]
@@ -369,7 +347,7 @@ def kernel(
                     b_dh_fragment[i_k, i_v] += b_dh_fragment_1[i_k, i_v] - b_dh_fragment_2[i_k, i_v]
 
             if use_initial_state:
-                T.copy(b_dh_fragment, dh0[bb, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV])
+                T.copy(b_dh_fragment, dh0[bb, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV])
 
     return kernel
 
@@ -444,44 +422,61 @@ def run_test(
     num_stages=0,
     use_torch=False,
 ):
-    Q, K, W, G, h0, dht, dO, dv = prepare_input(B, S, H, DK, DV, chunk_size,
-                                                getattr(torch, input_dtype),
-                                                getattr(torch, output_dtype),
-                                                getattr(torch, accum_dtype),
-                                                getattr(torch, gate_dtype),
-                                                getattr(torch, state_dtype))
-    dh_ref, dh0_ref, dv2_ref = prepare_output(B, S, H, DK, DV, chunk_size,
-                                              getattr(torch, output_dtype),
-                                              getattr(torch, gate_dtype),
-                                              getattr(torch, state_dtype))
-    dh_tilelang, dh0_tilelang, dv2_tilelang = prepare_output(B, S, H, DK, DV, chunk_size,
-                                                             getattr(torch, output_dtype),
-                                                             getattr(torch, gate_dtype),
-                                                             getattr(torch, state_dtype))
+    Q, K, W, G, h0, dht, dO, dv = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+    dh_ref, dh0_ref, dv2_ref = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
+    dh_tilelang, dh0_tilelang, dv2_tilelang = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
 
     # fla ref
     print("fla running...", flush=True)
     if use_g:
-        dh_ref, dh0_ref, dv2_ref = chunk_gated_delta_rule_bwd_dhu(Q, K, W, G, h0, dht, dO, dv,
-                                                                  scale)
+        dh_ref, dh0_ref, dv2_ref = chunk_gated_delta_rule_bwd_dhu(Q, K, W, G, h0, dht, dO, dv, scale)
     else:
         G = G.fill_(0)
-        dh_ref, dh0_ref, dv2_ref = chunk_gated_delta_rule_bwd_dhu(Q, K, W, G, h0, dht, dO, dv,
-                                                                  scale)
+        dh_ref, dh0_ref, dv2_ref = chunk_gated_delta_rule_bwd_dhu(Q, K, W, G, h0, dht, dO, dv, scale)
 
     # tilelang
     print("tilelang running...", flush=True)
-    kernel = tilelang_chunk_gated_delta_rule_bwd_dhu(B, S, H, DK, DV, input_dtype, output_dtype,
-                                                     accum_dtype, gate_dtype, state_dtype,
-                                                     chunk_size, scale, use_g, use_initial_state,
-                                                     use_final_state_gradient, block_DV, threads,
-                                                     num_stages)
+    kernel = tilelang_chunk_gated_delta_rule_bwd_dhu(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        scale,
+        use_g,
+        use_initial_state,
+        use_final_state_gradient,
+        block_DV,
+        threads,
+        num_stages,
+    )
     # kernel = tilelang.compile(program)
     print(kernel.get_kernel_source())
     dh_tilelang, dh0_tilelang, dv2_tilelang = kernel(Q, K, W, G, h0, dht, dO, dv)
 
-    fla_time = do_bench(
-        chunk_gated_delta_rule_bwd_dhu, Q, K, W, G, h0, dht, dO, dv, scale, chunk_size=chunk_size)
+    fla_time = do_bench(chunk_gated_delta_rule_bwd_dhu, Q, K, W, G, h0, dht, dO, dv, scale, chunk_size=chunk_size)
     tilelang_time = do_bench(kernel, Q, K, W, G, h0, dht, dO, dv)
 
     print(f"fla time: {fla_time} ms")
@@ -496,19 +491,47 @@ def run_test(
         print("torch running...", flush=True)
         if use_g:
             dh_ref_torch, dh0_ref_torch, dv2_ref_torch = torch_chunk_gated_delta_rule_bwd_dhu(
-                Q, K, W, G, h0, dht, dO, dv, scale, use_g, use_initial_state,
-                use_final_state_gradient, getattr(torch, input_dtype), getattr(torch, output_dtype),
-                getattr(torch, accum_dtype), getattr(torch,
-                                                     gate_dtype), getattr(torch, state_dtype))
+                Q,
+                K,
+                W,
+                G,
+                h0,
+                dht,
+                dO,
+                dv,
+                scale,
+                use_g,
+                use_initial_state,
+                use_final_state_gradient,
+                getattr(torch, input_dtype),
+                getattr(torch, output_dtype),
+                getattr(torch, accum_dtype),
+                getattr(torch, gate_dtype),
+                getattr(torch, state_dtype),
+            )
             dh_ref_torch = dh_ref_torch.cuda()
             dh0_ref_torch = dh0_ref_torch.cuda()
             dv2_ref_torch = dv2_ref_torch.cuda()
         else:
             dh_ref_torch, dh0_ref_torch, dv2_ref_torch = torch_chunk_gated_delta_rule_bwd_dhu(
-                Q, K, W, None, h0, dht, dO, dv, scale, use_g, use_initial_state,
-                use_final_state_gradient, getattr(torch, input_dtype), getattr(torch, output_dtype),
-                getattr(torch, accum_dtype), getattr(torch,
-                                                     gate_dtype), getattr(torch, state_dtype))
+                Q,
+                K,
+                W,
+                None,
+                h0,
+                dht,
+                dO,
+                dv,
+                scale,
+                use_g,
+                use_initial_state,
+                use_final_state_gradient,
+                getattr(torch, input_dtype),
+                getattr(torch, output_dtype),
+                getattr(torch, accum_dtype),
+                getattr(torch, gate_dtype),
+                getattr(torch, state_dtype),
+            )
             dh_ref_torch = dh_ref_torch.cuda()
             dh0_ref_torch = dh0_ref_torch.cuda()
             dv2_ref_torch = dv2_ref_torch.cuda()
@@ -554,11 +577,11 @@ def main():
         H=8,
         DK=DK,
         DV=128,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        accum_dtype="float32",
-        gate_dtype="float32",
-        state_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        accum_dtype=T.float32,
+        gate_dtype=T.float32,
+        state_dtype=T.float32,
         chunk_size=64,
         scale=DK**-0.5,
         use_g=True,
diff --git a/examples/gdn/example_chunk_delta_h.py b/examples/gdn/example_chunk_delta_h.py
index 4d6b657ff..2ee84e7bf 100644
--- a/examples/gdn/example_chunk_delta_h.py
+++ b/examples/gdn/example_chunk_delta_h.py
@@ -3,12 +3,14 @@
 import sys  # noqa: F401
 import tilelang
 import tilelang.language as T
+from tilelang.autotuner import autotune
 
 # Add your fla repository path to sys.path
 # Currently we use the fla repository from the flash-linear-attention project at commit id f03cb3ae
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.common.chunk_delta_h import chunk_gated_delta_rule_fwd_h
 except ImportError:
@@ -19,7 +21,7 @@
 import torch.nn.functional as F
 from tilelang.engine.callback import register_cuda_postproc_callback  # noqa: F401
 
-from utils import *
+from test_utils import assert_similar
 
 # (zhengju) We can slightly modify the generated cuda code from tilelang lowering
 # in the debug folder to make the performance better. To enable this callback,
@@ -55,6 +57,7 @@ def prepare_input(
     G = F.logsigmoid(G)
     try:
         from fla.ops.utils.cumsum import chunk_local_cumsum
+
         G = chunk_local_cumsum(G, chunk_size)
     except ImportError:
         print("fla not found, skip cumsum")
@@ -80,7 +83,21 @@ def prepare_output(
     return h, final_state, V_new
 
 
-@tilelang.jit(out_idx=[-3, -2, -1])
+def get_configs():
+    import itertools
+
+    block_DK = [32, 64, 128]
+    block_DV = [32, 64, 128]
+    threads = [128, 256]
+    num_stages = [1, 2, 3]
+    _configs = list(itertools.product(block_DK, block_DV, threads, num_stages))
+
+    configs = [{"block_DK": c[0], "block_DV": c[1], "threads": c[2], "num_stages": c[3]} for c in _configs]
+    return configs
+
+
+@autotune(configs=get_configs(), warmup=3, rep=5)
+@tilelang.jit(out_idx=[-3, -2, -1], pass_configs={tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True})
 def tilelang_chunk_gated_delta_rule_fwd_h(
     # task config
     B,
@@ -94,15 +111,15 @@ def tilelang_chunk_gated_delta_rule_fwd_h(
     gate_dtype,
     state_dtype,
     chunk_size,
-    use_g=True,
-    use_initial_state=True,
-    store_final_state=True,
-    save_new_value=True,
+    use_g,
+    use_initial_state,
+    store_final_state,
+    save_new_value,
     # kernel config
     block_DK=64,
-    block_DV=64,
-    threads=256,
-    num_stages=0,
+    block_DV=32,
+    threads=128,
+    num_stages=1,
 ):
     block_S = chunk_size
     BS = S // block_S
@@ -118,14 +135,14 @@ def tilelang_chunk_gated_delta_rule_fwd_h(
 
     @T.prim_func
     def kernel(
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            W: T.Tensor(W_shape, dtype=input_dtype),
-            U: T.Tensor(U_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            initial_state: T.Tensor(initial_state_shape, dtype=input_dtype),
-            h: T.Tensor(h_shape, dtype=output_dtype),
-            final_state: T.Tensor(final_state_shape, dtype=state_dtype),
-            V_new: T.Tensor(V_shape, dtype=output_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        W: T.Tensor(W_shape, dtype=input_dtype),
+        U: T.Tensor(U_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        initial_state: T.Tensor(initial_state_shape, dtype=input_dtype),
+        h: T.Tensor(h_shape, dtype=output_dtype),
+        final_state: T.Tensor(final_state_shape, dtype=state_dtype),
+        V_new: T.Tensor(V_shape, dtype=output_dtype),
     ):
         with T.Kernel(T.ceildiv(DV, block_DV), B * H, threads=threads) as (bv, bbh):
             bb, bh = bbh // H, bbh % H
@@ -139,39 +156,35 @@ def kernel(
             V_new_fragment = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
             V_new_shared = T.alloc_shared((block_S, block_DV), dtype=output_dtype)
             K_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
-            G_last_local = T.alloc_local((1), dtype=gate_dtype)
+            G_last_local = T.alloc_var(T.float32)
             G_shared = T.alloc_shared((block_S, block_DV), dtype=gate_dtype)
             G_fragment = T.alloc_fragment((block_S, block_DV), dtype=gate_dtype)
 
-            T.annotate_layout({
-                b_h_shared: tilelang.layout.make_swizzled_layout(b_h_shared),
-                U_shared: tilelang.layout.make_swizzled_layout(U_shared),
-                W_shared: tilelang.layout.make_swizzled_layout(W_shared),
-                V_new_shared: tilelang.layout.make_swizzled_layout(V_new_shared),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                G_shared: tilelang.layout.make_swizzled_layout(G_shared),
-            })
+            T.annotate_layout(
+                {
+                    U_shared: tilelang.layout.make_swizzled_layout(U_shared),
+                    G_shared: tilelang.layout.make_swizzled_layout(G_shared),
+                }
+            )
 
             T.use_swizzle(10)
 
             if use_initial_state:
-                T.copy(initial_state[bb, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV], b_h_shared)
+                T.copy(initial_state[bb, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV], b_h_shared)
                 T.copy(b_h_shared, b_h_fragment)
             else:
                 T.clear(b_h_fragment)
 
             for i_s in T.Pipelined(T.ceildiv(S, block_S), num_stages=num_stages):
                 # Store previous result to the hidden tensor, like the epilogue
-                T.copy(b_h_shared, h[bb, i_s, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV])
+                T.copy(b_h_shared, h[bb, i_s, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV])
 
                 # Recurrence
-                T.copy(W[bb, i_s * block_S:(i_s + 1) * block_S, bh, 0:DK], W_shared)
+                T.copy(W[bb, i_s * block_S : (i_s + 1) * block_S, bh, 0:DK], W_shared)
                 T.gemm(W_shared, b_h_shared, V_new_fragment, clear_accum=True)
 
                 # U - W * S
-                T.copy(
-                    U[bb, i_s * block_S:(i_s + 1) * block_S, bh, bv * block_DV:(bv + 1) * block_DV],
-                    U_shared)
+                T.copy(U[bb, i_s * block_S : (i_s + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV], U_shared)
                 T.copy(U_shared, U_fragment)
                 for i_s2, i_v in T.Parallel(block_S, block_DV):
                     V_new_fragment[i_s2, i_v] = -V_new_fragment[i_s2, i_v] + U_fragment[i_s2, i_v]
@@ -179,27 +192,24 @@ def kernel(
                 # Save V_new
                 if save_new_value:
                     T.copy(V_new_fragment, dst=V_new_shared)
-                    T.copy(
-                        V_new_shared, V_new[bb, i_s * block_S:(i_s + 1) * block_S, bh,
-                                            bv * block_DV:(bv + 1) * block_DV])
+                    T.copy(V_new_shared, V_new[bb, i_s * block_S : (i_s + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV])
 
-                T.copy(K[bb, i_s * block_S:(i_s + 1) * block_S, bh, 0:DK], K_shared)
+                T.copy(K[bb, i_s * block_S : (i_s + 1) * block_S, bh, 0:DK], K_shared)
                 # use_g
                 if use_g:
-                    G_last_local[0] = G[bb, (i_s + 1) * block_S - 1, bh]
+                    G_last_local = G[bb, (i_s + 1) * block_S - 1, bh]
                     for i_s2, i_v in T.Parallel(block_S, block_DV):
                         G_shared[i_s2, i_v] = G[bb, i_s * block_S + i_s2, bh]
                     T.copy(G_shared, G_fragment)
                     for i_s2, i_v in T.Parallel(block_S, block_DV):
-                        with T.If(G_last_local[0] - G_fragment[i_s2, i_v] <= 0):
-                            with T.Then():
-                                V_new_fragment[i_s2, i_v] = V_new_fragment[i_s2, i_v] * T.exp(
-                                    G_last_local[0] - G_fragment[i_s2, i_v])
-                            with T.Else():
-                                V_new_fragment[i_s2, i_v] = 0
-                    G_last_local[0] = T.exp(G_last_local[0])
+                        V_new_fragment[i_s2, i_v] = (
+                            V_new_fragment[i_s2, i_v] * T.exp2((G_last_local - G_fragment[i_s2, i_v]) * 1.442695)
+                            if G_last_local - G_fragment[i_s2, i_v] <= 0
+                            else 0
+                        )
+                    G_last_local = T.exp2(G_last_local * 1.442695)
                     for i_k, i_v in T.Parallel(DK, block_DV):
-                        b_h_fragment[i_k, i_v] *= G_last_local[0]
+                        b_h_fragment[i_k, i_v] *= G_last_local
 
                 # Update intermediate results
                 T.copy(V_new_fragment, V_new_shared)
@@ -209,7 +219,7 @@ def kernel(
 
             # Save final state
             if store_final_state:
-                T.copy(b_h_fragment, final_state[bb, bh, 0:DK, bv * block_DV:(bv + 1) * block_DV])
+                T.copy(b_h_fragment, final_state[bb, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV])
 
     return kernel
 
@@ -260,47 +270,77 @@ def run_test(
     threads=128,
     num_stages=0,
 ):
-    K, W, U, G, initial_state = prepare_input(B, S, H, DK, DV, chunk_size,
-                                              getattr(torch, input_dtype),
-                                              getattr(torch, output_dtype),
-                                              getattr(torch, accum_dtype),
-                                              getattr(torch, gate_dtype))
-    h_ref, final_state_ref, V_new_ref = prepare_output(B, S, H, DK, DV, chunk_size,
-                                                       getattr(torch, output_dtype),
-                                                       getattr(torch, state_dtype))
-    h_tilelang, final_state_tilelang, V_new_tilelang = prepare_output(B, S, H, DK, DV, chunk_size,
-                                                                      getattr(torch, output_dtype),
-                                                                      getattr(torch, state_dtype))
+    K, W, U, G, initial_state = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+    )
+    h_ref, final_state_ref, V_new_ref = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, state_dtype)
+    )
+    h_tilelang, final_state_tilelang, V_new_tilelang = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, state_dtype)
+    )
 
     # fla ref
-    h_ref, V_new_ref, final_state_ref = chunk_gated_delta_rule_fwd_h(K, W, U, G, initial_state,
-                                                                     store_final_state, chunk_size,
-                                                                     save_new_value)
+    h_ref, V_new_ref, final_state_ref = chunk_gated_delta_rule_fwd_h(
+        k=K,
+        w=W,
+        u=U,
+        g=G,
+        initial_state=initial_state,
+        output_final_state=store_final_state,
+        chunk_size=chunk_size,
+        save_new_value=save_new_value,
+    )
 
     # tilelang
-    kernel = tilelang_chunk_gated_delta_rule_fwd_h(B, S, H, DK, DV, input_dtype, output_dtype,
-                                                   accum_dtype, gate_dtype, state_dtype, chunk_size,
-                                                   use_g, use_initial_state, store_final_state,
-                                                   save_new_value, block_DK, block_DV, threads,
-                                                   num_stages)
+    kernel = tilelang_chunk_gated_delta_rule_fwd_h(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        use_g,
+        use_initial_state,
+        store_final_state,
+        save_new_value,
+    )
     h_tilelang, final_state_tilelang, V_new_tilelang = kernel(K, W, U, G, initial_state)
     # (zhengju) If you want to print the generated cuda code, you can uncomment the following line
     # print("CUDA Code:\n", kernel.get_kernel_source())
 
-    fla_time = do_bench(chunk_gated_delta_rule_fwd_h, K, W, U, G, initial_state, store_final_state,
-                        chunk_size, save_new_value)
+    fla_time = do_bench(
+        chunk_gated_delta_rule_fwd_h,
+        k=K,
+        w=W,
+        u=U,
+        g=G,
+        initial_state=initial_state,
+        output_final_state=store_final_state,
+        chunk_size=chunk_size,
+        save_new_value=save_new_value,
+    )
     tilelang_time = do_bench(kernel, K, W, U, G, initial_state)
 
     # check correctness
     try:
         h_ref_fp32 = h_ref.to(torch.float32)
         h_tilelang_fp32 = h_tilelang.to(torch.float32)
-        assert_similar(
-            h_ref_fp32,
-            h_tilelang_fp32,
-            eps=1e-5,
-            name="tilelang chunk gated delta rule fwd h",
-            raise_assert=False)
+        assert_similar(h_ref_fp32, h_tilelang_fp32, eps=1e-5, name="tilelang chunk gated delta rule fwd h", raise_assert=False)
         print("tilelang chunk gated delta rule fwd h passed √")
     except Exception as e:
         print("tilelang chunk gated delta rule fwd h failed ✗")
@@ -314,7 +354,8 @@ def run_test(
             final_state_tilelang_fp32,
             eps=1e-5,
             name="tilelang chunk gated delta rule fwd final_state",
-            raise_assert=False)
+            raise_assert=False,
+        )
         print("tilelang chunk gated delta rule fwd final_state passed √")
     except Exception as e:
         print("tilelang chunk gated delta rule fwd final_state failed ✗")
@@ -323,12 +364,7 @@ def run_test(
     try:
         V_new_ref_fp32 = V_new_ref.to(torch.float32)
         V_new_tilelang_fp32 = V_new_tilelang.to(torch.float32)
-        assert_similar(
-            V_new_ref_fp32,
-            V_new_tilelang_fp32,
-            eps=1e-5,
-            name="tilelang chunk gated delta rule fwd V_new",
-            raise_assert=False)
+        assert_similar(V_new_ref_fp32, V_new_tilelang_fp32, eps=1e-5, name="tilelang chunk gated delta rule fwd V_new", raise_assert=False)
         print("tilelang chunk gated delta rule fwd V_new passed √")
     except Exception as e:
         print("tilelang chunk gated delta rule fwd V_new failed ✗")
@@ -345,20 +381,20 @@ def main():
         H=32,
         DK=128,
         DV=128,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        accum_dtype="float32",
-        gate_dtype="float32",
-        state_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        accum_dtype=T.float32,
+        gate_dtype=T.float32,
+        state_dtype=T.float32,
         chunk_size=64,
         use_g=True,
-        use_initial_state=True,
+        use_initial_state=False,
         store_final_state=True,
         save_new_value=True,
-        block_DK=64,
+        block_DK=32,
         block_DV=32,
         threads=128,
-        num_stages=1,
+        num_stages=2,
     )
 
 
diff --git a/examples/gdn/example_chunk_o.py b/examples/gdn/example_chunk_o.py
index 1c084be70..a4d7281f5 100644
--- a/examples/gdn/example_chunk_o.py
+++ b/examples/gdn/example_chunk_o.py
@@ -9,6 +9,7 @@
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.common.chunk_o import chunk_fwd_o
 except ImportError:
@@ -87,16 +88,14 @@ def tilelang_chunk_fwd_o(
 
     @T.prim_func
     def kernel(
-            Q: T.Tensor(Q_shape, dtype=input_dtype),
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            V: T.Tensor(V_shape, dtype=input_dtype),
-            HIDDEN: T.Tensor(H_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            O: T.Tensor(O_shape, dtype=output_dtype),
+        Q: T.Tensor(Q_shape, dtype=input_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        HIDDEN: T.Tensor(H_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        O: T.Tensor(O_shape, dtype=output_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(DV, block_DV), T.ceildiv(S, block_S), B * H,
-                threads=threads) as (bv, bs, bbh):
+        with T.Kernel(T.ceildiv(DV, block_DV), T.ceildiv(S, block_S), B * H, threads=threads) as (bv, bs, bbh):
             bb, bh = bbh // H, bbh % H
             Q_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
             K_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
@@ -109,28 +108,13 @@ def kernel(
             G_shared = T.alloc_shared((block_S,), dtype=gate_dtype, scope="shared")
             G_diff_local = T.alloc_fragment((block_S, block_S), dtype=gate_dtype)
 
-            T.annotate_layout({
-                Q_shared: tilelang.layout.make_swizzled_layout(Q_shared),
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                V_shared: tilelang.layout.make_swizzled_layout(V_shared),
-                H_shared: tilelang.layout.make_swizzled_layout(H_shared),
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-            })
-
             T.clear(A_fragment)
             T.clear(O_fragment)
             T.disable_warp_group_reg_alloc()
             for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
-                T.copy(
-                    Q[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
-                    Q_shared)
-                T.copy(
-                    K[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
-                    K_shared)
-                T.copy(
-                    HIDDEN[bb, bs, bh, i_k * block_DK:(i_k + 1) * block_DK,
-                           bv * block_DV:(bv + 1) * block_DV], H_shared)
+                T.copy(Q[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], Q_shared)
+                T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], K_shared)
+                T.copy(HIDDEN[bb, bs, bh, i_k * block_DK : (i_k + 1) * block_DK, bv * block_DV : (bv + 1) * block_DV], H_shared)
                 T.gemm(Q_shared, H_shared, O_fragment)
                 T.gemm(Q_shared, K_shared, A_fragment, transpose_B=True)
 
@@ -145,8 +129,7 @@ def kernel(
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
                     with T.If(G_diff_local[i_s1, i_s2] <= 0):
                         with T.Then():
-                            A_fragment[i_s1, i_s2] = A_fragment[i_s1, i_s2] * T.exp(
-                                G_diff_local[i_s1, i_s2])
+                            A_fragment[i_s1, i_s2] = A_fragment[i_s1, i_s2] * T.exp(G_diff_local[i_s1, i_s2])
                         with T.Else():
                             A_fragment[i_s1, i_s2] = 0
 
@@ -155,8 +138,7 @@ def kernel(
                     with T.Then():
                         A_fragment[i_s1, i_s2] = 0
 
-            T.copy(V[bb, bs * block_S:(bs + 1) * block_S, bh, bv * block_DV:(bv + 1) * block_DV],
-                   V_shared)
+            T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV], V_shared)
             T.copy(A_fragment, A_shared)
             T.gemm(A_shared, V_shared, O_fragment)
 
@@ -164,8 +146,7 @@ def kernel(
                 O_fragment[i_s, i_v] = O_fragment[i_s, i_v] * scale
 
             T.copy(O_fragment, O_shared)
-            T.copy(O_shared, O[bb, bs * block_S:(bs + 1) * block_S, bh,
-                               bv * block_DV:(bv + 1) * block_DV])
+            T.copy(O_shared, O[bb, bs * block_S : (bs + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV])
 
     return kernel
 
@@ -191,8 +172,9 @@ def run_test(
     output_dtype_torch = getattr(torch, output_dtype)
     accum_dtype_torch = getattr(torch, accum_dtype)
     gate_dtype_torch = getattr(torch, gate_dtype)
-    Q, K, V, HIDDEN, G = prepare_input(B, S, H, DK, DV, chunk_size, input_dtype_torch,
-                                       output_dtype_torch, accum_dtype_torch, gate_dtype_torch)
+    Q, K, V, HIDDEN, G = prepare_input(
+        B, S, H, DK, DV, chunk_size, input_dtype_torch, output_dtype_torch, accum_dtype_torch, gate_dtype_torch
+    )
     scale = 1.0 / DK**0.5
 
     O_ref = prepare_output(B, S, H, DK, DV, chunk_size, output_dtype_torch)
@@ -200,9 +182,25 @@ def run_test(
 
     block_S = chunk_size
     O_tilelang = prepare_output(B, S, H, DK, DV, chunk_size, output_dtype_torch)
-    kernel = tilelang_chunk_fwd_o(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                  gate_dtype, chunk_size, scale, use_g, block_S, block_DK, block_DV,
-                                  threads, num_stages)
+    kernel = tilelang_chunk_fwd_o(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        chunk_size,
+        scale,
+        use_g,
+        block_S,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
     O_tilelang = kernel(Q, K, V, HIDDEN, G)
 
     try:
@@ -221,10 +219,10 @@ def main():
         DK=128,
         DV=128,
         chunk_size=64,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        accum_dtype="float32",
-        gate_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        accum_dtype=T.float32,
+        gate_dtype=T.float32,
         use_g=True,
         block_DK=128,
         block_DV=128,
diff --git a/examples/gdn/example_chunk_o_bwd.py b/examples/gdn/example_chunk_o_bwd.py
index 76b4792df..e589818f4 100644
--- a/examples/gdn/example_chunk_o_bwd.py
+++ b/examples/gdn/example_chunk_o_bwd.py
@@ -7,13 +7,12 @@
 import tilelang.language as T
 from tilelang.engine.callback import register_cuda_postproc_callback  # noqa: F401
 
-print(tilelang.__file__)
-
 # Add your fla repository path to sys.path
 # Currently we use the fla repository from the flash-linear-attention project at commit id f03cb3ae
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.common.chunk_o import chunk_bwd_dqkwg
 except ImportError:
@@ -21,7 +20,7 @@
     fla = None
 
 import torch
-from utils import *
+from test_utils import assert_similar
 
 torch.random.manual_seed(0)
 # torch.set_printoptions(profile="full")
@@ -110,10 +109,8 @@ def prepare_output(
 
 @tilelang.jit(
     out_idx=[-4, -3, -2, -1],
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
-    })
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+)
 def tilelang_chunk_o_bwd_dqkwg(
     # task config
     B,
@@ -157,25 +154,23 @@ def tilelang_chunk_o_bwd_dqkwg(
 
     @T.prim_func
     def kernel(
-            # input
-            Q: T.Tensor(Q_shape, dtype=input_dtype),
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            V: T.Tensor(V_shape, dtype=input_dtype),
-            h: T.Tensor(h_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            dO: T.Tensor(dO_shape, dtype=input_dtype),
-            dh: T.Tensor(dh_shape, dtype=input_dtype),
-            dv: T.Tensor(dv_shape, dtype=input_dtype),
-            W: T.Tensor(W_shape, dtype=input_dtype),
-            # output
-            dq: T.Tensor(dq_shape, dtype=output_dtype),
-            dk: T.Tensor(dk_shape, dtype=output_dtype),
-            dw: T.Tensor(dw_shape, dtype=output_dtype),
-            dg: T.Tensor(dg_shape, dtype=gate_dtype),
+        # input
+        Q: T.Tensor(Q_shape, dtype=input_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        h: T.Tensor(h_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        dO: T.Tensor(dO_shape, dtype=input_dtype),
+        dh: T.Tensor(dh_shape, dtype=input_dtype),
+        dv: T.Tensor(dv_shape, dtype=input_dtype),
+        W: T.Tensor(W_shape, dtype=input_dtype),
+        # output
+        dq: T.Tensor(dq_shape, dtype=output_dtype),
+        dk: T.Tensor(dk_shape, dtype=output_dtype),
+        dw: T.Tensor(dw_shape, dtype=output_dtype),
+        dg: T.Tensor(dg_shape, dtype=gate_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(DK, block_DK), T.ceildiv(S, block_S), B * H,
-                threads=threads) as (bk, bs, bbh):
+        with T.Kernel(T.ceildiv(DK, block_DK), T.ceildiv(S, block_S), B * H, threads=threads) as (bk, bs, bbh):
             bb, bh = bbh // H, bbh % H
 
             V_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
@@ -204,27 +199,27 @@ def kernel(
             dg_fragment = T.alloc_fragment((block_S,), dtype=gate_dtype)
             dg_fragment_2 = T.alloc_fragment((block_S,), dtype=gate_dtype)
             dg_fragment_final = T.alloc_fragment((block_S,), dtype=gate_dtype)
-            dg_last_local = T.alloc_local((2,), dtype=gate_dtype)
+            dg_last_local_0 = T.alloc_var(dtype=gate_dtype)
+            dg_last_local_1 = T.alloc_var(dtype=gate_dtype)
+            G_last_local = T.alloc_var(dtype=gate_dtype)
+
             dg_last_fragment = T.alloc_fragment((block_DV * block_DK), dtype=gate_dtype)
             dg_last_fragment_scalar = T.alloc_fragment((1,), dtype=gate_dtype)
             dg_last_fragment_2 = T.alloc_fragment((block_S * block_DK), dtype=gate_dtype)
             dg_last_fragment_scalar_2 = T.alloc_fragment((1,), dtype=gate_dtype)
-            G_shared = T.alloc_shared((block_S, block_DK), dtype=gate_dtype, scope="shared")
-            G_last_local = T.alloc_local((1,), dtype=gate_dtype)
+            G_shared = T.alloc_shared((block_S, block_DK), dtype=gate_dtype)
 
             T.use_swizzle(10)
 
-            T.annotate_layout({
-                V_shared: tilelang.layout.make_swizzled_layout(V_shared),
-                dO_shared: tilelang.layout.make_swizzled_layout(dO_shared),
-                h_shared: tilelang.layout.make_swizzled_layout(h_shared),
-                dh_shared: tilelang.layout.make_swizzled_layout(dh_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared),
-                q_shared: tilelang.layout.make_swizzled_layout(q_shared),
-                k_shared: tilelang.layout.make_swizzled_layout(k_shared),
-            })
-
-            T.clear(dg_last_local)
+            T.annotate_layout(
+                {
+                    q_shared: tilelang.layout.make_swizzled_layout(q_shared),
+                    k_shared: tilelang.layout.make_swizzled_layout(k_shared),
+                }
+            )
+
+            T.clear(dg_last_local_0)
+            T.clear(dg_last_local_1)
             T.clear(G_last_local)
             T.clear(G_shared)
             T.clear(q_fragment)
@@ -237,18 +232,10 @@ def kernel(
             T.clear(dw_fragment)
 
             for i_v in T.Pipelined(T.ceildiv(DV, block_DV), num_stages=num_stages):
-                T.copy(
-                    V[bb, bs * block_S:(bs + 1) * block_S, bh, i_v * block_DV:(i_v + 1) * block_DV],
-                    V_shared)
-                T.copy(
-                    dO[bb, bs * block_S:(bs + 1) * block_S, bh,
-                       i_v * block_DV:(i_v + 1) * block_DV], dO_shared)
-                T.copy(
-                    h[bb, bs, bh, bk * block_DK:(bk + 1) * block_DK,
-                      i_v * block_DV:(i_v + 1) * block_DV], h_shared)
-                T.copy(
-                    dh[bb, bs, bh, bk * block_DK:(bk + 1) * block_DK,
-                       i_v * block_DV:(i_v + 1) * block_DV], dh_shared)
+                T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], V_shared)
+                T.copy(dO[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], dO_shared)
+                T.copy(h[bb, bs, bh, bk * block_DK : (bk + 1) * block_DK, i_v * block_DV : (i_v + 1) * block_DV], h_shared)
+                T.copy(dh[bb, bs, bh, bk * block_DK : (bk + 1) * block_DK, i_v * block_DV : (i_v + 1) * block_DV], dh_shared)
 
                 if use_g:
                     T.clear(dg_last_fragment_scalar)
@@ -256,32 +243,25 @@ def kernel(
                     # for i_kv in T.Parallel(block_DK * block_DV):
                     #     dg_last_fragment[i_kv] = h_shared[i_kv // block_DV, i_kv % block_DV] * dh_shared[i_kv // block_DV, i_kv % block_DV]
                     for i_kv in T.Parallel(block_DK * block_DV):
-                        i_k, i_v = i_kv // block_DV, i_kv % block_DV
-                        dg_last_fragment[i_kv] = h_shared[i_k, i_v] * dh_shared[i_k, i_v]
+                        dg_last_fragment[i_kv] = h_shared[i_kv // block_DV, i_kv % block_DV] * dh_shared[i_kv // block_DV, i_kv % block_DV]
                     T.reduce_sum(dg_last_fragment, dg_last_fragment_scalar, dim=-1, clear=False)
-                    dg_last_local[0] += dg_last_fragment_scalar[0]
+                    dg_last_local_0 = dg_last_local_0 + dg_last_fragment_scalar[0]
 
                 T.gemm(dO_shared, V_shared, ds_fragment, transpose_B=True)
                 T.gemm(dO_shared, h_shared, dq_fragment, transpose_B=True)
                 T.gemm(V_shared, dh_shared, dk_fragment, transpose_B=True)
 
                 if use_dw:
-                    T.copy(
-                        dv[bb, bs * block_S:(bs + 1) * block_S, bh,
-                           i_v * block_DV:(i_v + 1) * block_DV], dv_shared)
+                    T.copy(dv[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], dv_shared)
                     T.gemm(dv_shared, h_shared, dw_fragment, transpose_B=True)
 
             if use_dw:
                 for i_s, i_k in T.Parallel(block_S, block_DK):
                     dw_fragment[i_s, i_k] = -dw_fragment[i_s, i_k]
-                T.copy(
-                    dw_fragment, dw[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    bk * block_DK:(bk + 1) * block_DK])
-
-            T.copy(Q[bb, bs * block_S:(bs + 1) * block_S, bh, bk * block_DK:(bk + 1) * block_DK],
-                   q_shared)
-            T.copy(K[bb, bs * block_S:(bs + 1) * block_S, bh, bk * block_DK:(bk + 1) * block_DK],
-                   k_shared)
+                T.copy(dw_fragment, dw[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
+
+            T.copy(Q[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK], q_shared)
+            T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK], k_shared)
             T.copy(q_shared, q_fragment)
             T.copy(k_shared, k_fragment)
 
@@ -290,13 +270,12 @@ def kernel(
                 T.clear(dg_fragment_2)
                 for i_s, i_k in T.Parallel(block_S, block_DK):
                     G_shared[i_s, i_k] = G[bb, bs * block_S + i_s, bh]
-                G_last_local[0] = G[bb, bs * block_S + block_S - 1, bh]
+                dg_last_local_0 = G[bb, bs * block_S + block_S - 1, bh]
                 # Use gmem directly instead of local register
-                dg_last_local[0] = dg_last_local[0] * T.exp(G[bb, bs * block_S + block_S - 1, bh])
+                dg_last_local_0 = dg_last_local_0 * T.exp(G[bb, bs * block_S + block_S - 1, bh])
 
                 for i_s, i_k in T.Parallel(block_S, block_DK):
-                    dq_fragment[i_s, i_k] = dq_fragment[i_s, i_k] * T.exp(G[bb, bs * block_S + i_s,
-                                                                            bh]) * scale
+                    dq_fragment[i_s, i_k] = dq_fragment[i_s, i_k] * T.exp(G[bb, bs * block_S + i_s, bh]) * scale
                 T.clear(dg_fragment_reduce_tmp)
                 for i_s, i_k in T.Parallel(block_S, block_DK):
                     dg_fragment_reduce_tmp[i_s, i_k] = dq_fragment[i_s, i_k] * q_shared[i_s, i_k]
@@ -304,12 +283,11 @@ def kernel(
                 T.reduce_sum(dg_fragment_reduce_tmp, dg_fragment, dim=-1, clear=False)
 
                 for i_s, i_k in T.Parallel(block_S, block_DK):
-                    with T.If(G_last_local[0] - G[bb, bs * block_S + i_s, bh] <= 0):
-                        with T.Then():
-                            dk_fragment[i_s, i_k] = dk_fragment[i_s, i_k] * T.exp(
-                                G_last_local[0] - G[bb, bs * block_S + i_s, bh])
-                        with T.Else():
-                            dk_fragment[i_s, i_k] = 0
+                    dk_fragment[i_s, i_k] = (
+                        dk_fragment[i_s, i_k] * T.exp(G_last_local - G[bb, bs * block_S + i_s, bh])
+                        if G_last_local - G[bb, bs * block_S + i_s, bh] <= 0
+                        else 0
+                    )
                 T.clear(dg_fragment_reduce_tmp)
                 for i_s, i_k in T.Parallel(block_S, block_DK):
                     dg_fragment_reduce_tmp[i_s, i_k] = dk_fragment[i_s, i_k] * (-k_shared[i_s, i_k])
@@ -323,24 +301,20 @@ def kernel(
                     i_s, i_k = i_sk // block_DK, i_sk % block_DK
                     dg_last_fragment_2[i_sk] = dk_shared[i_s, i_k] * k_shared[i_s, i_k]
                 T.reduce_sum(dg_last_fragment_2, dg_last_fragment_scalar_2, dim=-1, clear=False)
-                dg_last_local[1] = dg_last_fragment_scalar_2[0]
+                dg_last_local_1 = dg_last_fragment_scalar_2[0]
 
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
-                    with T.If(i_s1 >= i_s2 and
-                              G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh] <= 0):
-                        with T.Then():
-                            ds_fragment[i_s1, i_s2] = ds_fragment[
-                                i_s1, i_s2] * T.exp(G[bb, bs * block_S + i_s1, bh] -
-                                                    G[bb, bs * block_S + i_s2, bh]) * scale
-                        with T.Else():
-                            ds_fragment[i_s1, i_s2] = 0
+                    ds_fragment[i_s1, i_s2] = (
+                        (ds_fragment[i_s1, i_s2] * T.exp(G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh]) * scale)
+                        if G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh] <= 0
+                        else 0
+                    )
 
                 T.clear(ds_fragment_positive)
                 T.clear(ds_fragment_positive_transpose)
                 T.gemm(q_shared, k_shared, ds_fragment_positive, transpose_B=True)
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
-                    ds_fragment_positive[
-                        i_s1, i_s2] = ds_fragment[i_s1, i_s2] * ds_fragment_positive[i_s1, i_s2]
+                    ds_fragment_positive[i_s1, i_s2] = ds_fragment[i_s1, i_s2] * ds_fragment_positive[i_s1, i_s2]
 
                 # FIXME: The reduce_sum statement with clear=True will cause an error of warp specialized pass
                 T.reduce_sum(ds_fragment_positive, dg_fragment, dim=1, clear=False)
@@ -362,25 +336,16 @@ def kernel(
                 T.gemm(ds_shared, q_shared, dk_fragment, transpose_A=True)
 
                 for i_s in T.Parallel(block_S):
-                    with T.If(i_s >= block_S - 1):  # noqa: SIM117
-                        with T.Then():
-                            dg_fragment_final[
-                                i_s] = dg_fragment_final[i_s] + dg_last_local[0] + dg_last_local[1]
-
-                T.copy(
-                    dq_fragment, dq[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    bk * block_DK:(bk + 1) * block_DK])
-                T.copy(
-                    dk_fragment, dk[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    bk * block_DK:(bk + 1) * block_DK])
+                    dg_fragment_final[i_s] = dg_fragment_final[i_s] + dg_last_local_0 + dg_last_local_1
+
+                T.copy(dq_fragment, dq[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
+                T.copy(dk_fragment, dk[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
                 for i_s in T.Parallel(block_S):
                     dg[bk, bb, bs * block_S + i_s, bh] = dg_fragment_final[i_s]
 
             else:
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
-                    with T.If(i_s1 < i_s2):  # noqa: SIM117
-                        with T.Then():
-                            ds_fragment[i_s1, i_s2] = 0
+                    ds_fragment[i_s1, i_s2] = 0 if i_s1 < i_s2 else ds_fragment[i_s1, i_s2]
                 T.clear(dk_fragment_2)
                 T.copy(ds_fragment, ds_shared)
                 T.gemm(ds_shared, k_shared, dq_fragment)
@@ -388,12 +353,8 @@ def kernel(
                 for i_s, i_k in T.Parallel(block_S, block_DK):
                     dq_fragment[i_s, i_k] = dq_fragment[i_s, i_k] * scale
                     dk_fragment[i_s, i_k] = dk_fragment[i_s, i_k] + dk_fragment_2[i_s, i_k] * scale
-                T.copy(
-                    dq_fragment, dq[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    bk * block_DK:(bk + 1) * block_DK])
-                T.copy(
-                    dk_fragment, dk[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    bk * block_DK:(bk + 1) * block_DK])
+                T.copy(dq_fragment, dq[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
+                T.copy(dk_fragment, dk[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
 
     return kernel
 
@@ -443,33 +404,53 @@ def run_test(
     threads=256,
     num_stages=0,
 ):
-    Q, K, V, h, G, dO, dh, dv, W = prepare_input(B, S, H, DK, DV, chunk_size,
-                                                 getattr(torch, input_dtype),
-                                                 getattr(torch, output_dtype),
-                                                 getattr(torch, accum_dtype),
-                                                 getattr(torch, gate_dtype),
-                                                 getattr(torch, state_dtype))
-    dq_ref, dk_ref, dw_ref, dg_ref = prepare_output(B, S, H, DK, DV, chunk_size,
-                                                    getattr(torch, output_dtype),
-                                                    getattr(torch, gate_dtype),
-                                                    getattr(torch, state_dtype), block_DK)
+    Q, K, V, h, G, dO, dh, dv, W = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+    dq_ref, dk_ref, dw_ref, dg_ref = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype), block_DK
+    )
     dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = prepare_output(
-        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype),
-        getattr(torch, state_dtype), block_DK)
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype), block_DK
+    )
 
     # ref
     if use_g:
-        dq_ref, dk_ref, dw_ref, dg_ref = chunk_bwd_dqkwg(
-            Q, K, V, G, dO, h, dh, dv, W, chunk_size=chunk_size, scale=scale)
+        dq_ref, dk_ref, dw_ref, dg_ref = chunk_bwd_dqkwg(Q, K, V, G, dO, h, dh, dv, W, chunk_size=chunk_size, scale=scale)
     else:
-        dq_ref, dk_ref, dw_ref, dg_ref = chunk_bwd_dqkwg(
-            Q, K, V, None, dO, h, dh, dv, W, chunk_size=chunk_size, scale=scale)
+        dq_ref, dk_ref, dw_ref, dg_ref = chunk_bwd_dqkwg(Q, K, V, None, dO, h, dh, dv, W, chunk_size=chunk_size, scale=scale)
 
     # tilelang
-    kernel = tilelang_chunk_o_bwd_dqkwg(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                        gate_dtype, state_dtype, chunk_size, scale, use_g, use_dw,
-                                        block_DK, block_DV, threads, num_stages)
-    print(kernel.get_kernel_source())
+    kernel = tilelang_chunk_o_bwd_dqkwg(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        scale,
+        use_g,
+        use_dw,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
     dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = kernel(Q, K, V, h, G, dO, dh, dv, W)
 
     if use_g:
@@ -516,11 +497,11 @@ def main():
         H=8,
         DK=DK,
         DV=DV,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        accum_dtype="float32",
-        gate_dtype="float32",
-        state_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        accum_dtype=T.float32,
+        gate_dtype=T.float32,
+        state_dtype=T.float32,
         chunk_size=64,
         scale=DK**-0.5,
         # scale=1,
diff --git a/examples/gdn/example_chunk_scaled_dot_kkt.py b/examples/gdn/example_chunk_scaled_dot_kkt.py
index d07a4776a..8c7a4d573 100644
--- a/examples/gdn/example_chunk_scaled_dot_kkt.py
+++ b/examples/gdn/example_chunk_scaled_dot_kkt.py
@@ -9,6 +9,7 @@
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.common.chunk_scaled_dot_kkt import chunk_scaled_dot_kkt_fwd
 except ImportError:
@@ -56,9 +57,9 @@ def tilelang_chunk_scaled_dot_kkt_fwd(
     H,
     DK,
     chunk_size=64,
-    input_dtype="bfloat16",
-    output_dtype="bfloat16",
-    accum_dtype="float32",
+    input_dtype=T.bfloat16,
+    output_dtype=T.bfloat16,
+    accum_dtype=T.float32,
     use_g=True,
     # kernel config
     block_S=64,
@@ -75,10 +76,10 @@ def tilelang_chunk_scaled_dot_kkt_fwd(
 
     @T.prim_func
     def kernel(
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            Beta: T.Tensor(Beta_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=accum_dtype),
-            A: T.Tensor(output_shape, dtype=output_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=accum_dtype),
+        A: T.Tensor(output_shape, dtype=output_dtype),
     ):
         with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
             bb, bh = bbh // H, bbh % H
@@ -93,20 +94,13 @@ def kernel(
             G_shared = T.alloc_shared((block_S,), dtype=accum_dtype, scope="shared")
             G_diff_local = T.alloc_fragment((block_S, block_S), dtype=accum_dtype)
 
-            T.annotate_layout({
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-            })
-
             T.fill(A_fragment, 0)
             T.disable_warp_group_reg_alloc()
             for i_s in T.Parallel(block_S):
                 Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
 
             for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
-                T.copy(
-                    K[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
-                    K_shared)
+                T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], K_shared)
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
                     Beta_K_fragment[i_s, i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s]
                 T.gemm(Beta_K_fragment, K_shared, A_fragment, transpose_B=True)
@@ -119,8 +113,7 @@ def kernel(
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
                     with T.If(G_diff_local[i_s1, i_s2] <= 0 and i_s1 > i_s2):
                         with T.Then():
-                            A_fragment[i_s1, i_s2] = A_fragment[i_s1, i_s2] * T.exp(
-                                G_diff_local[i_s1, i_s2])
+                            A_fragment[i_s1, i_s2] = A_fragment[i_s1, i_s2] * T.exp(G_diff_local[i_s1, i_s2])
                         with T.Else():
                             A_fragment[i_s1, i_s2] = 0
             else:
@@ -130,7 +123,7 @@ def kernel(
                             A_fragment[i_s1, i_s2] = 0
 
             T.copy(A_fragment, A_shared)
-            T.copy(A_shared, A[bb, bs * block_S:(bs + 1) * block_S, bh, :])
+            T.copy(A_shared, A[bb, bs * block_S : (bs + 1) * block_S, bh, :])
 
     return kernel
 
@@ -149,24 +142,21 @@ def run_test(
     threads,
     num_stages,
 ):
-    K, Beta, G = prepare_input(B, S, H, DK, getattr(torch, input_dtype),
-                               getattr(torch, output_dtype), getattr(torch, accum_dtype))
+    K, Beta, G = prepare_input(B, S, H, DK, getattr(torch, input_dtype), getattr(torch, output_dtype), getattr(torch, accum_dtype))
     A_ref = prepare_output(B, S, H, chunk_size, getattr(torch, output_dtype))
     A_tilelang = prepare_output(B, S, H, chunk_size, getattr(torch, output_dtype))
 
     # reference
     if use_g:
-        A_ref = chunk_scaled_dot_kkt_fwd(
-            K, Beta, G, chunk_size=chunk_size, output_dtype=getattr(torch, output_dtype))
+        A_ref = chunk_scaled_dot_kkt_fwd(K, Beta, G, chunk_size=chunk_size, output_dtype=getattr(torch, output_dtype))
     else:
-        A_ref = chunk_scaled_dot_kkt_fwd(
-            K, Beta, None, chunk_size=chunk_size, output_dtype=getattr(torch, output_dtype))
+        A_ref = chunk_scaled_dot_kkt_fwd(K, Beta, None, chunk_size=chunk_size, output_dtype=getattr(torch, output_dtype))
 
     # tilelang
     block_S = chunk_size
-    kernel = tilelang_chunk_scaled_dot_kkt_fwd(B, S, H, DK, chunk_size, input_dtype, output_dtype,
-                                               accum_dtype, use_g, block_S, block_DK, threads,
-                                               num_stages)
+    kernel = tilelang_chunk_scaled_dot_kkt_fwd(
+        B, S, H, DK, chunk_size, input_dtype, output_dtype, accum_dtype, use_g, block_S, block_DK, threads, num_stages
+    )
     A_tilelang = kernel(K, Beta, G)
 
     try:
@@ -186,13 +176,14 @@ def main():
         H=32,
         DK=128,
         chunk_size=64,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        accum_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        accum_dtype=T.float32,
         use_g=True,
         block_DK=64,
         threads=128,
-        num_stages=2)
+        num_stages=2,
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/gdn/example_cumsum.py b/examples/gdn/example_cumsum.py
index 9896c7ecf..0760b4964 100644
--- a/examples/gdn/example_cumsum.py
+++ b/examples/gdn/example_cumsum.py
@@ -10,6 +10,7 @@
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.utils.cumsum import chunk_local_cumsum_scalar
 except ImportError:
@@ -20,11 +21,8 @@
 
 
 @tilelang.jit(
-    out_idx=[-1],
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
-    })
+    out_idx=[-1], pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True}
+)
 def tilelang_chunk_local_cumsum_scalar(
     # task config
     B,
@@ -34,43 +32,43 @@ def tilelang_chunk_local_cumsum_scalar(
     is_varlen=False,
     head_first=False,
     reverse=False,
-    input_dtype="float16",
-    output_dtype="float32",
+    input_dtype=T.float16,
+    output_dtype=T.float32,
     # kernel config
     block_S=64,
     threads=256,
     use_fragment=False,
 ):
     G_shape = (B, H, S) if head_first else (B, S, H)
-    assert chunk_size == 2**(chunk_size.bit_length() - 1), "chunk_size must be a power of 2"
+    assert chunk_size == 2 ** (chunk_size.bit_length() - 1), "chunk_size must be a power of 2"
     assert chunk_size == block_S, "chunk_size must be equal to block_S"
 
     @T.prim_func
     def kernel(
-            G: T.Tensor(G_shape, dtype=input_dtype),
-            G_new: T.Tensor(G_shape, dtype=output_dtype),
+        G: T.Tensor(G_shape, dtype=input_dtype),
+        G_new: T.Tensor(G_shape, dtype=output_dtype),
     ):
         with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
             bb, bh = bbh // H, bbh % H
             G_shared = T.alloc_shared((1, block_S), dtype=output_dtype, scope="shared")
             if head_first:
-                T.copy(G[bb, bh, bs * block_S:(bs + 1) * block_S], G_shared)
+                T.copy(G[bb, bh, bs * block_S : (bs + 1) * block_S], G_shared)
             else:
-                T.copy(G[bb, bs * block_S:(bs + 1) * block_S, bh], G_shared)
+                T.copy(G[bb, bs * block_S : (bs + 1) * block_S, bh], G_shared)
             if use_fragment:
                 G_fragment = T.alloc_fragment((1, block_S), dtype=output_dtype, scope="shared")
                 T.copy(G_shared, G_fragment)
                 T.cumsum(G_fragment, dim=1, reverse=reverse)
                 if head_first:
-                    T.copy(G_fragment, G_new[bb, bh, bs * block_S:(bs + 1) * block_S])
+                    T.copy(G_fragment, G_new[bb, bh, bs * block_S : (bs + 1) * block_S])
                 else:
-                    T.copy(G_fragment, G_new[bb, bs * block_S:(bs + 1) * block_S, bh])
+                    T.copy(G_fragment, G_new[bb, bs * block_S : (bs + 1) * block_S, bh])
             else:
                 T.cumsum(G_shared, dim=1, reverse=reverse)
                 if head_first:
-                    T.copy(G_shared, G_new[bb, bh, bs * block_S:(bs + 1) * block_S])
+                    T.copy(G_shared, G_new[bb, bh, bs * block_S : (bs + 1) * block_S])
                 else:
-                    T.copy(G_shared, G_new[bb, bs * block_S:(bs + 1) * block_S, bh])
+                    T.copy(G_shared, G_new[bb, bs * block_S : (bs + 1) * block_S, bh])
 
     return kernel
 
@@ -113,11 +111,8 @@ def run_test(
 
     # reference cumsum
     G_new_ref = chunk_local_cumsum_scalar(
-        g=G,
-        chunk_size=chunk_size,
-        reverse=reverse,
-        head_first=head_first,
-        output_dtype=getattr(torch, output_dtype))
+        g=G, chunk_size=chunk_size, reverse=reverse, head_first=head_first, output_dtype=getattr(torch, output_dtype)
+    )
 
     # tilelang cumsum
     block_S = chunk_size
@@ -159,10 +154,11 @@ def main():
         chunk_size=64,
         reverse=True,
         head_first=False,
-        input_dtype="float32",
-        output_dtype="float32",
+        input_dtype=T.float32,
+        output_dtype=T.float32,
         threads=256,
-        use_fragment=False)
+        use_fragment=False,
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/gdn/example_wy_fast.py b/examples/gdn/example_wy_fast.py
index 0a0983a82..d36dcf9b7 100644
--- a/examples/gdn/example_wy_fast.py
+++ b/examples/gdn/example_wy_fast.py
@@ -9,6 +9,7 @@
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.gated_delta_rule.wy_fast import recompute_w_u_fwd
 except ImportError:
@@ -73,13 +74,13 @@ def tilelang_recompute_w_u_fwd(
 
     @T.prim_func
     def kernel(
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            V: T.Tensor(V_shape, dtype=input_dtype),
-            Beta: T.Tensor(Beta_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            A: T.Tensor(A_shape, dtype=output_dtype),
-            W: T.Tensor(K_shape, dtype=output_dtype),
-            U: T.Tensor(V_shape, dtype=output_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        A: T.Tensor(A_shape, dtype=output_dtype),
+        W: T.Tensor(K_shape, dtype=output_dtype),
+        U: T.Tensor(V_shape, dtype=output_dtype),
     ):
         with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
             bb, bh = bbh // H, bbh % H
@@ -95,49 +96,37 @@ def kernel(
             W_Beta_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
             U_Beta_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
 
-            T.annotate_layout({
-                K_shared: tilelang.layout.make_swizzled_layout(K_shared),
-                V_shared: tilelang.layout.make_swizzled_layout(V_shared),
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                W_shared: tilelang.layout.make_swizzled_layout(W_shared),
-                U_shared: tilelang.layout.make_swizzled_layout(U_shared),
-                W_Beta_shared: tilelang.layout.make_swizzled_layout(W_Beta_shared),
-                U_Beta_shared: tilelang.layout.make_swizzled_layout(U_Beta_shared),
-            })
+            T.annotate_layout(
+                {
+                    K_shared: tilelang.layout.make_swizzled_layout(K_shared),
+                    V_shared: tilelang.layout.make_swizzled_layout(V_shared),
+                }
+            )
 
             T.disable_warp_group_reg_alloc()
             for i_s in T.Parallel(block_S):
                 Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
                 G_shared[i_s] = T.exp(G[bb, bs * block_S + i_s, bh])
 
-            T.copy(A[bb, bs * block_S:(bs + 1) * block_S, bh, :], A_shared)
+            T.copy(A[bb, bs * block_S : (bs + 1) * block_S, bh, :], A_shared)
 
             for i_v in T.Pipelined(T.ceildiv(DV, block_DV), num_stages=num_stages):
-                T.copy(
-                    V[bb, bs * block_S:(bs + 1) * block_S, bh, i_v * block_DV:(i_v + 1) * block_DV],
-                    V_shared)
+                T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], V_shared)
                 for i_s, i_v2 in T.Parallel(block_S, block_DV):
                     U_Beta_shared[i_s, i_v2] = V_shared[i_s, i_v2] * Beta_shared[i_s]
                 T.gemm(A_shared, U_Beta_shared, U_fragment, clear_accum=True)
                 # First copy to smem, then copy to gmem to reduce U2RU instructions
                 T.copy(U_fragment, U_shared)
-                T.copy(
-                    U_shared, U[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                i_v * block_DV:(i_v + 1) * block_DV])
+                T.copy(U_shared, U[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV])
 
             for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
-                T.copy(
-                    K[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
-                    K_shared)
+                T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], K_shared)
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
-                    W_Beta_shared[i_s,
-                                  i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s] * G_shared[i_s]
+                    W_Beta_shared[i_s, i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s] * G_shared[i_s]
                 T.gemm(A_shared, W_Beta_shared, W_fragment, clear_accum=True)
                 # First copy to smem, then copy to gmem to reduce U2RU instructions
                 T.copy(W_fragment, W_shared)
-                T.copy(
-                    W_shared, W[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                i_k * block_DK:(i_k + 1) * block_DK])
+                T.copy(W_shared, W[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK])
 
     return kernel
 
@@ -159,15 +148,8 @@ def run_test(
     num_stages,
 ):
     K, V, Beta, G, A = prepare_input(
-        B,
-        S,
-        H,
-        DK,
-        DV,
-        chunk_size,
-        getattr(torch, input_dtype),
-        getattr(torch, output_dtype),
-        gate_dtype=getattr(torch, gate_dtype))
+        B, S, H, DK, DV, chunk_size, getattr(torch, input_dtype), getattr(torch, output_dtype), gate_dtype=getattr(torch, gate_dtype)
+    )
     W_ref, U_ref = prepare_output(B, S, H, DK, DV, getattr(torch, output_dtype))
     W_tilelang, U_tilelang = prepare_output(B, S, H, DK, DV, getattr(torch, output_dtype))
 
@@ -191,7 +173,8 @@ def run_test(
         block_DK=block_DK,
         block_DV=block_DV,
         threads=threads,
-        num_stages=num_stages)
+        num_stages=num_stages,
+    )
     print(kernel.get_kernel_source())
     W_tilelang, U_tilelang = kernel(K, V, Beta, G, A)
 
@@ -217,14 +200,15 @@ def main():
         DK=128,
         DV=128,
         chunk_size=64,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        gate_dtype="float32",
-        accum_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        gate_dtype=T.float32,
+        accum_dtype=T.float32,
         block_DK=64,
         block_DV=32,
         threads=128,
-        num_stages=3)
+        num_stages=3,
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/gdn/example_wy_fast_bwd_split.py b/examples/gdn/example_wy_fast_bwd_split.py
index 618a82b4c..de8afc2b7 100644
--- a/examples/gdn/example_wy_fast_bwd_split.py
+++ b/examples/gdn/example_wy_fast_bwd_split.py
@@ -10,6 +10,7 @@
 # sys.path.insert(0, "/home/tzj/flash-linear-attention")
 try:
     import fla
+
     print(fla.__file__)
     from fla.ops.gated_delta_rule.wy_fast import bwd_prepare_wy_repr
 except ImportError:
@@ -93,10 +94,8 @@ def prepare_output(
 
 @tilelang.jit(
     out_idx=[-5, -4, -3, -2, -1],
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
-    })
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+)
 def tilelang_wy_fast_bwd(
     # task config
     B,
@@ -135,20 +134,20 @@ def tilelang_wy_fast_bwd(
 
     @T.prim_func
     def kernel(
-            # input
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            V: T.Tensor(V_shape, dtype=input_dtype),
-            Beta: T.Tensor(Beta_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            A: T.Tensor(A_shape, dtype=input_dtype),
-            dw: T.Tensor(dw_shape, dtype=input_dtype),
-            du: T.Tensor(du_shape, dtype=input_dtype),
-            # output
-            dA: T.Tensor(dA_shape, dtype=input_dtype),
-            dk: T.Tensor(dk_shape, dtype=output_dtype),
-            dv: T.Tensor(dv_shape, dtype=output_dtype),
-            dbeta: T.Tensor(dbeta_shape, dtype=output_dtype),
-            dg: T.Tensor(dg_shape, dtype=gate_dtype),
+        # input
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        A: T.Tensor(A_shape, dtype=input_dtype),
+        dw: T.Tensor(dw_shape, dtype=input_dtype),
+        du: T.Tensor(du_shape, dtype=input_dtype),
+        # output
+        dA: T.Tensor(dA_shape, dtype=input_dtype),
+        dk: T.Tensor(dk_shape, dtype=output_dtype),
+        dv: T.Tensor(dv_shape, dtype=output_dtype),
+        dbeta: T.Tensor(dbeta_shape, dtype=output_dtype),
+        dg: T.Tensor(dg_shape, dtype=gate_dtype),
     ):
         with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
             bb, bh = bbh // H, bbh % H
@@ -187,7 +186,7 @@ def kernel(
             T.clear(dbeta_fragment_v)
             T.clear(dg_fragment)
 
-            T.copy(A[bb, bs * block_S:(bs + 1) * block_S, bh, :], A_shared)
+            T.copy(A[bb, bs * block_S : (bs + 1) * block_S, bh, :], A_shared)
             for i_s in T.Parallel(block_S):
                 Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
                 G_shared[i_s] = G[bb, bs * block_S + i_s, bh]
@@ -195,51 +194,37 @@ def kernel(
 
             # Update dk
             for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
-                T.copy(
-                    K[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
-                    K_shared)
+                T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], K_shared)
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
-                    K_shared_beta_g[i_s,
-                                    i_k2] = K_shared[i_s,
-                                                     i_k2] * Beta_shared[i_s] * G_shared_exp[i_s]
-                T.copy(
-                    dw[bb, bs * block_S:(bs + 1) * block_S, bh,
-                       i_k * block_DK:(i_k + 1) * block_DK], dw_shared)
+                    K_shared_beta_g[i_s, i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s] * G_shared_exp[i_s]
+                T.copy(dw[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], dw_shared)
                 T.gemm(dw_shared, K_shared_beta_g, dA_fragment, transpose_B=True)
                 T.gemm(A_shared, dw_shared, dk_fragment_beta_g, clear_accum=True, transpose_A=True)
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
-                    dk_fragment[
-                        i_s,
-                        i_k2] = dk_fragment_beta_g[i_s, i_k2] * Beta_shared[i_s] * G_shared_exp[i_s]
+                    dk_fragment[i_s, i_k2] = dk_fragment_beta_g[i_s, i_k2] * Beta_shared[i_s] * G_shared_exp[i_s]
                 # for i_s, i_k2 in T.Parallel(block_S, block_DK):
                 #     dbeta_fragment[i_s] = dbeta_fragment[i_s] + dk_fragment_beta_g[i_s, i_k2] * K_shared[i_s, i_k2] * G_shared_exp[i_s]
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
-                    dbeta_fragment_reduce_tmpk[i_s, i_k2] = dk_fragment_beta_g[
-                        i_s, i_k2] * K_shared[i_s, i_k2] * G_shared_exp[i_s]
+                    dbeta_fragment_reduce_tmpk[i_s, i_k2] = dk_fragment_beta_g[i_s, i_k2] * K_shared[i_s, i_k2] * G_shared_exp[i_s]
                 T.reduce_sum(dbeta_fragment_reduce_tmpk, dbeta_fragment_k, dim=1, clear=False)
 
                 # for i_s, i_k2 in T.Parallel(block_S, block_DK):
                 #     dg_fragment[i_s] = dg_fragment[i_s] + dk_fragment_beta_g[i_s, i_k2] * K_shared[i_s, i_k2] * G_shared_exp[i_s] * Beta_shared[i_s]
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
-                    dg_fragment_reduce_tmp[i_s, i_k2] = dk_fragment_beta_g[i_s, i_k2] * K_shared[
-                        i_s, i_k2] * G_shared_exp[i_s] * Beta_shared[i_s]
+                    dg_fragment_reduce_tmp[i_s, i_k2] = (
+                        dk_fragment_beta_g[i_s, i_k2] * K_shared[i_s, i_k2] * G_shared_exp[i_s] * Beta_shared[i_s]
+                    )
                 T.reduce_sum(dg_fragment_reduce_tmp, dg_fragment, dim=1, clear=False)
 
                 # correct dk
-                T.copy(
-                    dk_fragment, dk[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    i_k * block_DK:(i_k + 1) * block_DK])
+                T.copy(dk_fragment, dk[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK])
 
             # Update dv
             for i_v in T.Pipelined(T.ceildiv(DV, block_DV), num_stages=num_stages):
-                T.copy(
-                    V[bb, bs * block_S:(bs + 1) * block_S, bh, i_v * block_DV:(i_v + 1) * block_DV],
-                    V_shared)
+                T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], V_shared)
                 for i_s, i_v2 in T.Parallel(block_S, block_DV):
                     V_shared_beta[i_s, i_v2] = V_shared[i_s, i_v2] * Beta_shared[i_s]
-                T.copy(
-                    du[bb, bs * block_S:(bs + 1) * block_S, bh,
-                       i_v * block_DV:(i_v + 1) * block_DV], du_shared)
+                T.copy(du[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], du_shared)
                 T.gemm(du_shared, V_shared_beta, dA_fragment, transpose_B=True)
                 T.gemm(A_shared, du_shared, dv_fragment_beta, clear_accum=True, transpose_A=True)
                 for i_s, i_v2 in T.Parallel(block_S, block_DV):
@@ -247,30 +232,22 @@ def kernel(
                 # for i_s, i_v2 in T.Parallel(block_S, block_DV):
                 #     dbeta_fragment[i_s] = dbeta_fragment[i_s] + dv_fragment_beta[i_s, i_v2] * V_shared[i_s, i_v2]
                 for i_s, i_v2 in T.Parallel(block_S, block_DV):
-                    dbeta_fragment_reduce_tmpv[i_s,
-                                               i_v2] = dv_fragment_beta[i_s, i_v2] * V_shared[i_s,
-                                                                                              i_v2]
+                    dbeta_fragment_reduce_tmpv[i_s, i_v2] = dv_fragment_beta[i_s, i_v2] * V_shared[i_s, i_v2]
                 T.reduce_sum(dbeta_fragment_reduce_tmpv, dbeta_fragment_v, dim=1, clear=False)
 
-                T.copy(
-                    dv_fragment, dv[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    i_v * block_DV:(i_v + 1) * block_DV])
+                T.copy(dv_fragment, dv[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV])
 
             # Temporary store dbeta, dg and dA
             for i_s in T.Parallel(block_S):
                 dbeta[bb, bs * block_S + i_s, bh] = dbeta_fragment_k[i_s] + dbeta_fragment_v[i_s]
                 dg[bb, bs * block_S + i_s, bh] = dg_fragment[i_s]
             # correct dA
-            T.copy(dA_fragment, dA[bb, bs * block_S:(bs + 1) * block_S, bh, :])
+            T.copy(dA_fragment, dA[bb, bs * block_S : (bs + 1) * block_S, bh, :])
 
     return kernel
 
 
-@tilelang.jit(
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True
-    })
+@tilelang.jit(pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True})
 def tilelang_wy_fast_bwd_split(
     # task config
     B,
@@ -308,20 +285,20 @@ def tilelang_wy_fast_bwd_split(
 
     @T.prim_func
     def kernel(
-            # input
-            K: T.Tensor(K_shape, dtype=input_dtype),
-            V: T.Tensor(V_shape, dtype=input_dtype),
-            Beta: T.Tensor(Beta_shape, dtype=input_dtype),
-            G: T.Tensor(G_shape, dtype=gate_dtype),
-            A: T.Tensor(A_shape, dtype=input_dtype),
-            dw: T.Tensor(dw_shape, dtype=input_dtype),
-            du: T.Tensor(du_shape, dtype=input_dtype),
-            dA: T.Tensor(dA_shape, dtype=input_dtype),
-            dk: T.Tensor(dk_shape, dtype=output_dtype),
-            dv: T.Tensor(dv_shape, dtype=output_dtype),
-            dbeta_k: T.Tensor(dbeta_shape, dtype=output_dtype),
-            dg_A_positive: T.Tensor(dA_shape, dtype=gate_dtype),
-            dg_A_negative: T.Tensor(dA_shape, dtype=gate_dtype),
+        # input
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        A: T.Tensor(A_shape, dtype=input_dtype),
+        dw: T.Tensor(dw_shape, dtype=input_dtype),
+        du: T.Tensor(du_shape, dtype=input_dtype),
+        dA: T.Tensor(dA_shape, dtype=input_dtype),
+        dk: T.Tensor(dk_shape, dtype=output_dtype),
+        dv: T.Tensor(dv_shape, dtype=output_dtype),
+        dbeta_k: T.Tensor(dbeta_shape, dtype=output_dtype),
+        dg_A_positive: T.Tensor(dA_shape, dtype=gate_dtype),
+        dg_A_negative: T.Tensor(dA_shape, dtype=gate_dtype),
     ):
         with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
             bb, bh = bbh // H, bbh % H
@@ -350,7 +327,7 @@ def kernel(
             T.clear(dA_A_fragment_1)
             T.clear(dA_A_fragment_2)
 
-            T.copy(A[bb, bs * block_S:(bs + 1) * block_S, bh, :], A_shared)
+            T.copy(A[bb, bs * block_S : (bs + 1) * block_S, bh, :], A_shared)
             for i_s in T.Parallel(block_S):
                 Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
                 G_shared[i_s] = G[bb, bs * block_S + i_s, bh]
@@ -361,7 +338,7 @@ def kernel(
             # for i_s in T.Parallel(block_S):
             # dbeta_fragment[i_s] = dbeta[bb, bs * block_S + i_s, bh]
             # dg_fragment[i_s] = dg[bb, bs * block_S + i_s, bh]
-            T.copy(dA[bb, bs * block_S:(bs + 1) * block_S, bh, :], dA_shared)
+            T.copy(dA[bb, bs * block_S : (bs + 1) * block_S, bh, :], dA_shared)
             # T.copy(dA_shared, dA[bb, bs * block_S:(bs + 1) * block_S, bh, :])
 
             # Update dA
@@ -385,8 +362,7 @@ def kernel(
             for i_s1, i_s2 in T.Parallel(block_S, block_S):
                 with T.If(G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh] <= 0):
                     with T.Then():
-                        dA_fragment[i_s1, i_s2] *= T.exp(G[bb, bs * block_S + i_s1, bh] -
-                                                         G[bb, bs * block_S + i_s2, bh])
+                        dA_fragment[i_s1, i_s2] *= T.exp(G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh])
                     with T.Else():
                         dA_fragment[i_s1, i_s2] = 0
             T.copy(dA_fragment, dA_shared)
@@ -397,12 +373,8 @@ def kernel(
             # Update dk using previous dk
             T.clear(A_fragment)
             for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
-                T.copy(
-                    K[bb, bs * block_S:(bs + 1) * block_S, bh, i_k * block_DK:(i_k + 1) * block_DK],
-                    K_shared)
-                T.copy(
-                    dk[bb, bs * block_S:(bs + 1) * block_S, bh,
-                       i_k * block_DK:(i_k + 1) * block_DK], dk_shared)
+                T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], K_shared)
+                T.copy(dk[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], dk_shared)
                 T.copy(dk_shared, dk_fragment)
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
                     K_shared_beta[i_s, i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s]
@@ -411,18 +383,14 @@ def kernel(
                 # for i_s, i_k2 in T.Parallel(block_S, block_DK):
                 #     dbeta_fragment[i_s] = dbeta_fragment[i_s] + dk_fragment_beta[i_s, i_k2] * K_shared[i_s, i_k2]
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
-                    dbeta_fragment_reduce_tmpk[i_s,
-                                               i_k2] = dk_fragment_beta[i_s, i_k2] * K_shared[i_s,
-                                                                                              i_k2]
+                    dbeta_fragment_reduce_tmpk[i_s, i_k2] = dk_fragment_beta[i_s, i_k2] * K_shared[i_s, i_k2]
                 T.reduce_sum(dbeta_fragment_reduce_tmpk, dbeta_fragment_k, dim=1, clear=False)
                 T.gemm(dA_shared, K_shared_beta, dk_fragment, transpose_A=True)
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
                     dk_shared_beta[i_s, i_k2] = dk_fragment_beta[i_s, i_k2] * Beta_shared[i_s]
                 for i_s, i_k2 in T.Parallel(block_S, block_DK):
                     dk_fragment[i_s, i_k2] = dk_fragment[i_s, i_k2] + dk_shared_beta[i_s, i_k2]
-                T.copy(
-                    dk_fragment, dk[bb, bs * block_S:(bs + 1) * block_S, bh,
-                                    i_k * block_DK:(i_k + 1) * block_DK])
+                T.copy(dk_fragment, dk[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK])
 
             # Update dg and dbeta
             T.copy(A_fragment, A_shared)
@@ -460,19 +428,25 @@ def run_test(
     threads=128,
     num_stages=0,
 ):
-    K, V, Beta, G, A, dw, du = prepare_input(B, S, H, DK, DV, chunk_size,
-                                             getattr(torch, input_dtype),
-                                             getattr(torch, output_dtype),
-                                             getattr(torch,
-                                                     accum_dtype), getattr(torch, gate_dtype),
-                                             getattr(torch, state_dtype))
-    dk_ref, dv_ref, dbeta_ref, dg_ref = prepare_output(B, S, H, DK, DV, chunk_size,
-                                                       getattr(torch, output_dtype),
-                                                       getattr(torch, gate_dtype),
-                                                       getattr(torch, state_dtype))
+    K, V, Beta, G, A, dw, du = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+    dk_ref, dv_ref, dbeta_ref, dg_ref = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
     dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = prepare_output(
-        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype),
-        getattr(torch, state_dtype))
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
     BS = chunk_size
     dA_tilelang = torch.empty(B, S, H, BS, dtype=getattr(torch, input_dtype)).cuda()
     dbeta_tilelang_k = torch.empty(B, S, H, dtype=getattr(torch, output_dtype)).cuda()
@@ -480,28 +454,55 @@ def run_test(
     dg_tilelang_A_negative = torch.empty(B, S, H, BS, dtype=getattr(torch, gate_dtype)).cuda()
 
     # ref
-    dk_ref, dv_ref, dbeta_ref, dg_ref = bwd_prepare_wy_repr(
-        K, V, G, Beta, A, dw, du, cu_seqlens=None)
+    dk_ref, dv_ref, dbeta_ref, dg_ref = bwd_prepare_wy_repr(K, V, G, Beta, A, dw, du, cu_seqlens=None)
 
     # tilelang
-    kernel = tilelang_wy_fast_bwd(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                  gate_dtype, state_dtype, chunk_size, block_DK, block_DV, threads,
-                                  num_stages)
-    dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = kernel(
-        K, V, Beta, G, A, dw, du)
+    kernel = tilelang_wy_fast_bwd(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
+    dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = kernel(K, V, Beta, G, A, dw, du)
     torch.cuda.synchronize()
-    kernel_split = tilelang_wy_fast_bwd_split(B, S, H, DK, DV, input_dtype, output_dtype,
-                                              accum_dtype, gate_dtype, state_dtype, chunk_size,
-                                              block_DK, block_DV, threads, num_stages)
-    kernel_split(K, V, Beta, G, A, dw, du, dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang_k,
-                 dg_tilelang_A_positive, dg_tilelang_A_negative)
+    kernel_split = tilelang_wy_fast_bwd_split(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
+    kernel_split(
+        K, V, Beta, G, A, dw, du, dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang_k, dg_tilelang_A_positive, dg_tilelang_A_negative
+    )
     torch.cuda.synchronize()
 
     dbeta_tilelang = dbeta_tilelang_k + dbeta_tilelang
-    dg_tilelang = dg_tilelang + dg_tilelang_A_positive.sum(dim=-1) - dg_tilelang_A_negative.sum(
-        dim=-1)
+    dg_tilelang = dg_tilelang + dg_tilelang_A_positive.sum(dim=-1) - dg_tilelang_A_negative.sum(dim=-1)
+
+    from test_utils import assert_similar
 
-    from utils import assert_similar
     assert_similar(dk_ref, dk_tilelang, eps=1e-5, name="dk", raise_assert=False)
     assert_similar(dv_ref, dv_tilelang, eps=1e-5, name="dv", raise_assert=False)
     assert_similar(dbeta_ref, dbeta_tilelang, eps=1e-5, name="dbeta", raise_assert=False)
@@ -517,11 +518,11 @@ def main():
         H=8,
         DK=DK,
         DV=DV,
-        input_dtype="bfloat16",
-        output_dtype="bfloat16",
-        accum_dtype="float32",
-        gate_dtype="float32",
-        state_dtype="float32",
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        accum_dtype=T.float32,
+        gate_dtype=T.float32,
+        state_dtype=T.float32,
         chunk_size=64,
         block_DK=32,
         block_DV=32,
diff --git a/examples/gdn/test_example_gdn_compilation.py b/examples/gdn/test_example_gdn_compilation.py
index e184dbcac..6f9fa5d2f 100644
--- a/examples/gdn/test_example_gdn_compilation.py
+++ b/examples/gdn/test_example_gdn_compilation.py
@@ -1,16 +1,16 @@
-import tilelang.testing
 import torch
+from tilelang import language as T
 
 B = 1
 S = 1024  # small but for test only.
 H = 32
 DK = 128
 DV = 128
-input_dtype = "bfloat16"
-output_dtype = "bfloat16"
-accum_dtype = "float32"
-gate_dtype = "float32"
-state_dtype = "float32"
+input_dtype = T.bfloat16
+output_dtype = T.bfloat16
+accum_dtype = T.float32
+gate_dtype = T.float32
+state_dtype = T.float32
 chunk_size = 64
 use_g = True
 use_initial_state = True
@@ -20,21 +20,15 @@
 block_DK = 64
 block_DV = 32
 threads = 128
-num_stages = 1
+num_stages = 0
 
 
 def test_example_wy_fast_compilation():
     from example_wy_fast import tilelang_recompute_w_u_fwd, prepare_input
+
     K, V, Beta, G, A = prepare_input(
-        B,
-        S,
-        H,
-        DK,
-        DV,
-        chunk_size,
-        getattr(torch, input_dtype),
-        getattr(torch, output_dtype),
-        gate_dtype=getattr(torch, gate_dtype))
+        B, S, H, DK, DV, chunk_size, getattr(torch, input_dtype), getattr(torch, output_dtype), gate_dtype=getattr(torch, gate_dtype)
+    )
     # tilelang
     block_S = chunk_size
     kernel = tilelang_recompute_w_u_fwd(
@@ -52,22 +46,31 @@ def test_example_wy_fast_compilation():
         block_DK=block_DK,
         block_DV=block_DV,
         threads=threads,
-        num_stages=num_stages)
+        num_stages=num_stages,
+    )
     print(kernel.get_kernel_source())
     W_tilelang, U_tilelang = kernel(K, V, Beta, G, A)
 
 
 def test_example_wy_fast_bwd_split_compilation():
     from example_wy_fast_bwd_split import tilelang_wy_fast_bwd, tilelang_wy_fast_bwd_split, prepare_input, prepare_output
-    K, V, Beta, G, A, dw, du = prepare_input(B, S, H, DK, DV, chunk_size,
-                                             getattr(torch, input_dtype),
-                                             getattr(torch, output_dtype),
-                                             getattr(torch,
-                                                     accum_dtype), getattr(torch, gate_dtype),
-                                             getattr(torch, state_dtype))
+
+    K, V, Beta, G, A, dw, du = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
     dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = prepare_output(
-        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype),
-        getattr(torch, state_dtype))
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
     BS = chunk_size
     dA_tilelang = torch.empty(B, S, H, BS, dtype=getattr(torch, input_dtype)).cuda()
     dbeta_tilelang_k = torch.empty(B, S, H, dtype=getattr(torch, output_dtype)).cuda()
@@ -75,67 +78,146 @@ def test_example_wy_fast_bwd_split_compilation():
     dg_tilelang_A_negative = torch.empty(B, S, H, BS, dtype=getattr(torch, gate_dtype)).cuda()
 
     # tilelang
-    kernel = tilelang_wy_fast_bwd(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                  gate_dtype, state_dtype, chunk_size, block_DK, block_DV, threads,
-                                  num_stages)
-    dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = kernel(
-        K, V, Beta, G, A, dw, du)
+    kernel = tilelang_wy_fast_bwd(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
+    dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = kernel(K, V, Beta, G, A, dw, du)
     torch.cuda.synchronize()
-    kernel_split = tilelang_wy_fast_bwd_split(B, S, H, DK, DV, input_dtype, output_dtype,
-                                              accum_dtype, gate_dtype, state_dtype, chunk_size,
-                                              block_DK, block_DV, threads, num_stages)
-    kernel_split(K, V, Beta, G, A, dw, du, dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang_k,
-                 dg_tilelang_A_positive, dg_tilelang_A_negative)
+    kernel_split = tilelang_wy_fast_bwd_split(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
+    kernel_split(
+        K, V, Beta, G, A, dw, du, dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang_k, dg_tilelang_A_positive, dg_tilelang_A_negative
+    )
     torch.cuda.synchronize()
 
     dbeta_tilelang = dbeta_tilelang_k + dbeta_tilelang
-    dg_tilelang = dg_tilelang + dg_tilelang_A_positive.sum(dim=-1) - dg_tilelang_A_negative.sum(
-        dim=-1)
+    dg_tilelang = dg_tilelang + dg_tilelang_A_positive.sum(dim=-1) - dg_tilelang_A_negative.sum(dim=-1)
 
 
 def test_example_chunk_o_compilation():
     from example_chunk_o import tilelang_chunk_fwd_o, prepare_input
-    Q, K, V, HIDDEN, G = prepare_input(B, S, H, DK, DV, chunk_size, getattr(torch, input_dtype),
-                                       getattr(torch, output_dtype), getattr(torch, accum_dtype),
-                                       getattr(torch, gate_dtype))
+
+    Q, K, V, HIDDEN, G = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+    )
     scale = 1.0 / DK**0.5
     block_S = chunk_size
-    kernel = tilelang_chunk_fwd_o(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                  gate_dtype, chunk_size, scale, use_g, block_S, block_DK, block_DV,
-                                  threads, num_stages)
+    kernel = tilelang_chunk_fwd_o(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        chunk_size,
+        scale,
+        use_g,
+        block_S,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
     O_tilelang = kernel(Q, K, V, HIDDEN, G)  # noqa: F841
 
 
 def test_example_chunk_o_bwd_compilation():
     from example_chunk_o_bwd import tilelang_chunk_o_bwd_dqkwg, prepare_input
-    Q, K, V, h, G, dO, dh, dv, W = prepare_input(B, S, H, DK, DV, chunk_size,
-                                                 getattr(torch, input_dtype),
-                                                 getattr(torch, output_dtype),
-                                                 getattr(torch, accum_dtype),
-                                                 getattr(torch, gate_dtype),
-                                                 getattr(torch, state_dtype))
-    kernel = tilelang_chunk_o_bwd_dqkwg(B, S, H, DK, DV, input_dtype, output_dtype, accum_dtype,
-                                        gate_dtype, state_dtype, chunk_size, 1.0, use_g, True,
-                                        block_DK, block_DV, threads, num_stages)
-    dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = kernel(Q, K, V, h, G, dO, dh, dv,
-                                                                W)  # noqa: F841
+
+    Q, K, V, h, G, dO, dh, dv, W = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+    kernel = tilelang_chunk_o_bwd_dqkwg(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        1.0,
+        use_g,
+        True,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
+
+    dq_tilelang, dk_tilelang, dw_tilelang, dg_tilelang = kernel(Q, K, V, h, G, dO, dh, dv, W)  # noqa: F841
     if use_g:
         dg_tilelang = dg_tilelang.sum(dim=0)
 
 
 def test_example_chunk_scaled_dot_kkt_compilation():
     from example_chunk_scaled_dot_kkt import tilelang_chunk_scaled_dot_kkt_fwd, prepare_input
-    K, Beta, G = prepare_input(B, S, H, DK, getattr(torch, input_dtype),
-                               getattr(torch, output_dtype), getattr(torch, accum_dtype))
+
+    K, Beta, G = prepare_input(B, S, H, DK, getattr(torch, input_dtype), getattr(torch, output_dtype), getattr(torch, accum_dtype))
     block_S = chunk_size
-    kernel = tilelang_chunk_scaled_dot_kkt_fwd(B, S, H, DK, chunk_size, input_dtype, output_dtype,
-                                               accum_dtype, use_g, block_S, block_DK, threads,
-                                               num_stages)
+    kernel = tilelang_chunk_scaled_dot_kkt_fwd(
+        B, S, H, DK, chunk_size, input_dtype, output_dtype, accum_dtype, use_g, block_S, block_DK, threads, num_stages
+    )
     A_tilelang = kernel(K, Beta, G)  # noqa: F841
 
 
 def test_example_cumsum_compilation():
     from example_cumsum import tilelang_chunk_local_cumsum_scalar, prepare_cumsum_input, prepare_cumsum_output
+
     G = prepare_cumsum_input(B, S, H, getattr(torch, gate_dtype))
     G_new_tilelang = prepare_cumsum_output(B, S, H, getattr(torch, gate_dtype))
     block_S = chunk_size
@@ -157,35 +239,82 @@ def test_example_cumsum_compilation():
 
 def test_example_chunk_delta_h_compilation():
     from example_chunk_delta_h import tilelang_chunk_gated_delta_rule_fwd_h, prepare_input
-    K, W, U, G, initial_state = prepare_input(B, S, H, DK, DV, chunk_size,
-                                              getattr(torch, input_dtype),
-                                              getattr(torch, output_dtype),
-                                              getattr(torch, accum_dtype),
-                                              getattr(torch, gate_dtype))
-    kernel = tilelang_chunk_gated_delta_rule_fwd_h(B, S, H, DK, DV, input_dtype, output_dtype,
-                                                   accum_dtype, gate_dtype, state_dtype, chunk_size,
-                                                   use_g, use_initial_state, store_final_state,
-                                                   save_new_value, block_DK, block_DV, threads,
-                                                   num_stages)
-    h_tilelang, final_state_tilelang, V_new_tilelang = kernel(K, W, U, G,
-                                                              initial_state)  # noqa: F841
+
+    K, W, U, G, initial_state = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+    )
+    kernel = tilelang_chunk_gated_delta_rule_fwd_h(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        use_g,
+        use_initial_state,
+        store_final_state,
+        save_new_value,
+        block_DK,
+        block_DV,
+        threads,
+        num_stages,
+    )
+    h_tilelang, final_state_tilelang, V_new_tilelang = kernel(K, W, U, G, initial_state)  # noqa: F841
 
 
 def test_example_chunk_delta_bwd_compilation():
     from example_chunk_delta_bwd import tilelang_chunk_gated_delta_rule_bwd_dhu, prepare_input
-    Q, K, W, G, h0, dht, dO, dv = prepare_input(B, S, H, DK, DV, chunk_size,
-                                                getattr(torch, input_dtype),
-                                                getattr(torch, output_dtype),
-                                                getattr(torch, accum_dtype),
-                                                getattr(torch, gate_dtype),
-                                                getattr(torch, state_dtype))
-    kernel = tilelang_chunk_gated_delta_rule_bwd_dhu(B, S, H, DK, DV, input_dtype, output_dtype,
-                                                     accum_dtype, gate_dtype, state_dtype,
-                                                     chunk_size, 1.0, use_g, use_initial_state,
-                                                     use_final_state_gradient, block_DV, threads,
-                                                     num_stages)
+
+    Q, K, W, G, h0, dht, dO, dv = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+    kernel = tilelang_chunk_gated_delta_rule_bwd_dhu(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        1.0,
+        use_g,
+        use_initial_state,
+        use_final_state_gradient,
+        block_DV,
+        threads,
+        num_stages,
+    )
     dh_tilelang, dh0_tilelang, dv2_tilelang = kernel(Q, K, W, G, h0, dht, dO, dv)  # noqa: F841
 
 
 if __name__ == "__main__":
-    tilelang.testing.main()
+    # tilelang.testing.main()
+    test_example_chunk_delta_bwd_compilation()
diff --git a/examples/gdn/test_utils.py b/examples/gdn/test_utils.py
new file mode 100644
index 000000000..3588551ce
--- /dev/null
+++ b/examples/gdn/test_utils.py
@@ -0,0 +1,38 @@
+import torch
+
+
+def print_red_warning(message):
+    print(f"\033[31mWARNING: {message}\033[0m")
+
+
+def calc_sim(x, y, name="tensor"):
+    x, y = x.data.double(), y.data.double()
+    denominator = (x * x + y * y).sum()
+    if denominator == 0:
+        print_red_warning(f"{name} all zero")
+        return 1
+    sim = 2 * (x * y).sum() / denominator
+    return sim
+
+
+def assert_similar(x, y, eps=1e-8, name="tensor", data="", raise_assert=True):
+    x_mask = torch.isfinite(x)
+    y_mask = torch.isfinite(y)
+    if not torch.all(x_mask == y_mask):
+        print_red_warning(f"{name} Error: isfinite mask mismatch")
+        if raise_assert:
+            raise AssertionError
+    if not torch.isclose(x.masked_fill(x_mask, 0), y.masked_fill(y_mask, 0), rtol=0, atol=0, equal_nan=True).all():
+        print_red_warning(f"{name} Error: nonfinite value mismatch")
+        if raise_assert:
+            raise AssertionError
+    x = x.masked_fill(~x_mask, 0)
+    y = y.masked_fill(~y_mask, 0)
+    sim = calc_sim(x, y, name)
+    diff = 1.0 - sim
+    if not (0 <= diff <= eps):
+        print_red_warning(f"{name} Error: {diff}")
+        if raise_assert:
+            raise AssertionError
+    else:
+        print(f"{name} {data} passed")
diff --git a/examples/gdn/utils.py b/examples/gdn/utils.py
index 37f8d8e69..3588551ce 100644
--- a/examples/gdn/utils.py
+++ b/examples/gdn/utils.py
@@ -9,7 +9,7 @@ def calc_sim(x, y, name="tensor"):
     x, y = x.data.double(), y.data.double()
     denominator = (x * x + y * y).sum()
     if denominator == 0:
-        print_red_warning(f'{name} all zero')
+        print_red_warning(f"{name} all zero")
         return 1
     sim = 2 * (x * y).sum() / denominator
     return sim
@@ -19,21 +19,19 @@ def assert_similar(x, y, eps=1e-8, name="tensor", data="", raise_assert=True):
     x_mask = torch.isfinite(x)
     y_mask = torch.isfinite(y)
     if not torch.all(x_mask == y_mask):
-        print_red_warning(f'{name} Error: isfinite mask mismatch')
+        print_red_warning(f"{name} Error: isfinite mask mismatch")
         if raise_assert:
             raise AssertionError
-    if not torch.isclose(
-            x.masked_fill(x_mask, 0), y.masked_fill(y_mask, 0), rtol=0, atol=0,
-            equal_nan=True).all():
-        print_red_warning(f'{name} Error: nonfinite value mismatch')
+    if not torch.isclose(x.masked_fill(x_mask, 0), y.masked_fill(y_mask, 0), rtol=0, atol=0, equal_nan=True).all():
+        print_red_warning(f"{name} Error: nonfinite value mismatch")
         if raise_assert:
             raise AssertionError
     x = x.masked_fill(~x_mask, 0)
     y = y.masked_fill(~y_mask, 0)
     sim = calc_sim(x, y, name)
-    diff = 1. - sim
+    diff = 1.0 - sim
     if not (0 <= diff <= eps):
-        print_red_warning(f'{name} Error: {diff}')
+        print_red_warning(f"{name} Error: {diff}")
         if raise_assert:
             raise AssertionError
     else:
diff --git a/examples/gemm/README.md b/examples/gemm/README.md
index 059d08c84..9ab7fb661 100644
--- a/examples/gemm/README.md
+++ b/examples/gemm/README.md
@@ -4,20 +4,23 @@ TileLang is a domain-specific language designed to simplify the process of writi
 
 ## Table of Contents
 
-1. [Getting Started](#getting-started)  
-2. [Simple GEMM Example](#simple-gemm-example)  
-   - [Code Walkthrough](#code-walkthrough)
-   - [Compiling and Profiling](#compiling-and-profiling)
-3. [Advanced GEMM Features](#advanced-gemm-features)  
-   - [Custom Memory Layout / Swizzling](#custom-memory-layout--swizzling)
-   - [Parallel Copy and Auto-Pipelining](#parallel-copy-and-auto-pipelining)
-   - [Rasterization for L2 Cache Locality](#rasterization-for-l2-cache-locality)
-4. [Enhanced GEMM Example with Annotations](#enhanced-gemm-example-with-annotations)
-5. [Verifying Correctness](#verifying-correctness)
-6. [Fine-grained MMA Computations](#fine-grained-mma-computations)  
-   - [Example Workflow](#example-workflow)
-   - [Summary](#summary)
-7. [References](#references)
+- [Table of Contents](#table-of-contents)
+- [Getting Started](#getting-started)
+  - [Prerequisites](#prerequisites)
+  - [Installation](#installation)
+- [Simple GEMM Example](#simple-gemm-example)
+  - [Code Walkthrough](#code-walkthrough)
+  - [Compiling and Profiling](#compiling-and-profiling)
+- [Advanced GEMM Features](#advanced-gemm-features)
+  - [Custom Memory Layout / Swizzling](#custom-memory-layout--swizzling)
+  - [Parallel Copy and Auto-Pipelining](#parallel-copy-and-auto-pipelining)
+  - [Rasterization for L2 Cache Locality](#rasterization-for-l2-cache-locality)
+- [Enhanced GEMM Example with Annotations](#enhanced-gemm-example-with-annotations)
+- [Verifying Correctness](#verifying-correctness)
+- [Fine-grained MMA Computations](#fine-grained-mma-computations)
+  - [Example Workflow](#example-workflow)
+  - [Summary](#summary)
+- [References](#references)
 
 ---
 
@@ -25,10 +28,10 @@ TileLang is a domain-specific language designed to simplify the process of writi
 
 ### Prerequisites
 
-- **Python 3.8+**  
-- **NVIDIA GPU** with a recent CUDA toolkit installed  
+- **Python 3.8+**
+- **NVIDIA GPU** with a recent CUDA toolkit installed
 - **PyTorch** (optional, for easy correctness verification)
-- **tilelang**  
+- **tilelang**
 - **bitblas** (optional; used for swizzle layout utilities in the advanced examples)
 
 ### Installation
@@ -50,7 +53,7 @@ import tilelang
 from tilelang import Profiler
 import tilelang.language as T
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float):
     @T.prim_func
     def main(
         A: T.Tensor((M, K), dtype),
@@ -87,26 +90,26 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="flo
 
 ### Code Walkthrough
 
-1. **Define the Kernel Launch Configuration:**  
+1. **Define the Kernel Launch Configuration:**
    ```python
    with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
    ```
    This creates a grid of blocks (ceildiv(N, block_N) in x-dimension, ceildiv(M, block_M) in y-dimension), each with 128 threads.
 
-2. **Shared Memory Allocation:**  
+2. **Shared Memory Allocation:**
    ```python
    A_shared = T.alloc_shared((block_M, block_K), dtype)
    B_shared = T.alloc_shared((block_K, block_N), dtype)
    ```
    Tiles of \(A\) and \(B\) are loaded into these shared memory buffers for faster access.
 
-3. **Local Fragment Accumulation:**  
+3. **Local Fragment Accumulation:**
    ```python
    C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
    ```
    Partial results are stored in registers (or local memory) to reduce writes to global memory.
 
-4. **Pipelined Loading and GEMM:**  
+4. **Pipelined Loading and GEMM:**
    ```python
    for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
        T.copy(...)
@@ -114,7 +117,7 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="flo
    ```
    Loads blocks of \(A\) and \(B\) in a pipelined fashion (up to 3 stages). This exploits overlap of data transfer and computation.
 
-5. **Copy Out the Results:**  
+5. **Copy Out the Results:**
    ```python
    T.copy(C_local, C[by * block_M, bx * block_N])
    ```
@@ -173,7 +176,7 @@ import tilelang.language as T
 # that helps align data for MMA (Matrix Multiply-Accumulate) operations.
 from tilelang.intrinsics import make_mma_swizzle_layout as make_swizzle_layout
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float):
     @T.prim_func
     def main(
         A: T.Tensor((M, K), dtype),
@@ -216,10 +219,10 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="flo
     return main
 ```
 
-**Key Differences vs. Basic Example**  
-1. **`T.annotate_layout(...)`**: Annotates how data should be organized in shared memory (swizzling).  
-2. **`T.use_swizzle(...)`**: Enables swizzle-based rasterization.  
-3. **Parallel Copy Loop** with `T.Parallel(...)`: Distributes global-to-shared copy across all threads, potentially vectorizing load/store instructions.  
+**Key Differences vs. Basic Example**
+1. **`T.annotate_layout(...)`**: Annotates how data should be organized in shared memory (swizzling).
+2. **`T.use_swizzle(...)`**: Enables swizzle-based rasterization.
+3. **Parallel Copy Loop** with `T.Parallel(...)`: Distributes global-to-shared copy across all threads, potentially vectorizing load/store instructions.
 
 ---
 
@@ -247,7 +250,7 @@ print("Results match!")
 
 ## Fine-grained MMA Computations
 
-For advanced users who require full control over warp-level matrix multiplication operations, TileLang allows you to specify fine-grained MMA (Matrix Multiply-Accumulate) computations in a manner similar to writing raw CUDA. While higher-level abstractions like `T.gemm(...)` or automatic MMA emitters are sufficient for many use cases, specialized workloads (for example, dequantize gemm may require fine-grained layout transformation on shared to register stage) may benefit from explicitly controlling each MMA instruction, the data layout, and the synchronization points. 
+For advanced users who require full control over warp-level matrix multiplication operations, TileLang allows you to specify fine-grained MMA (Matrix Multiply-Accumulate) computations in a manner similar to writing raw CUDA. While higher-level abstractions like `T.gemm(...)` or automatic MMA emitters are sufficient for many use cases, specialized workloads (for example, dequantize gemm may require fine-grained layout transformation on shared to register stage) may benefit from explicitly controlling each MMA instruction, the data layout, and the synchronization points.
 
 ### Example Workflow
 
@@ -262,18 +265,18 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if out_dtype == "int32":
+    if out_dtype == T.int32:
         micro_size_k = 32
 
     # This is a debug config
@@ -394,10 +397,10 @@ def tl_matmul(
                 ]
 ```
 
-1. **Set Up Tile Sizes and Thread Bindings**  
+1. **Set Up Tile Sizes and Thread Bindings**
    Just like in CUDA, you will typically start by defining how many warps or threads per block you want and how your matrix is subdivided. In TileLang, this is done via `T.Kernel(...)` and `T.thread_binding(...),` which ensure that the correct number of threads are active, and each thread is bound to a specific role (e.g., warp ID or lane ID).
 
-2. **Allocate Warp-local Fragments**  
+2. **Allocate Warp-local Fragments**
    Instead of using a single shared buffer for partial sums, you allocate local buffers (register fragments) to hold sub-blocks of matrices \(A\) and \(B\). In TileLang, this is done with something like:
    ```python
    A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
@@ -406,7 +409,7 @@ def tl_matmul(
    ```
    Each of these `local` allocations represents a region of per-thread storage, which collectively forms the warp’s register tiles.
 
-3. **Load Data via `ldmatrix`**  
+3. **Load Data via `ldmatrix`**
    Fine-grained loading instructions allow you to specify exactly how data moves from shared memory to the warp-level fragments. In the example below, `mma_emitter.ldmatrix_a()` and `.ldmatrix_b()` are higher-level wrappers around warp-synchronous intrinsics. You can write your own load logic as well:
    ```python
    for ki in T.serial(0, (block_K // micro_size_k)):
@@ -418,7 +421,7 @@ def tl_matmul(
    ```
    Internally, these calls orchestrate how each thread in the warp issues the correct load instructions, performs address calculations, and stores the data into registers.
 
-4. **Perform the MMA Instruction**  
+4. **Perform the MMA Instruction**
    After loading sub-tiles (fragments), the warp executes the `mma` instruction. This operation is essentially:
    \[
      C_{\text{local}} \;+=\; A_{\text{local}} \;\times\; B_{\text{local}}
@@ -429,7 +432,7 @@ def tl_matmul(
    ```
    Under the hood, this translates into Tensor Core instructions (e.g., `wmma.mma.sync` in PTX), which process multiple data elements per warp in parallel.
 
-5. **Store Results via `stmatrix`**  
+5. **Store Results via `stmatrix`**
    Finally, you write the results from the warp-level fragments back to shared memory or global memory. This step might happen multiple times in a loop or just once at the end. The code snippet:
    ```python
    mma_emitter.stmatrix(C_local, C_shared)
@@ -444,6 +447,6 @@ By combining warp-synchronous intrinsics (`ldmatrix`, `mma`, `stmatrix`) with ma
 
 ## References
 
-- [NVIDIA CUTLASS Library](https://github.com/NVIDIA/cutlass): A collection of high-performance CUDA C++ template abstractions for GEMM.  
-- [NVIDIA CUDA Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html): Official documentation for CUDA.  
+- [NVIDIA CUTLASS Library](https://github.com/NVIDIA/cutlass): A collection of high-performance CUDA C++ template abstractions for GEMM.
+- [NVIDIA CUDA Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html): Official documentation for CUDA.
 - [PyTorch Documentation](https://pytorch.org/docs): For verifying correctness via CPU or GPU-based matmul.
diff --git a/examples/gemm/example_gemm.py b/examples/gemm/example_gemm.py
index f18cd388a..dfa431121 100644
--- a/examples/gemm/example_gemm.py
+++ b/examples/gemm/example_gemm.py
@@ -3,13 +3,12 @@
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def gemm(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -58,5 +57,11 @@ def main():
     print(f"tilelang Latency: {latency}ms")
 
 
+def run_regression_perf():
+    kernel = matmul(1024, 1024, 1024, 128, 128, 32)
+    profiler = kernel.get_profiler()
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/gemm/example_gemm_autotune.py b/examples/gemm/example_gemm_autotune.py
index 661ef1276..016d448a4 100644
--- a/examples/gemm/example_gemm_autotune.py
+++ b/examples/gemm/example_gemm_autotune.py
@@ -51,9 +51,9 @@ def get_configs(M, N, K, with_roller=False, topk=20):
             M=M,
             N=N,
             K=K,
-            in_dtype="float16",
-            out_dtype="float16",
-            accum_dtype="float",
+            in_dtype=T.float16,
+            out_dtype=T.float16,
+            accum_dtype=T.float32,
         ).with_arch(arch)
 
         func = carve_template.equivalent_function()
@@ -90,7 +90,8 @@ def get_configs(M, N, K, with_roller=False, topk=20):
                 num_stages,
                 thread_num,
                 enable_rasterization,
-            ))
+            )
+        )
 
         configs = [
             {
@@ -100,13 +101,13 @@ def get_configs(M, N, K, with_roller=False, topk=20):
                 "num_stages": c[3],
                 "thread_num": c[4],
                 "enable_rasteration": c[5],  # keep param name for backward-compat
-            } for c in _configs
+            }
+            for c in _configs
         ]
     return configs
 
 
 def get_best_config(M, N, K, with_roller=False):
-
     def kernel(
         block_M=None,
         block_N=None,
@@ -115,17 +116,16 @@ def kernel(
         thread_num=None,
         enable_rasteration=None,
     ):
-        dtype = "bfloat16"
-        accum_dtype = "float"
+        dtype = T.bfloat16
+        accum_dtype = T.float32
 
         @T.prim_func
         def main(
-                A: T.Tensor((M, K), dtype),
-                B: T.Tensor((N, K), dtype),
-                C: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((N, K), dtype),
+            C: T.Tensor((M, N), dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
                 A_shared = T.alloc_shared((block_M, block_K), dtype)
                 B_shared = T.alloc_shared((block_N, block_K), dtype)
                 C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -146,15 +146,18 @@ def main(
 
         return main
 
-    autotuner = AutoTuner.from_kernel(
-        kernel=kernel, configs=get_configs(M, N, K, with_roller)).set_compile_args(
+    autotuner = (
+        AutoTuner.from_kernel(kernel=kernel, configs=get_configs(M, N, K, with_roller))
+        .set_compile_args(
             out_idx=[-1],
             target="auto",
-        ).set_profile_args(
+        )
+        .set_profile_args(
             supply_type=tl.TensorSupplyType.Integer,
             ref_prog=ref_program,
             skip_check=False,
         )
+    )
     return autotuner.run(warmup=3, rep=20)
 
 
@@ -167,52 +170,20 @@ def get_heuristic_config() -> dict:
     sm_version = sm_major * 10 + sm_minor
     print(f"CUDA device capability: {sm_version}")
     if sm_version in {80}:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 32,
-            "num_stages": 2,
-            "thread_num": 128,
-            "enable_rasteration": True
-        }
+        return {"block_M": 128, "block_N": 256, "block_K": 32, "num_stages": 2, "thread_num": 128, "enable_rasteration": True}
     elif sm_version in {90}:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 64,
-            "num_stages": 3,
-            "thread_num": 256,
-            "enable_rasteration": True
-        }
+        return {"block_M": 128, "block_N": 256, "block_K": 64, "num_stages": 3, "thread_num": 256, "enable_rasteration": True}
     else:
-        return {
-            "block_M": 128,
-            "block_N": 256,
-            "block_K": 32,
-            "num_stages": 0,
-            "thread_num": 128,
-            "enable_rasteration": True
-        }
+        return {"block_M": 128, "block_N": 256, "block_K": 32, "num_stages": 0, "thread_num": 128, "enable_rasteration": True}
 
 
 @tl.jit(out_idx=[-1])
-def matmul(M,
-           N,
-           K,
-           block_M,
-           block_N,
-           block_K,
-           num_stages,
-           thread_num,
-           enable_rasteration,
-           dtype="float16",
-           accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, num_stages, thread_num, enable_rasteration, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def gemm_autotune(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -236,11 +207,7 @@ def gemm_autotune(
     return gemm_autotune
 
 
-def main(M: int = 4096,
-         N: int = 4096,
-         K: int = 4096,
-         use_autotune: bool = False,
-         with_roller: bool = False):
+def main(M: int = 4096, N: int = 4096, K: int = 4096, use_autotune: bool = False, with_roller: bool = False):
     use_autotune = True
     if use_autotune:
         result = get_best_config(M, N, K, with_roller)
@@ -261,20 +228,19 @@ def main(M: int = 4096,
     print(f"Ref TFlops: {2 * M * N * K / ref_latency * 1e-9}")
 
 
+def run_regression_perf(M: int = 4096, N: int = 4096, K: int = 4096):
+    config = get_heuristic_config()
+    kernel = matmul(M, N, K, **config)
+    profiler = kernel.get_profiler(tensor_supply_type=tl.TensorSupplyType.Auto)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark")
     parser.add_argument("--m", type=int, default=4096, help="Matrix dimension M")
     parser.add_argument("--n", type=int, default=4096, help="Matrix dimension N")
     parser.add_argument("--k", type=int, default=4096, help="Matrix dimension K")
-    parser.add_argument(
-        "--use_autotune",
-        action="store_true",
-        default=False,
-        help="Whether to use autotune for matmul configs")
-    parser.add_argument(
-        "--with_roller",
-        action="store_true",
-        default=False,
-        help="Whether to enable BitBLAS roller for search space")
+    parser.add_argument("--use_autotune", action="store_true", default=False, help="Whether to use autotune for matmul configs")
+    parser.add_argument("--with_roller", action="store_true", default=False, help="Whether to enable BitBLAS roller for search space")
     args = parser.parse_args()
     main(args.m, args.n, args.k, args.use_autotune, args.with_roller)
diff --git a/examples/gemm/example_gemm_intrinsics.py b/examples/gemm/example_gemm_intrinsics.py
index 5c014ce3a..d4bc9480f 100644
--- a/examples/gemm/example_gemm_intrinsics.py
+++ b/examples/gemm/example_gemm_intrinsics.py
@@ -4,7 +4,8 @@
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 
 
@@ -34,18 +35,18 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if out_dtype == "int32":
+    if out_dtype == T.int32:
         micro_size_k = 32
 
     # This is a debug config
@@ -53,7 +54,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 64
     warp_col_tiles = 64
-    # chunk = 32 if in_dtype == "float16" else 64
+    # chunk = 32 if in_dtype == T.float16 else 64
     chunk = 32
     shared_scope = "shared.dyn"
 
@@ -99,12 +100,11 @@ def tl_matmul(
 
     @T.prim_func
     def gemm_intrinsics(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -112,10 +112,12 @@ def gemm_intrinsics(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -123,7 +125,6 @@ def gemm_intrinsics(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -133,7 +134,6 @@ def gemm_intrinsics(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(A_local, A_shared, ki)
 
@@ -163,7 +163,7 @@ def ref_program(A, B):
 
 
 def main(M=4096, N=4096, K=4096):
-    in_dtype, out_dtype, accum_dtype = "float16", "float16", "float32"
+    in_dtype, out_dtype, accum_dtype = T.float16, T.float16, T.float32
     kernel = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype)
     src_code = kernel.get_kernel_source()
     # src_code is the generated cuda source
@@ -181,5 +181,12 @@ def main(M=4096, N=4096, K=4096):
     profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
 
 
+def run_regression_perf(M=4096, N=4096, K=4096):
+    in_dtype, out_dtype, accum_dtype = "float16", "float16", "float32"
+    kernel = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype)
+    profiler = kernel.get_profiler()
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     main(M=4096, N=4096, K=4096)
diff --git a/examples/gemm/example_gemm_persistent.py b/examples/gemm/example_gemm_persistent.py
index a2a7122d3..ad3d556ed 100644
--- a/examples/gemm/example_gemm_persistent.py
+++ b/examples/gemm/example_gemm_persistent.py
@@ -5,22 +5,12 @@
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul_non_persistent(M,
-                          N,
-                          K,
-                          block_M,
-                          block_N,
-                          block_K,
-                          threads,
-                          num_stages,
-                          dtype="float16",
-                          accum_dtype="float"):
-
+def matmul_non_persistent(M, N, K, block_M, block_N, block_K, threads, num_stages, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=threads) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -43,18 +33,9 @@ def main(
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul_persistent(M,
-                      N,
-                      K,
-                      block_M,
-                      block_N,
-                      block_K,
-                      threads,
-                      num_stages,
-                      dtype="float16",
-                      accum_dtype="float",
-                      use_persistent_primitive=True):
-
+def matmul_persistent(
+    M, N, K, block_M, block_N, block_K, threads, num_stages, dtype=T.float16, accum_dtype=T.float32, use_persistent_primitive=True
+):
     sm_num = driver.get_num_sms()
     m_blocks = T.ceildiv(M, block_M)
     n_blocks = T.ceildiv(N, block_N)
@@ -63,9 +44,9 @@ def matmul_persistent(M,
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(sm_num, threads=threads) as (block_id):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -90,9 +71,9 @@ def main(
 
     @T.prim_func
     def main_persistent_primitive(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(sm_num, threads=threads) as (block_id):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -100,8 +81,7 @@ def main_persistent_primitive(
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             C_shared = T.alloc_shared((block_M, block_N), dtype)
 
-            for bx, by in T.Persistent(
-                [T.ceildiv(M, block_M), T.ceildiv(N, block_N)], sm_num, block_id):
+            for bx, by in T.Persistent([T.ceildiv(M, block_M), T.ceildiv(N, block_N)], sm_num, block_id):
                 T.clear(C_local)
                 for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                     T.copy(A[bx * block_M, k * block_K], A_shared)
@@ -128,18 +108,15 @@ def main(M=4096, N=4096, K=4096):
     num_stages = 3
 
     persistent_kernel = matmul_persistent(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, threads, num_stages)
-    persistent_profiler = persistent_kernel.get_profiler(
-        tensor_supply_type=tilelang.TensorSupplyType.Randn)
+    persistent_profiler = persistent_kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
     persistent_profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
     print("Persistent GEMM: All check passed.")
     persistent_latency = persistent_profiler.do_bench(warmup=500)
     print(f"Persistent GEMM Latency: {persistent_latency} ms")
     print(f"Persistent GEMM TFlops: {total_flops / persistent_latency * 1e-9} TFlops")
 
-    non_persistent_kernel = matmul_non_persistent(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, threads,
-                                                  num_stages)
-    non_persistent_profiler = non_persistent_kernel.get_profiler(
-        tensor_supply_type=tilelang.TensorSupplyType.Randn)
+    non_persistent_kernel = matmul_non_persistent(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, threads, num_stages)
+    non_persistent_profiler = non_persistent_kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
     non_persistent_profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
     print("Non-Persistent GEMM: All check passed.")
     non_persistent_latency = non_persistent_profiler.do_bench(warmup=500)
@@ -149,11 +126,22 @@ def main(M=4096, N=4096, K=4096):
     print(f"Persistent GEMM Speedup: {non_persistent_latency / persistent_latency}")
 
 
+def run_regression_perf(M=4096, N=4096, K=4096):
+    BLOCK_M = 128
+    BLOCK_N = 256
+    BLOCK_K = 64
+    threads = 256
+    num_stages = 3
+    persistent_kernel = matmul_persistent(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, threads, num_stages)
+    persistent_profiler = persistent_kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Randn)
+    return persistent_profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--M', type=int, default=8192, help='M dimension')
-    parser.add_argument('--N', type=int, default=8192, help='N dimension')
-    parser.add_argument('--K', type=int, default=8192, help='K dimension')
+    parser.add_argument("--M", type=int, default=8192, help="M dimension")
+    parser.add_argument("--N", type=int, default=8192, help="N dimension")
+    parser.add_argument("--K", type=int, default=8192, help="K dimension")
     args = parser.parse_args()
     M, N, K = args.M, args.N, args.K
     main(M, N, K)
diff --git a/examples/gemm/example_gemm_schedule.py b/examples/gemm/example_gemm_schedule.py
index f4727412b..17dbcc568 100644
--- a/examples/gemm/example_gemm_schedule.py
+++ b/examples/gemm/example_gemm_schedule.py
@@ -3,13 +3,12 @@
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def gemm_schedule(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -65,5 +64,19 @@ def main():
     print(kernel.get_kernel_source())
 
 
+def run_regression_perf():
+    kernel = matmul(1024, 1024, 1024, 128, 128, 32)
+    import torch
+
+    a = torch.randn(1024, 1024).cuda().half()
+    b = torch.randn(1024, 1024).cuda().half()
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(a, b)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/gemm/regression_example_gemm.py b/examples/gemm/regression_example_gemm.py
new file mode 100644
index 000000000..3583cf16a
--- /dev/null
+++ b/examples/gemm/regression_example_gemm.py
@@ -0,0 +1,25 @@
+import tilelang.testing
+import example_gemm
+import example_gemm_autotune
+import example_gemm_intrinsics
+import example_gemm_schedule
+
+
+def regression_example_gemm_autotune():
+    tilelang.testing.process_func(example_gemm_autotune.run_regression_perf, M=1024, N=1024, K=1024)
+
+
+def regression_example_gemm_intrinsics():
+    tilelang.testing.process_func(example_gemm_intrinsics.run_regression_perf, M=1024, N=1024, K=1024)
+
+
+def regression_example_gemm_schedule():
+    tilelang.testing.process_func(example_gemm_schedule.run_regression_perf)
+
+
+def regression_example_gemm():
+    tilelang.testing.process_func(example_gemm.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/gemm_fp8/README.md b/examples/gemm_fp8/README.md
index 9d7011a06..2b3dc9560 100644
--- a/examples/gemm_fp8/README.md
+++ b/examples/gemm_fp8/README.md
@@ -1 +1 @@
-**Notes**: Now we only support fp8 with mma instructions instead of `T.gemm`, because the cutlass version of tilelang is too old, we should update the cutlass version in future.
\ No newline at end of file
+**Notes**: Now we only support fp8 with mma instructions instead of `T.gemm`, because the cutlass version of tilelang is too old, we should update the cutlass version in future.
diff --git a/examples/gemm_fp8/example_tilelang_gemm_amd.py b/examples/gemm_fp8/example_tilelang_gemm_amd.py
index 0e6ace757..93f8c4980 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_amd.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_amd.py
@@ -17,10 +17,8 @@ def supply_prog(args):
     a_param, b_param = args
     M, K = a_param.shape
     N, _ = b_param.shape
-    a = (torch.randn(M, K, dtype=torch.float16, device='cuda') *
-         0.01).to(dtype=torch.float8_e4m3fnuz)
-    b = (torch.randn(N, K, dtype=torch.float16, device='cuda') *
-         0.01).to(dtype=torch.float8_e4m3fnuz)
+    a = (torch.randn(M, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=torch.float8_e4m3fnuz)
+    b = (torch.randn(N, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=torch.float8_e4m3fnuz)
     return [a, b]
 
 
@@ -35,40 +33,36 @@ def get_configs():
 
     valid_configs = []
 
-    for m, n, k, stages, t, kp, gemm_type in itertools.product(block_Ms, block_Ns, block_Ks,
-                                                               num_stages, num_threads, k_packs,
-                                                               gemm_types):
-        valid_configs.append({
-            "block_M": m,
-            "block_N": n,
-            "block_K": k,
-            "num_stages": stages,
-            "num_threads": t,
-            "k_pack": kp,
-            "gemm_type": gemm_type,
-        })
+    for m, n, k, stages, t, kp, gemm_type in itertools.product(block_Ms, block_Ns, block_Ks, num_stages, num_threads, k_packs, gemm_types):
+        valid_configs.append(
+            {
+                "block_M": m,
+                "block_N": n,
+                "block_K": k,
+                "num_stages": stages,
+                "num_threads": t,
+                "k_pack": kp,
+                "gemm_type": gemm_type,
+            }
+        )
     return valid_configs
 
 
 @tilelang.autotune(
-    configs=get_configs(),
-    cache_input_tensors=True,
-    ref_prog=ref_program,
-    manual_check_prog=manual_check_prog,
-    supply_prog=supply_prog)
+    configs=get_configs(), cache_input_tensors=True, ref_prog=ref_program, manual_check_prog=manual_check_prog, supply_prog=supply_prog
+)
 @tilelang.jit(out_idx=[-1])
 def fp8_matmul(M, N, K, block_M, block_N, block_K, num_stages, num_threads, k_pack, gemm_type):
-    dtype = "float8_e4m3fnuz"
-    accum_dtype = "float"
+    dtype = T.float8_e4m3fnuz
+    accum_dtype = T.float32
 
     @T.prim_func
     def gemm_fp8_rs(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), accum_dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), accum_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
             A_local = T.alloc_fragment((block_M, block_K), dtype)
             B_shared = T.alloc_shared((block_N, block_K), dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -77,24 +71,17 @@ def gemm_fp8_rs(
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(A[by * block_M, k * block_K], A_local)
                 T.copy(B[bx * block_N, k * block_K], B_shared)
-                T.gemm(
-                    A_local,
-                    B_shared,
-                    C_local,
-                    transpose_B=True,
-                    k_pack=k_pack,
-                    policy=T.GemmWarpPolicy.FullRow)
+                T.gemm(A_local, B_shared, C_local, transpose_B=True, k_pack=k_pack, policy=T.GemmWarpPolicy.FullRow)
 
             T.copy(C_local, C[by * block_M, bx * block_N])
 
     @T.prim_func
     def gemm_fp8_ss(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), accum_dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), accum_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             B_shared = T.alloc_shared((block_N, block_K), dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -103,13 +90,7 @@ def gemm_fp8_ss(
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(A[by * block_M, k * block_K], A_shared)
                 T.copy(B[bx * block_N, k * block_K], B_shared)
-                T.gemm(
-                    A_shared,
-                    B_shared,
-                    C_local,
-                    transpose_B=True,
-                    k_pack=k_pack,
-                    policy=T.GemmWarpPolicy.FullRow)
+                T.gemm(A_shared, B_shared, C_local, transpose_B=True, k_pack=k_pack, policy=T.GemmWarpPolicy.FullRow)
 
             T.copy(C_local, C[by * block_M, bx * block_N])
 
@@ -123,10 +104,8 @@ def gemm_fp8_ss(
 
 def test_gemm_fp8(M, N, K):
     kernel = fp8_matmul(M, N, K)
-    a = (torch.randn(M, K, dtype=torch.float16, device='cuda') *
-         0.01).to(dtype=torch.float8_e4m3fnuz)
-    b = (torch.randn(N, K, dtype=torch.float16, device='cuda') *
-         0.01).to(dtype=torch.float8_e4m3fnuz)
+    a = (torch.randn(M, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=torch.float8_e4m3fnuz)
+    b = (torch.randn(N, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=torch.float8_e4m3fnuz)
     c = kernel(a, b)
     ref_c = ref_program(a, b)
     torch_assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8.py b/examples/gemm_fp8/example_tilelang_gemm_fp8.py
index a403ed068..086997975 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8.py
@@ -1,7 +1,6 @@
 import torch
 import tilelang
 import tilelang.language as T
-from tilelang.utils.tensor import map_torch_type
 
 
 def calc_diff(x, y):
@@ -12,13 +11,12 @@ def calc_diff(x, y):
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype=T.float32):
     @T.prim_func
     def gemm_fp8(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -37,12 +35,12 @@ def gemm_fp8(
 
 
 def test_gemm_fp8(M, N, K, dtype):
-    torch_dtype = map_torch_type(dtype)
+    torch_dtype = T.dtype(dtype).as_torch()
 
     kernel = matmul(M, N, K, 128, 128, 64, dtype)
 
-    a = torch.randn(M, K, dtype=torch.float16, device='cuda').to(dtype=torch_dtype)
-    b = torch.randn(N, K, dtype=torch.float16, device='cuda').to(dtype=torch_dtype)
+    a = torch.randn(M, K, dtype=torch.float16, device="cuda").to(dtype=torch_dtype)
+    b = torch.randn(N, K, dtype=torch.float16, device="cuda").to(dtype=torch_dtype)
 
     c = kernel(a, b)
 
@@ -57,8 +55,21 @@ def test_gemm_fp8(M, N, K, dtype):
 
 
 def main():
-    test_gemm_fp8(1024, 1024, 1024, 'float8_e4m3')
-    test_gemm_fp8(1024, 1024, 1024, 'float8_e5m2')
+    test_gemm_fp8(1024, 1024, 1024, T.float8_e4m3fn)
+    test_gemm_fp8(1024, 1024, 1024, T.float8_e5m2)
+
+
+def run_regression_perf():
+    M, N, K = 4096, 4096, 4096
+    dtype = "float8_e4m3"
+    kernel_e4m3 = matmul(M, N, K, 128, 128, 64, dtype)
+    profiler_e4m3 = kernel_e4m3.get_profiler(tilelang.TensorSupplyType.Integer)
+    latency_e4m3 = profiler_e4m3.do_bench(backend="cupti")
+    dtype = "float8_e5m2"
+    kernel_e5m2 = matmul(M, N, K, 128, 128, 64, dtype)
+    profiler_e5m2 = kernel_e5m2.get_profiler(tilelang.TensorSupplyType.Integer)
+    latency_e5m2 = profiler_e5m2.do_bench(backend="cupti")
+    return (latency_e4m3 + latency_e5m2) / 2
 
 
 if __name__ == "__main__":
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py
index 1d9207aff..a702e8ae0 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py
@@ -1,11 +1,10 @@
 import torch
 import tilelang
 import tilelang.language as T
-from tilelang.utils.tensor import map_torch_type
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype=T.float32):
     # for fp8 gemm, do one promote after 4 wgmma inst, i.e. block_K = 128.
     # if block_K < 128, promote after 128/block_K iters.
     # if block_K > 128, promote after every iter.
@@ -13,9 +12,9 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype="float"):
 
     @T.prim_func
     def gemm_fp8_2xAcc(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), accum_dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), accum_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -55,18 +54,18 @@ def calc_diff(x, y):
 
 
 def test_gemm_fp8(M, N, K, dtype):
-    torch_dtype = map_torch_type(dtype)
+    torch_dtype = T.dtype(dtype).as_torch()
 
     kernel = matmul(M, N, K, 128, 128, 64, dtype)
 
-    a = torch.rand(M, K, dtype=torch.float16, device='cuda')
+    a = torch.rand(M, K, dtype=torch.float16, device="cuda")
     a = (100 * (2 * a - 1)).to(dtype=torch_dtype)
-    b = torch.rand(N, K, dtype=torch.float16, device='cuda')
+    b = torch.rand(N, K, dtype=torch.float16, device="cuda")
     b = (100 * (2 * b - 1)).to(dtype=torch_dtype)
 
     c = kernel(a, b)
 
-    ref_c = (a.float() @ b.float().T)
+    ref_c = a.float() @ b.float().T
 
     diff = calc_diff(c, ref_c)
     print(f"diff: {diff}")
@@ -74,8 +73,21 @@ def test_gemm_fp8(M, N, K, dtype):
 
 
 def main():
-    test_gemm_fp8(1024, 1024, 8192, 'float8_e4m3')
-    test_gemm_fp8(1024, 1024, 8192, 'float8_e5m2')
+    test_gemm_fp8(1024, 1024, 8192, T.float8_e4m3fn)
+    test_gemm_fp8(1024, 1024, 8192, T.float8_e5m2)
+
+
+def run_regression_perf():
+    M, N, K = 1024, 1024, 8192
+    dtype = "float8_e4m3"
+    kernel_e4m3 = matmul(M, N, K, 128, 128, 64, dtype)
+    profiler_e4m3 = kernel_e4m3.get_profiler(tilelang.TensorSupplyType.Integer)
+    latency_e4m3 = profiler_e4m3.do_bench(backend="cupti")
+    dtype = "float8_e5m2"
+    kernel_e5m2 = matmul(M, N, K, 128, 128, 64, dtype)
+    profiler_e5m2 = kernel_e5m2.get_profiler(tilelang.TensorSupplyType.Integer)
+    latency_e5m2 = profiler_e5m2.do_bench(backend="cupti")
+    return (latency_e4m3 + latency_e5m2) / 2
 
 
 if __name__ == "__main__":
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
index ed44aab69..762885ec3 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
@@ -5,7 +5,8 @@
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 from tilelang.utils.tensor import map_torch_type
 
@@ -38,21 +39,26 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "float8_e4m3",
-        "float8_e5m2",
-        "int8",
+        T.float16,
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    is_float8 = in_dtype in ["float8_e4m3", "float8_e5m2"]
-    if out_dtype == "int32" or is_float8:
+    is_float8 = in_dtype in [
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.float8_e4m3fn,
+        T.float8_e5m2fnuz,
+    ]
+    if out_dtype == T.int32 or is_float8:
         micro_size_k = 32
 
     # This is a debug config
@@ -60,7 +66,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 32
     warp_col_tiles = 32
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -105,12 +111,11 @@ def tl_matmul(
 
     @T.prim_func
     def gemm_fp8_intrinsic(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -118,10 +123,12 @@ def gemm_fp8_intrinsic(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -129,7 +136,6 @@ def gemm_fp8_intrinsic(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -139,7 +145,6 @@ def gemm_fp8_intrinsic(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -215,8 +220,22 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 
 
 def main():
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e4m3", "float32", "float32")
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e5m2", "float32", "float32")
+    assert_tl_matmul_correctness(128, 128, 128, T.float8_e4m3fn, T.float32, T.float32)
+    assert_tl_matmul_correctness(128, 128, 128, T.float8_e5m2, T.float32, T.float32)
+
+
+def run_regression_perf():
+    M, N, K = 4096, 4096, 4096
+    out_dtype, accum_dtype = "float32", "float32"
+    in_dtype = T.float8_e4m3fn
+    kernel_e4m3 = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype)
+    profiler_e4m3 = kernel_e4m3.get_profiler(tilelang.TensorSupplyType.Integer)
+    latency_e4m3 = profiler_e4m3.do_bench(backend="cupti")
+    in_dtype = T.float8_e5m2
+    kernel_e5m2 = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype)
+    profiler_e5m2 = kernel_e5m2.get_profiler(tilelang.TensorSupplyType.Integer)
+    latency_e5m2 = profiler_e5m2.do_bench(backend="cupti")
+    return (latency_e4m3 + latency_e5m2) / 2
 
 
 if __name__ == "__main__":
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py
new file mode 100644
index 000000000..aa7e8b360
--- /dev/null
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py
@@ -0,0 +1,124 @@
+import torch
+import tilelang
+import tilelang.language as T
+from tilelang.utils.tensor import map_torch_type
+
+
+def matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_tmem = T.alloc_tmem([block_M, block_N], accum_dtype)
+            mbar = T.alloc_barrier(1)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            C_shared = T.alloc_shared((block_M, block_N), out_dtype)
+
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[bx * block_N, k * block_K], B_shared)
+                T.gemm_v2(
+                    A_shared,
+                    B_shared,
+                    C_tmem,
+                    trans_A,
+                    trans_B,
+                    mbar=mbar,
+                    wg_wait=-1,
+                    clear_accum=(k == 0),
+                )
+                T.mbarrier_wait_parity(mbar, k % 2)
+
+            T.copy(C_tmem, C_local)
+            T.copy(C_local, C_shared)
+
+            T.copy(C_shared, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def calc_diff(x, y):
+    x, y = x.double(), y.double()
+    denominator = (x * x + y * y).sum()
+    sim = 2 * (x * y).sum() / denominator
+    return 1 - sim
+
+
+M, N, K = 4096, 4096, 8192
+block_M, block_N, block_K = 64, 256, 32
+trans_A, trans_B = False, True
+num_stages = 2
+threads = 256
+for tvm_fp8_dtype in [T.float8_e4m3fn, T.float8_e5m2]:
+    for tvm_acc_dtype in [T.float16, T.float32]:  # , torch.float16]:
+        torch_fp8_dtype = map_torch_type(tvm_fp8_dtype)
+        torch_acc_dtype = map_torch_type(tvm_acc_dtype)
+        print(f"running {tvm_fp8_dtype} -> {tvm_acc_dtype}")
+        in_dtype, out_dtype, accum_dtype = tvm_fp8_dtype, tvm_acc_dtype, tvm_acc_dtype
+
+        func = matmul(
+            M,
+            N,
+            K,
+            block_M,
+            block_N,
+            block_K,
+            trans_A,
+            trans_B,
+            in_dtype,
+            out_dtype,
+            accum_dtype,
+            num_stages,
+            threads,
+        )
+        jit_kernel = tilelang.compile(
+            func,
+            out_idx=[2],
+            target="cuda",
+            pass_configs={
+                tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+                tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+                tilelang.PassConfigKey.TL_ENABLE_PTXAS_VERBOSE_OUTPUT: True,
+            },
+        )
+        # jit_kernel.export_ptx("./dump.ptx")
+        # jit_kernel.export_sources("./dump.cu")
+
+        a = torch.randn(M, K, device="cuda", dtype=torch.float16).to(torch_fp8_dtype)
+        b = torch.randn(N, K, device="cuda", dtype=torch.float16).to(torch_fp8_dtype)
+
+        c = jit_kernel(a, b)
+        ref_c = (a.to(torch.half) @ b.T.to(torch.half)).float()
+        c = c.float()
+        diff = calc_diff(c, ref_c)
+        # assert diff < 1e-3, f"{diff}"
+        print(f"[{tvm_fp8_dtype} -> {tvm_acc_dtype}] diff = {diff}")
+
+        profiler = jit_kernel.get_profiler()
+        latency = profiler.do_bench()
+        print(f"[{tvm_fp8_dtype} -> {tvm_acc_dtype}] Latency: {latency} ms")
+        print(f"[{tvm_fp8_dtype} -> {tvm_acc_dtype}] Flops: {2 * M * N * K / (latency / 1e3) / 1e12} TFLOPS")
diff --git a/examples/gemm_fp8/regression_example_gemm_fp8.py b/examples/gemm_fp8/regression_example_gemm_fp8.py
new file mode 100644
index 000000000..3ba2f4f27
--- /dev/null
+++ b/examples/gemm_fp8/regression_example_gemm_fp8.py
@@ -0,0 +1,20 @@
+import tilelang.testing
+import example_tilelang_gemm_fp8
+import example_tilelang_gemm_fp8_2xAcc
+import example_tilelang_gemm_fp8_intrinsic
+
+
+def regression_example_tilelang_gemm_fp8_2xAcc():
+    tilelang.testing.process_func(example_tilelang_gemm_fp8_2xAcc.run_regression_perf)
+
+
+def regression_example_tilelang_gemm_fp8_intrinsic():
+    tilelang.testing.process_func(example_tilelang_gemm_fp8_intrinsic.run_regression_perf)
+
+
+def regression_example_tilelang_gemm_fp8():
+    tilelang.testing.process_func(example_tilelang_gemm_fp8.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/gemm_fp8/test_example_gemm_fp8.py b/examples/gemm_fp8/test_example_gemm_fp8.py
index 19a9ee00a..8a60d0e02 100644
--- a/examples/gemm_fp8/test_example_gemm_fp8.py
+++ b/examples/gemm_fp8/test_example_gemm_fp8.py
@@ -1,17 +1,30 @@
+import pytest
+import torch
 import tilelang.testing
 import example_tilelang_gemm_fp8_2xAcc
 import example_tilelang_gemm_fp8_intrinsic
 import example_tilelang_gemm_fp8
 
 
+def requires_sm89():
+    """FP8 tensor core MMA requires SM89 (Ada Lovelace) or higher."""
+    major, minor = torch.cuda.get_device_capability()
+    return pytest.mark.skipif(
+        major < 9 and not (major == 8 and minor >= 9), reason="FP8 tensor core MMA requires SM89 or higher (Ada Lovelace/Hopper)"
+    )
+
+
+@requires_sm89()
 def test_example_tilelang_gemm_fp8_2xAcc():
     example_tilelang_gemm_fp8_2xAcc.main()
 
 
+@requires_sm89()
 def test_example_tilelang_gemm_fp8_intrinsic():
     example_tilelang_gemm_fp8_intrinsic.main()
 
 
+@requires_sm89()
 def test_example_tilelang_gemm_fp8():
     example_tilelang_gemm_fp8.main()
 
diff --git a/examples/gemm_sm100/README.md b/examples/gemm_sm100/README.md
index 73dd76c30..d630d2d0d 100644
--- a/examples/gemm_sm100/README.md
+++ b/examples/gemm_sm100/README.md
@@ -40,19 +40,19 @@ import tilelang.language as T
 
 @T.prim_func
 def main(
-    A: T.Tensor((M, K), "bfloat16"),
-    B: T.Tensor((N, K), "bfloat16"),
-    C: T.Tensor((M, N), "bfloat16"),
+    A: T.Tensor((M, K), T.bfloat16),
+    B: T.Tensor((N, K), T.bfloat16),
+    C: T.Tensor((M, N), T.bfloat16),
 ):
     with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
         # 1. Allocate memory buffers
-        A_shared = T.alloc_shared((block_M, block_K), "bfloat16")  # A matrix shared memory
-        B_shared = T.alloc_shared((block_N, block_K), "bfloat16")  # B matrix shared memory
-        C_tmem = T.alloc_tmem([block_M, block_N], "float")         # TCGEN5MMA output to Tensor Memory
+        A_shared = T.alloc_shared((block_M, block_K), T.bfloat16)  # A matrix shared memory
+        B_shared = T.alloc_shared((block_N, block_K), T.bfloat16)  # B matrix shared memory
+        C_tmem = T.alloc_tmem([block_M, block_N], T.float)         # TCGEN5MMA output to Tensor Memory
         mbar = T.alloc_barrier(1)                                 # mbarrier synchronization primitive
 
-        C_local = T.alloc_fragment((block_M, block_N), "float")   # Register storage
-        C_shared = T.alloc_shared((block_M, block_N), "bfloat16") # Output shared memory
+        C_local = T.alloc_fragment((block_M, block_N), T.float)   # Register storage
+        C_shared = T.alloc_shared((block_M, block_N), T.bfloat16) # Output shared memory
 
         # 2. Main computation loop
         for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=1):
@@ -103,4 +103,3 @@ latency = profiler.do_bench()
 print(f"Latency: {latency} ms")
 print(f"Performance: {2 * M * N * K / (latency/1e3) / 1e12:.2f} TFLOPS")
 ```
-
diff --git a/examples/gemm_sm100/gemm_mma.py b/examples/gemm_sm100/gemm_mma.py
index a58e5a7c0..226e33c01 100644
--- a/examples/gemm_sm100/gemm_mma.py
+++ b/examples/gemm_sm100/gemm_mma.py
@@ -4,13 +4,12 @@
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -62,7 +61,8 @@ def main(
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
+    },
+)
 print(jit_kernel.get_kernel_source())
 # 3. Test the kernel in Python with PyTorch data
 import torch
diff --git a/examples/gemm_sm100/gemm_tcgen5mma.py b/examples/gemm_sm100/gemm_tcgen5mma.py
index 9008c7ef5..523a94fea 100644
--- a/examples/gemm_sm100/gemm_tcgen5mma.py
+++ b/examples/gemm_sm100/gemm_tcgen5mma.py
@@ -25,9 +25,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -40,15 +40,7 @@ def main(
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(A[by * block_M, k * block_K], A_shared)
                 T.copy(B[bx * block_N, k * block_K], B_shared)
-                T.gemm(
-                    A_shared,
-                    B_shared,
-                    C_tmem,
-                    trans_A,
-                    trans_B,
-                    mbar=mbar,
-                    wg_wait=-1,
-                    clear_accum=k == 0)
+                T.gemm(A_shared, B_shared, C_tmem, trans_A, trans_B, mbar=mbar, wg_wait=-1, clear_accum=k == 0)
                 T.mbarrier_wait_parity(mbar, k % 2)
 
             T.copy(C_tmem, C_local)
@@ -62,12 +54,11 @@ def main(
 M, N, K = 4096, 4096, 8192
 block_M, block_N, block_K = 128, 256, 128
 trans_A, trans_B = False, True
-in_dtype, out_dtype, accum_dtype = "bfloat16", "bfloat16", "float"
+in_dtype, out_dtype, accum_dtype = T.bfloat16, T.bfloat16, T.float
 num_stages = 2
 threads = 256
 
-func = matmul(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype,
-              accum_dtype, num_stages, threads)
+func = matmul(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, num_stages, threads)
 jit_kernel = tilelang.compile(
     func,
     out_idx=[2],
@@ -75,7 +66,8 @@ def main(
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
+    },
+)
 
 print(jit_kernel.get_kernel_source())
 
@@ -88,4 +80,4 @@ def main(
 profiler = jit_kernel.get_profiler()
 latency = profiler.do_bench()
 print(f"Latency: {latency} ms")
-print(f"Flops: {2 * M * N * K / (latency/1e3) / 1e12} TFLOPS")
+print(f"Flops: {2 * M * N * K / (latency / 1e3) / 1e12} TFLOPS")
diff --git a/examples/gemm_sp/example_custom_compress.py b/examples/gemm_sp/example_custom_compress.py
new file mode 100644
index 000000000..0544b8255
--- /dev/null
+++ b/examples/gemm_sp/example_custom_compress.py
@@ -0,0 +1,337 @@
+import argparse
+
+import tilelang
+import tilelang.language as T
+
+from tilelang.layout import make_cutlass_metadata_layout
+from tilelang.utils.sparse import randn_semi_sparse
+from tilelang.utils.tensor import torch_assert_close
+
+from triton.testing import do_bench
+
+import torch
+
+torch.manual_seed(42)
+
+DEFAULT_CONFIG = {  # take best config from autotune script
+    "4090": {
+        T.float: {
+            "block_M": 128,
+            "block_N": 64,
+            "block_K": 64,
+            "num_stages": 1,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
+        },
+        T.float16: {
+            "block_M": 256,
+            "block_N": 128,
+            "block_K": 64,
+            "num_stages": 2,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
+        },
+    },
+    "h20": {
+        T.float: {
+            "block_M": 128,
+            "block_N": 64,
+            "block_K": 128,
+            "num_stages": 3,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
+        },
+        T.float16: {
+            "block_M": 128,
+            "block_N": 64,
+            "block_K": 128,
+            "num_stages": 3,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
+        },
+    },
+}
+
+ARCH_INFO = {"8.0": (16, "int16"), "8.9": (16, "int16"), "9.0": (8, "uint8")}
+
+
+@tilelang.jit(out_idx=[-1])
+def matmul_sp_fp16_custom_compress(
+    M, N, K, accum_dtype, block_M, block_N, block_K, num_stages, thread_num, policy, enable_rasterization, use_cutlass_layout
+):
+    e_factor, e_dtype = (16, T.int16)
+
+    @T.prim_func
+    def gemm_sp_fp16_custom_compress(
+        A_sparse: T.Tensor((M, K // 2), T.float16),
+        E: T.Tensor((M, K // e_factor), e_dtype),
+        B: T.Tensor((K, N), T.float16),
+        C: T.Tensor((M, N), accum_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K // 2), T.float16)
+            E_shared = T.alloc_shared((block_M, block_K // e_factor), e_dtype)
+            B_shared = T.alloc_shared((block_K, block_N), T.float16)
+            C_shared = T.alloc_shared((block_M, block_N), accum_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            if use_cutlass_layout:
+                T.annotate_layout(
+                    {
+                        E: make_cutlass_metadata_layout(E, mma_dtype=T.float16, arch="8.0", block_k=block_K),
+                        E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=T.float16, arch="8.0", block_k=block_K),
+                    }
+                )
+            T.clear(C_local)
+            T.disable_warp_group_reg_alloc()
+            T.use_swizzle(panel_size=10, enable=enable_rasterization)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                T.copy(E[by * block_M, k * block_K // e_factor], E_shared)
+                T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm_sp_v2(A_shared, E_shared, B_shared, C_local, False, False, policy=policy)
+
+            T.copy(C_local, C_shared)
+            T.copy(C_shared, C[by * block_M, bx * block_N])
+
+    return gemm_sp_fp16_custom_compress
+
+
+def torch_compress(dense):
+    """
+    A naive compression function, where each 4-bit meta matches 4 elements in original matrix in row major layout.
+    """
+    if dense.dim() != 2:
+        raise RuntimeError(f"Expected 2-dimensional dense tensor, got {dense.dim()}-dimensional tensor")
+
+    m, k = dense.shape
+
+    meta_dtype = torch.int8
+    if dense.dtype == torch.int8:
+        meta_dtype = torch.int32
+    elif dense.dtype in [torch.half, torch.bfloat16, torch.float]:
+        meta_dtype = torch.int16
+    else:
+        raise RuntimeError(f"Invalid datatype {dense.dtype} of dense matrix")
+    quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4
+    if quadbits_per_meta_elem not in (4, 8):
+        raise RuntimeError("Invalid number of elements per meta element calculated")
+
+    if meta_dtype == torch.int32:
+        if m % 16 != 0:
+            raise RuntimeError(f"Number of rows of dense matrix {m} must be divisible by 16")
+    else:
+        if m % 32 != 0:
+            raise RuntimeError(f"Number of rows of dense matrix {m} must be divisible by 32")
+    if k % (4 * quadbits_per_meta_elem) != 0:
+        raise RuntimeError(f"Number of columns of dense matrix {k} must be divisible by {4 * quadbits_per_meta_elem}")
+
+    if dense.dtype != torch.float:
+        ksparse = 4
+        dense_4 = dense.view(-1, k // ksparse, ksparse)
+        m0, m1, _m2, m3 = (dense_4 != 0).unbind(-1)
+    else:
+        ksparse = 2
+        dense_2 = dense.view(-1, k // ksparse, ksparse)
+        m0, _m2 = m1, m3 = (dense_2 != 0).unbind(-1)
+    meta_ncols = k // (ksparse * quadbits_per_meta_elem)
+
+    # Encoding quadruples of True/False values as follows:
+    #     [True,  True,  False, False] -> 0b0100
+    #     [True,  False, True,  False] -> 0b1000
+    #     [False, True,  True,  False] -> 0b1001
+    #     [True,  False, False, True ] -> 0b1100
+    #     [False, True,  False, True ] -> 0b1101
+    #     [False, False, True,  True ] -> 0b1110
+    # Thus, lower two bits in the encoding are index of the True value
+    # at the lowest index in the quadruple, and the higher two bits in
+    # the encoding are index of the other True value in the quadruple.
+    # In case there are less than two True values, than False value or
+    # values at some index or indices are considered True for the
+    # encoding.  In case there are more than two True values, then the
+    # excess True value(s) at some indices are considered False for
+    # the encoding.  The exact encodings used for these cases are as
+    # follows:
+    #     [False, False, False, False] -> 0b1110
+    #     [False, False, False, True ] -> 0b1110
+    #     [False, False, True,  False] -> 0b1110
+    #     [False, True,  False, False] -> 0b1001
+    #     [False, True,  True,  True ] -> 0b1101
+    #     [True,  False, False, False] -> 0b1000
+    #     [True,  False, True,  True ] -> 0b1100
+    #     [True,  True,  False, True ] -> 0b0100
+    #     [True,  True,  True,  False] -> 0b0100
+    #     [True,  True,  True,  True ] -> 0b0100
+    # These particular encodings are chosen, with the help of Espresso
+    # logic minimizer software, for the purpose of minimization of
+    # corresponding Boolean functions, that translate non-zero flags
+    # into encoding bits.  Note also possible choices for the first
+    # and last of these encodings were limited only to (0b0100,
+    # 0b1110), in order to produce valid encodings for 1:2 sparsity
+    # case.
+
+    expr0 = m0 & m1
+    expr1 = ~m0 & m1
+    expr2 = ~m0 & ~m1
+    bit0 = expr1
+    bit1 = expr2
+    bit2 = expr0 | expr2 | m3
+    bit3 = expr1 | ~m1
+    idxs0 = bit0 | (bit1.to(torch.int64) << 1)
+    idxs1 = bit2 | (bit3.to(torch.int64) << 1)
+
+    if dense.dtype != torch.float:
+        sparse0 = dense_4.gather(-1, idxs0.unsqueeze(-1))  # type: ignore[possibly-undefined]
+        sparse1 = dense_4.gather(-1, idxs1.unsqueeze(-1))
+        sparse = torch.stack((sparse0, sparse1), dim=-1).view(m, k // 2)
+    else:
+        sparse = dense_2.gather(-1, idxs0.unsqueeze(-1) // 2).view(m, k // 2)  # type: ignore[possibly-undefined]
+
+    meta_4 = idxs0 | (idxs1 << 2)
+    meta_n = meta_4.view((-1, meta_ncols, quadbits_per_meta_elem)).to(meta_dtype)
+
+    if quadbits_per_meta_elem == 4:
+        meta = meta_n[:, :, 0] | (meta_n[:, :, 1] << 4) | (meta_n[:, :, 2] << 8) | (meta_n[:, :, 3] << 12)
+    elif quadbits_per_meta_elem == 8:
+        meta = (
+            meta_n[:, :, 0]
+            | (meta_n[:, :, 1] << 4)
+            | (meta_n[:, :, 2] << 8)
+            | (meta_n[:, :, 3] << 12)
+            | (meta_n[:, :, 4] << 16)
+            | (meta_n[:, :, 5] << 20)
+            | (meta_n[:, :, 6] << 24)
+            | (meta_n[:, :, 7] << 28)
+        )
+
+    return (sparse, meta)
+
+
+def decode_metadata(meta: torch.Tensor) -> torch.Tensor:
+    assert meta.dtype is torch.int16
+    groups_per_meta = 16 // 4  # 4 groups per uint16
+    out = []
+    for g in range(groups_per_meta):
+        group_bits = (meta >> (g * 4)) & 0xF
+        idx0 = group_bits & 0x3
+        idx1 = (group_bits >> 2) & 0x3
+        out.append(torch.stack([idx0, idx1], dim=-1))
+    return torch.concat(out, dim=-1).view(meta.shape[0], -1)
+
+
+@tilelang.jit(
+    out_idx=[1, 2],
+    pass_configs={
+        tilelang.PassConfigKey.TIR_DISABLE_VECTORIZE: True,
+    },
+)
+def compress_kernel(M, K, block_M, block_K, dtype, use_cutlass_layout):
+    e_factor, e_dtype = ARCH_INFO["8.0"]
+    e_K = K // e_factor
+    elem, group = 2, 4
+
+    assert M % block_M == 0, "M must be divisible by block_M"
+    assert K % block_K == 0, "K must be divisible by block_K"
+    assert K % e_factor == 0, "K must be divisible by e_factor"
+    assert block_K % e_factor == 0, "block_K must be divisible by e_factor"
+
+    @T.prim_func
+    def kernel(
+        A: T.Tensor((M, K), dtype),
+        A_sp: T.Tensor((M, K // 2), dtype),
+        E: T.Tensor((M, e_K), e_dtype),
+    ):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(K, block_K), threads=block_M) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            A_sp_shared = T.alloc_shared((block_M, block_K // 2), dtype)
+            E_shared = T.alloc_shared((block_M, block_K // e_factor), e_dtype)
+            if use_cutlass_layout:
+                T.annotate_layout(
+                    {
+                        E: make_cutlass_metadata_layout(E, mma_dtype=T.float16, arch="8.0", block_k=block_K),
+                        E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=T.float16, arch="8.0", block_k=block_K),
+                    }
+                )
+            T.clear(A_sp_shared)
+            T.clear(E_shared)
+            # TODO: alloc_var seems buggy here
+            non_zero_cnt = T.alloc_local((1,), dtype=T.uint8)
+            non_zero_elt_log_idx = T.alloc_local((elem,), dtype=T.uint8)
+            T.copy(A[bx * block_M, by * block_K], A_shared)
+            for tm in T.Parallel(block_M):
+                for g_i in range(0, block_K // group):
+                    a_k = g_i * group
+                    non_zero_cnt[0] = 0
+                    for i in range(elem):
+                        non_zero_elt_log_idx[i] = 0
+                    for i in range(group):
+                        val = A_shared[tm, a_k + i]
+                        if val != 0.0:
+                            non_zero_elt_log_idx[non_zero_cnt[0]] = i
+                            A_sp_shared[tm, a_k // 2 + non_zero_cnt[0]] = val
+                            non_zero_cnt[0] += 1
+                    # TODO: use T.device_assert(non_zero_cnt <= 2) after rebasing main
+                    if non_zero_cnt[0] == 1 and non_zero_elt_log_idx[0] == 3:
+                        non_zero_elt_log_idx[0] = 0
+                        non_zero_elt_log_idx[1] = 3
+                        A_sp_shared[tm, a_k // 2 + 1] = A_sp_shared[tm, a_k // 2]
+                        A_sp_shared[tm, a_k // 2] = 0.0
+                    elif non_zero_cnt[0] == 1:
+                        A_sp_shared[tm, a_k // 2 + 1] = 0
+                        non_zero_elt_log_idx[1] = 3
+                    for i in T.serial(elem):
+                        val = non_zero_elt_log_idx[i]
+                        E_shared[tm, a_k // e_factor] |= T.shift_left(val, 4 * (g_i % (e_factor // group)) + 2 * i)
+            T.copy(A_sp_shared, A_sp[bx * block_M, by * block_K // 2])
+            T.copy(E_shared, E[bx * block_M, by * block_K // e_factor])
+
+    return kernel
+
+
+def main(m=16384, n=16384, k=16384, use_cutlass_layout=False, use_torch_compressor=False, accum_dtype=None, cfg="4090"):
+    if accum_dtype is None:
+        accum_dtype = T.float
+    kernel = matmul_sp_fp16_custom_compress(m, n, k, accum_dtype, **DEFAULT_CONFIG[cfg][accum_dtype], use_cutlass_layout=use_cutlass_layout)
+
+    a = randn_semi_sparse(m, k, device="cuda", dtype=torch.half)
+    b = torch.randn(k, n, device="cuda", dtype=torch.half)
+
+    if use_torch_compressor:
+        assert not use_cutlass_layout, "torch sparse must be used with naive layout"
+        a_sparse, e = torch_compress(a)
+    else:
+        a_sparse, e = compress_kernel(m, k, 32, 32, T.float16, use_cutlass_layout=use_cutlass_layout)(a)
+
+    c = kernel(a_sparse, e, b)
+
+    ref_c = a @ b
+
+    assert not c.isnan().any(), "Reference result contains NaNs, please report an issue"
+    torch_assert_close(c, ref_c.to(c.dtype), rtol=1e-3, atol=1e-3)
+    print(f"Precision check passed. Max diff: {(c - ref_c).abs().max()}, Mean diff: {(c - ref_c).abs().mean()}")
+
+    latency = do_bench(lambda: kernel(a_sparse, e, b))
+    ref_latency = do_bench(lambda: a @ b)
+
+    total_flops = 2 * m * n * k
+    tflops = total_flops / latency / 1e9
+    ref_tflops = total_flops / ref_latency / 1e9
+    print(f"Sparse TFLOPS: {tflops:.2f}, Latency: {latency / 1e3} s")
+    print(f"Reference TFLOPS: {ref_tflops:.2f}, Latency: {ref_latency / 1e3:} s")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark")
+    parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
+    parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
+    parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
+    parser.add_argument("--use_cutlass_layout", action="store_true", help="Use cutlass layout for E tensor")
+    parser.add_argument("--use_torch_compressor", action="store_true", help="Use torch sparse for reference")
+    parser.add_argument("--accum_dtype", type=str, default="float", choices=["float", "float16"], help="Accumulation datatype")
+    parser.add_argument("--cfg", type=str, choices=["4090"], default="4090")
+    args = parser.parse_args()
+    accum_dtype = T.float if args.accum_dtype == "float" else T.float16
+    main(args.m, args.n, args.k, args.use_cutlass_layout, args.use_torch_compressor, accum_dtype, args.cfg)
diff --git a/examples/gemm_sp/example_gemm_sp.py b/examples/gemm_sp/example_gemm_sp.py
index 505f2b883..8163c84cc 100644
--- a/examples/gemm_sp/example_gemm_sp.py
+++ b/examples/gemm_sp/example_gemm_sp.py
@@ -1,11 +1,9 @@
-# Copyright (c) Tile-AI Corporation.
-# Licensed under the MIT License.
 import argparse
 
 import tilelang
 import tilelang.language as T
 
-from tilelang.layout import make_metadata_layout
+from tilelang.layout import make_cutlass_metadata_layout
 from tilelang.utils.sparse import compress, randn_semi_sparse
 from tilelang.contrib import nvcc
 from triton.testing import do_bench
@@ -14,86 +12,79 @@
 
 arch = nvcc.get_target_compute_version()
 
-ARCH_INFO = {"8.0": (16, "int16"), "8.9": (16, "int16"), "9.0": (8, "uint8")}
-
-default_config = {  # take best config from autotune script
+DEFAULT_CONFIG = {  # take best config from autotune script
     "4090": {
-        'float': {
-            'block_M': 128,
-            'block_N': 64,
-            'block_K': 64,
-            'num_stages': 1,
-            'thread_num': 128,
-            'policy': T.GemmWarpPolicy.Square,
-            'enable_rasterization': True
+        T.float: {
+            "block_M": 128,
+            "block_N": 64,
+            "block_K": 64,
+            "num_stages": 1,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
+        },
+        T.float16: {
+            "block_M": 256,
+            "block_N": 128,
+            "block_K": 64,
+            "num_stages": 2,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
         },
-        'float16': {
-            'block_M': 256,
-            'block_N': 128,
-            'block_K': 64,
-            'num_stages': 2,
-            'thread_num': 128,
-            'policy': T.GemmWarpPolicy.Square,
-            'enable_rasterization': True
-        }
     },
     "h20": {
-        'float': {
-            'block_M': 128,
-            'block_N': 64,
-            'block_K': 128,
-            'num_stages': 3,
-            'thread_num': 128,
-            'policy': T.GemmWarpPolicy.Square,
-            'enable_rasterization': True
+        T.float: {
+            "block_M": 128,
+            "block_N": 64,
+            "block_K": 128,
+            "num_stages": 3,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
         },
-        'float16': {
-            'block_M': 128,
-            'block_N': 64,
-            'block_K': 128,
-            'num_stages': 3,
-            'thread_num': 128,
-            'policy': T.GemmWarpPolicy.Square,
-            'enable_rasterization': True
-        }
-    }
+        T.float16: {
+            "block_M": 128,
+            "block_N": 64,
+            "block_K": 128,
+            "num_stages": 3,
+            "thread_num": 128,
+            "policy": T.GemmWarpPolicy.Square,
+            "enable_rasterization": True,
+        },
+    },
 }
 
+ARCH_INFO = {"8.0": (16, "int16"), "8.9": (16, "int16"), "9.0": (8, "uint8")}
+
 
 @tilelang.jit(out_idx=[-1])
-def matmul_sp_fp16(M, N, K, accum_dtype, block_M, block_N, block_K, num_stages, thread_num, policy,
-                   enable_rasterization):
+def matmul_sp_fp16(M, N, K, accum_dtype, block_M, block_N, block_K, num_stages, thread_num, policy, enable_rasterization):
     e_factor, e_dtype = ARCH_INFO[arch]
 
     @T.prim_func
     def gemm_sp_fp16(
-            A_sparse: T.Tensor((M, K // 2), 'float16'),
-            E: T.Tensor((M, K // e_factor), e_dtype),
-            B: T.Tensor((K, N), 'float16'),
-            C: T.Tensor((M, N), accum_dtype),
+        A_sparse: T.Tensor((M, K // 2), T.float16),
+        E: T.Tensor((M, K // e_factor), e_dtype),
+        B: T.Tensor((K, N), T.float16),
+        C: T.Tensor((M, N), accum_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_K // 2), 'float16')
+            A_shared = T.alloc_shared((block_M, block_K // 2), T.float16)
             E_shared = T.alloc_shared((block_M, block_K // e_factor), e_dtype)
-            B_shared = T.alloc_shared((block_K, block_N), 'float16')
+            B_shared = T.alloc_shared((block_K, block_N), T.float16)
             C_shared = T.alloc_shared((block_M, block_N), accum_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
 
             T.clear(C_local)
             T.disable_warp_group_reg_alloc()
             T.use_swizzle(panel_size=10, enable=enable_rasterization)
-            T.annotate_layout({
-                E:
-                    make_metadata_layout(
-                        E, mma_dtype="float16", backend="cutlass", block_k=block_K, arch=arch),
-                E_shared:
-                    make_metadata_layout(
-                        E_shared,
-                        mma_dtype="float16",
-                        backend="cutlass",
-                        block_k=block_K,
-                        arch=arch),
-            })
+            T.annotate_layout(
+                {
+                    E: make_cutlass_metadata_layout(E, mma_dtype=T.float16, block_k=block_K, arch=arch),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=T.float16, block_k=block_K, arch=arch),
+                }
+            )
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
                 T.copy(E[by * block_M, k * block_K // e_factor], E_shared)
@@ -106,30 +97,15 @@ def gemm_sp_fp16(
     return gemm_sp_fp16
 
 
-def main():
-    parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark")
-    parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
-    parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
-    parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
-    parser.add_argument(
-        "--accum_dtype",
-        type=str,
-        default="float",
-        choices=["float", "float16"],
-        help="Accumulation datatype")
-    parser.add_argument("--cfg", type=str, choices=["4090", "h20"], required=True)
-    args = parser.parse_args()
-    kernel = matmul_sp_fp16(args.m, args.n, args.k, args.accum_dtype,
-                            **default_config[args.cfg][args.accum_dtype])
+def main(m=16384, n=16384, k=16384, accum_dtype=None, cfg="4090"):
+    if accum_dtype is None:
+        accum_dtype = T.float
+    kernel = matmul_sp_fp16(m, n, k, accum_dtype, **DEFAULT_CONFIG[cfg][accum_dtype])
 
-    a = randn_semi_sparse(args.m, args.k, device='cuda', dtype=torch.half)
-    b = torch.randn(args.k, args.n, device='cuda', dtype=torch.half)
+    a = randn_semi_sparse(m, k, device="cuda", dtype=torch.half)
+    b = torch.randn(k, n, device="cuda", dtype=torch.half)
 
-    a_sparse, e = compress(
-        a,
-        transposed=False,
-        block_k=default_config[args.cfg][args.accum_dtype]['block_K'],
-        arch=arch)
+    a_sparse, e = compress(a, transposed=False, block_k=DEFAULT_CONFIG[cfg][accum_dtype]["block_K"], arch=arch)
     c = kernel(a_sparse, e, b)
 
     ref_c = a @ b
@@ -141,12 +117,20 @@ def main():
     latency = do_bench(lambda: kernel(a_sparse, e, b))
     ref_latency = do_bench(lambda: a @ b)
 
-    total_flops = 2 * args.m * args.n * args.k
+    total_flops = 2 * m * n * k
     tflops = total_flops / latency / 1e9
     ref_tflops = total_flops / ref_latency / 1e9
-    print(f"Sparse TFLOPS: {tflops:.2f}, Latency: {latency/1e3} s")
-    print(f"Reference TFLOPS: {ref_tflops:.2f}, Latency: {ref_latency/1e3:} s")
+    print(f"Sparse TFLOPS: {tflops:.2f}, Latency: {latency / 1e3} s")
+    print(f"Reference TFLOPS: {ref_tflops:.2f}, Latency: {ref_latency / 1e3:} s")
 
 
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark")
+    parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
+    parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
+    parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
+    parser.add_argument("--accum_dtype", type=str, default="float", choices=["float", "float16"], help="Accumulation datatype")
+    parser.add_argument("--cfg", type=str, choices=["4090", "h20"], default="4090")
+    args = parser.parse_args()
+    accum_dtype = T.float if args.accum_dtype == "float" else T.float16
+    main(args.m, args.n, args.k, accum_dtype, args.cfg)
diff --git a/examples/gemm_sp/test_example_gemm_sp.py b/examples/gemm_sp/test_example_gemm_sp.py
new file mode 100644
index 000000000..fe26df144
--- /dev/null
+++ b/examples/gemm_sp/test_example_gemm_sp.py
@@ -0,0 +1,16 @@
+import tilelang.testing
+
+import example_custom_compress
+import example_gemm_sp
+
+
+def test_example_custom_compress():
+    example_custom_compress.main()
+
+
+def test_example_gemm_sp():
+    example_gemm_sp.main()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/examples/gemm_splitk/example_tilelang_gemm_splitk.py b/examples/gemm_splitk/example_tilelang_gemm_splitk.py
index c96669711..64ffade8e 100644
--- a/examples/gemm_splitk/example_tilelang_gemm_splitk.py
+++ b/examples/gemm_splitk/example_tilelang_gemm_splitk.py
@@ -3,27 +3,16 @@
 
 
 @tilelang.jit
-def matmul(M,
-           N,
-           K,
-           block_M,
-           block_N,
-           block_K,
-           split_k,
-           dtype="float16",
-           accum_dtype="float",
-           out_dtype="float32"):
-
+def matmul(M, N, K, block_M, block_N, block_K, split_k, dtype=T.float16, accum_dtype=T.float32, out_dtype=T.float32):
     splitK = K // split_k
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), split_k, threads=128) as (bx, by, bz):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), split_k, threads=128) as (bx, by, bz):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             B_shared = T.alloc_shared((block_K, block_N), dtype)
             C_shared = T.alloc_shared((block_M, block_N), out_dtype)
@@ -67,5 +56,28 @@ def main():
     torch.testing.assert_close(c, ref_c.to(c.dtype), rtol=1e-2, atol=1e-2)
 
 
+def run_regression_perf():
+    M = 4096
+    N = 4096
+    K = 4096
+    block_M = 128
+    block_N = 128
+    block_K = 32
+    split_k = 4
+    kernel = matmul(M, N, K, block_M, block_N, block_K, split_k)
+    import torch
+
+    torch.random.manual_seed(42)
+    a = torch.randn(M, K).cuda().half()
+    b = torch.randn(K, N).cuda().half()
+    c = torch.zeros(M, N).cuda().float()
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(a, b, c)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/gemm_splitk/example_tilelang_gemm_splitk_vectorize_atomicadd.py b/examples/gemm_splitk/example_tilelang_gemm_splitk_vectorize_atomicadd.py
index 145d622ed..3d33478cf 100644
--- a/examples/gemm_splitk/example_tilelang_gemm_splitk_vectorize_atomicadd.py
+++ b/examples/gemm_splitk/example_tilelang_gemm_splitk_vectorize_atomicadd.py
@@ -3,27 +3,16 @@
 
 
 @tilelang.jit
-def matmul(M,
-           N,
-           K,
-           block_M,
-           block_N,
-           block_K,
-           split_k,
-           dtype="float16",
-           accum_dtype="float",
-           out_dtype="float32"):
-
+def matmul(M, N, K, block_M, block_N, block_K, split_k, dtype=T.float16, accum_dtype=T.float32, out_dtype=T.float32):
     splitK = K // split_k
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), split_k, threads=128) as (bx, by, bz):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), split_k, threads=128) as (bx, by, bz):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             B_shared = T.alloc_shared((block_K, block_N), dtype)
             C_shared = T.alloc_shared((block_M, block_N), out_dtype)
@@ -66,5 +55,29 @@ def main():
     torch.testing.assert_close(c, ref_c.to(c.dtype), rtol=1e-2, atol=1e-2)
 
 
+def run_regression_perf():
+    M = 4096
+    N = 4096
+    K = 4096
+    block_M = 128
+    block_N = 128
+    block_K = 32
+    split_k = 4
+
+    kernel = matmul(M, N, K, block_M, block_N, block_K, split_k)
+    import torch
+
+    torch.random.manual_seed(42)
+    a = torch.randn(M, K).cuda().half()
+    b = torch.randn(K, N).cuda().half()
+    c = torch.zeros(M, N).cuda().float()
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(a, b, c)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/gemm_splitk/regression_example_gemm_splitk.py b/examples/gemm_splitk/regression_example_gemm_splitk.py
new file mode 100644
index 000000000..c76b7e55c
--- /dev/null
+++ b/examples/gemm_splitk/regression_example_gemm_splitk.py
@@ -0,0 +1,15 @@
+import tilelang.testing
+import example_tilelang_gemm_splitk
+import example_tilelang_gemm_splitk_vectorize_atomicadd
+
+
+def regression_example_tilelang_gemm_splitk():
+    tilelang.testing.process_func(example_tilelang_gemm_splitk.run_regression_perf)
+
+
+def regression_example_tilelang_gemm_splitk_vectorize_atomicadd():
+    tilelang.testing.process_func(example_tilelang_gemm_splitk_vectorize_atomicadd.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/gemm_streamk/example_tilelang_gemm_streamk.py b/examples/gemm_streamk/example_tilelang_gemm_streamk.py
index 31cf40647..b2e8e9369 100644
--- a/examples/gemm_streamk/example_tilelang_gemm_streamk.py
+++ b/examples/gemm_streamk/example_tilelang_gemm_streamk.py
@@ -39,7 +39,7 @@ def cdiv(a, b):
 
 # Two-tile SK + DP
 streamk_tiles = total_tiles % streamk_programs
-if (total_tiles - streamk_tiles > streamk_programs):  # (total_tiles // total_programs > 1)
+if total_tiles - streamk_tiles > streamk_programs:  # (total_tiles // total_programs > 1)
     streamk_tiles += streamk_programs
 
 blocking_tiles = total_tiles - streamk_tiles
@@ -77,95 +77,71 @@ def tl_matmul_streamk(
     A_shared_shape = (block_M, block_K) if not trans_A else (block_K, block_M)
     B_shared_shape = (block_K, block_N) if not trans_B else (block_N, block_K)
 
-    @T.macro
-    def compute_first_wave(
-        pid: T.int32,
-        A_buf: T.Tensor,
-        A_buf_shared: T.SharedBuffer,
-        B_buf: T.Tensor,
-        B_buf_shared: T.SharedBuffer,
-        C: T.Tensor,
-        C_local: T.LocalBuffer,
-    ):
-        start_iter = T.alloc_fragment((1,), "int32", "local")
-        end_iter = T.alloc_fragment((1,), "int32", "local")
-
-        start_iter[0] = pid * streamk_full_tiles + T.min(pid, streamk_partial_tiles)
-        last_iter = (pid + 1) * streamk_full_tiles + T.min(pid + 1, streamk_partial_tiles)
-
-        while start_iter[0] < last_iter:
-            end_iter[0] = T.min(
-                start_iter[0] + (iters_per_tile - (start_iter[0] % iters_per_tile)),
-                last_iter,
-            )
-
-            tile_id = start_iter[0] // iters_per_tile
-            remain_iters = start_iter[0] % iters_per_tile
-            pid_m = tile_id // T.ceildiv(N, block_N)
-            pid_n = tile_id % T.ceildiv(N, block_N)
-
-            T.clear(C_local)
-            for k in T.Pipelined(end_iter[0] - start_iter[0], num_stages=num_stages):
-                T.copy(
-                    A_buf[pid_m * block_M, (k + (start_iter[0] % iters_per_tile)) * block_K],
-                    A_buf_shared,
-                )
-                T.copy(
-                    B_buf[pid_n * block_N, (k + (start_iter[0] % iters_per_tile)) * block_K],
-                    B_buf_shared,
-                )
-                T.gemm(A_buf_shared, B_buf_shared, C_local, transpose_B=trans_B)
-
-            # last iteration of the tile always happens before its start on another SM
-            if remain_iters == 0 and (end_iter[0] % iters_per_tile == 0):
-                T.copy(C_local, C[pid_m * block_M, pid_n * block_N])
-            else:
-                for i, j in T.Parallel(block_M, block_N):
-                    T.atomic_add(C[pid_m * block_M + i, pid_n * block_N + j], C_local[i, j])
-
-            start_iter[0] = end_iter[0]
-
-    @T.macro
-    def compute_full_tiles(
-        pid: T.int32,
-        A_buf: T.Tensor,
-        A_shared: T.SharedBuffer,
-        B_buf: T.Tensor,
-        B_shared: T.SharedBuffer,
-        C: T.Tensor,
-        C_local: T.LocalBuffer,
-    ):
-
-        for p in T.serial(sm_patition_factor):
-            tile_id = pid + streamk_tiles + p * total_sm
-            pid_m = tile_id // T.ceildiv(N, block_N)
-            pid_n = tile_id % T.ceildiv(N, block_N)
-            T.clear(C_local)
-
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=1):
-                T.copy(A_buf[pid_m * block_M, k * block_K], A_shared)
-                T.copy(B_buf[pid_n * block_N, k * block_K], B_shared)
-                T.gemm(A_shared, B_shared, C_local, transpose_B=trans_B)
-            T.copy(C_local, C[pid_m * block_M, pid_n * block_N])
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, dtypeAB),
-            B: T.Tensor(B_shape, dtypeAB),
-            C: T.Tensor((M, N), dtypeC),
+        A: T.Tensor(A_shape, dtypeAB),
+        B: T.Tensor(B_shape, dtypeAB),
+        C: T.Tensor((M, N), dtypeC),
     ):
         with T.Kernel(streamk_programs, threads=threads) as pid:
-
             A_shared = T.alloc_shared(A_shared_shape, dtypeAB)
             B_shared = T.alloc_shared(B_shared_shape, dtypeAB)
             A_shared_full_tiles = T.alloc_shared(A_shared_shape, dtypeAB)
             B_shared_full_tiles = T.alloc_shared(B_shared_shape, dtypeAB)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
 
-            compute_first_wave(pid, A, A_shared, B, B_shared, C, C_local)
+            # compute first wave
+            start_iter = T.alloc_fragment((1,), T.int32, "local")
+            end_iter = T.alloc_fragment((1,), T.int32, "local")
+
+            start_iter[0] = pid * streamk_full_tiles + T.min(pid, streamk_partial_tiles)
+            last_iter = (pid + 1) * streamk_full_tiles + T.min(pid + 1, streamk_partial_tiles)
 
+            while start_iter[0] < last_iter:
+                end_iter[0] = T.min(
+                    start_iter[0] + (iters_per_tile - (start_iter[0] % iters_per_tile)),
+                    last_iter,
+                )
+
+                tile_id = start_iter[0] // iters_per_tile
+                remain_iters = start_iter[0] % iters_per_tile
+                pid_m = tile_id // T.ceildiv(N, block_N)
+                pid_n = tile_id % T.ceildiv(N, block_N)
+
+                T.clear(C_local)
+                for k in T.Pipelined(end_iter[0] - start_iter[0], num_stages=num_stages):
+                    T.copy(
+                        A[pid_m * block_M, (k + (start_iter[0] % iters_per_tile)) * block_K],
+                        A_shared,
+                    )
+                    T.copy(
+                        B[pid_n * block_N, (k + (start_iter[0] % iters_per_tile)) * block_K],
+                        B_shared,
+                    )
+                    T.gemm(A_shared, B_shared, C_local, transpose_B=trans_B)
+
+                # last iteration of the tile always happens before its start on another SM
+                if remain_iters == 0 and (end_iter[0] % iters_per_tile == 0):
+                    T.copy(C_local, C[pid_m * block_M, pid_n * block_N])
+                else:
+                    for i, j in T.Parallel(block_M, block_N):
+                        T.atomic_add(C[pid_m * block_M + i, pid_n * block_N + j], C_local[i, j])
+
+                start_iter[0] = end_iter[0]
+
+            # compute full tiles
             if sm_patition_factor > 0:
-                compute_full_tiles(pid, A, A_shared_full_tiles, B, B_shared_full_tiles, C, C_local)
+                for p in T.serial(sm_patition_factor):
+                    tile_id = pid + streamk_tiles + p * total_sm
+                    pid_m = tile_id // T.ceildiv(N, block_N)
+                    pid_n = tile_id % T.ceildiv(N, block_N)
+                    T.clear(C_local)
+
+                    for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=1):
+                        T.copy(A[pid_m * block_M, k * block_K], A_shared_full_tiles)
+                        T.copy(B[pid_n * block_N, k * block_K], B_shared_full_tiles)
+                        T.gemm(A_shared_full_tiles, B_shared_full_tiles, C_local, transpose_B=trans_B)
+                    T.copy(C_local, C[pid_m * block_M, pid_n * block_N])
 
     return main
 
@@ -181,9 +157,9 @@ def main():
         BLOCK_SIZE_K,
         False,
         True,
-        "float16",
-        "float16",
-        "float32",
+        T.float16,
+        T.float16,
+        T.float32,
         2,
         64,
     )
@@ -201,5 +177,30 @@ def main():
     torch.testing.assert_close(C, b_c, rtol=1e-2, atol=1e-2)
 
 
+def run_regression_perf():
+    kernel = tl_matmul_streamk(
+        m,
+        n,
+        k,
+        streamk_tiles,
+        BLOCK_SIZE_M,
+        BLOCK_SIZE_N,
+        BLOCK_SIZE_K,
+        False,
+        True,
+        "float16",
+        "float16",
+        "float32",
+        2,
+        64,
+    )
+    b_c = torch.zeros((m, n), device="cuda", dtype=torch.float16)
+    torch.cuda.synchronize()
+
+    from tilelang.profiler import do_bench
+
+    return do_bench(lambda: kernel(A, B, b_c), backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/gemm_streamk/test_example_tilelang_gemm_splitk.py b/examples/gemm_streamk/test_example_tilelang_gemm_streamk.py
similarity index 100%
rename from examples/gemm_streamk/test_example_tilelang_gemm_splitk.py
rename to examples/gemm_streamk/test_example_tilelang_gemm_streamk.py
diff --git a/examples/gemv/example_gemv.py b/examples/gemv/example_gemv.py
index 4e43dcd9a..8ca77a2e8 100644
--- a/examples/gemv/example_gemv.py
+++ b/examples/gemv/example_gemv.py
@@ -17,15 +17,14 @@ def naive_gemv(
     K: int,
     BLOCK_N: int,
     BLOCK_K: int,
-    dtype: str = "float16",
-    accum_dtype: str = "float",
+    dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float,
 ):
-
     @T.prim_func
     def main(
-            A: T.Tensor((K,), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((K,), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, BLOCK_N)) as bn:
             tn = T.get_thread_binding(0)  # tn = threadIdx.x
@@ -38,8 +37,7 @@ def main(
                     A_shared[tk] = A[bk * BLOCK_K + tk]
                     B_shared[tn, tk] = B[bn * BLOCK_N + tn, bk * BLOCK_K + tk]
                 for tk in T.serial(BLOCK_K):
-                    C_reg[0] += A_shared[tk].astype(accum_dtype) * B_shared[tn,
-                                                                            tk].astype(accum_dtype)
+                    C_reg[0] += A_shared[tk].astype(accum_dtype) * B_shared[tn, tk].astype(accum_dtype)
             C[bn * BLOCK_N + tn] = C_reg[0]
 
     return main
@@ -51,15 +49,14 @@ def naive_splitk_gemv(
     K: int,
     BLOCK_N: int,
     BLOCK_K: int,
-    dtype: str = "float16",
-    accum_dtype: str = "float",
+    dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float,
 ):
-
     @T.prim_func
     def main(
-            A: T.Tensor((K,), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((K,), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, BLOCK_N), threads=(BLOCK_N, BLOCK_K)) as bn:
             tn = T.get_thread_binding(0)
@@ -88,16 +85,16 @@ def splitk_gemv(
     BLOCK_N: int,
     BLOCK_K: int,
     reduce_threads: int,
-    dtype: str = "float16",
-    accum_dtype: str = "float",
+    dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float,
 ):
     TILE_K = T.ceildiv(BLOCK_K, reduce_threads)
 
     @T.prim_func
     def main(
-            A: T.Tensor((K,), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((K,), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, BLOCK_N), threads=(BLOCK_N, reduce_threads)) as bn:
             tn = T.get_thread_binding(0)
@@ -127,8 +124,8 @@ def splitk_gemv_vectorized(
     K: int,
     BLOCK_N: int,
     reduce_threads: int,
-    dtype: str = "float16",
-    accum_dtype: str = "float",
+    dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float,
 ):
     MAX_TRANSACTION_SIZE_IN_BITS = 128
     TILE_K = MAX_TRANSACTION_SIZE_IN_BITS // DataType(dtype).bits
@@ -136,9 +133,9 @@ def splitk_gemv_vectorized(
 
     @T.prim_func
     def main(
-            A: T.Tensor((K,), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((K,), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, BLOCK_N), threads=(BLOCK_N, reduce_threads)) as bn:
             tn = T.get_thread_binding(0)
@@ -168,8 +165,8 @@ def splitk_gemv_vectorized_tvm(
     K: int,
     BLOCK_N: int,
     reduce_threads: int,
-    dtype: str = "float16",
-    accum_dtype: str = "float",
+    dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float,
 ):
     MAX_TRANSACTION_SIZE_IN_BITS = 128
     TILE_K = MAX_TRANSACTION_SIZE_IN_BITS // DataType(dtype).bits
@@ -177,9 +174,9 @@ def splitk_gemv_vectorized_tvm(
 
     @T.prim_func
     def main(
-            A: T.Tensor((K,), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((K,), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, BLOCK_N), threads=(BLOCK_N, reduce_threads)) as bn:
             tn = T.get_thread_binding(0)
@@ -197,9 +194,9 @@ def main(
                     C_accum[0] += A_local[k].astype(accum_dtype) * B_local[k].astype(accum_dtype)
             C_reduced = T.alloc_local((1,), accum_dtype)
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
-                    "reduce_scope",
-                    T.reinterpret(T.uint64(0), dtype="handle"),
+                T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                "reduce_scope",
+                T.reinterpret(T.uint64(0), dtype="handle"),
             ):
                 T.evaluate(
                     T.tvm_thread_allreduce(
@@ -209,7 +206,8 @@ def main(
                         C_reduced[0],
                         tk,
                         dtype="handle",
-                    ))
+                    )
+                )
 
             C[bn * BLOCK_N + tn] = C_reduced[0]
 
@@ -218,10 +216,8 @@ def main(
 
 def get_block_template_configs():
     iter_params = dict(
-        block_M=[2, 4, 8, 32, 64, 128],
-        block_N=[2, 4, 8, 32, 64, 128],
-        num_stages=[0, 1, 2, 3, 4],
-        threads=[32, 64, 128, 256])
+        block_M=[2, 4, 8, 32, 64, 128], block_N=[2, 4, 8, 32, 64, 128], num_stages=[0, 1, 2, 3, 4], threads=[32, 64, 128, 256]
+    )
     return [dict(zip(iter_params, values)) for values in itertools.product(*iter_params.values())]
 
 
@@ -237,18 +233,11 @@ def get_block_template_configs():
     },
     out_idx=[2],
 )
-def gemv_alloc_reducer(M,
-                       N,
-                       block_M=128,
-                       block_N=128,
-                       num_stages=2,
-                       threads=256,
-                       dtype: str = "float16",
-                       accum_dtype: str = "float"):
-
+def gemv_alloc_reducer(
+    M, N, block_M=128, block_N=128, num_stages=2, threads=256, dtype: T.dtype = T.float16, accum_dtype: T.dtype = T.float
+):
     @T.prim_func
-    def main(a: T.Tensor((M, N), dtype), x: T.Tensor(N, dtype), o: T.Tensor(M,
-                                                                            dtype)):  # type: ignore
+    def main(a: T.Tensor((M, N), dtype), x: T.Tensor(N, dtype), o: T.Tensor(M, dtype)):  # type: ignore
         with T.Kernel(T.ceildiv(M, block_M), threads=threads) as i0_m:
             o_reducer = T.alloc_reducer(block_M, accum_dtype, replication="all")
             T.clear(o_reducer)
@@ -287,17 +276,17 @@ def get_autotuned_kernel(
     BLOCK_N=None,
     reduce_threads=None,
 ):
-    dtype = "float16"
-    accum_dtype = "float"
+    dtype = T.float16
+    accum_dtype = T.float32
     MAX_TRANSACTION_SIZE_IN_BITS = 128
     TILE_K = MAX_TRANSACTION_SIZE_IN_BITS // DataType(dtype).bits
     BLOCK_K = reduce_threads * TILE_K
 
     @T.prim_func
     def main(
-            A: T.Tensor((K,), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((K,), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, BLOCK_N), threads=(BLOCK_N, reduce_threads)) as bn:
             tn = T.get_thread_binding(0)
@@ -315,9 +304,9 @@ def main(
                     C_accum[0] += A_local[k].astype(accum_dtype) * B_local[k].astype(accum_dtype)
             C_reduced = T.alloc_local((1,), accum_dtype)
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
-                    "reduce_scope",
-                    T.reinterpret(T.uint64(0), dtype="handle"),
+                T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                "reduce_scope",
+                T.reinterpret(T.uint64(0), dtype="handle"),
             ):
                 T.evaluate(
                     T.tvm_thread_allreduce(
@@ -327,21 +316,22 @@ def main(
                         C_reduced[0],
                         tk,
                         dtype="handle",
-                    ))
+                    )
+                )
 
             C[bn * BLOCK_N + tn] = C_reduced[0]
 
     return main
 
 
-def check_correctness_and_bench(kernel, N, K, bench_ref=True):
+def check_correctness_and_bench(kernel, N, K, do_bench=True):
     profiler = kernel.get_profiler()
     profiler.assert_allclose(lambda x, y: x @ y.T, atol=1e-2, rtol=1e-2)
-    if bench_ref:
+    if do_bench:
         latency = profiler.do_bench(lambda x, y: x @ y.T, warmup=50)
         print(f"Torch Latency: {latency} ms")
-    latency = profiler.do_bench(kernel, warmup=50)
-    print(f"TileLang Latency: {latency} ms\n")
+        latency = profiler.do_bench(kernel, warmup=50)
+        print(f"TileLang Latency: {latency} ms\n")
 
 
 def main(do_bench: bool = True):
@@ -350,16 +340,16 @@ def main(do_bench: bool = True):
     parser.add_argument("--k", type=int, default=1024, help="Matrix dimension K")
     args, _ = parser.parse_known_args()
     N, K = args.n, args.k
-    check_correctness_and_bench(naive_gemv(N, K, 128, 128), N, K)
-    check_correctness_and_bench(naive_splitk_gemv(N, K, 32, 32), N, K)
-    check_correctness_and_bench(splitk_gemv(N, K, 32, 32, 32), N, K)
-    check_correctness_and_bench(splitk_gemv_vectorized(N, K, 2, 32), N, K)
-    check_correctness_and_bench(splitk_gemv_vectorized_tvm(N, K, 2, 32), N, K)
-    check_correctness_and_bench(gemv_alloc_reducer(N, K, block_M=128, block_N=128), N, K)
+    check_correctness_and_bench(naive_gemv(N, K, 128, 128), N, K, do_bench=do_bench)
+    check_correctness_and_bench(naive_splitk_gemv(N, K, 32, 32), N, K, do_bench=do_bench)
+    check_correctness_and_bench(splitk_gemv(N, K, 32, 32, 32), N, K, do_bench=do_bench)
+    check_correctness_and_bench(splitk_gemv_vectorized(N, K, 2, 32), N, K, do_bench=do_bench)
+    check_correctness_and_bench(splitk_gemv_vectorized_tvm(N, K, 2, 32), N, K, do_bench=do_bench)
+    check_correctness_and_bench(gemv_alloc_reducer(N, K, block_M=128, block_N=128), N, K, do_bench=do_bench)
 
     print("Test passed!")
 
-    if not do_bench:
+    if do_bench:
         best_result = get_autotuned_kernel(N, K)
         best_config = best_result.config
         kernel = splitk_gemv_vectorized_tvm(N, K, **best_config)
@@ -374,5 +364,23 @@ def main(do_bench: bool = True):
         print(f"TileLang BlockReduce Latency: {tilelang_tile_latency} ms\n")
 
 
+def run_regression_perf():
+    N, K = 4096, 4096
+    latency = 0.0
+    kernel_list = [
+        naive_gemv(N, K, 128, 128),
+        naive_splitk_gemv(N, K, 32, 32),
+        splitk_gemv(N, K, 32, 32, 32),
+        splitk_gemv_vectorized(N, K, 2, 32),
+        splitk_gemv_vectorized_tvm(N, K, 2, 32),
+        gemv_alloc_reducer(N, K, block_M=128, block_N=128),
+    ]
+    for kernel in kernel_list:
+        profiler = kernel.get_profiler()
+        # Benchmark the TileLang kernel itself, not the PyTorch reference.
+        latency += profiler.do_bench(backend="cupti")
+    return latency / len(kernel_list)
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/gemv/regression_example_gemv.py b/examples/gemv/regression_example_gemv.py
new file mode 100644
index 000000000..dd6f1d39f
--- /dev/null
+++ b/examples/gemv/regression_example_gemv.py
@@ -0,0 +1,10 @@
+import tilelang.testing
+import example_gemv
+
+
+def regression_example_gemv():
+    tilelang.testing.process_func(example_gemv.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/gemv/test_example_gemv.py b/examples/gemv/test_example_gemv.py
index 3881ca769..323337a7a 100644
--- a/examples/gemv/test_example_gemv.py
+++ b/examples/gemv/test_example_gemv.py
@@ -1,5 +1,3 @@
-import tilelang.testing
-
 import example_gemv
 
 
@@ -8,4 +6,4 @@ def test_example_gemv():
 
 
 if __name__ == "__main__":
-    tilelang.testing.main()
+    test_example_gemv()
diff --git a/examples/grouped_gemm/example_grouped_gemm_bwd.py b/examples/grouped_gemm/example_grouped_gemm_bwd.py
index ac8da7e2c..49cce0d1d 100644
--- a/examples/grouped_gemm/example_grouped_gemm_bwd.py
+++ b/examples/grouped_gemm/example_grouped_gemm_bwd.py
@@ -5,78 +5,55 @@
 import tilelang.language as T
 
 
-@tilelang.jit(
-    out_idx=[2], pass_configs={
-        "tl.disable_tma_lower": True,
-        "tl.disable_warp_specialized": True
-    })
-def grouped_gemm_fwd(batch_sum,
-                     batch_count,
-                     K,
-                     N,
-                     block_M,
-                     block_N,
-                     block_K,
-                     num_stages=2,
-                     threads=128,
-                     dtype="float16"):
+@tilelang.jit(out_idx=[2], pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
+def grouped_gemm_fwd(batch_sum, batch_count, K, N, block_M, block_N, block_K, num_stages=2, threads=128, dtype=T.float16):
     """
     args:
         a (torch.Tensor): Input tensor of shape (M, K).
         b (torch.Tensor): Input tensor of shape (G, K, N).
     """
-    accum_dtype = "float32"
+    accum_dtype = T.float32
 
     @T.prim_func
     def kernel(
-            A: T.Tensor([batch_sum, K], dtype),  # type: ignore
-            B: T.Tensor([batch_count, K, N], dtype),  # type: ignore
-            C: T.Tensor([batch_sum, N], dtype),  # type: ignore
-            batch_sizes: T.Tensor([batch_count], "int32"),  # type: ignore
-            batch_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
-            batch_padded_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
+        A: T.Tensor([batch_sum, K], dtype),  # type: ignore
+        B: T.Tensor([batch_count, K, N], dtype),  # type: ignore
+        C: T.Tensor([batch_sum, N], dtype),  # type: ignore
+        batch_sizes: T.Tensor([batch_count], T.int32),  # type: ignore
+        batch_offsets: T.Tensor([batch_count], T.int32),  # type: ignore
+        batch_padded_offsets: T.Tensor([batch_count], T.int32),  # type: ignore
     ):
-
-        with T.Kernel(
-                T.ceildiv(batch_sum, block_M) + batch_count, T.ceildiv(N, block_N),
-                threads=threads) as (bx, by):
+        with T.Kernel(T.ceildiv(batch_sum, block_M) + batch_count, T.ceildiv(N, block_N), threads=threads) as (bx, by):
             A_shared = T.alloc_shared([block_M, block_K], dtype)
             B_shared = T.alloc_shared([block_K, block_N], dtype)
             C_local = T.alloc_fragment([block_M, block_N], accum_dtype)
-            cur_batch_idx = T.alloc_local([1], "int32")
-            cur_batch_size = T.alloc_local([1], "int32")
+            cur_batch_idx = T.alloc_var(dtype=T.int32)
+            cur_batch_size = T.alloc_var(dtype=T.int32)
 
             m_start_padded = bx * block_M
 
             for i in range(batch_count):
-                in_cur_batch_idx = (m_start_padded >= batch_padded_offsets[i])
-                cur_batch_idx[0] = T.if_then_else(in_cur_batch_idx, i, cur_batch_idx[0])
+                in_cur_batch_idx = m_start_padded >= batch_padded_offsets[i]
+                cur_batch_idx = T.if_then_else(in_cur_batch_idx, i, cur_batch_idx)
 
-            cur_batch_size[0] = batch_sizes[cur_batch_idx[0]]
-            m_start = m_start_padded - batch_padded_offsets[cur_batch_idx[0]] + batch_offsets[
-                cur_batch_idx[0]]
-            actual_rows = T.max(
-                0,
-                T.min(block_M,
-                      cur_batch_size[0] + batch_padded_offsets[cur_batch_idx[0]] - m_start_padded))
+            cur_batch_size = batch_sizes[cur_batch_idx]
+            m_start = m_start_padded - batch_padded_offsets[cur_batch_idx] + batch_offsets[cur_batch_idx]
+            actual_rows = T.max(0, T.min(block_M, cur_batch_size + batch_padded_offsets[cur_batch_idx] - m_start_padded))
 
             T.clear(C_local)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                T.copy(A[m_start:m_start + block_M, k * block_K:(k + 1) * block_K], A_shared)
-                T.copy(
-                    B[cur_batch_idx[0], k * block_K:(k + 1) * block_K,
-                      by * block_N:(by + 1) * block_N], B_shared)
+                T.copy(A[m_start : m_start + block_M, k * block_K : (k + 1) * block_K], A_shared)
+                T.copy(B[cur_batch_idx, k * block_K : (k + 1) * block_K, by * block_N : (by + 1) * block_N], B_shared)
                 T.gemm(A_shared, B_shared, C_local)
 
             for i, j in T.Parallel(block_M, block_N):
-                with T.If(i < actual_rows), T.Then():
+                if i < actual_rows:
                     C[m_start + i, by * block_N + j] = C_local[i, j]
 
     return kernel
 
 
 class _GroupedGEMM(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, a, b, batch_sizes):
         block_M = 64
@@ -99,15 +76,11 @@ def forward(ctx, a, b, batch_sizes):
         for i in range(batch_count - 1):
             batch_offsets_list.append(batch_offsets_list[-1] + batch_sizes[i])
         for i in range(batch_count - 1):
-            batch_padded_offsets_list.append(batch_padded_offsets_list[-1] +
-                                             math.ceil((batch_sizes[i] + 1) / padding_M) *
-                                             padding_M)
+            batch_padded_offsets_list.append(batch_padded_offsets_list[-1] + math.ceil((batch_sizes[i] + 1) / padding_M) * padding_M)
         batch_offsets = torch.tensor(batch_offsets_list, device=a.device, dtype=torch.int32)
-        batch_padded_offsets = torch.tensor(
-            batch_padded_offsets_list, device=a.device, dtype=torch.int32)
+        batch_padded_offsets = torch.tensor(batch_padded_offsets_list, device=a.device, dtype=torch.int32)
 
-        kernel = grouped_gemm_fwd(batch_sum, batch_count, K, N, block_M, block_N, block_K,
-                                  num_stages, threads)
+        kernel = grouped_gemm_fwd(batch_sum, batch_count, K, N, block_M, block_N, block_K, num_stages, threads)
 
         o = kernel(a, b, batch_sizes, batch_offsets, batch_padded_offsets)
         ctx.save_for_backward(a, b, batch_sizes, batch_offsets)
@@ -135,8 +108,7 @@ def maybe_contiguous(x):
             return x
 
         A, B, batch_sizes = [maybe_contiguous(x) for x in (A, B, batch_sizes)]
-        kernel = grouped_gemm_bwd(ctx.batch_sum, ctx.batch_count, M, N, block_M, block_N, block_K,
-                                  num_stages, threads)
+        kernel = grouped_gemm_bwd(ctx.batch_sum, ctx.batch_count, M, N, block_M, block_N, block_K, num_stages, threads)
 
         dB = kernel(A, grad_output, batch_sizes, batch_offsets)
         return None, dB, None
@@ -172,9 +144,7 @@ def construct_inputs(batch_sizes_list, K, M, trans_b, padding_M, device, dtype):
     for i in range(batch_count - 1):
         batch_offsets_list.append(batch_offsets_list[-1] + batch_sizes_list[i])
     for i in range(batch_count - 1):
-        batch_padded_offsets_list.append(batch_padded_offsets_list[-1] +
-                                         math.ceil((batch_sizes_list[i] + 1) / padding_M) *
-                                         padding_M)
+        batch_padded_offsets_list.append(batch_padded_offsets_list[-1] + math.ceil((batch_sizes_list[i] + 1) / padding_M) * padding_M)
     A = torch.randn(batch_sum, K, device=device, dtype=dtype)
     B = torch.randn(batch_count, K, M, device=device, dtype=dtype)
     C = torch.empty(batch_sum, M, device=device, dtype=dtype)
@@ -187,40 +157,24 @@ def construct_inputs(batch_sizes_list, K, M, trans_b, padding_M, device, dtype):
     return A, B, C, batch_sizes, batch_offsets, batch_padded_offsets
 
 
-@tilelang.jit(
-    out_idx=[2], pass_configs={
-        "tl.disable_tma_lower": True,
-        "tl.disable_warp_specialized": True
-    })
-def grouped_gemm_bwd(batch_sum,
-                     batch_count,
-                     M,
-                     N,
-                     block_M,
-                     block_N,
-                     block_K,
-                     num_stages=2,
-                     threads=128,
-                     dtype="float16"):
+@tilelang.jit(out_idx=[2], pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
+def grouped_gemm_bwd(batch_sum, batch_count, M, N, block_M, block_N, block_K, num_stages=2, threads=128, dtype=T.float16):
     """
     args:
         a (torch.Tensor): Input tensor of shape (M, K).
         b (torch.Tensor): Input tensor of shape (G, K, N).
     """
-    accum_dtype = "float32"
+    accum_dtype = T.float32
 
     @T.prim_func
     def kernel(
-            A: T.Tensor([batch_sum, M], dtype),  # type: ignore
-            B: T.Tensor([batch_sum, N], dtype),  # type: ignore
-            C: T.Tensor([batch_count, M, N], dtype),  # type: ignore
-            batch_sizes: T.Tensor([batch_count], "int32"),  # type: ignore
-            batch_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
+        A: T.Tensor([batch_sum, M], dtype),  # type: ignore
+        B: T.Tensor([batch_sum, N], dtype),  # type: ignore
+        C: T.Tensor([batch_count, M, N], dtype),  # type: ignore
+        batch_sizes: T.Tensor([batch_count], T.int32),  # type: ignore
+        batch_offsets: T.Tensor([batch_count], T.int32),  # type: ignore
     ):
-
-        with T.Kernel(
-                T.ceildiv(M, block_M), T.ceildiv(N, block_N), batch_count,
-                threads=threads) as (bx, by, bz):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), batch_count, threads=threads) as (bx, by, bz):
             A_shared = T.alloc_shared([block_K, block_M], dtype)
             B_shared = T.alloc_shared([block_K, block_N], dtype)
             C_local = T.alloc_fragment([block_M, block_N], accum_dtype)
@@ -228,13 +182,9 @@ def kernel(
             T.clear(C_local)
             for k in T.Pipelined(T.ceildiv(batch_sizes[bz], block_K), num_stages=num_stages):
                 for i, j in T.Parallel(block_K, block_M):
-                    A_shared[i, j] = T.if_then_else(
-                        i < batch_sizes[bz], A[batch_offsets[bz] + k * block_K + i,
-                                               bx * block_M + j], 0)
+                    A_shared[i, j] = T.if_then_else(i < batch_sizes[bz], A[batch_offsets[bz] + k * block_K + i, bx * block_M + j], 0)
                 for i, j in T.Parallel(block_K, block_N):
-                    B_shared[i, j] = T.if_then_else(
-                        i < batch_sizes[bz], B[batch_offsets[bz] + k * block_K + i,
-                                               by * block_N + j], 0)
+                    B_shared[i, j] = T.if_then_else(i < batch_sizes[bz], B[batch_offsets[bz] + k * block_K + i, by * block_N + j], 0)
                 T.gemm(A_shared, B_shared, C_local, transpose_A=True)
 
             T.copy(C_local, C[bz, bx * block_M, by * block_N])
@@ -242,23 +192,12 @@ def kernel(
     return kernel
 
 
-def run_tilelang_grouped_gemm(batch_sizes_list,
-                              K,
-                              M,
-                              block_M,
-                              block_N,
-                              block_K,
-                              trans_b,
-                              num_stages=2,
-                              threads=128,
-                              profile=False):
-
+def run_tilelang_grouped_gemm(batch_sizes_list, K, M, block_M, block_N, block_K, trans_b, num_stages=2, threads=128, profile=False):
     padding_M = block_M
     device = torch.device("cuda")
     dtype = torch.float16
 
-    A, B, C, batch_sizes, batch_offsets, batch_padded_offsets = construct_inputs(
-        batch_sizes_list, K, M, False, padding_M, device, dtype)
+    A, B, C, batch_sizes, batch_offsets, batch_padded_offsets = construct_inputs(batch_sizes_list, K, M, False, padding_M, device, dtype)
 
     A.requires_grad_(False)
     B.requires_grad_(True)
@@ -273,10 +212,7 @@ def run_tilelang_grouped_gemm(batch_sizes_list,
     O.backward(dO, retain_graph=True)
     dB, B.grad = B.grad.clone(), None
 
-    if (
-        torch.allclose(O, O_ref, rtol=1e-2, atol=1e-2) and \
-        torch.allclose(dB, dB_ref, rtol=1e-2, atol=1e-2)
-    ):
+    if torch.allclose(O, O_ref, rtol=1e-2, atol=1e-2) and torch.allclose(dB, dB_ref, rtol=1e-2, atol=1e-2):
         print("✅ Tilelang and Torch match")
     else:
         print("❌ Tilelang and Torch mismatch")
@@ -284,12 +220,11 @@ def run_tilelang_grouped_gemm(batch_sizes_list,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--batch_sizes', type=str, default="64, 128", help='comma-separated batch sizes')
-    parser.add_argument('--K', type=int, default=8192, help='reduce dim')
-    parser.add_argument('--M', type=int, default=8192, help='output dim')
-    parser.add_argument('--trans_b', action="store_true", help="transpose B")
-    parser.add_argument('--profile', action="store_true", help="profile")
+    parser.add_argument("--batch_sizes", type=str, default="64, 128", help="comma-separated batch sizes")
+    parser.add_argument("--K", type=int, default=8192, help="reduce dim")
+    parser.add_argument("--M", type=int, default=8192, help="output dim")
+    parser.add_argument("--trans_b", action="store_true", help="transpose B")
+    parser.add_argument("--profile", action="store_true", help="profile")
     args = parser.parse_args()
 
     batch_sizes_list = [int(x) for x in args.batch_sizes.split(",")]
@@ -301,14 +236,4 @@ def run_tilelang_grouped_gemm(batch_sizes_list,
     num_stages = 2
     threads = 256
 
-    run_tilelang_grouped_gemm(
-        batch_sizes_list,
-        K,
-        M,
-        block_M,
-        block_N,
-        block_K,
-        trans_b,
-        num_stages,
-        threads,
-        profile=args.profile)
+    run_tilelang_grouped_gemm(batch_sizes_list, K, M, block_M, block_N, block_K, trans_b, num_stages, threads, profile=args.profile)
diff --git a/examples/grouped_gemm/example_grouped_gemm_fwd.py b/examples/grouped_gemm/example_grouped_gemm_fwd.py
index 9b58e3a21..b71472741 100644
--- a/examples/grouped_gemm/example_grouped_gemm_fwd.py
+++ b/examples/grouped_gemm/example_grouped_gemm_fwd.py
@@ -18,8 +18,7 @@ def torch_gmm(a, b, batch_sizes, batch_offsets_tensor, trans_b=False):
         torch.Tensor: Resulting tensor after grouped matrix multiplication.
     """
     assert a.shape[0] == sum(batch_sizes), "Sum of batch_sizes must equal the first dimension of a"
-    assert b.shape[0] == len(
-        batch_sizes), "The first dimension of b must match the length of batch_sizes"
+    assert b.shape[0] == len(batch_sizes), "The first dimension of b must match the length of batch_sizes"
 
     # Initialize output tensor
     output = torch.empty((sum(batch_sizes), b.shape[2]), device=a.device, dtype=a.dtype)
@@ -38,15 +37,7 @@ def torch_gmm(a, b, batch_sizes, batch_offsets_tensor, trans_b=False):
 
 
 @tilelang.jit(out_idx=[2])
-def grouped_gemm(batch_sizes_list,
-                 K,
-                 N,
-                 block_M,
-                 block_N,
-                 block_K,
-                 num_stages=2,
-                 threads=128,
-                 dtype="float16"):
+def grouped_gemm(batch_sizes_list, K, N, block_M, block_N, block_K, num_stages=2, threads=128, dtype=T.float16):
     """
     args:
         a (torch.Tensor): Input tensor of shape (M, K).
@@ -54,50 +45,43 @@ def grouped_gemm(batch_sizes_list,
     """
     batch_sum = sum(batch_sizes_list)
     batch_count = len(batch_sizes_list)
-    accum_dtype = "float32"
+    accum_dtype = T.float32
     total_m_blocks = sum((size + block_M - 1) // block_M for size in batch_sizes_list)
 
     @T.prim_func
     def kernel(
-            A: T.Tensor([batch_sum, K], dtype),  # type: ignore
-            B: T.Tensor([batch_count, K, N], dtype),  # type: ignore
-            C: T.Tensor([batch_sum, N], dtype),  # type: ignore
-            batch_sizes: T.Tensor([batch_count], "int32"),  # type: ignore
-            batch_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
-            batch_padded_offsets: T.Tensor([batch_count], "int32"),  # type: ignore
+        A: T.Tensor([batch_sum, K], dtype),  # type: ignore
+        B: T.Tensor([batch_count, K, N], dtype),  # type: ignore
+        C: T.Tensor([batch_sum, N], dtype),  # type: ignore
+        batch_sizes: T.Tensor([batch_count], T.int32),  # type: ignore
+        batch_offsets: T.Tensor([batch_count], T.int32),  # type: ignore
+        batch_padded_offsets: T.Tensor([batch_count], T.int32),  # type: ignore
     ):
-
         with T.Kernel(total_m_blocks, T.ceildiv(N, block_N), threads=threads) as (bx, by):
             A_shared = T.alloc_shared([block_M, block_K], dtype)
             B_shared = T.alloc_shared([block_K, block_N], dtype)
             C_local = T.alloc_fragment([block_M, block_N], accum_dtype)
-            cur_batch_idx = T.alloc_local([1], "int32")
-            cur_batch_size = T.alloc_local([1], "int32")
+            cur_batch_idx = T.alloc_var(dtype=T.int32)
+            cur_batch_size = T.alloc_var(dtype=T.int32)
 
             m_start_padded = bx * block_M
 
             for i in range(batch_count):
-                in_cur_batch_idx = (m_start_padded >= batch_padded_offsets[i])
-                cur_batch_idx[0] = T.if_then_else(in_cur_batch_idx, i, cur_batch_idx[0])
+                in_cur_batch_idx = m_start_padded >= batch_padded_offsets[i]
+                cur_batch_idx = T.if_then_else(in_cur_batch_idx, i, cur_batch_idx)
 
-            cur_batch_size[0] = batch_sizes[cur_batch_idx[0]]
-            m_start = m_start_padded - batch_padded_offsets[cur_batch_idx[0]] + batch_offsets[
-                cur_batch_idx[0]]
-            actual_rows = T.max(
-                0,
-                T.min(block_M,
-                      cur_batch_size[0] + batch_padded_offsets[cur_batch_idx[0]] - m_start_padded))
+            cur_batch_size = batch_sizes[cur_batch_idx]
+            m_start = m_start_padded - batch_padded_offsets[cur_batch_idx] + batch_offsets[cur_batch_idx]
+            actual_rows = T.max(0, T.min(block_M, cur_batch_size + batch_padded_offsets[cur_batch_idx] - m_start_padded))
 
             T.clear(C_local)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                T.copy(A[m_start:m_start + block_M, k * block_K:(k + 1) * block_K], A_shared)
-                T.copy(
-                    B[cur_batch_idx[0], k * block_K:(k + 1) * block_K,
-                      by * block_N:(by + 1) * block_N], B_shared)
+                T.copy(A[m_start : m_start + block_M, k * block_K : (k + 1) * block_K], A_shared)
+                T.copy(B[cur_batch_idx, k * block_K : (k + 1) * block_K, by * block_N : (by + 1) * block_N], B_shared)
                 T.gemm(A_shared, B_shared, C_local)
 
             for i, j in T.Parallel(block_M, block_N):
-                with T.If(i < actual_rows), T.Then():
+                if i < actual_rows:
                     C[m_start + i, by * block_N + j] = C_local[i, j]
 
     return kernel
@@ -111,8 +95,7 @@ def construct_inputs(batch_sizes_list, K, M, trans_b, padding_M, device, dtype):
     for i in range(batch_count - 1):
         batch_offsets_list.append(batch_offsets_list[-1] + batch_sizes_list[i])
     for i in range(batch_count - 1):
-        batch_padded_offsets_list.append(batch_padded_offsets_list[-1] +
-                                         math.ceil((batch_sizes_list[i]) / padding_M) * padding_M)
+        batch_padded_offsets_list.append(batch_padded_offsets_list[-1] + math.ceil((batch_sizes_list[i]) / padding_M) * padding_M)
     A = torch.randn(batch_sum, K, device=device, dtype=dtype)
     B = torch.randn(batch_count, K, M, device=device, dtype=dtype)
     C = torch.empty(batch_sum, M, device=device, dtype=dtype)
@@ -125,27 +108,16 @@ def construct_inputs(batch_sizes_list, K, M, trans_b, padding_M, device, dtype):
     return A, B, C, batch_sizes, batch_offsets, batch_padded_offsets
 
 
-def run_tilelang_grouped_gemm(batch_sizes_list,
-                              K,
-                              M,
-                              block_M,
-                              block_N,
-                              block_K,
-                              trans_b,
-                              num_stages=2,
-                              threads=128,
-                              profile=False):
+def run_tilelang_grouped_gemm(batch_sizes_list, K, M, block_M, block_N, block_K, trans_b, num_stages=2, threads=128, profile=False):
     padding_M = block_M
     batch_sum = sum(batch_sizes_list)
-    kernel = grouped_gemm(
-        tuple(batch_sizes_list), K, M, block_M, block_N, block_K, num_stages, threads)
+    kernel = grouped_gemm(tuple(batch_sizes_list), K, M, block_M, block_N, block_K, num_stages, threads)
     # print(kernel.get_kernel_source())
 
     device = torch.device("cuda")
     dtype = torch.float16
 
-    A, B, C, batch_sizes, batch_offsets, batch_padded_offsets = construct_inputs(
-        batch_sizes_list, K, M, trans_b, padding_M, device, dtype)
+    A, B, C, batch_sizes, batch_offsets, batch_padded_offsets = construct_inputs(batch_sizes_list, K, M, trans_b, padding_M, device, dtype)
     out = kernel(A, B, batch_sizes, batch_offsets, batch_padded_offsets)
     ref_output = torch_gmm(A, B, batch_sizes, batch_offsets, trans_b)
     # print(out)
@@ -157,8 +129,7 @@ def run_tilelang_grouped_gemm(batch_sizes_list,
 
     if profile:
         profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Auto)
-        latency = profiler.do_bench(
-            warmup=500, input_tensors=[A, B, batch_sizes, batch_offsets, batch_padded_offsets])
+        latency = profiler.do_bench(warmup=500, input_tensors=[A, B, batch_sizes, batch_offsets, batch_padded_offsets])
         print(f"Latency: {latency} ms")
         print(f"TFlops: {batch_sum * K * M * 2 / latency * 1e-9} TFlops")
 
@@ -173,12 +144,11 @@ def test_grouped_gemm():
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--batch_sizes', type=str, default="64, 128", help='comma-separated batch sizes')
-    parser.add_argument('--K', type=int, default=8192, help='reduce dim')
-    parser.add_argument('--M', type=int, default=8192, help='output dim')
-    parser.add_argument('--trans_b', action="store_true", help="transpose B")
-    parser.add_argument('--profile', action="store_true", help="profile")
+    parser.add_argument("--batch_sizes", type=str, default="64, 128", help="comma-separated batch sizes")
+    parser.add_argument("--K", type=int, default=8192, help="reduce dim")
+    parser.add_argument("--M", type=int, default=8192, help="output dim")
+    parser.add_argument("--trans_b", action="store_true", help="transpose B")
+    parser.add_argument("--profile", action="store_true", help="profile")
     args = parser.parse_args()
 
     batch_sizes_list = [int(x) for x in args.batch_sizes.split(",")]
@@ -190,14 +160,4 @@ def test_grouped_gemm():
     num_stages = 2
     threads = 256
 
-    run_tilelang_grouped_gemm(
-        batch_sizes_list,
-        K,
-        M,
-        block_M,
-        block_N,
-        block_K,
-        trans_b,
-        num_stages,
-        threads,
-        profile=args.profile)
+    run_tilelang_grouped_gemm(batch_sizes_list, K, M, block_M, block_N, block_K, trans_b, num_stages, threads, profile=args.profile)
diff --git a/examples/hadamard_transform/example_hadamard.py b/examples/hadamard_transform/example_hadamard.py
index 531d46891..65f463b71 100644
--- a/examples/hadamard_transform/example_hadamard.py
+++ b/examples/hadamard_transform/example_hadamard.py
@@ -17,7 +17,7 @@ def is_pow_of_2(n):
 def hadamard(b, n, dtype):
     assert is_pow_of_2(n), "n must be a power of 2"
     assert 2 <= n <= 32768, "n must be in [2, 32768]"
-    elem_size = {'float32': 4, 'float16': 2, 'bfloat16': 2}[dtype]
+    elem_size = {T.float32: 4, T.float16: 2, T.bfloat16: 2}[dtype]
 
     logN = int(math.log2(n))
     threads = [0, 1, 1, 1, 2, 4, 8, 16, 32, 32, 128, 256, 256, 256, 256, 256][logN]
@@ -40,23 +40,21 @@ def hadamard(b, n, dtype):
     # print(f'{exchange_round=}')
 
     @T.macro
-    def warp_shfl(local: T.Tensor((thread_elem,), dtype), buf: T.Tensor((thread_elem,), dtype),
-                  round: int):
+    def warp_shfl(local: T.Tensor((thread_elem,), dtype), buf: T.Tensor((thread_elem,), dtype), round: int):
         tx = T.get_thread_binding(0)
         for i in T.serial(round):
             tx_stride = 1 << i
             another_tx = tx ^ tx_stride
-            sign = (
-                tx >> i
-            ) & 1  # get i-th lowest bit of tx, which determines the operation type for shared[tx, :]
+            sign = (tx >> i) & 1  # get i-th lowest bit of tx, which determines the operation type for shared[tx, :]
 
             for j in T.Pipelined(thread_elem, num_stages=1):
                 buf[j] = T.tvm_warp_shuffle(
-                    0xffffffff,  # mask of all threads
+                    0xFFFFFFFF,  # mask of all threads
                     local[j],
                     another_tx % warp_size,
                     warp_size,
-                    warp_size)
+                    warp_size,
+                )
                 local[j] = T.if_then_else(sign == 0, local[j] + buf[j], buf[j] - local[j])
 
     @T.prim_func
@@ -78,10 +76,8 @@ def main(A: T.Tensor((b, n), dtype), B: T.Tensor((b, n), dtype)):
                 for j in T.serial(chunknum):
                     chunkbase = j * chunksize
                     for k in T.serial(chunksize // 2):
-                        local[chunkbase +
-                              k] = local[chunkbase + k] + local[chunkbase + k + chunksize // 2]
-                        local[chunkbase + k + chunksize //
-                              2] = local[chunkbase + k] - 2 * local[chunkbase + k + chunksize // 2]
+                        local[chunkbase + k] = local[chunkbase + k] + local[chunkbase + k + chunksize // 2]
+                        local[chunkbase + k + chunksize // 2] = local[chunkbase + k] - 2 * local[chunkbase + k + chunksize // 2]
 
             # 3. Hadamard inside warp, n<=512
             # In warp level, we rely on warp shuffle to exchange data inside each warp, without using shared memory
@@ -131,28 +127,27 @@ def ref_program(x: torch.Tensor):
     assert x.ndim == 2
     dim = x.shape[-1]
     assert is_pow_of_2(dim)
-    return F.linear(
-        x, torch.tensor(scipy.linalg.hadamard(dim, dtype=float), dtype=x.dtype, device=x.device))
+    return F.linear(x, torch.tensor(scipy.linalg.hadamard(dim, dtype=float), dtype=x.dtype, device=x.device))
 
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=64, help='Batch size')
-    parser.add_argument('--dim', type=int, default=32768, help='Dimension')
+    parser.add_argument("--batch", type=int, default=64, help="Batch size")
+    parser.add_argument("--dim", type=int, default=32768, help="Dimension")
     args = parser.parse_args()
 
     B, D = args.batch, args.dim
-    x = torch.randn((B, D), device='cuda')
-    kernel = hadamard(B, D, 'float32')
+    x = torch.randn((B, D), device="cuda")
+    kernel = hadamard(B, D, T.float32)
     y = kernel(x)
     y_ref = ref_program(x)
     torch.testing.assert_close(y, y_ref, atol=1e-2, rtol=1e-2)
-    print('All tests passed.')
+    print("All tests passed.")
 
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Auto)
     latency = profiler.do_bench(warmup=100)
     print("Tile-lang: {:.2f} ms".format(latency))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/lazy_jit/lazyjit.en.ipynb b/examples/lazy_jit/lazyjit.en.ipynb
new file mode 100644
index 000000000..5b5df8e6a
--- /dev/null
+++ b/examples/lazy_jit/lazyjit.en.ipynb
@@ -0,0 +1,977 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5e0deecc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "sys.path.insert(0, str(Path.cwd().parent.parent.absolute()))\n",
+    "import tilelang\n",
+    "import torch\n",
+    "import tilelang.language as T"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1ca2c56d",
+   "metadata": {},
+   "source": [
+    "# Tilelang Lazy JIT"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "156e7370",
+   "metadata": {},
+   "source": [
+    "## Tensor Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b070c109",
+   "metadata": {},
+   "source": [
+    "Tilelang Lazy JIT merges JIT kernel generation and invocation into a single workflow.\n",
+    "\n",
+    "The function signature looks similar to Triton, but we add many enhancements; the most important one is allowing rich Tensor annotations:\n",
+    "\n",
+    "* If a Tensor has complex shape constraints, we can move its annotation into the function body.\n",
+    "* Use `T.const` or `T.dynamic` to create shape variables, then annotate complex Tensors with `T.Tensor`.\n",
+    "* Use `T.empty` to declare return tensors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "60bf8954",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def gemm(\n",
+    "    A,\n",
+    "    B,\n",
+    "    out_dtype: T.dtype = T.float32,\n",
+    "    block_M: int = 128,\n",
+    "    block_N: int = 128,\n",
+    "    block_K: int = 32,\n",
+    "):\n",
+    "    M, N, K = T.const(\"M, N, K\")\n",
+    "\n",
+    "    A: T.Tensor[[M, K], T.float16]\n",
+    "    B: T.Tensor[[K, N], T.float16]\n",
+    "\n",
+    "    C = T.empty((M, N), out_dtype)\n",
+    "\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        A_shared = T.alloc_shared((block_M, block_K), A.dtype)\n",
+    "        B_shared = T.alloc_shared((block_K, block_N), B.dtype)\n",
+    "        C_local = T.alloc_fragment((block_M, block_N), out_dtype)\n",
+    "        T.clear(C_local)\n",
+    "        for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):\n",
+    "            T.copy(A[bx * block_M, k * block_K], A_shared)\n",
+    "            T.copy(B[k * block_K, by * block_N], B_shared)\n",
+    "            T.gemm(A_shared, B_shared, C_local)\n",
+    "        T.copy(C_local, C[bx * block_M, by * block_N])\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "28f868fe",
+   "metadata": {},
+   "source": [
+    "Calling the function with Tensors directly triggers the full JIT compile-and-run pipeline:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ee13394a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm(A, B)\n",
+    "\n",
+    "# check output is correct\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c6705091",
+   "metadata": {},
+   "source": [
+    "Changing the call arguments may trigger a recompilation when compilation parameters change:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "d8aab5b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 1024, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm(A, B, block_M=64, block_N=64)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ce6b7391",
+   "metadata": {},
+   "source": [
+    "You can also explicitly call the `compile` method to build the kernel.\n",
+    "\n",
+    "1. `ker.compile` compiles the kernel\n",
+    "2. `ker.get_tir` retrieves the TIR\n",
+    "3. `ker.par_compile` compiles in parallel"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f3cf3a2d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kernel = gemm.compile(A, B, block_M=64, block_N=64)\n",
+    "C = kernel(A, B)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "921761b5",
+   "metadata": {},
+   "source": [
+    "## More Tensor Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4539e54e",
+   "metadata": {},
+   "source": [
+    "### Use macros to separate implementation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ad96ba65",
+   "metadata": {},
+   "source": [
+    "Next, we implement a simple GEMM in several different ways. For convenience, we first write a macro that contains the core GEMM logic:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "171d4fe6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@T.macro\n",
+    "def gemm_impl(A, B, C, M, N, K, block_M, block_N, block_K):\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        A_shared = T.alloc_shared((block_M, block_K), A.dtype)\n",
+    "        B_shared = T.alloc_shared((block_K, block_N), B.dtype)\n",
+    "        C_local = T.alloc_fragment((block_M, block_N), C.dtype)\n",
+    "        T.clear(C_local)\n",
+    "        for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):\n",
+    "            T.copy(A[bx * block_M, k * block_K], A_shared)\n",
+    "            T.copy(B[k * block_K, by * block_N], B_shared)\n",
+    "            T.gemm(A_shared, B_shared, C_local)\n",
+    "        T.copy(C_local, C[bx * block_M, by * block_N])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "446a1acd",
+   "metadata": {},
+   "source": [
+    "### Use `T.dynamic` to mark dynamic shapes\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "6a38aa95",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def gemm_dyn_K(A, B):\n",
+    "    M, N, K = T.dynamic(\"M, N, K\")\n",
+    "    A: T.Tensor[[M, K], T.float16]\n",
+    "    B: T.Tensor[[K, N], T.float16]\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, 128, 128, 32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "fe6cfdc8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm_dyn_K(A, B)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2ee97bf7",
+   "metadata": {},
+   "source": [
+    "### Use `T.StridedTensor` to annotate tensors with strides\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "9dde1dae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def as_contingious(A):\n",
+    "    M, N, dM, dN = T.dynamic(\"M, N, dM, dN\")\n",
+    "    A: T.StridedTensor[[M, N], [dM, dN], T.float32]\n",
+    "    B = T.empty((M, N), A.dtype)\n",
+    "    block_M = 128\n",
+    "    block_N = 128\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        T.copy(\n",
+    "            A[bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N],\n",
+    "            B[bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N],\n",
+    "        )\n",
+    "    return B"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "dec2c0a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 1024, device=\"cuda\")\n",
+    "B = as_contingious(A.T)\n",
+    "B_ref = A.T.contiguous()\n",
+    "torch.testing.assert_close(B, B_ref)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f5fb20d6",
+   "metadata": {},
+   "source": [
+    "## More Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "890df0a2",
+   "metadata": {},
+   "source": [
+    "### Use parameters directly as annotations"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e9a47d42",
+   "metadata": {},
+   "source": [
+    "You can directly use function parameters in the annotations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "0fc17af6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def gemm_ptr(\n",
+    "    A,\n",
+    "    B,\n",
+    "    M,\n",
+    "    N,\n",
+    "    K,\n",
+    "):\n",
+    "    A: T.Tensor[[M, K], T.float16]\n",
+    "    B: T.Tensor[[K, N], T.float16]\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, block_M=128, block_N=128, block_K=32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "8e52a554",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm_ptr(A, B, 1024, 256, 512)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6b19ef90",
+   "metadata": {},
+   "source": [
+    "### Annotations for runtime variables"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bba5f27f",
+   "metadata": {},
+   "source": [
+    "Runtime variables work the same; if the function annotation becomes too long, you can move it into the function body."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "c1e7598a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def gemm_ptr_dyn(A, B, M, N, K):\n",
+    "    M: T.int32\n",
+    "    N: T.int32\n",
+    "    K: T.int32\n",
+    "    A: T.Tensor[[M, K], T.float16]\n",
+    "    B: T.Tensor[[K, N], T.float16]\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, block_M=128, block_N=128, block_K=32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "9e9a4c88",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm_ptr_dyn(A, B, 1024, 256, 512)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "81427765",
+   "metadata": {},
+   "source": [
+    "### Constraints for constants"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4d6b084b",
+   "metadata": {},
+   "source": [
+    "A constant annotation created by `T.const` must be used directly at least once, otherwise an error is raised."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "c90dd24f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Constexpr variable `M` is not used in any buffer shape or stride.\n",
+      "At least one **DIRECT** usage is required. Please check:\n",
+      "(1) the variable is not used\n",
+      "(2) all uses are indirect, e.g. M * 2, M * 3. (you can replace them with separate constexpr variables)\n",
+      "Buffer shapes: {A: [M * 2, M * 3]}\n",
+      "Buffer strides: {A: [M * 3, 1]}\n"
+     ]
+    }
+   ],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def example_wrong_kernel(A):\n",
+    "    M = T.const(\"M\")\n",
+    "    A: T.Tensor[[M * 2, M * 3], T.float32]\n",
+    "    with T.Kernel(1) as _:\n",
+    "        A[0, 0]\n",
+    "\n",
+    "\n",
+    "try:\n",
+    "    A = torch.randn(64, 96, dtype=torch.float32, device=\"cuda\")\n",
+    "    example_wrong_kernel(A)\n",
+    "except Exception as e:\n",
+    "    print(e)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e07e762b",
+   "metadata": {},
+   "source": [
+    "### Dynamic dimensions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f48e5d7a",
+   "metadata": {},
+   "source": [
+    "If you want certain parameters in a Tensor annotation to change, it is recommended to switch to the `T.ptr` + `T.match_buffer` style."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "1d050321",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def dyn_annot(\n",
+    "    A: T.ptr,  # 1. T.ptr type annotation\n",
+    "    is_2d=False,\n",
+    "):\n",
+    "    if is_2d:\n",
+    "        M, N = T.const(\"M, N\")\n",
+    "        # 2. dynamic shape annotation inside function body\n",
+    "        A = T.match_buffer(A, [M, N], T.float32)\n",
+    "        with T.Kernel(1) as _:\n",
+    "            A[0, 0]\n",
+    "    else:\n",
+    "        L = T.const(\"L\")\n",
+    "        A = T.match_buffer(A, [L], T.float32)\n",
+    "        with T.Kernel(1) as _:\n",
+    "            A[0]\n",
+    "\n",
+    "\n",
+    "A = torch.randn(64, 96, dtype=torch.float32, device=\"cuda\")\n",
+    "dyn_annot(A, is_2d=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2e9f1bb3",
+   "metadata": {},
+   "source": [
+    "### Default arguments"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f7fc9917",
+   "metadata": {},
+   "source": [
+    "Scalar annotations like `T.float32` can carry default values."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "42ec86a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def add_one(X, data: T.float32 = 1):\n",
+    "    M, N = T.const(\"M, N\")\n",
+    "    X: T.Tensor[[M, N], T.float32]\n",
+    "    Y = T.empty((M, N), T.float32)\n",
+    "    with T.Kernel(T.ceildiv(M, 128), threads=128) as bx:\n",
+    "        for i, j in T.Parallel(128, N):\n",
+    "            Y[bx * 128 + i, j] = X[bx * 128 + i, j] + data\n",
+    "    return Y"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "d49e1120",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = torch.randn(1024, 1024, dtype=torch.float32, device=\"cuda\")\n",
+    "Y = add_one(X)\n",
+    "torch.testing.assert_close(Y, X + 1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a02baedc",
+   "metadata": {},
+   "source": [
+    "## Overhead of argument matching"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "860a2972",
+   "metadata": {},
+   "source": [
+    "LazyJIT has very small overhead; each additional constant annotation costs about 200 ns.\n",
+    "* 200 ns is roughly the cost of an FFI call that reads parameters from a `torch.Tensor`'s shape/stride."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dc676e33",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Kernel call    : 7.68 us\n",
+      "Parse cache key: 0.41 us\n"
+     ]
+    }
+   ],
+   "source": [
+    "import time\n",
+    "\n",
+    "A = torch.randn(128, 128, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(128, 128, dtype=torch.float16, device=\"cuda\")\n",
+    "\n",
+    "\n",
+    "@tilelang.lazy_jit\n",
+    "def dummy_kernel(A, B):\n",
+    "    M, N = T.const(\"M, N\")\n",
+    "    A: T.Tensor[[M, N], T.float16]\n",
+    "    B: T.Tensor[[M, N], T.float16]\n",
+    "    with T.Kernel(1) as _:\n",
+    "        pass\n",
+    "\n",
+    "\n",
+    "# compile it first\n",
+    "dummy_kernel(A, B)\n",
+    "\n",
+    "\n",
+    "def eval_overhead(f):\n",
+    "    start = time.perf_counter_ns()\n",
+    "    for _ in range(10000):\n",
+    "        f()\n",
+    "    stop = time.perf_counter_ns()\n",
+    "    return (stop - start) / 10000 / 1000\n",
+    "\n",
+    "\n",
+    "kernel_call_overhead = eval_overhead(lambda: dummy_kernel(A, B))\n",
+    "parse_cache_key_overhead = eval_overhead(lambda: dummy_kernel.parse_cache_key(A, B))\n",
+    "\n",
+    "print(f\"Kernel call    : {kernel_call_overhead:.2f} us\")\n",
+    "print(f\"Parse cache key: {parse_cache_key_overhead:.2f} us\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39166cb4",
+   "metadata": {},
+   "source": [
+    "## Compilation and parallel compilation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8c6fbe08",
+   "metadata": {},
+   "source": [
+    "Both `lazyjit` and the original `jit` support parallel compilation.\n",
+    "\n",
+    "To avoid wasting memory on temporary `torch.Tensor` objects, you can use `T.Tensor` to create placeholders."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "7222e57b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8a4e4eb3cd4445bda6e8693da31ef3b8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Elaborating:   0%|          | 0/8 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f61c2946f55547c688e629851d4e8106",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Parallel Compiling:   0%|          | 0/8 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[<tilelang.jit.kernel.JITKernel at 0x7ef9f7de7d70>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7de52b0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7e34b30>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7e34530>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7de6900>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7e344a0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7e347a0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7fb25d0>]"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from itertools import product\n",
+    "\n",
+    "\n",
+    "def get_configs():\n",
+    "    return [\n",
+    "        {\n",
+    "            \"A\": T.Tensor((1024, 1024), T.float32),\n",
+    "            \"B\": T.Tensor((1024, 1024), T.float32),\n",
+    "            \"block_M\": block_M,\n",
+    "            \"block_N\": block_N,\n",
+    "            \"block_K\": block_K,\n",
+    "        }\n",
+    "        for block_M, block_N, block_K in product([32, 64], repeat=3)\n",
+    "    ]\n",
+    "\n",
+    "\n",
+    "gemm.par_compile(get_configs())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5160d2cc",
+   "metadata": {},
+   "source": [
+    "## More convenient macros"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "be44afc4",
+   "metadata": {},
+   "source": [
+    "tilelang's macros have been improved:\n",
+    "\n",
+    "1. Allow using `T.Ref` as an annotation, similar to C++ references.\n",
+    "2. Allow returning multiple values.\n",
+    "3. Allow nesting and recursion."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "79575972",
+   "metadata": {},
+   "source": [
+    "### Passing references with `T.Ref`\n",
+    "\n",
+    "A `T.Ref` reference can point to a scalar variable or to an element of a buffer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "90eaa6e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "# from tvm.script import tir as T\n",
+       "\n",
+       "@T.prim_func\n",
+       "def foo(x_handle: T.handle):\n",
+       "    x = T.match_buffer(x_handle, (2,), strides=(1,))\n",
+       "    # with T.block(\"root\"):\n",
+       "    bx = T.launch_thread(\"blockIdx.x\", 1)\n",
+       "    tx = T.launch_thread(\"threadIdx.x\", 128)\n",
+       "    ty = T.launch_thread(\"threadIdx.y\", 1)\n",
+       "    tz = T.launch_thread(\"threadIdx.z\", 1)\n",
+       "    with T.block(\"tilelang_root\"):\n",
+       "        T.reads()\n",
+       "        idx = T.Buffer((1,), \"int32\", scope=\"local.var\")\n",
+       "        T.writes(x[T.min(1, idx[0]):T.min(1, idx[0]) + (T.max(1, idx[0]) + 1 - T.min(1, idx[0]))])\n",
+       "        T.block_attr({\"tl.local_var_init\": {idx.data: 0}})\n",
+       "        idx = T.alloc_buffer((1,), \"int32\", data=idx.data, scope=\"local.var\")\n",
+       "        x[1] = T.float32(1.0)\n",
+       "        _tmp: T.int32 = idx[0]\n",
+       "        x[_tmp] = T.float32(1.0)"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@T.macro\n",
+    "def macro_with_ref(x: T.Ref):\n",
+    "    x = 1  # noqa: F841\n",
+    "\n",
+    "\n",
+    "@T.prim_func\n",
+    "def foo(x: T.Tensor((2,))):\n",
+    "    with T.Kernel(1) as _:\n",
+    "        # Supports constant indices\n",
+    "        macro_with_ref(x[1])\n",
+    "\n",
+    "        # Also supports variable indices\n",
+    "        idx = T.alloc_var(T.int32, 0)\n",
+    "        macro_with_ref(x[idx])\n",
+    "\n",
+    "\n",
+    "foo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7bb447a2",
+   "metadata": {},
+   "source": [
+    "### Pass macros as arguments\n",
+    "\n",
+    "You can pass a macro as a function argument."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "dc7bb779",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def element_wise(A, fn):\n",
+    "    N = T.dynamic(\"N\")\n",
+    "    A: T.Tensor[[N], T.float32]\n",
+    "    B = T.empty((N,), dtype=A.dtype)\n",
+    "    block_N = 128\n",
+    "    with T.Kernel(T.ceildiv(N, block_N), threads=128) as bx:\n",
+    "        for i in T.Parallel(block_N):\n",
+    "            idx = bx * block_N + i\n",
+    "            B[idx] = fn(A[idx])\n",
+    "    return B\n",
+    "\n",
+    "\n",
+    "@T.macro\n",
+    "def add_one(x):\n",
+    "    return x + 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "a89fdb44",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, device=\"cuda\")\n",
+    "B = element_wise(A, add_one)\n",
+    "B_ref = A + 1\n",
+    "torch.testing.assert_close(B, B_ref)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ef6e403a",
+   "metadata": {},
+   "source": [
+    "### Recursive macros\n",
+    "\n",
+    "You may not need this often, but macros can be recursive as long as the termination condition is known at compile time."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "7703cab5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@T.macro\n",
+    "def n31(x, var: T.Ref):\n",
+    "    if x == 1:\n",
+    "        pass\n",
+    "    elif x % 2 == 0:\n",
+    "        var = var // 2\n",
+    "        n31(x // 2, var)\n",
+    "    else:\n",
+    "        var = var * 3 + 1\n",
+    "        n31(x * 3 + 1, var)\n",
+    "\n",
+    "\n",
+    "@tilelang.lazy_jit\n",
+    "def foo(A: T.Tensor[[1], T.int32], n: int):\n",
+    "    with T.Kernel(1) as _:\n",
+    "        n31(n, A[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "542ddd4e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([18], device='cuda:0', dtype=torch.int32)"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "A = torch.tensor([100], dtype=torch.int32, device=\"cuda\")\n",
+    "foo(A, 5)\n",
+    "A"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dc30c2d2",
+   "metadata": {},
+   "source": [
+    "### Macros returning multiple values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "d5a2388f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "# from tvm.script import tir as T\n",
+       "\n",
+       "@T.prim_func\n",
+       "def foo():\n",
+       "    # with T.block(\"root\"):\n",
+       "    x = T.launch_thread(\"blockIdx.x\", 32)\n",
+       "    tx = T.launch_thread(\"threadIdx.x\", 128)\n",
+       "    ty = T.launch_thread(\"threadIdx.y\", 1)\n",
+       "    tz = T.launch_thread(\"threadIdx.z\", 1)\n",
+       "    with T.block(\"tilelang_root\"):\n",
+       "        T.reads()\n",
+       "        T.writes()\n",
+       "        s: T.int32 = T.sin(x)\n",
+       "        c: T.int32 = T.cos(x)\n",
+       "        a: T.int32 = s + c\n",
+       "        b: T.int32 = s - c\n",
+       "        T.evaluate(0)"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@T.macro\n",
+    "def sincos(x):\n",
+    "    return T.sin(x), T.cos(x)\n",
+    "\n",
+    "\n",
+    "@T.prim_func\n",
+    "def foo():\n",
+    "    with T.Kernel(32) as x:\n",
+    "        s, c = sincos(x)\n",
+    "        a = s + c  # noqa: F841\n",
+    "        b = s - c  # noqa: F841\n",
+    "\n",
+    "\n",
+    "foo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dd83fea7",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "tilelang-dev_0",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/lazy_jit/lazyjit.zh.ipynb b/examples/lazy_jit/lazyjit.zh.ipynb
new file mode 100644
index 000000000..387aff461
--- /dev/null
+++ b/examples/lazy_jit/lazyjit.zh.ipynb
@@ -0,0 +1,977 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5e0deecc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "sys.path.insert(0, str(Path.cwd().parent.parent.absolute()))\n",
+    "import tilelang\n",
+    "import torch\n",
+    "import tilelang.language as T"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1ca2c56d",
+   "metadata": {},
+   "source": [
+    "# Tilelang Lazy JIT"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "156e7370",
+   "metadata": {},
+   "source": [
+    "## Tensor Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b070c109",
+   "metadata": {},
+   "source": [
+    "Tilelang Lazy JIT 将 jit 生成和调用的逻辑合并到一起\n",
+    "\n",
+    "函数签名的写法与 triton 相似，但做了大量增强，最主要的增强是允许对 Tensor 的标注：\n",
+    "\n",
+    "* 如果一个 Tensor 有复杂的 shape 约束，我们可以把它的标注移动到函数内部\n",
+    "* 通过 `T.const` 或 `T.dynamic` 来建立一些 shape 变量，然后用 `T.Tensor` 标注复杂的 Tensor\n",
+    "* 用 `T.empty` 来声明返回值"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "60bf8954",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def gemm(\n",
+    "    A,\n",
+    "    B,\n",
+    "    out_dtype: T.dtype = T.float32,\n",
+    "    block_M: int = 128,\n",
+    "    block_N: int = 128,\n",
+    "    block_K: int = 32,\n",
+    "):\n",
+    "    M, N, K = T.const(\"M, N, K\")\n",
+    "\n",
+    "    A: T.Tensor[[M, K], T.float16]\n",
+    "    B: T.Tensor[[K, N], T.float16]\n",
+    "\n",
+    "    C = T.empty((M, N), out_dtype)\n",
+    "\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        A_shared = T.alloc_shared((block_M, block_K), A.dtype)\n",
+    "        B_shared = T.alloc_shared((block_K, block_N), B.dtype)\n",
+    "        C_local = T.alloc_fragment((block_M, block_N), out_dtype)\n",
+    "        T.clear(C_local)\n",
+    "        for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):\n",
+    "            T.copy(A[bx * block_M, k * block_K], A_shared)\n",
+    "            T.copy(B[k * block_K, by * block_N], B_shared)\n",
+    "            T.gemm(A_shared, B_shared, C_local)\n",
+    "        T.copy(C_local, C[bx * block_M, by * block_N])\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "28f868fe",
+   "metadata": {},
+   "source": [
+    "直接将 Tensor 作为参数调用，即可触发完整的 jit 编译运行流程："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ee13394a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm(A, B)\n",
+    "\n",
+    "# check output is correct\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c6705091",
+   "metadata": {},
+   "source": [
+    "更改调用的参数，如果编译器参数发生了变化，会触发重新编译："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "d8aab5b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 1024, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm(A, B, block_M=64, block_N=64)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ce6b7391",
+   "metadata": {},
+   "source": [
+    "你也可以手动调用 compile 函数编译 kernel\n",
+    "\n",
+    "1. `ker.compile` 编译 kernel\n",
+    "2. `ker.get_tir` 获取 tir\n",
+    "3. `ker.par_compile` 并行编译"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f3cf3a2d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kernel = gemm.compile(A, B, block_M=64, block_N=64)\n",
+    "C = kernel(A, B)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "921761b5",
+   "metadata": {},
+   "source": [
+    "## More Tensor Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4539e54e",
+   "metadata": {},
+   "source": [
+    "### 用 macro 来分离实现"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ad96ba65",
+   "metadata": {},
+   "source": [
+    "接下来，我们会用各种方式来实现一个简单的 gemm，为了方便，我们先写一个 macro 把 gemm 的主要逻辑写出来："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "171d4fe6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@T.macro\n",
+    "def gemm_impl(A, B, C, M, N, K, block_M, block_N, block_K):\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        A_shared = T.alloc_shared((block_M, block_K), A.dtype)\n",
+    "        B_shared = T.alloc_shared((block_K, block_N), B.dtype)\n",
+    "        C_local = T.alloc_fragment((block_M, block_N), C.dtype)\n",
+    "        T.clear(C_local)\n",
+    "        for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):\n",
+    "            T.copy(A[bx * block_M, k * block_K], A_shared)\n",
+    "            T.copy(B[k * block_K, by * block_N], B_shared)\n",
+    "            T.gemm(A_shared, B_shared, C_local)\n",
+    "        T.copy(C_local, C[bx * block_M, by * block_N])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "446a1acd",
+   "metadata": {},
+   "source": [
+    "### 用 T.dynamic 标记动态 Shape\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "6a38aa95",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def gemm_dyn_K(A, B):\n",
+    "    M, N, K = T.dynamic(\"M, N, K\")\n",
+    "    A: T.Tensor[[M, K], T.float16]\n",
+    "    B: T.Tensor[[K, N], T.float16]\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, 128, 128, 32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "fe6cfdc8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm_dyn_K(A, B)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2ee97bf7",
+   "metadata": {},
+   "source": [
+    "### 用 T.StridedTensor 标记带 stride 的 Tensor\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "9dde1dae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def as_contingious(A):\n",
+    "    M, N, dM, dN = T.dynamic(\"M, N, dM, dN\")\n",
+    "    A: T.StridedTensor[[M, N], [dM, dN], T.float32]\n",
+    "    B = T.empty((M, N), A.dtype)\n",
+    "    block_M = 128\n",
+    "    block_N = 128\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        T.copy(\n",
+    "            A[bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N],\n",
+    "            B[bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N],\n",
+    "        )\n",
+    "    return B"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "dec2c0a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 1024, device=\"cuda\")\n",
+    "B = as_contingious(A.T)\n",
+    "B_ref = A.T.contiguous()\n",
+    "torch.testing.assert_close(B, B_ref)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f5fb20d6",
+   "metadata": {},
+   "source": [
+    "## More Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "890df0a2",
+   "metadata": {},
+   "source": [
+    "### 直接用参数当 annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e9a47d42",
+   "metadata": {},
+   "source": [
+    "可以直接把函数参数写到 annotation 里面"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "0fc17af6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def gemm_ptr(\n",
+    "    A,\n",
+    "    B,\n",
+    "    M,\n",
+    "    N,\n",
+    "    K,\n",
+    "):\n",
+    "    A: T.Tensor[[M, K], T.float16]\n",
+    "    B: T.Tensor[[K, N], T.float16]\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, block_M=128, block_N=128, block_K=32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "8e52a554",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm_ptr(A, B, 1024, 256, 512)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6b19ef90",
+   "metadata": {},
+   "source": [
+    "### 对运行时变量的 annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bba5f27f",
+   "metadata": {},
+   "source": [
+    "运行时变量也是一样，如果嫌函数 annotation 太长，可以放到函数体里面"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "c1e7598a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def gemm_ptr_dyn(A, B, M, N, K):\n",
+    "    M: T.int32\n",
+    "    N: T.int32\n",
+    "    K: T.int32\n",
+    "    A: T.Tensor[[M, K], T.float16]\n",
+    "    B: T.Tensor[[K, N], T.float16]\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, block_M=128, block_N=128, block_K=32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "9e9a4c88",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm_ptr_dyn(A, B, 1024, 256, 512)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "81427765",
+   "metadata": {},
+   "source": [
+    "### 常量的约束"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4d6b084b",
+   "metadata": {},
+   "source": [
+    "`T.const` 创建的常量 annotation 只要要被直接使用一次，否则会报错"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "c90dd24f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Constexpr variable `M` is not used in any buffer shape or stride.\n",
+      "At least one **DIRECT** usage is required. Please check:\n",
+      "(1) the variable is not used\n",
+      "(2) all uses are indirect, e.g. M * 2, M * 3. (you can replace them with separate constexpr variables)\n",
+      "Buffer shapes: {A: [M * 2, M * 3]}\n",
+      "Buffer strides: {A: [M * 3, 1]}\n"
+     ]
+    }
+   ],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def example_wrong_kernel(A):\n",
+    "    M = T.const(\"M\")\n",
+    "    A: T.Tensor[[M * 2, M * 3], T.float32]\n",
+    "    with T.Kernel(1) as _:\n",
+    "        A[0, 0]\n",
+    "\n",
+    "\n",
+    "try:\n",
+    "    A = torch.randn(64, 96, dtype=torch.float32, device=\"cuda\")\n",
+    "    example_wrong_kernel(A)\n",
+    "except Exception as e:\n",
+    "    print(e)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e07e762b",
+   "metadata": {},
+   "source": [
+    "### 动态维度的"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f48e5d7a",
+   "metadata": {},
+   "source": [
+    "如果想要 Tensor 的 annotation 类型某个参数变化，建议改成 T.ptr + T.match_buffer 格式。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "1d050321",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def dyn_annot(\n",
+    "    A: T.ptr,  # 1. T.ptr type annotation\n",
+    "    is_2d=False,\n",
+    "):\n",
+    "    if is_2d:\n",
+    "        M, N = T.const(\"M, N\")\n",
+    "        # 2. dynamic shape annotation inside function body\n",
+    "        A = T.match_buffer(A, [M, N], T.float32)\n",
+    "        with T.Kernel(1) as _:\n",
+    "            A[0, 0]\n",
+    "    else:\n",
+    "        L = T.const(\"L\")\n",
+    "        A = T.match_buffer(A, [L], T.float32)\n",
+    "        with T.Kernel(1) as _:\n",
+    "            A[0]\n",
+    "\n",
+    "\n",
+    "A = torch.randn(64, 96, dtype=torch.float32, device=\"cuda\")\n",
+    "dyn_annot(A, is_2d=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2e9f1bb3",
+   "metadata": {},
+   "source": [
+    "### 带默认参数的"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f7fc9917",
+   "metadata": {},
+   "source": [
+    "类似 `T.float32` 标注的标量可以带默认参数"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "42ec86a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def add_one(X, data: T.float32 = 1):\n",
+    "    M, N = T.const(\"M, N\")\n",
+    "    X: T.Tensor[[M, N], T.float32]\n",
+    "    Y = T.empty((M, N), T.float32)\n",
+    "    with T.Kernel(T.ceildiv(M, 128), threads=128) as bx:\n",
+    "        for i, j in T.Parallel(128, N):\n",
+    "            Y[bx * 128 + i, j] = X[bx * 128 + i, j] + data\n",
+    "    return Y"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "d49e1120",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = torch.randn(1024, 1024, dtype=torch.float32, device=\"cuda\")\n",
+    "Y = add_one(X)\n",
+    "torch.testing.assert_close(Y, X + 1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a02baedc",
+   "metadata": {},
+   "source": [
+    "## 参数匹配的 Overhead"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "860a2972",
+   "metadata": {},
+   "source": [
+    "LazyJIT overhead 很小，每个 constant 添加约 200ns 的 overhead\n",
+    "* 200ns 大约是从 torch.Tensor 的 shape/stride 中拿参数的 ffi call 的代价"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dc676e33",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Kernel call    : 7.68 us\n",
+      "Parse cache key: 0.41 us\n"
+     ]
+    }
+   ],
+   "source": [
+    "import time\n",
+    "\n",
+    "A = torch.randn(128, 128, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(128, 128, dtype=torch.float16, device=\"cuda\")\n",
+    "\n",
+    "\n",
+    "@tilelang.lazy_jit\n",
+    "def dummy_kernel(A, B):\n",
+    "    M, N = T.const(\"M, N\")\n",
+    "    A: T.Tensor[[M, N], T.float16]\n",
+    "    B: T.Tensor[[M, N], T.float16]\n",
+    "    with T.Kernel(1) as _:\n",
+    "        pass\n",
+    "\n",
+    "\n",
+    "# compile it first\n",
+    "dummy_kernel(A, B)\n",
+    "\n",
+    "\n",
+    "def eval_overhead(f):\n",
+    "    start = time.perf_counter_ns()\n",
+    "    for _ in range(10000):\n",
+    "        f()\n",
+    "    stop = time.perf_counter_ns()\n",
+    "    return (stop - start) / 10000 / 1000\n",
+    "\n",
+    "\n",
+    "kernel_call_overhead = eval_overhead(lambda: dummy_kernel(A, B))\n",
+    "parse_cache_key_overhead = eval_overhead(lambda: dummy_kernel.parse_cache_key(A, B))\n",
+    "\n",
+    "print(f\"Kernel call    : {kernel_call_overhead:.2f} us\")\n",
+    "print(f\"Parse cache key: {parse_cache_key_overhead:.2f} us\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39166cb4",
+   "metadata": {},
+   "source": [
+    "## 编译与并行编译"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8c6fbe08",
+   "metadata": {},
+   "source": [
+    "lazyjit 和原来的 jit 都支持并行编译\n",
+    "\n",
+    "为了防止 torch.tensor 白白浪费内存，可以使用 T.Tensor 来创建 placeholder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "7222e57b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8a4e4eb3cd4445bda6e8693da31ef3b8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Elaborating:   0%|          | 0/8 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f61c2946f55547c688e629851d4e8106",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Parallel Compiling:   0%|          | 0/8 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[<tilelang.jit.kernel.JITKernel at 0x7ef9f7de7d70>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7de52b0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7e34b30>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7e34530>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7de6900>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7e344a0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7e347a0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7fb25d0>]"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from itertools import product\n",
+    "\n",
+    "\n",
+    "def get_configs():\n",
+    "    return [\n",
+    "        {\n",
+    "            \"A\": T.Tensor((1024, 1024), T.float32),\n",
+    "            \"B\": T.Tensor((1024, 1024), T.float32),\n",
+    "            \"block_M\": block_M,\n",
+    "            \"block_N\": block_N,\n",
+    "            \"block_K\": block_K,\n",
+    "        }\n",
+    "        for block_M, block_N, block_K in product([32, 64], repeat=3)\n",
+    "    ]\n",
+    "\n",
+    "\n",
+    "gemm.par_compile(get_configs())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5160d2cc",
+   "metadata": {},
+   "source": [
+    "## 更便利的 Macro"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "be44afc4",
+   "metadata": {},
+   "source": [
+    "tilelang 的 macro 现在已经升级：\n",
+    "\n",
+    "1. 允许用 `T.Ref` 作为 annotation，这类似与 C++ 的引用传递\n",
+    "2. 允许返回多个值\n",
+    "3. 允许嵌套，递归"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "79575972",
+   "metadata": {},
+   "source": [
+    "### T.Ref 传递引用\n",
+    "\n",
+    "T.Ref 传递的引用可以 var 也可以是 Buffer 的索引"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "90eaa6e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "# from tvm.script import tir as T\n",
+       "\n",
+       "@T.prim_func\n",
+       "def foo(x_handle: T.handle):\n",
+       "    x = T.match_buffer(x_handle, (2,), strides=(1,))\n",
+       "    # with T.block(\"root\"):\n",
+       "    bx = T.launch_thread(\"blockIdx.x\", 1)\n",
+       "    tx = T.launch_thread(\"threadIdx.x\", 128)\n",
+       "    ty = T.launch_thread(\"threadIdx.y\", 1)\n",
+       "    tz = T.launch_thread(\"threadIdx.z\", 1)\n",
+       "    with T.block(\"tilelang_root\"):\n",
+       "        T.reads()\n",
+       "        idx = T.Buffer((1,), \"int32\", scope=\"local.var\")\n",
+       "        T.writes(x[T.min(1, idx[0]):T.min(1, idx[0]) + (T.max(1, idx[0]) + 1 - T.min(1, idx[0]))])\n",
+       "        T.block_attr({\"tl.local_var_init\": {idx.data: 0}})\n",
+       "        idx = T.alloc_buffer((1,), \"int32\", data=idx.data, scope=\"local.var\")\n",
+       "        x[1] = T.float32(1.0)\n",
+       "        _tmp: T.int32 = idx[0]\n",
+       "        x[_tmp] = T.float32(1.0)"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@T.macro\n",
+    "def macro_with_ref(x: T.Ref):\n",
+    "    x = 1  # noqa: F841\n",
+    "\n",
+    "\n",
+    "@T.prim_func\n",
+    "def foo(x: T.Tensor((2,))):\n",
+    "    with T.Kernel(1) as _:\n",
+    "        # 支持常量 index\n",
+    "        macro_with_ref(x[1])\n",
+    "\n",
+    "        # 也支持变量 index\n",
+    "        idx = T.alloc_var(T.int32, 0)\n",
+    "        macro_with_ref(x[idx])\n",
+    "\n",
+    "\n",
+    "foo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7bb447a2",
+   "metadata": {},
+   "source": [
+    "### 当作参数传递\n",
+    "\n",
+    "你可以把 macro 当做参数传递"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "dc7bb779",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.lazy_jit\n",
+    "def element_wise(A, fn):\n",
+    "    N = T.dynamic(\"N\")\n",
+    "    A: T.Tensor[[N], T.float32]\n",
+    "    B = T.empty((N,), dtype=A.dtype)\n",
+    "    block_N = 128\n",
+    "    with T.Kernel(T.ceildiv(N, block_N), threads=128) as bx:\n",
+    "        for i in T.Parallel(block_N):\n",
+    "            idx = bx * block_N + i\n",
+    "            B[idx] = fn(A[idx])\n",
+    "    return B\n",
+    "\n",
+    "\n",
+    "@T.macro\n",
+    "def add_one(x):\n",
+    "    return x + 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "a89fdb44",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, device=\"cuda\")\n",
+    "B = element_wise(A, add_one)\n",
+    "B_ref = A + 1\n",
+    "torch.testing.assert_close(B, B_ref)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ef6e403a",
+   "metadata": {},
+   "source": [
+    "### Macro 递归\n",
+    "\n",
+    "虽然不知道有没有这种需求，但 macro 是可以递归的，终止条件要求编译期间确定"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "7703cab5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@T.macro\n",
+    "def n31(x, var: T.Ref):\n",
+    "    if x == 1:\n",
+    "        pass\n",
+    "    elif x % 2 == 0:\n",
+    "        var = var // 2\n",
+    "        n31(x // 2, var)\n",
+    "    else:\n",
+    "        var = var * 3 + 1\n",
+    "        n31(x * 3 + 1, var)\n",
+    "\n",
+    "\n",
+    "@tilelang.lazy_jit\n",
+    "def foo(A: T.Tensor[[1], T.int32], n: int):\n",
+    "    with T.Kernel(1) as _:\n",
+    "        n31(n, A[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "542ddd4e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([18], device='cuda:0', dtype=torch.int32)"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "A = torch.tensor([100], dtype=torch.int32, device=\"cuda\")\n",
+    "foo(A, 5)\n",
+    "A"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dc30c2d2",
+   "metadata": {},
+   "source": [
+    "### Macro 返回多个值"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "d5a2388f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "# from tvm.script import tir as T\n",
+       "\n",
+       "@T.prim_func\n",
+       "def foo():\n",
+       "    # with T.block(\"root\"):\n",
+       "    x = T.launch_thread(\"blockIdx.x\", 32)\n",
+       "    tx = T.launch_thread(\"threadIdx.x\", 128)\n",
+       "    ty = T.launch_thread(\"threadIdx.y\", 1)\n",
+       "    tz = T.launch_thread(\"threadIdx.z\", 1)\n",
+       "    with T.block(\"tilelang_root\"):\n",
+       "        T.reads()\n",
+       "        T.writes()\n",
+       "        s: T.int32 = T.sin(x)\n",
+       "        c: T.int32 = T.cos(x)\n",
+       "        a: T.int32 = s + c\n",
+       "        b: T.int32 = s - c\n",
+       "        T.evaluate(0)"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@T.macro\n",
+    "def sincos(x):\n",
+    "    return T.sin(x), T.cos(x)\n",
+    "\n",
+    "\n",
+    "@T.prim_func\n",
+    "def foo():\n",
+    "    with T.Kernel(32) as x:\n",
+    "        s, c = sincos(x)\n",
+    "        a = s + c  # noqa: F841\n",
+    "        b = s - c  # noqa: F841\n",
+    "\n",
+    "\n",
+    "foo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dd83fea7",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "tilelang-dev_0",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/linear_attention/example_linear_attn_bwd.py b/examples/linear_attention/example_linear_attn_bwd.py
index 568bcc55f..82ae1d982 100644
--- a/examples/linear_attention/example_linear_attn_bwd.py
+++ b/examples/linear_attention/example_linear_attn_bwd.py
@@ -13,20 +13,20 @@
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
+    }
+)
 def tl_fused_chunk_bwd_kernel(
     B,
     S,
     H,
     DK,
     DV,
-    dtype: str = 'float16',
+    dtype: T.dtype = T.float16,
     scale: float = None,
 ) -> torch.Tensor:
-
     if scale is None:
         scale = DK**-0.5
-    accum_dtype = 'float'
+    accum_dtype = T.float32
 
     chunk_size = 64
     BK = BV = 64  # Set to 128 can be faster, but has some numerical differences with FLA
@@ -37,13 +37,13 @@ def tl_fused_chunk_bwd_kernel(
 
     @T.prim_func
     def fused_chunk_linear_attn_bwd(
-            Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
-            K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
-            V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
-            dO: T.Tensor([B, S, H, DV], dtype),  # type: ignore
-            dQ: T.Tensor([B, S, H, DK], accum_dtype),  # type: ignore
-            dK: T.Tensor([B, S, H, DK], accum_dtype),  # type: ignore
-            dV: T.Tensor([B, S, H, DV], accum_dtype),  # type: ignore
+        Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
+        K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
+        V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
+        dO: T.Tensor([B, S, H, DV], dtype),  # type: ignore
+        dQ: T.Tensor([B, S, H, DK], accum_dtype),  # type: ignore
+        dK: T.Tensor([B, S, H, DK], accum_dtype),  # type: ignore
+        dV: T.Tensor([B, S, H, DV], accum_dtype),  # type: ignore
     ):
         with T.Kernel(NV, NK, B * H) as (i_v, i_k, i_bh):
             i_b = i_bh // H
@@ -66,11 +66,6 @@ def fused_chunk_linear_attn_bwd(
             dh = T.alloc_fragment([BK, BV], accum_dtype)
             dh_shared = T.alloc_shared([BK, BV], dtype)
 
-            T.annotate_layout({
-                dq_shared: tilelang.layout.make_swizzled_layout(dq_shared),
-                dk_shared: tilelang.layout.make_swizzled_layout(dk_shared),
-                dv_shared: tilelang.layout.make_swizzled_layout(dv_shared)
-            })
             T.use_swizzle(10)
 
             T.clear(h)
@@ -78,10 +73,9 @@ def fused_chunk_linear_attn_bwd(
 
             # Calculate dQ
             for i in T.Pipelined(0, NT):
-                T.copy(K[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_k * BK:(i_k + 1) * BK], k)
-                T.copy(V[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_v * BV:(i_v + 1) * BV], v)
-                T.copy(dO[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_v * BV:(i_v + 1) * BV],
-                       do)
+                T.copy(K[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_k * BK : (i_k + 1) * BK], k)
+                T.copy(V[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], v)
+                T.copy(dO[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], do)
 
                 T.gemm(do, v, ds, transpose_B=True, clear_accum=True)
                 for row, col in T.Parallel(chunk_size, chunk_size):
@@ -94,29 +88,19 @@ def fused_chunk_linear_attn_bwd(
                 for row, col in T.Parallel(chunk_size, BK):
                     dq[row, col] *= scale
                 T.copy(dq, dq_shared)
-                T.atomic_add(
-                    dQ[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_k * BK:(i_k + 1) * BK],
-                    dq_shared)
+                T.atomic_add(dQ[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_k * BK : (i_k + 1) * BK], dq_shared)
 
             # Calculate dK, dV (reversely)
             for i in T.Pipelined(1, NT + 1):
                 start = NT - i
                 for row, col in T.Parallel(chunk_size, BK):
                     q[row, col] = Q[i_b, start * chunk_size + row, i_h, i_k * BK + col] * scale
-                T.copy(
-                    K[i_b, start * chunk_size:(start + 1) * chunk_size, i_h,
-                      i_k * BK:(i_k + 1) * BK], k)
-                T.copy(
-                    V[i_b, start * chunk_size:(start + 1) * chunk_size, i_h,
-                      i_v * BV:(i_v + 1) * BV], v)
-                T.copy(
-                    dO[i_b, start * chunk_size:(start + 1) * chunk_size, i_h,
-                       i_v * BV:(i_v + 1) * BV], do)
+                T.copy(K[i_b, start * chunk_size : (start + 1) * chunk_size, i_h, i_k * BK : (i_k + 1) * BK], k)
+                T.copy(V[i_b, start * chunk_size : (start + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], v)
+                T.copy(dO[i_b, start * chunk_size : (start + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], do)
 
                 # Calculate dk
-                T.gemm(
-                    v, do, ds, transpose_B=True, clear_accum=True
-                )  # ds here actually means `s`, but we simply reuse the buffer `ds`
+                T.gemm(v, do, ds, transpose_B=True, clear_accum=True)  # ds here actually means `s`, but we simply reuse the buffer `ds`
                 for row, col in T.Parallel(chunk_size, chunk_size):
                     ds_shared[row, col] = T.if_then_else(row <= col, ds[row, col], 0)
                 T.gemm(ds_shared, q, dk, clear_accum=True)
@@ -134,13 +118,9 @@ def fused_chunk_linear_attn_bwd(
                 T.gemm(q, do, dh, transpose_A=True)
 
                 T.copy(dk, dk_shared)
-                T.atomic_add(
-                    dK[i_b, start * chunk_size:(start + 1) * chunk_size, i_h,
-                       i_k * BK:(i_k + 1) * BK], dk_shared)
+                T.atomic_add(dK[i_b, start * chunk_size : (start + 1) * chunk_size, i_h, i_k * BK : (i_k + 1) * BK], dk_shared)
                 T.copy(dv, dv_shared)
-                T.atomic_add(
-                    dV[i_b, start * chunk_size:(start + 1) * chunk_size, i_h,
-                       i_v * BV:(i_v + 1) * BV], dv_shared)
+                T.atomic_add(dV[i_b, start * chunk_size : (start + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], dv_shared)
 
     return fused_chunk_linear_attn_bwd
 
@@ -155,34 +135,31 @@ def tl_fused_chunk_bwd(Q, K, V, dO):
     return dQ.to(torch.float16), dK.to(torch.float16), dV.to(torch.float16)
 
 
-def ref_program(q: torch.Tensor,
-                k: torch.Tensor,
-                v: torch.Tensor,
-                scale: Optional[float] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+def ref_program(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, scale: Optional[float] = None) -> Tuple[torch.Tensor, torch.Tensor]:
     q, k, v = q.float(), k.float(), v.float()
     if scale is None:
-        scale = q.shape[-1]**-0.5
+        scale = q.shape[-1] ** -0.5
     chunk_size = 64
-    q = rearrange(q, 'b (n c) h d -> b h n c d', c=chunk_size) * scale
-    k = rearrange(k, 'b (n c) h d -> b h n c d', c=chunk_size)
-    v = rearrange(v, 'b (n c) h d -> b h n c d', c=chunk_size)
+    q = rearrange(q, "b (n c) h d -> b h n c d", c=chunk_size) * scale
+    k = rearrange(k, "b (n c) h d -> b h n c d", c=chunk_size)
+    v = rearrange(v, "b (n c) h d -> b h n c d", c=chunk_size)
     kv = k.transpose(-1, -2) @ v
     kv = kv.cumsum(2)
     h = kv[:, :, -1, :, :]
     kv = torch.cat([torch.zeros_like(kv[:, :, :1]), kv[:, :, :-1]], dim=2)
     inter = q @ kv
-    intra = ((q @ k.transpose(-1, -2)).masked_fill_(
-        torch.triu(torch.ones(chunk_size, chunk_size, dtype=bool, device=q.device), diagonal=1),
-        0)) @ v
+    intra = (
+        (q @ k.transpose(-1, -2)).masked_fill_(torch.triu(torch.ones(chunk_size, chunk_size, dtype=bool, device=q.device), diagonal=1), 0)
+    ) @ v
     o = inter + intra
-    return rearrange(o, 'b h n c d -> b (n c) h d'), h
+    return rearrange(o, "b h n c d -> b (n c) h d"), h
 
 
 def main(B=1, S=1024, H=16, D=128):
-    q = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16, requires_grad=True)
-    k = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16, requires_grad=True)
-    v = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16, requires_grad=True)
-    do = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
+    q = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16, requires_grad=True)
+    k = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16, requires_grad=True)
+    v = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16, requires_grad=True)
+    do = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
 
     # qk norm is necessary for linear attn
     q = l2norm_fwd(q)[0].requires_grad_(True)
@@ -193,30 +170,42 @@ def main(B=1, S=1024, H=16, D=128):
     o_ref, _ = ref_program(q, k, v)
     o_ref.backward(do, retain_graph=True)
 
-    assert torch.allclose(
-        dq, q.grad, atol=1e-2, rtol=1e-2), f'dq max err: {(dq - q.grad).abs().max()}'
-    assert torch.allclose(
-        dk, k.grad, atol=1e-2, rtol=1e-2), f'dk max err: {(dk - k.grad).abs().max()}'
-    assert torch.allclose(
-        dv, v.grad, atol=1e-2, rtol=1e-2), f'dv max err: {(dv - v.grad).abs().max()}'
-    print('Passed all tests!✅')
+    assert torch.allclose(dq, q.grad, atol=1e-2, rtol=1e-2), f"dq max err: {(dq - q.grad).abs().max()}"
+    assert torch.allclose(dk, k.grad, atol=1e-2, rtol=1e-2), f"dk max err: {(dk - k.grad).abs().max()}"
+    assert torch.allclose(dv, v.grad, atol=1e-2, rtol=1e-2), f"dv max err: {(dv - v.grad).abs().max()}"
+    print("Passed all tests!✅")
 
     # Benchmark
     q.grad = k.grad = v.grad = None
     o_ref, _ = fused_chunk_linear_attn(q, k, v, output_final_state=True, normalize=False)
-    t1 = do_bench(lambda: o_ref.backward(do, retain_graph=True), backend='cupti')
-    t2 = do_bench(lambda: tl_fused_chunk_bwd(q, k, v, do), backend='cupti')
-    print(f'Triton latency: {t1:.3f} ms')
-    print(f'TileLang latency: {t2:.3f} ms')
-    print(f'Speedup: {t1/t2:.3f}x')
+    t1 = do_bench(lambda: o_ref.backward(do, retain_graph=True), backend="cupti")
+    t2 = do_bench(lambda: tl_fused_chunk_bwd(q, k, v, do), backend="cupti")
+    print(f"Triton latency: {t1:.3f} ms")
+    print(f"TileLang latency: {t2:.3f} ms")
+    print(f"Speedup: {t1 / t2:.3f}x")
+
+
+def run_regression_perf(B=1, S=1024, H=16, D=128):
+    q = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16, requires_grad=True)
+    k = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16, requires_grad=True)
+    v = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16, requires_grad=True)
+    do = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
+    q = l2norm_fwd(q)[0].requires_grad_(True)
+    k = l2norm_fwd(k)[0].requires_grad_(True)
+    kernel = tl_fused_chunk_bwd_kernel(B, S, H, D, D)
+    dQ = torch.zeros_like(q, dtype=torch.float32)
+    dK = torch.zeros_like(k, dtype=torch.float32)
+    dV = torch.zeros_like(v, dtype=torch.float32)
+    kernel(q, k, v, do, dQ, dK, dV)
+    return do_bench(lambda: kernel(q, k, v, do, dQ, dK, dV), backend="cupti")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--B', type=int, default=8, help='Batch size')
-    parser.add_argument('--S', type=int, default=1024, help='Seq len')
-    parser.add_argument('--H', type=int, default=32, help='Num heads')
-    parser.add_argument('--D', type=int, default=128, help='Head dim')
+    parser.add_argument("--B", type=int, default=8, help="Batch size")
+    parser.add_argument("--S", type=int, default=1024, help="Seq len")
+    parser.add_argument("--H", type=int, default=32, help="Num heads")
+    parser.add_argument("--D", type=int, default=128, help="Head dim")
     args = parser.parse_args()
 
     main(args.B, args.S, args.H, args.D)
diff --git a/examples/linear_attention/example_linear_attn_fwd.py b/examples/linear_attention/example_linear_attn_fwd.py
index cbf352bbc..cdfd5cb72 100644
--- a/examples/linear_attention/example_linear_attn_fwd.py
+++ b/examples/linear_attention/example_linear_attn_fwd.py
@@ -14,20 +14,20 @@
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
+    },
+)
 def tl_fused_chunk_fwd_kernel(
     B,
     S,
     H,
     DK,
     DV,
-    dtype: str = 'float16',
+    dtype: T.dtype = T.float16,
     scale: float = None,
 ) -> torch.Tensor:
-
     if scale is None:
         scale = DK**-0.5
-    accum_dtype = 'float'
+    accum_dtype = T.float32
 
     chunk_size = 64
     BK = BV = 64  # Set to 128 can be faster, but has some numerical differences with FLA
@@ -38,11 +38,12 @@ def tl_fused_chunk_fwd_kernel(
 
     @T.prim_func
     def fused_chunk_linear_attn_fwd(
-            Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
-            K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
-            V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
-            O: T.Tensor([B, S, H, DV], accum_dtype),  # type: ignore
-            final_state: T.Tensor([B, H, DK, DV], accum_dtype)):  # type: ignore
+        Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
+        K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
+        V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
+        O: T.Tensor([B, S, H, DV], accum_dtype),  # type: ignore
+        final_state: T.Tensor([B, H, DK, DV], accum_dtype),
+    ):  # type: ignore
         with T.Kernel(NV, NK, B * H) as (i_v, i_k, i_bh):
             i_b = i_bh // H
             i_h = i_bh % H
@@ -57,7 +58,6 @@ def fused_chunk_linear_attn_fwd(
             o = T.alloc_fragment([chunk_size, BV], accum_dtype)
             o_shared = T.alloc_shared([chunk_size, BV], accum_dtype)
 
-            T.annotate_layout({o_shared: tilelang.layout.make_swizzled_layout(o_shared)})
             T.use_swizzle(10)
 
             T.clear(h)
@@ -65,8 +65,8 @@ def fused_chunk_linear_attn_fwd(
             for i in T.Pipelined(0, NT):
                 for row, col in T.Parallel(chunk_size, BK):
                     q[row, col] = Q[i_b, i * chunk_size + row, i_h, i_k * BK + col] * scale
-                T.copy(K[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_k * BK:(i_k + 1) * BK], k)
-                T.copy(V[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_v * BV:(i_v + 1) * BV], v)
+                T.copy(K[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_k * BK : (i_k + 1) * BK], k)
+                T.copy(V[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], v)
 
                 T.gemm(q, k, s, clear_accum=True, transpose_B=True)
                 for row, col in T.Parallel(chunk_size, chunk_size):
@@ -77,13 +77,10 @@ def fused_chunk_linear_attn_fwd(
                 T.gemm(k, v, h, transpose_A=True)
                 T.gemm(q, h_shared, o)
                 T.copy(o, o_shared)
-                T.atomic_add(
-                    O[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_v * BV:(i_v + 1) * BV],
-                    o_shared)
-                #TODO: consider using vectorized atomic add or tma reduce for sm90
+                T.atomic_add(O[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], o_shared)
 
             # Output final state
-            T.copy(h, final_state[i_b, i_h, i_k * BK:(i_k + 1) * BK, i_v * BV:(i_v + 1) * BV])
+            T.copy(h, final_state[i_b, i_h, i_k * BK : (i_k + 1) * BK, i_v * BV : (i_v + 1) * BV])
 
     return fused_chunk_linear_attn_fwd
 
@@ -91,38 +88,36 @@ def fused_chunk_linear_attn_fwd(
 def tl_fused_chunk_fwd(q, k, v):
     B, S, H, D = q.shape
     kernel = tl_fused_chunk_fwd_kernel(B, S, H, D, D)
-    o = torch.zeros((B, S, H, D), device='cuda', dtype=torch.float32)
+    print(kernel.get_kernel_source())
+    o = torch.zeros((B, S, H, D), device="cuda", dtype=torch.float32)
     h = kernel(q, k, v, o)
     return o, h
 
 
-def ref_program(q: torch.Tensor,
-                k: torch.Tensor,
-                v: torch.Tensor,
-                scale: Optional[float] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+def ref_program(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, scale: Optional[float] = None) -> Tuple[torch.Tensor, torch.Tensor]:
     q, k, v = q.float(), k.float(), v.float()
     if scale is None:
-        scale = q.shape[-1]**-0.5
+        scale = q.shape[-1] ** -0.5
     chunk_size = 64
-    q = rearrange(q, 'b (n c) h d -> b h n c d', c=chunk_size) * scale
-    k = rearrange(k, 'b (n c) h d -> b h n c d', c=chunk_size)
-    v = rearrange(v, 'b (n c) h d -> b h n c d', c=chunk_size)
+    q = rearrange(q, "b (n c) h d -> b h n c d", c=chunk_size) * scale
+    k = rearrange(k, "b (n c) h d -> b h n c d", c=chunk_size)
+    v = rearrange(v, "b (n c) h d -> b h n c d", c=chunk_size)
     kv = k.transpose(-1, -2) @ v
     kv = kv.cumsum(2)
     h = kv[:, :, -1, :, :]
     kv = torch.cat([torch.zeros_like(kv[:, :, :1]), kv[:, :, :-1]], dim=2)
     inter = q @ kv
-    intra = ((q @ k.transpose(-1, -2)).masked_fill_(
-        torch.triu(torch.ones(chunk_size, chunk_size, dtype=bool, device=q.device), diagonal=1),
-        0)) @ v
+    intra = (
+        (q @ k.transpose(-1, -2)).masked_fill_(torch.triu(torch.ones(chunk_size, chunk_size, dtype=bool, device=q.device), diagonal=1), 0)
+    ) @ v
     o = inter + intra
-    return rearrange(o, 'b h n c d -> b (n c) h d'), h
+    return rearrange(o, "b h n c d -> b (n c) h d"), h
 
 
 def main(B=1, S=512, H=16, D=128):
-    q = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
-    k = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
-    v = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
+    q = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
+    k = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
+    v = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
 
     # qk norm is necessary for linear attn
     q, _ = l2norm_fwd(q)
@@ -131,25 +126,35 @@ def main(B=1, S=512, H=16, D=128):
     o, h = tl_fused_chunk_fwd(q, k, v)
     o_ref, h_ref = ref_program(q, k, v)
 
-    assert torch.allclose(o, o_ref, atol=1e-2, rtol=1e-2), f'o max err: {(o - o_ref).abs().max()}'
-    assert torch.allclose(h, h_ref, atol=1e-2, rtol=1e-2), f'h max err: {(h - h_ref).abs().max()}'
-    print('Passed all tests!✅')
+    assert torch.allclose(o, o_ref, atol=1e-2, rtol=1e-2), f"o max err: {(o - o_ref).abs().max()}"
+    assert torch.allclose(h, h_ref, atol=1e-2, rtol=1e-2), f"h max err: {(h - h_ref).abs().max()}"
+    print("Passed all tests!✅")
+
+    t1 = do_bench(lambda: fused_chunk_linear_attn(q, k, v, output_final_state=True, normalize=False), backend="cupti")
+    t2 = do_bench(lambda: tl_fused_chunk_fwd(q, k, v), backend="cupti")
+    print(f"Triton latency: {t1:.3f} ms")
+    print(f"TileLang latency: {t2:.3f} ms")
+    print(f"Speedup: {t1 / t2:.3f}x")
 
-    t1 = do_bench(
-        lambda: fused_chunk_linear_attn(q, k, v, output_final_state=True, normalize=False),
-        backend='cupti')
-    t2 = do_bench(lambda: tl_fused_chunk_fwd(q, k, v), backend='cupti')
-    print(f'Triton latency: {t1:.3f} ms')
-    print(f'TileLang latency: {t2:.3f} ms')
-    print(f'Speedup: {t1/t2:.3f}x')
+
+def run_regression_perf(B=1, S=512, H=16, D=128):
+    q = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
+    k = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
+    v = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
+    q, _ = l2norm_fwd(q)
+    k, _ = l2norm_fwd(k)
+    B, S, H, D = q.shape
+    kernel = tl_fused_chunk_fwd_kernel(B, S, H, D, D)
+    o = torch.zeros((B, S, H, D), device="cuda", dtype=torch.float32)
+    return do_bench(lambda: kernel(q, k, v, o), backend="cupti")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--B', type=int, default=8, help='Batch size')
-    parser.add_argument('--S', type=int, default=1024, help='Seq len')
-    parser.add_argument('--H', type=int, default=32, help='Num heads')
-    parser.add_argument('--D', type=int, default=128, help='Head dim')
+    parser.add_argument("--B", type=int, default=8, help="Batch size")
+    parser.add_argument("--S", type=int, default=1024, help="Seq len")
+    parser.add_argument("--H", type=int, default=32, help="Num heads")
+    parser.add_argument("--D", type=int, default=128, help="Head dim")
     args = parser.parse_args()
 
     main(args.B, args.S, args.H, args.D)
diff --git a/examples/linear_attention/example_mamba_chunk_scan.py b/examples/linear_attention/example_mamba_chunk_scan.py
index add49052d..88a9b75bc 100644
--- a/examples/linear_attention/example_mamba_chunk_scan.py
+++ b/examples/linear_attention/example_mamba_chunk_scan.py
@@ -9,6 +9,7 @@
 
 def chunk_scan_triton(cb, x, dt, dA_cumsum, C, states, D):
     from mamba_ssm.ops.triton.ssd_chunk_scan import _chunk_scan_fwd
+
     out, _ = _chunk_scan_fwd(cb, x, dt, dA_cumsum, C, states, D)
     return out
 
@@ -43,14 +44,15 @@ def ref_program(cb, x, dt, dA_cumsum, C, prev_states, D):
     dt_segment_sum = dA_cumsum[:, :, :, :, None] - dA_cumsum[:, :, :, None, :]
     decay = torch.exp(dt_segment_sum)
     scores_decay = cb * rearrange(decay, "b h c l s -> b c h l s")
-    causal_mask = torch.tril(
-        torch.ones(chunk_size, chunk_size, device=x.device, dtype=bool), diagonal=0)
+    causal_mask = torch.tril(torch.ones(chunk_size, chunk_size, device=x.device, dtype=bool), diagonal=0)
     scores_decay = scores_decay.masked_fill(~causal_mask, 0)
-    out = torch.einsum('bchls,bhcs,bcshp->bclhp', scores_decay.to(x.dtype), dt.to(x.dtype),
-                       rearrange(x, "b (c s) h p -> b c s h p", c=nchunks))
+    out = torch.einsum(
+        "bchls,bhcs,bcshp->bclhp", scores_decay.to(x.dtype), dt.to(x.dtype), rearrange(x, "b (c s) h p -> b c s h p", c=nchunks)
+    )
     state_decay_out = torch.exp(rearrange(dA_cumsum, "b h c l -> b c l h 1"))
-    out_prev = torch.einsum('bclhn,bchpn->bclhp', rearrange(
-        C, "b (c l) h n -> b c l h n", c=nchunks), prev_states.to(C.dtype)) * state_decay_out
+    out_prev = (
+        torch.einsum("bclhn,bchpn->bclhp", rearrange(C, "b (c l) h n -> b c l h n", c=nchunks), prev_states.to(C.dtype)) * state_decay_out
+    )
     out = out + out_prev
     out = rearrange(out, "b c l h p -> b (c l) h p")
     if D is not None:
@@ -61,12 +63,7 @@ def ref_program(cb, x, dt, dA_cumsum, C, prev_states, D):
 
 
 def get_configs():
-    iter_params = dict(
-        block_M=[64, 128, 256],
-        block_N=[32, 64],
-        block_K=[64, 128, 256],
-        block_Dstate=[128],
-        num_stages=[1, 2, 3, 4, 5])
+    iter_params = dict(block_M=[64, 128, 256], block_N=[32, 64], block_K=[64, 128, 256], block_Dstate=[128], num_stages=[1, 2, 3, 4, 5])
     return [dict(zip(iter_params, values)) for values in itertools.product(*iter_params.values())]
 
 
@@ -77,56 +74,58 @@ def get_configs():
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
     },
 )
-def chunk_scan_fwd(batch,
-                   seqlen,
-                   chunk_size,
-                   ngroups,
-                   nheads,
-                   headdim,
-                   dstate,
-                   block_M=64,
-                   block_N=64,
-                   block_K=64,
-                   block_Dstate=128,
-                   num_stages=2,
-                   threads=128):
-    dtype = "float16"
-    accum_dtype = "float"
+def chunk_scan_fwd(
+    batch,
+    seqlen,
+    chunk_size,
+    ngroups,
+    nheads,
+    headdim,
+    dstate,
+    block_M=64,
+    block_N=64,
+    block_K=64,
+    block_Dstate=128,
+    num_stages=2,
+    threads=128,
+):
+    dtype = T.float16
+    accum_dtype = T.float32
     nchunks = T.ceildiv(seqlen, chunk_size)
     p = 1.44269504
 
     @T.prim_func
     def main(
-            cb: T.Tensor((batch, nchunks, ngroups, chunk_size, chunk_size), dtype),  # type: ignore
-            x: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
-            dt: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
-            dA_cumsum: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
-            C: T.Tensor((batch, seqlen, ngroups, dstate), dtype),  # type: ignore
-            prev_states: T.Tensor((batch, nchunks, nheads, headdim, dstate), dtype),  # type: ignore
-            D: T.Tensor((nheads), dtype),  # type: ignore
-            Output: T.Tensor((batch, seqlen, nheads, headdim), dtype)  # type: ignore
+        cb: T.Tensor((batch, nchunks, ngroups, chunk_size, chunk_size), dtype),  # type: ignore
+        x: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
+        dt: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
+        dA_cumsum: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),  # type: ignore
+        C: T.Tensor((batch, seqlen, ngroups, dstate), dtype),  # type: ignore
+        prev_states: T.Tensor((batch, nchunks, nheads, headdim, dstate), dtype),  # type: ignore
+        D: T.Tensor((nheads), dtype),  # type: ignore
+        Output: T.Tensor((batch, seqlen, nheads, headdim), dtype),  # type: ignore
     ):
-        with T.Kernel(
-                nheads,
-                T.ceildiv(chunk_size, block_M) * T.ceildiv(headdim, block_N),
-                batch * nchunks,
-                threads=threads) as (bz, bx, by):
+        with T.Kernel(nheads, T.ceildiv(chunk_size, block_M) * T.ceildiv(headdim, block_N), batch * nchunks, threads=threads) as (
+            bz,
+            bx,
+            by,
+        ):
             acc_o = T.alloc_fragment((block_M, block_N), accum_dtype)
             acc_o_shared = T.alloc_shared((block_M, block_N), dtype)
-            cb_shared = T.alloc_shared((block_M, block_K), dtype, scope="shared.dyn")
+            cb_shared = T.alloc_shared((block_M, block_K), dtype)
             cb_local = T.alloc_fragment((block_M, block_K), dtype)
-            dA_cs_k_shared = T.alloc_shared((block_K), dtype, scope="shared")
+            dA_cs_k_shared = T.alloc_shared((block_K), dtype)
             dA_cs_k_local = T.alloc_fragment((block_K), accum_dtype)
             dA_cs_m_local = T.alloc_fragment((block_M), accum_dtype)
-            dt_shared = T.alloc_shared((block_K), dtype, scope="shared")
+            dt_shared = T.alloc_shared((block_K), dtype)
             dt_local = T.alloc_fragment((block_K), accum_dtype)
             x_shared = T.alloc_shared((block_K, block_N), dtype, scope="shared.dyn")
-            dA_cs_m_shared = T.alloc_shared((block_M), dtype, scope="shared")
+            dA_cs_m_shared = T.alloc_shared((block_M), dtype)
             scale_m_local = T.alloc_fragment((block_M), accum_dtype)
             C_shared = T.alloc_shared((block_M, block_Dstate), dtype)
             prev_state_shared = T.alloc_shared((block_N, block_Dstate), dtype)
             D_local = T.alloc_fragment((1), accum_dtype)
-            x_residual_shared = T.alloc_shared((block_M, block_N), dtype, scope="shared.dyn")
+            x_residual_shared = T.alloc_shared((block_M, block_N), dtype)
             x_residual_local = T.alloc_fragment((block_M, block_N), accum_dtype)
 
             batch_idx = by % batch
@@ -136,27 +135,31 @@ def main(
             m_idx = bx // T.ceildiv(headdim, block_N)
             n_idx = bx % T.ceildiv(headdim, block_N)
 
-            T.annotate_layout({
-                acc_o_shared: tilelang.layout.make_swizzled_layout(acc_o_shared),
-                cb_shared: tilelang.layout.make_swizzled_layout(cb_shared),
-                x_residual_shared: tilelang.layout.make_swizzled_layout(x_residual_shared)
-            })
+            T.annotate_layout(
+                {
+                    cb_shared: tilelang.layout.make_swizzled_layout(cb_shared),
+                    x_residual_shared: tilelang.layout.make_swizzled_layout(x_residual_shared),
+                }
+            )
 
             T.no_set_max_nreg()
 
-            T.copy(dA_cumsum[batch_idx, bz, chunk_idx, m_idx * block_M:(m_idx + 1) * block_M],
-                   dA_cs_m_shared)
+            T.copy(dA_cumsum[batch_idx, bz, chunk_idx, m_idx * block_M : (m_idx + 1) * block_M], dA_cs_m_shared)
             T.copy(dA_cs_m_shared, dA_cs_m_local)
             T.clear(acc_o)
 
             for i in T.Parallel(block_M):
                 scale_m_local[i] = T.exp2(dA_cs_m_local[i] * p)
             T.copy(
-                C[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
-                  (m_idx + 1) * block_M, bz // (nheads // ngroups), 0:block_Dstate], C_shared)
-            T.copy(
-                prev_states[batch_idx, chunk_idx, bz, n_idx * block_N:(n_idx + 1) * block_N,
-                            0:block_Dstate], prev_state_shared)
+                C[
+                    batch_idx,
+                    chunk_idx * chunk_size + m_idx * block_M : chunk_idx * chunk_size + (m_idx + 1) * block_M,
+                    bz // (nheads // ngroups),
+                    0:block_Dstate,
+                ],
+                C_shared,
+            )
+            T.copy(prev_states[batch_idx, chunk_idx, bz, n_idx * block_N : (n_idx + 1) * block_N, 0:block_Dstate], prev_state_shared)
             T.gemm(C_shared, prev_state_shared, acc_o, transpose_B=True)
             for i, j in T.Parallel(block_M, block_N):
                 acc_o[i, j] *= scale_m_local[i]
@@ -165,34 +168,47 @@ def main(
 
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 T.copy(
-                    cb[batch_idx, chunk_idx, bz // (nheads // ngroups),
-                       m_idx * block_M:(m_idx + 1) * block_M, k * block_K:(k + 1) * block_K],
-                    cb_shared)
+                    cb[
+                        batch_idx,
+                        chunk_idx,
+                        bz // (nheads // ngroups),
+                        m_idx * block_M : (m_idx + 1) * block_M,
+                        k * block_K : (k + 1) * block_K,
+                    ],
+                    cb_shared,
+                )
                 T.copy(cb_shared, cb_local)
-                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K],
-                       dA_cs_k_shared)
+                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K : (k + 1) * block_K], dA_cs_k_shared)
                 T.copy(dA_cs_k_shared, dA_cs_k_local)
                 for i, j in T.Parallel(block_M, block_K):
-                    cb_local[i,
-                             j] = cb_local[i,
-                                           j] * T.exp2(dA_cs_m_local[i] * p - dA_cs_k_local[j] * p)
-                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K], dt_shared)
+                    cb_local[i, j] = cb_local[i, j] * T.exp2(dA_cs_m_local[i] * p - dA_cs_k_local[j] * p)
+                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K : (k + 1) * block_K], dt_shared)
                 T.copy(dt_shared, dt_local)
                 for i, j in T.Parallel(block_M, block_K):
                     cb_local[i, j] *= dt_local[j]
                 for i, j in T.Parallel(block_M, block_K):
-                    cb_local[i, j] = T.if_then_else(m_idx * block_M + i >= k * block_K + j,
-                                                    cb_local[i, j], 0)
+                    cb_local[i, j] = T.if_then_else(m_idx * block_M + i >= k * block_K + j, cb_local[i, j], 0)
                 T.copy(
-                    x[batch_idx, chunk_idx * chunk_size + k * block_K:chunk_idx * chunk_size +
-                      (k + 1) * block_K, bz, n_idx * block_N:(n_idx + 1) * block_N], x_shared)
+                    x[
+                        batch_idx,
+                        chunk_idx * chunk_size + k * block_K : chunk_idx * chunk_size + (k + 1) * block_K,
+                        bz,
+                        n_idx * block_N : (n_idx + 1) * block_N,
+                    ],
+                    x_shared,
+                )
                 T.gemm(cb_local, x_shared, acc_o)
 
             D_local[0] = D[bz]
             T.copy(
-                x[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
-                  (m_idx + 1) * block_M, bz, n_idx * block_N:(n_idx + 1) * block_N],
-                x_residual_shared)
+                x[
+                    batch_idx,
+                    chunk_idx * chunk_size + m_idx * block_M : chunk_idx * chunk_size + (m_idx + 1) * block_M,
+                    bz,
+                    n_idx * block_N : (n_idx + 1) * block_N,
+                ],
+                x_residual_shared,
+            )
             T.copy(x_residual_shared, x_residual_local)
             for i, j in T.Parallel(block_M, block_N):
                 acc_o[i, j] += x_residual_local[i, j] * D_local[0]
@@ -200,27 +216,40 @@ def main(
             T.copy(acc_o, acc_o_shared)
             T.copy(
                 acc_o_shared,
-                Output[batch_idx, chunk_idx * chunk_size + m_idx * block_M:chunk_idx * chunk_size +
-                       (m_idx + 1) * block_M, bz, n_idx * block_N:(n_idx + 1) * block_N])
+                Output[
+                    batch_idx,
+                    chunk_idx * chunk_size + m_idx * block_M : chunk_idx * chunk_size + (m_idx + 1) * block_M,
+                    bz,
+                    n_idx * block_N : (n_idx + 1) * block_N,
+                ],
+            )
 
     return main
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=80, help='heads')
-    parser.add_argument('--groups', type=int, default=1, help='groups')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--chunk_size', type=int, default=256, help='chunk size')
-    parser.add_argument('--dim', type=int, default=64, help='dim')
-    parser.add_argument('--dstate', type=int, default=128, help='dstate')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=80, help="heads")
+    parser.add_argument("--groups", type=int, default=1, help="groups")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--chunk_size", type=int, default=256, help="chunk size")
+    parser.add_argument("--dim", type=int, default=64, help="dim")
+    parser.add_argument("--dstate", type=int, default=128, help="dstate")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
-    batch, heads, groups, seq_len, chunk_size, dim, dstate = args.batch, args.heads, args.groups, args.seq_len, args.chunk_size, args.dim, args.dstate
+    batch, heads, groups, seq_len, chunk_size, dim, dstate = (
+        args.batch,
+        args.heads,
+        args.groups,
+        args.seq_len,
+        args.chunk_size,
+        args.dim,
+        args.dstate,
+    )
     total_flops = 2 * batch * seq_len * chunk_size * heads * dim * 0.5 + 2 * batch * seq_len * heads * dim * dstate
 
-    if (not args.tune):
+    if not args.tune:
         kernel = chunk_scan_fwd(
             batch,
             seq_len,
@@ -234,7 +263,8 @@ def main(
             block_K=64,
             block_Dstate=128,
             num_stages=2,
-            threads=128)
+            threads=128,
+        )
         profiler = kernel.get_profiler(tilelang.TensorSupplyType.Normal)
         profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
         print("All checks pass.")
diff --git a/examples/linear_attention/example_mamba_chunk_state.py b/examples/linear_attention/example_mamba_chunk_state.py
index ad3df0df8..96126889b 100644
--- a/examples/linear_attention/example_mamba_chunk_state.py
+++ b/examples/linear_attention/example_mamba_chunk_state.py
@@ -10,6 +10,7 @@
 
 def chunk_state_triton(B, x, dt, dA_cumsum):
     from mamba_ssm.ops.triton.ssd_chunk_state import _chunk_state_fwd
+
     return _chunk_state_fwd(B, x, dt, dA_cumsum, states_in_fp32=False)
 
 
@@ -41,46 +42,33 @@ def ref_program(B, x, dt, dA_cumsum):
     x = rearrange(x, "b (c l) h p -> b c l h p", l=chunk_size)
     B = rearrange(B, "b (c l) ... -> b c l ...", l=chunk_size)
     decay_states = torch.exp((dA_cumsum[:, :, :, -1:] - dA_cumsum))
-    return torch.einsum("bclhn,bhcl,bhcl,bclhp->bchpn", B.to(x.dtype), decay_states.to(x.dtype),
-                        dt.to(x.dtype), x)
+    return torch.einsum("bclhn,bhcl,bhcl,bclhp->bchpn", B.to(x.dtype), decay_states.to(x.dtype), dt.to(x.dtype), x)
 
 
 def get_configs():
-    iter_params = dict(
-        block_M=[64, 128], block_N=[32, 64, 128], block_K=[32, 64], num_stages=[1, 2, 3, 4, 5])
+    iter_params = dict(block_M=[64, 128], block_N=[32, 64, 128], block_K=[32, 64], num_stages=[1, 2, 3, 4, 5])
     return [dict(zip(iter_params, values)) for values in itertools.product(*iter_params.values())]
 
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
 @tilelang.jit(out_idx=[4])
-def chunk_state_fwd(batch,
-                    seqlen,
-                    chunk_size,
-                    ngroups,
-                    nheads,
-                    headdim,
-                    dstate,
-                    block_M=64,
-                    block_N=64,
-                    block_K=64,
-                    num_stages=2,
-                    threads=128):
-    dtype = "float16"
-    accum_dtype = "float"
+def chunk_state_fwd(
+    batch, seqlen, chunk_size, ngroups, nheads, headdim, dstate, block_M=64, block_N=64, block_K=64, num_stages=2, threads=128
+):
+    dtype = T.float16
+    accum_dtype = T.float32
     nchunks = T.ceildiv(seqlen, chunk_size)
     p = 1.44269504
 
     @T.prim_func
-    def main(B: T.Tensor((batch, seqlen, ngroups, dstate), dtype), x: T.Tensor(
-        (batch, seqlen, nheads, headdim), dtype), dt: T.Tensor(
-            (batch, nheads, nchunks, chunk_size), dtype), dA_cumsum: T.Tensor(
-                (batch, nheads, nchunks, chunk_size), dtype), Output: T.Tensor(
-                    (batch, nchunks, nheads, headdim, dstate), dtype)):
-        with T.Kernel(
-                nheads,
-                T.ceildiv(headdim, block_M) * T.ceildiv(dstate, block_N),
-                batch * nchunks,
-                threads=threads) as (bz, bx, by):
+    def main(
+        B: T.Tensor((batch, seqlen, ngroups, dstate), dtype),
+        x: T.Tensor((batch, seqlen, nheads, headdim), dtype),
+        dt: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),
+        dA_cumsum: T.Tensor((batch, nheads, nchunks, chunk_size), dtype),
+        Output: T.Tensor((batch, nchunks, nheads, headdim, dstate), dtype),
+    ):
+        with T.Kernel(nheads, T.ceildiv(headdim, block_M) * T.ceildiv(dstate, block_N), batch * nchunks, threads=threads) as (bz, bx, by):
             x_shared = T.alloc_shared((block_K, block_M), dtype)
             x_local = T.alloc_fragment((block_K, block_M), dtype)
             xt_local = T.alloc_fragment((block_M, block_K), dtype)
@@ -101,20 +89,22 @@ def main(B: T.Tensor((batch, seqlen, ngroups, dstate), dtype), x: T.Tensor(
             m_idx = bx // T.ceildiv(dstate, block_N)
             n_idx = bx % T.ceildiv(dstate, block_N)
 
-            T.annotate_layout({
-                x_shared: tilelang.layout.make_swizzled_layout(x_shared),
-                acc_o_shared: tilelang.layout.make_swizzled_layout(acc_o_shared)
-            })
+            T.annotate_layout({x_shared: tilelang.layout.make_swizzled_layout(x_shared)})
 
             dA_cs_last[0] = dA_cumsum[batch_idx, bz, chunk_idx, chunk_size - 1]
             T.clear(acc_o)
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 T.copy(
-                    x[batch_idx, chunk_idx * chunk_size + k * block_K:chunk_idx * chunk_size +
-                      (k + 1) * block_K, bz, m_idx * block_M:(m_idx + 1) * block_M], x_shared)
-                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K],
-                       dA_cumsum_shared)
-                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K:(k + 1) * block_K], dt_shared)
+                    x[
+                        batch_idx,
+                        chunk_idx * chunk_size + k * block_K : chunk_idx * chunk_size + (k + 1) * block_K,
+                        bz,
+                        m_idx * block_M : (m_idx + 1) * block_M,
+                    ],
+                    x_shared,
+                )
+                T.copy(dA_cumsum[batch_idx, bz, chunk_idx, k * block_K : (k + 1) * block_K], dA_cumsum_shared)
+                T.copy(dt[batch_idx, bz, chunk_idx, k * block_K : (k + 1) * block_K], dt_shared)
                 T.copy(dA_cumsum_shared, dA_cumsum_local)
                 T.copy(dt_shared, dt_local)
                 for i in T.Parallel(block_K):
@@ -123,47 +113,50 @@ def main(B: T.Tensor((batch, seqlen, ngroups, dstate), dtype), x: T.Tensor(
                 for i, j in T.Parallel(block_M, block_K):
                     xt_local[i, j] = x_local[j, i] * scale[j]
                 T.copy(
-                    B[batch_idx, chunk_idx * chunk_size + k * block_K:chunk_idx * chunk_size +
-                      (k + 1) * block_K, bz // (nheads // ngroups),
-                      n_idx * block_N:(n_idx + 1) * block_N], B_shared)
+                    B[
+                        batch_idx,
+                        chunk_idx * chunk_size + k * block_K : chunk_idx * chunk_size + (k + 1) * block_K,
+                        bz // (nheads // ngroups),
+                        n_idx * block_N : (n_idx + 1) * block_N,
+                    ],
+                    B_shared,
+                )
                 T.gemm(xt_local, B_shared, acc_o)
             T.copy(acc_o, acc_o_shared)
             T.copy(
                 acc_o_shared,
-                Output[batch_idx, chunk_idx, bz, m_idx * block_M:(m_idx + 1) * block_M,
-                       n_idx * block_N:(n_idx + 1) * block_N])
+                Output[batch_idx, chunk_idx, bz, m_idx * block_M : (m_idx + 1) * block_M, n_idx * block_N : (n_idx + 1) * block_N],
+            )
 
     return main
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=8, help='batch size')
-    parser.add_argument('--heads', type=int, default=80, help='heads')
-    parser.add_argument('--groups', type=int, default=1, help='groups')
-    parser.add_argument('--seq_len', type=int, default=4096, help='sequence length')
-    parser.add_argument('--chunk_size', type=int, default=256, help='chunk size')
-    parser.add_argument('--dim', type=int, default=64, help='dim')
-    parser.add_argument('--dstate', type=int, default=128, help='dstate')
-    parser.add_argument('--tune', action='store_true', help='tune configs')
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=80, help="heads")
+    parser.add_argument("--groups", type=int, default=1, help="groups")
+    parser.add_argument("--seq_len", type=int, default=4096, help="sequence length")
+    parser.add_argument("--chunk_size", type=int, default=256, help="chunk size")
+    parser.add_argument("--dim", type=int, default=64, help="dim")
+    parser.add_argument("--dstate", type=int, default=128, help="dstate")
+    parser.add_argument("--tune", action="store_true", help="tune configs")
     args = parser.parse_args()
-    batch, heads, groups, seq_len, chunk_size, dim, dstate = args.batch, args.heads, args.groups, args.seq_len, args.chunk_size, args.dim, args.dstate
+    batch, heads, groups, seq_len, chunk_size, dim, dstate = (
+        args.batch,
+        args.heads,
+        args.groups,
+        args.seq_len,
+        args.chunk_size,
+        args.dim,
+        args.dstate,
+    )
     total_flops = 2 * batch * seq_len * heads * dim * dstate
 
-    if (not args.tune):
+    if not args.tune:
         kernel = chunk_state_fwd(
-            batch,
-            seq_len,
-            chunk_size,
-            groups,
-            heads,
-            dim,
-            dstate,
-            block_M=64,
-            block_N=128,
-            block_K=64,
-            num_stages=4,
-            threads=128)
+            batch, seq_len, chunk_size, groups, heads, dim, dstate, block_M=64, block_N=128, block_K=64, num_stages=4, threads=128
+        )
         profiler = kernel.get_profiler(tilelang.TensorSupplyType.Normal)
         profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
         print("All checks pass.")
diff --git a/examples/linear_attention/example_retention_fwd.py b/examples/linear_attention/example_retention_fwd.py
index 66012e0c1..f45e38388 100644
--- a/examples/linear_attention/example_retention_fwd.py
+++ b/examples/linear_attention/example_retention_fwd.py
@@ -13,13 +13,12 @@ def chunk_retention_fwd_kernel(
     H,
     DK,
     DV,
-    dtype: str = 'float16',
+    dtype: T.dtype = T.float16,
     scale: float = None,
 ) -> torch.Tensor:
-
     if scale is None:
         scale = DK**-0.5
-    accum_dtype = 'float'
+    accum_dtype = T.float32
 
     chunk_size = 64
     BK = BV = 64  # Set to 128 can be faster, but has some numerical differences with FLA
@@ -30,16 +29,16 @@ def chunk_retention_fwd_kernel(
 
     @T.prim_func
     def chunk_retention_fwd(
-            Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
-            K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
-            V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
-            O: T.Tensor([NK, B, S, H, DV], dtype),  # type: ignore
+        Q: T.Tensor([B, S, H, DK], dtype),  # type: ignore
+        K: T.Tensor([B, S, H, DK], dtype),  # type: ignore
+        V: T.Tensor([B, S, H, DV], dtype),  # type: ignore
+        O: T.Tensor([NK, B, S, H, DV], dtype),  # type: ignore
     ):
         with T.Kernel(NV, NK, B * H) as (i_v, i_k, i_bh):
             i_b = i_bh // H
             i_h = i_bh % H
-            log_decay = T.alloc_var('float32')
-            log_decay = T.log2(1 - T.exp2(-5. - 1. * i_h))  # Head-specific log decay
+            log_decay = T.alloc_var(T.float32)
+            log_decay = T.log2(1 - T.exp2(-5.0 - 1.0 * i_h))  # Head-specific log decay
 
             q = T.alloc_shared([chunk_size, BK], dtype)
             k = T.alloc_shared([chunk_size, BK], dtype)
@@ -51,26 +50,17 @@ def chunk_retention_fwd(
             o = T.alloc_fragment([chunk_size, BV], accum_dtype)
             T.clear(h)
 
-            T.annotate_layout({
-                q: tl.layout.make_swizzled_layout(q),
-                k: tl.layout.make_swizzled_layout(k),
-                v: tl.layout.make_swizzled_layout(v),
-                h_shared: tl.layout.make_swizzled_layout(h_shared),
-                s_shared: tl.layout.make_swizzled_layout(s_shared),
-            })
             T.use_swizzle(10)
 
             for i in T.Pipelined(0, NT):
                 for row, col in T.Parallel(chunk_size, BK):
                     q[row, col] = Q[i_b, i * chunk_size + row, i_h, i_k * BK + col] * scale
-                T.copy(K[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_k * BK:(i_k + 1) * BK], k)
-                T.copy(V[i_b, i * chunk_size:(i + 1) * chunk_size, i_h, i_v * BV:(i_v + 1) * BV], v)
+                T.copy(K[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_k * BK : (i_k + 1) * BK], k)
+                T.copy(V[i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV], v)
 
                 T.gemm(q, k, s, clear_accum=True, transpose_B=True)
                 for row, col in T.Parallel(chunk_size, chunk_size):
-                    s_shared[row,
-                             col] = T.if_then_else(row >= col, s[row, col] * T.exp2(
-                                 (row - col) * log_decay), 0)
+                    s_shared[row, col] = T.if_then_else(row >= col, s[row, col] * T.exp2((row - col) * log_decay), 0)
 
                 T.copy(h, h_shared)
                 T.gemm(q, h_shared, o, clear_accum=True)
@@ -82,9 +72,7 @@ def chunk_retention_fwd(
                     v[row, col] = v[row, col] * T.exp2((chunk_size - row - 1) * log_decay)
                 for row, col in T.Parallel(BK, BV):
                     h[row, col] = T.exp2(chunk_size * log_decay) * h[row, col]
-                T.copy(
-                    o, O[i_k, i_b, i * chunk_size:(i + 1) * chunk_size, i_h,
-                         i_v * BV:(i_v + 1) * BV])
+                T.copy(o, O[i_k, i_b, i * chunk_size : (i + 1) * chunk_size, i_h, i_v * BV : (i_v + 1) * BV])
                 T.gemm(k, v, h, transpose_A=True)
 
     return chunk_retention_fwd
@@ -96,24 +84,24 @@ def postprocess(o):
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--B', type=int, default=8, help='Batch size')
-    parser.add_argument('--S', type=int, default=4096, help='Seq len')
-    parser.add_argument('--H', type=int, default=32, help='Num heads')
-    parser.add_argument('--D', type=int, default=128, help='Head dim')
+    parser.add_argument("--B", type=int, default=8, help="Batch size")
+    parser.add_argument("--S", type=int, default=4096, help="Seq len")
+    parser.add_argument("--H", type=int, default=32, help="Num heads")
+    parser.add_argument("--D", type=int, default=128, help="Head dim")
     args = parser.parse_args()
     B, S, H, D = args.B, args.S, args.H, args.D
     total_flops = 2.0 * B * S * S * H * D  # causal
 
-    q = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
-    k = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
-    v = torch.randn((B, S, H, D), device='cuda', dtype=torch.float16)
+    q = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
+    k = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
+    v = torch.randn((B, S, H, D), device="cuda", dtype=torch.float16)
 
     kernel = chunk_retention_fwd_kernel(B, S, H, D, D)
 
     t = do_bench(lambda: postprocess(kernel(q, k, v)), warmup=25, rep=100)
-    print(f'Tilelang latency: {t:.3f} ms')
-    print(f'Tilelang TFLOPs: {total_flops/t * 1e-9}')
+    print(f"Tilelang latency: {t:.3f} ms")
+    print(f"Tilelang TFLOPs: {total_flops / t * 1e-9}")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/linear_attention/regression_linear_attn.py b/examples/linear_attention/regression_linear_attn.py
new file mode 100644
index 000000000..ced854087
--- /dev/null
+++ b/examples/linear_attention/regression_linear_attn.py
@@ -0,0 +1,15 @@
+import tilelang.testing
+import example_linear_attn_bwd
+import example_linear_attn_fwd
+
+
+def regression_example_linear_attn_fwd():
+    tilelang.testing.process_func(example_linear_attn_fwd.run_regression_perf)
+
+
+def regression_example_linear_attn_bwd():
+    tilelang.testing.process_func(example_linear_attn_bwd.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/minference/example_vertical_slash_sparse_attn.py b/examples/minference/example_vertical_slash_sparse_attn.py
index ebf8513a1..91af8b454 100644
--- a/examples/minference/example_vertical_slash_sparse_attn.py
+++ b/examples/minference/example_vertical_slash_sparse_attn.py
@@ -15,12 +15,11 @@
 
 @tilelang.jit(out_idx=[3])
 def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_size):
-
     block_M = 64
     block_N = 64
     num_stages = 2
     threads = 128
-    scale = (1.0 / dim)**0.5 * 1.44269504
+    scale = (1.0 / dim) ** 0.5 * 1.44269504
     shape = [batch, heads, seq_len, dim]
 
     seq_blocks = (seq_len + block_M - 1) // block_M
@@ -30,15 +29,13 @@ def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_siz
     offset_shape = count_shape + [slash_size]
     index_shape = count_shape + [vertical_size]
 
-    vertical_size_round, slash_size_round = tilelang.next_power_of_2(
-        vertical_size), tilelang.next_power_of_2(slash_size)
+    vertical_size_round, slash_size_round = tilelang.next_power_of_2(vertical_size), tilelang.next_power_of_2(slash_size)
 
-    dtype = "float16"
-    accum_dtype = "float"
-    int_dtype = "int32"
+    dtype = T.float16
+    accum_dtype = T.float32
+    int_dtype = T.int32
 
     def kernel_func(block_M, block_N, num_stages, threads):
-
         @T.macro
         def Prefetch(
             K: T.Tensor(shape, dtype),
@@ -53,32 +50,30 @@ def Prefetch(
         ):
             with T.attr("default", "async_scope", 1):
                 for i, j in T.Parallel(block_N, dim):
-                    K_shared[i, j] = T.if_then_else(k + i < column_count,
-                                                    K[bz, by, column_index[k + i], j], 0)
+                    K_shared[i, j] = T.if_then_else(k + i < column_count, K[bz, by, column_index[k + i], j], 0)
 
             with T.attr("default", "async_scope", 1):
                 for i, j in T.Parallel(block_N, dim):
-                    V_shared[i, j] = T.if_then_else(k + i < column_count,
-                                                    V[bz, by, column_index[k + i], j], 0)
+                    V_shared[i, j] = T.if_then_else(k + i < column_count, V[bz, by, column_index[k + i], j], 0)
 
             T.ptx_commit_group()
 
         @T.macro
         def Compute(
-                acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-                acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-                acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-                scores_max: T.FragmentBuffer([block_M], accum_dtype),
-                scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-                k: T.int32,
-                column_count: T.int32,
-                Q_shared: T.SharedBuffer([block_M, dim], dtype),
-                K_shared: T.SharedBuffer([block_N, dim], dtype),
-                V_shared: T.SharedBuffer([block_N, dim], dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-                scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-                logsum: T.FragmentBuffer([block_M], accum_dtype),
-                count: T.int32,
+            acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+            acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+            acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+            scores_max: T.FragmentBuffer([block_M], accum_dtype),
+            scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+            k: T.int32,
+            column_count: T.int32,
+            Q_shared: T.SharedBuffer([block_M, dim], dtype),
+            K_shared: T.SharedBuffer([block_N, dim], dtype),
+            V_shared: T.SharedBuffer([block_N, dim], dtype),
+            scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+            scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+            logsum: T.FragmentBuffer([block_M], accum_dtype),
+            count: T.int32,
         ):
             T.ptx_wait_group(count)
             for i, j in T.Parallel(block_M, block_N):
@@ -87,6 +82,8 @@ def Compute(
 
             T.copy(scores_max, scores_max_prev)
             T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
 
             for i in T.Parallel(block_M):
                 scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
@@ -106,17 +103,16 @@ def Compute(
 
         @T.prim_func
         def vs_sparse_flashattn_ws(
-                Q: T.Tensor(shape, dtype),
-                K: T.Tensor(shape, dtype),
-                V: T.Tensor(shape, dtype),
-                Output: T.Tensor(shape, dtype),
-                BlockCount: T.Tensor(count_shape, int_dtype),
-                BlockOffset: T.Tensor(offset_shape, int_dtype),
-                ColumnCount: T.Tensor(count_shape, int_dtype),
-                ColumnIndex: T.Tensor(index_shape, int_dtype),
+            Q: T.Tensor(shape, dtype),
+            K: T.Tensor(shape, dtype),
+            V: T.Tensor(shape, dtype),
+            Output: T.Tensor(shape, dtype),
+            BlockCount: T.Tensor(count_shape, int_dtype),
+            BlockOffset: T.Tensor(offset_shape, int_dtype),
+            ColumnCount: T.Tensor(count_shape, int_dtype),
+            ColumnIndex: T.Tensor(index_shape, int_dtype),
         ):
             with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=256) as (bc, by, bz):
-
                 bx = T.ceildiv(seq_len, block_M) - 1 - bc
                 Q_shared = T.alloc_shared([block_M, dim], dtype)
                 K_shared = T.alloc_shared([2, block_N, dim], dtype)
@@ -134,19 +130,15 @@ def vs_sparse_flashattn_ws(
                 scores_scale = T.alloc_fragment([block_M], accum_dtype)
                 scores_sum = T.alloc_fragment([block_M], accum_dtype)
                 logsum = T.alloc_fragment([block_M], accum_dtype)
-                block_count = T.alloc_local([1], int_dtype)
+                block_count = T.alloc_var(dtype=int_dtype)
                 block_offset = T.alloc_shared([slash_size_round], int_dtype, scope="shared")
-                column_count = T.alloc_local([1], int_dtype)
+                column_count = T.alloc_var(dtype=int_dtype)
                 column_index = T.alloc_shared([vertical_size_round], int_dtype, scope="shared")
 
                 T.create_list_of_mbarrier([128] * 9)
 
-                T.annotate_layout({
-                    O_shared: tilelang.layout.make_swizzled_layout(O_shared),
-                })
-
-                block_count[0] = BlockCount[bz, by, bx]
-                column_count[0] = ColumnCount[bz, by, bx]
+                block_count = BlockCount[bz, by, bx]
+                column_count = ColumnCount[bz, by, bx]
 
                 for vi in T.Parallel(slash_size_round):
                     if vi < slash_size:
@@ -160,15 +152,15 @@ def vs_sparse_flashattn_ws(
 
                 if tid >= 128:
                     T.annotate_producer_reg_dealloc()
-                    T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+                    T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
                     T.mbarrier_arrive(mbarrier=8)
-                    for bi in T.serial(block_count[0]):
+                    for bi in T.serial(block_count):
                         k = block_offset[bi]
                         T.mbarrier_wait_parity(mbarrier=bi % 2 + 4, parity=(((bi & 3) >> 1) ^ 1))
-                        T.copy(K[bz, by, k:k + block_N, :], K_shared[bi % 2, :, :])
+                        T.copy(K[bz, by, k : k + block_N, :], K_shared[bi % 2, :, :])
                         T.mbarrier_arrive(mbarrier=bi % 2)
                         T.mbarrier_wait_parity(mbarrier=bi % 2 + 6, parity=(((bi & 3) >> 1) ^ 1))
-                        T.copy(V[bz, by, k:k + block_N, :], V_shared[bi % 2, :, :])
+                        T.copy(V[bz, by, k : k + block_N, :], V_shared[bi % 2, :, :])
                         T.mbarrier_arrive(mbarrier=bi % 2 + 2)
                 else:
                     T.annotate_consumer_reg_alloc()
@@ -176,40 +168,31 @@ def vs_sparse_flashattn_ws(
                     T.fill(logsum, 0)
                     T.fill(scores_max, -T.infinity(accum_dtype))
                     T.mbarrier_wait_parity(mbarrier=8, parity=0)
-                    for bi in T.serial(block_count[0]):
+                    for bi in T.serial(block_count):
                         k = block_offset[bi]
                         for i, j in T.Parallel(block_M, block_N):
-                            acc_s[i, j] = T.if_then_else(bx * block_M + i >= k + j, 0,
-                                                         -T.infinity(acc_s.dtype))
+                            acc_s[i, j] = T.if_then_else(bx * block_M + i >= k + j, 0, -T.infinity(acc_s.dtype))
 
                         T.mbarrier_wait_parity(mbarrier=bi % 2, parity=((bi & 3) >> 1))
-                        T.gemm(
-                            Q_shared,
-                            K_shared[bi % 2, :, :],
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullRow)
+                        T.gemm(Q_shared, K_shared[bi % 2, :, :], acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                         T.mbarrier_arrive(mbarrier=bi % 2 + 4)
 
                         T.copy(scores_max, scores_max_prev)
 
                         T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                        for i in T.Parallel(block_M):
+                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
 
                         for i in T.Parallel(block_M):
-                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale -
-                                                     scores_max[i] * scale)
+                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                         for i, j in T.Parallel(block_M, block_N):
                             acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                         for i, j in T.Parallel(block_M, dim):
                             acc_o[i, j] = acc_o[i, j] * scores_scale[i]
 
                         T.copy(acc_s, acc_s_cast)
-                        T.mbarrier_wait_parity(mbarrier=bi % 2 + 2, parity=(((bi & 3) >> 1)))
-                        T.gemm(
-                            acc_s_cast,
-                            V_shared[bi % 2, :, :],
-                            acc_o,
-                            policy=T.GemmWarpPolicy.FullRow)
+                        T.mbarrier_wait_parity(mbarrier=bi % 2 + 2, parity=((bi & 3) >> 1))
+                        T.gemm(acc_s_cast, V_shared[bi % 2, :, :], acc_o, policy=T.GemmWarpPolicy.FullRow)
 
                         T.mbarrier_arrive(mbarrier=bi % 2 + 6)
 
@@ -218,39 +201,86 @@ def vs_sparse_flashattn_ws(
                         for i in T.Parallel(block_M):
                             logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
 
-                    if column_count[0] != 0:
-                        Prefetch(K, V, K_shared_1, V_shared_1, column_index, column_count[0], 0, bz,
-                                 by)
-                        for bi in T.serial(T.ceildiv(column_count[0], block_N) - 1):
+                    if column_count != 0:
+                        Prefetch(K, V, K_shared_1, V_shared_1, column_index, column_count, 0, bz, by)
+                        for bi in T.serial(T.ceildiv(column_count, block_N) - 1):
                             k = bi * block_N
                             if bi % 2 == 0:
-                                Prefetch(K, V, K_shared_2, V_shared_2, column_index,
-                                         column_count[0], k + block_N, bz, by)
-
-                                Compute(acc_s, acc_s_cast, acc_o, scores_max, scores_max_prev, k,
-                                        column_count[0], Q_shared, K_shared_1, V_shared_1,
-                                        scores_scale, scores_sum, logsum, 1)
+                                Prefetch(K, V, K_shared_2, V_shared_2, column_index, column_count, k + block_N, bz, by)
+
+                                Compute(
+                                    acc_s,
+                                    acc_s_cast,
+                                    acc_o,
+                                    scores_max,
+                                    scores_max_prev,
+                                    k,
+                                    column_count,
+                                    Q_shared,
+                                    K_shared_1,
+                                    V_shared_1,
+                                    scores_scale,
+                                    scores_sum,
+                                    logsum,
+                                    1,
+                                )
                             else:
-                                Prefetch(K, V, K_shared_1, V_shared_1, column_index,
-                                         column_count[0], k + block_N, bz, by)
-
-                                Compute(acc_s, acc_s_cast, acc_o, scores_max, scores_max_prev, k,
-                                        column_count[0], Q_shared, K_shared_2, V_shared_2,
-                                        scores_scale, scores_sum, logsum, 1)
-                        if T.ceildiv(column_count[0], block_N) % 2 == 0:
-                            Compute(acc_s, acc_s_cast, acc_o, scores_max, scores_max_prev,
-                                    T.ceildiv(column_count[0], block_N) * block_N - block_N,
-                                    column_count[0], Q_shared, K_shared_2, V_shared_2, scores_scale,
-                                    scores_sum, logsum, 0)
+                                Prefetch(K, V, K_shared_1, V_shared_1, column_index, column_count, k + block_N, bz, by)
+
+                                Compute(
+                                    acc_s,
+                                    acc_s_cast,
+                                    acc_o,
+                                    scores_max,
+                                    scores_max_prev,
+                                    k,
+                                    column_count,
+                                    Q_shared,
+                                    K_shared_2,
+                                    V_shared_2,
+                                    scores_scale,
+                                    scores_sum,
+                                    logsum,
+                                    1,
+                                )
+                        if T.ceildiv(column_count, block_N) % 2 == 0:
+                            Compute(
+                                acc_s,
+                                acc_s_cast,
+                                acc_o,
+                                scores_max,
+                                scores_max_prev,
+                                T.ceildiv(column_count, block_N) * block_N - block_N,
+                                column_count,
+                                Q_shared,
+                                K_shared_2,
+                                V_shared_2,
+                                scores_scale,
+                                scores_sum,
+                                logsum,
+                                0,
+                            )
                         else:
-                            Compute(acc_s, acc_s_cast, acc_o, scores_max, scores_max_prev,
-                                    T.ceildiv(column_count[0], block_N) * block_N - block_N,
-                                    column_count[0], Q_shared, K_shared_1, V_shared_1, scores_scale,
-                                    scores_sum, logsum, 0)
+                            Compute(
+                                acc_s,
+                                acc_s_cast,
+                                acc_o,
+                                scores_max,
+                                scores_max_prev,
+                                T.ceildiv(column_count, block_N) * block_N - block_N,
+                                column_count,
+                                Q_shared,
+                                K_shared_1,
+                                V_shared_1,
+                                scores_scale,
+                                scores_sum,
+                                logsum,
+                                0,
+                            )
                     for i, j in T.Parallel(block_M, dim):
                         acc_o[i, j] /= logsum[i]
                     T.copy(acc_o, O_shared)
-                    T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+                    T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
         return vs_sparse_flashattn_ws
 
@@ -466,11 +496,8 @@ def vertical_slash_sparse_attention(
     import os
 
     current_dir = os.path.dirname(os.path.abspath(__file__))
-    sources = [
-        os.path.join(current_dir, 'ops', 'kernels.cpp'),
-        os.path.join(current_dir, 'ops', 'vertical_slash_index.cu')
-    ]
-    ops = load(name='convert', sources=sources, verbose=False)
+    sources = [os.path.join(current_dir, "ops", "kernels.cpp"), os.path.join(current_dir, "ops", "vertical_slash_index.cu")]
+    ops = load(name="convert", sources=sources, verbose=False)
     convert_vertical_slash_indexes = ops.convert_vertical_slash_indexes
     batch_size, num_heads, context_size, head_dim = query.shape
     pad = (block_size_M - context_size) & (block_size_M - 1)
@@ -481,15 +508,13 @@ def vertical_slash_sparse_attention(
     value = torch.nn.functional.pad(value, [0, 0, 0, pad, 0, 0, 0, 0])
 
     if head_dim not in [16, 32, 64, 128, 256, 512]:
-        target_dim = 2**math.ceil(math.log2(head_dim)) - head_dim
+        target_dim = 2 ** math.ceil(math.log2(head_dim)) - head_dim
         query = torch.nn.functional.pad(query, [0, target_dim, 0, 0, 0, 0, 0, 0])
         key = torch.nn.functional.pad(key, [0, target_dim, 0, 0, 0, 0, 0, 0])
         value = torch.nn.functional.pad(value, [0, target_dim, 0, 0, 0, 0, 0, 0])
 
-    v_idx = v_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(
-        dim=-1, descending=False)[0]
-    s_idx = s_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(
-        dim=-1, descending=True)[0]
+    v_idx = v_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(dim=-1, descending=False)[0]
+    s_idx = s_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(dim=-1, descending=True)[0]
 
     seqlens = torch.tensor([context_size] * query.shape[0], dtype=torch.int32, device=query.device)
     sm_scale = head_dim**-0.5
@@ -502,8 +527,7 @@ def vertical_slash_sparse_attention(
         block_size_N,
     )
 
-    tl_kernel = _tl_vs_sparse_flashattn(batch_size, num_heads, context_size, head_dim,
-                                        v_idx.shape[2], s_idx.shape[2])
+    tl_kernel = _tl_vs_sparse_flashattn(batch_size, num_heads, context_size, head_dim, v_idx.shape[2], s_idx.shape[2])
 
     def run(is_triton: bool = True):
         if is_triton:
@@ -521,8 +545,7 @@ def run(is_triton: bool = True):
                 block_size_N,
             )
         else:
-            out = tl_kernel(query, key, value, block_count, block_offset, column_count,
-                            column_index)
+            out = tl_kernel(query, key, value, block_count, block_offset, column_count, column_index)
         return out[..., :context_size, :head_dim]
 
     return run
@@ -532,8 +555,7 @@ def sum_all_diagonal_matrix(mat: torch.tensor):
     b, h, n, m = mat.shape
     zero_mat = torch.zeros((b, h, n, n)).to(mat.device)  # Zero matrix used for padding
     mat_padded = torch.cat((zero_mat, mat, zero_mat), -1)  # pads the matrix on left and right
-    mat_strided = mat_padded.as_strided(
-        (1, 1, n, n + m), (1, n * (2 * n + m), 2 * n + m + 1, 1))  # Change the strides
+    mat_strided = mat_padded.as_strided((1, 1, n, n + m), (1, n * (2 * n + m), 2 * n + m + 1, 1))  # Change the strides
     sum_diags = torch.sum(mat_strided, 2)  # Sums the resulting matrix's columns
     return sum_diags[:, :, 1:]
 
@@ -555,24 +577,23 @@ def main(argv=None):
     vertical_size, slash_size = args.vertical_size, args.slash_size
 
     torch.manual_seed(0)
-    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
     q_len = SEQ_LEN
 
     vertical_size, slash_size = min(q_len, vertical_size), min(q_len, slash_size)
     last_q = 64
-    qk = torch.einsum('bhmk, bhnk -> bhmn', q[:, :, -last_q:, :], k)
+    qk = torch.einsum("bhmk, bhnk -> bhmn", q[:, :, -last_q:, :], k)
     arange = torch.arange(last_q, device="cuda")
-    qk[:, :, :, -last_q:] = torch.where(arange[None, None, :, None] >= arange[None, None, None, :],
-                                        qk[:, :, :, -last_q:], -torch.inf)
+    qk[:, :, :, -last_q:] = torch.where(arange[None, None, :, None] >= arange[None, None, None, :], qk[:, :, :, -last_q:], -torch.inf)
     qk = torch.nn.functional.softmax(qk, dim=-1, dtype=torch.float32)
     vertical = qk.sum(-2, keepdim=True)
     vertical[..., :30] = torch.inf
     vertical_topk = torch.topk(vertical, vertical_size, -1).indices
 
-    slash = sum_all_diagonal_matrix(qk)[..., :-last_q + 1]
+    slash = sum_all_diagonal_matrix(qk)[..., : -last_q + 1]
     slash[..., -30:] = torch.inf
 
     slash = (q_len - 1) - torch.topk(slash, slash_size, -1).indices
@@ -592,5 +613,78 @@ def main(argv=None):
     print(f"speedup: {triton_time / tilelang_time:.2f}x")
 
 
+def run_regression_perf(argv=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch", type=int, default=1)
+    parser.add_argument("--heads", type=int, default=1)
+    parser.add_argument("--seq_len", type=int, default=16384)
+    parser.add_argument("--head_dim", type=int, default=64)
+    parser.add_argument("--vertical_size", type=int, default=1000)
+    parser.add_argument("--slash_size", type=int, default=200)
+    args = parser.parse_args(argv)
+    BATCH, N_HEADS, SEQ_LEN, D_HEAD = args.batch, args.heads, args.seq_len, args.head_dim
+    vertical_size, slash_size = args.vertical_size, args.slash_size
+    torch.manual_seed(0)
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    q_len = SEQ_LEN
+    vertical_size, slash_size = min(q_len, vertical_size), min(q_len, slash_size)
+    last_q = 64
+    qk = torch.einsum("bhmk, bhnk -> bhmn", q[:, :, -last_q:, :], k)
+    arange = torch.arange(last_q, device="cuda")
+    qk[:, :, :, -last_q:] = torch.where(arange[None, None, :, None] >= arange[None, None, None, :], qk[:, :, :, -last_q:], -torch.inf)
+    qk = torch.nn.functional.softmax(qk, dim=-1, dtype=torch.float32)
+    vertical = qk.sum(-2, keepdim=True)
+    vertical[..., :30] = torch.inf
+    vertical_topk = torch.topk(vertical, vertical_size, -1).indices
+    slash = sum_all_diagonal_matrix(qk)[..., : -last_q + 1]
+    slash[..., -30:] = torch.inf
+    slash = (q_len - 1) - torch.topk(slash, slash_size, -1).indices
+    block_size_M = 64
+    block_size_N = 64
+    query, key, value = q, k, v
+    v_idx, s_idx = vertical_topk, slash
+    batch_size, num_heads, context_size, head_dim = query.shape
+    v_idx = v_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(dim=-1, descending=False)[0]
+    s_idx = s_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(dim=-1, descending=True)[0]
+    from torch.utils.cpp_extension import load
+    import os
+
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    sources = [os.path.join(current_dir, "ops", "kernels.cpp"), os.path.join(current_dir, "ops", "vertical_slash_index.cu")]
+    ops = load(name="convert", sources=sources, verbose=False)
+    convert_vertical_slash_indexes = ops.convert_vertical_slash_indexes
+    batch_size, num_heads, context_size, head_dim = query.shape
+    pad = (block_size_M - context_size) & (block_size_M - 1)
+    if pad == block_size_M:
+        pad = 0
+    query = torch.nn.functional.pad(query, [0, 0, 0, pad, 0, 0, 0, 0])
+    key = torch.nn.functional.pad(key, [0, 0, 0, pad, 0, 0, 0, 0])
+    value = torch.nn.functional.pad(value, [0, 0, 0, pad, 0, 0, 0, 0])
+    if head_dim not in [16, 32, 64, 128, 256, 512]:
+        target_dim = 2 ** math.ceil(math.log2(head_dim)) - head_dim
+        query = torch.nn.functional.pad(query, [0, target_dim, 0, 0, 0, 0, 0, 0])
+        key = torch.nn.functional.pad(key, [0, target_dim, 0, 0, 0, 0, 0, 0])
+        value = torch.nn.functional.pad(value, [0, target_dim, 0, 0, 0, 0, 0, 0])
+    v_idx = v_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(dim=-1, descending=False)[0]
+    s_idx = s_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(dim=-1, descending=True)[0]
+    seqlens = torch.tensor([context_size] * query.shape[0], dtype=torch.int32, device=query.device)
+    block_count, block_offset, column_count, column_index = convert_vertical_slash_indexes(
+        seqlens,
+        v_idx,
+        s_idx,
+        context_size,
+        block_size_M,
+        block_size_N,
+    )
+    tl_kernel = _tl_vs_sparse_flashattn(batch_size, num_heads, context_size, head_dim, vertical_topk.shape[-1], slash.shape[-1])
+
+    def run_kernel_only():
+        tl_kernel(query, key, value, block_count, block_offset, column_count, column_index)
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/minference/regression_vs_sparse_attn.py b/examples/minference/regression_vs_sparse_attn.py
new file mode 100644
index 000000000..32fdfa9e8
--- /dev/null
+++ b/examples/minference/regression_vs_sparse_attn.py
@@ -0,0 +1,10 @@
+import tilelang.testing
+import example_vertical_slash_sparse_attn
+
+
+def regression_example_vertical_slash_sparse_attn():
+    tilelang.testing.process_func(example_vertical_slash_sparse_attn.run_regression_perf, argv=[])
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/norm/rms_norm.py b/examples/norm/rms_norm.py
index 25bac50fc..57bccc1a0 100644
--- a/examples/norm/rms_norm.py
+++ b/examples/norm/rms_norm.py
@@ -4,7 +4,7 @@
 
 
 def rms_norm_splitk(M, N, blk_m, blk_k):
-    dtype = "float"
+    dtype = T.float
 
     @T.prim_func
     def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
@@ -21,7 +21,7 @@ def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
                     A_local[i, j] += A_shared[i, j] * A_shared[i, j]
             T.reduce_sum(A_local, A_powsum, dim=1)
             for i in T.Parallel(blk_m):
-                A_powsum[i] = T.rsqrt(A_powsum[i] / N) + 1e-12
+                A_powsum[i] = T.rsqrt(A_powsum[i] / N + 1e-12)
 
             for k in range(num_k_step):
                 # reverse, better cache hit rate
@@ -35,7 +35,7 @@ def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
 
 @tilelang.jit(out_idx=[-1], pass_configs={"tl.disable_tma_lower": True})
 def rms_norm(M, N, blk_m):
-    dtype = "float"
+    dtype = T.float
 
     @T.prim_func
     def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
@@ -45,16 +45,16 @@ def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
             A_local = T.alloc_fragment((blk_m, N), dtype)
             A_powsum = T.alloc_fragment((blk_m,), dtype)
 
-            T.copy(A[bx * blk_m:(bx + 1) * blk_m, :], A_shared)
+            T.copy(A[bx * blk_m : (bx + 1) * blk_m, :], A_shared)
             T.copy(A_shared, A_local)
             for i, j in T.Parallel(blk_m, N):
                 A_pow_local[i, j] = A_local[i, j] * A_local[i, j]
             T.reduce_sum(A_pow_local, A_powsum, dim=1)
             for i in T.Parallel(blk_m):
-                A_powsum[i] = T.rsqrt(A_powsum[i] / N) + 1e-12
+                A_powsum[i] = T.rsqrt(A_powsum[i] / N + 1e-12)
             for i, j in T.Parallel(blk_m, N):
                 A_local[i, j] *= A_powsum[i]
-            T.copy(A_local, B[bx * blk_m:(bx + 1) * blk_m, :])
+            T.copy(A_local, B[bx * blk_m : (bx + 1) * blk_m, :])
 
     return main
 
diff --git a/examples/norm/test_rms_norm.py b/examples/norm/test_rms_norm.py
index 8cc413531..53db03d98 100644
--- a/examples/norm/test_rms_norm.py
+++ b/examples/norm/test_rms_norm.py
@@ -5,7 +5,7 @@
 
 
 def rms_norm_splitk(M, N, blk_m, blk_k):
-    dtype = "float"
+    dtype = T.float
 
     @T.prim_func
     def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
@@ -22,7 +22,7 @@ def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
                     A_local[i, j] += A_shared[i, j] * A_shared[i, j]
             T.reduce_sum(A_local, A_powsum, dim=1)
             for i in T.Parallel(blk_m):
-                A_powsum[i] = T.rsqrt(A_powsum[i] / N) + 1e-12
+                A_powsum[i] = T.rsqrt(A_powsum[i] / N + 1e-12)
 
             for k in range(num_k_step):
                 # reverse, better cache hit rate
@@ -35,7 +35,7 @@ def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
 
 
 def rms_norm(M, N, blk_m):
-    dtype = "float"
+    dtype = T.float
 
     @T.prim_func
     def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
@@ -45,16 +45,16 @@ def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
             A_local = T.alloc_fragment((blk_m, N), dtype)
             A_powsum = T.alloc_fragment((blk_m,), dtype)
 
-            T.copy(A[bx * blk_m:(bx + 1) * blk_m, :], A_shared)
+            T.copy(A[bx * blk_m : (bx + 1) * blk_m, :], A_shared)
             T.copy(A_shared, A_local)
             for i, j in T.Parallel(blk_m, N):
                 A_pow_local[i, j] = A_local[i, j] * A_local[i, j]
             T.reduce_sum(A_pow_local, A_powsum, dim=1)
             for i in T.Parallel(blk_m):
-                A_powsum[i] = T.rsqrt(A_powsum[i] / N) + 1e-12
+                A_powsum[i] = T.rsqrt(A_powsum[i] / N + 1e-12)
             for i, j in T.Parallel(blk_m, N):
                 A_local[i, j] *= A_powsum[i]
-            T.copy(A_local, B[bx * blk_m:(bx + 1) * blk_m, :])
+            T.copy(A_local, B[bx * blk_m : (bx + 1) * blk_m, :])
 
     return main
 
diff --git a/examples/online_softmax/online_softmax.py b/examples/online_softmax/online_softmax.py
index 432482d06..811870e44 100644
--- a/examples/online_softmax/online_softmax.py
+++ b/examples/online_softmax/online_softmax.py
@@ -9,19 +9,19 @@
 def softmax_kernel(
     M,
     N,
-    dtype: str = "float16",
+    dtype: T.dtype = T.float16,
 ) -> "Callable":
     BN = min(tl.next_power_of_2(N), 8192)
     NN = tl.cdiv(N, BN)
 
-    accum_dtype = "float"
+    accum_dtype = T.float32
 
     scale = 1.44269504  # log2(e)
 
     @T.prim_func
     def main(
-            X: T.Tensor([M, N], dtype),
-            Y: T.Tensor([M, N], dtype),
+        X: T.Tensor([M, N], dtype),
+        Y: T.Tensor([M, N], dtype),
     ):
         with T.Kernel(M, threads=128) as (i_m):
             x = T.alloc_fragment([BN], dtype)
@@ -33,7 +33,7 @@ def main(
             T.fill(lse, -T.infinity(accum_dtype))
 
             for i_n in T.Pipelined(0, NN):
-                T.copy(X[i_m, i_n * BN:(i_n + 1) * BN], x)
+                T.copy(X[i_m, i_n * BN : (i_n + 1) * BN], x)
 
                 T.reduce_max(x, max_x, dim=0, clear=True)
 
@@ -45,12 +45,12 @@ def main(
                 lse[0] = max_x[0] * scale + T.log2(T.exp2(lse[0] - max_x[0] * scale) + sum_exp_x[0])
 
             for i_n in T.Pipelined(0, NN):
-                T.copy(X[i_m, i_n * BN:(i_n + 1) * BN], x)
+                T.copy(X[i_m, i_n * BN : (i_n + 1) * BN], x)
 
                 for j in T.Parallel(BN):
                     y[j] = T.exp2(x[j] * scale - lse[0])
 
-                T.copy(y, Y[i_m, i_n * BN:(i_n + 1) * BN])
+                T.copy(y, Y[i_m, i_n * BN : (i_n + 1) * BN])
 
     return main
 
@@ -69,4 +69,4 @@ def main(
 t2 = do_bench(lambda: kernel(X), warmup=25, rep=100)
 print(f"torch latency: {t1:.3f} ms")
 print(f"TileLang latency: {t2:.3f} ms")
-print(f"Speedup: {t1/t2:.3f}x")
+print(f"Speedup: {t1 / t2:.3f}x")
diff --git a/examples/plot_layout/README.md b/examples/plot_layout/README.md
index a65d771c2..8204e93d8 100644
--- a/examples/plot_layout/README.md
+++ b/examples/plot_layout/README.md
@@ -10,7 +10,7 @@ from typing import Literal, Callable
 from tilelang.intrinsics.utils import get_mma_micro_size
 from tilelang.tools import plot_layout
 
-def make_mma_load_base_layout(dtype: str = "float16",
+def make_mma_load_base_layout(dtype: str = T.float16,
                               matrix: Literal["A", "B"] = "A",
                               transposed: bool = False) -> T.Fragment:
     """
@@ -69,7 +69,7 @@ def make_mma_load_base_layout(dtype: str = "float16",
     micro_size_s, _, micro_size_r = get_mma_micro_size(dtype)
 
     transform_func = transform_func
-    inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+    inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
     def forward_thread(i: int, j: int) -> int:
         """
@@ -94,7 +94,7 @@ def make_mma_load_base_layout(dtype: str = "float16",
 
 
 # Create a 16×16 matrix layout for ldmatrix operations
-base_layout = make_mma_load_base_layout(dtype="float16", matrix="A", transposed=False)
+base_layout = make_mma_load_base_layout(dtype=T.float16, matrix="A", transposed=False)
 
 # Print the layout structure (optional for debugging)
 print(base_layout)
diff --git a/examples/plot_layout/fragment_mfma_load_a.py b/examples/plot_layout/fragment_mfma_load_a.py
new file mode 100644
index 000000000..d45cc227b
--- /dev/null
+++ b/examples/plot_layout/fragment_mfma_load_a.py
@@ -0,0 +1,127 @@
+import tilelang.language as T
+from typing import Literal, Callable
+from tvm.tir import IndexMap
+from tilelang.intrinsics.utils import get_mma_micro_size
+
+from tilelang.intrinsics.mfma_layout import (
+    shared_16x4_to_local_64x1_layout_A,
+    shared_16x16_to_local_64x4_layout_A,
+    shared_16x32_to_local_64x8_layout_A,
+    shared_16x64_to_local_64x16_layout_A,
+)
+
+
+def make_mfma_load_base_layout(
+    dtype: T.dtype = T.float16, matrix: Literal["A", "B"] = "A", k_dim: int = 16, transposed: bool = False
+) -> T.Fragment:
+    """
+    Create a layout function for storing MFMA results into a fragment buffer.
+    This layout is used in conjunction with `inverse_mfma_store_layout` to
+    map fragment indices to threads and local indices.
+
+    Parameters
+    ----------
+    dtype : str
+        The data type of the matrix.
+    matrix : Literal["A", "B"]
+        The mfma operand to be loaded.
+    k_dim : int
+        The k dimension of the mfma.
+    transposed : bool
+        Whether the matrix is transposed, by default False.
+
+    Returns
+    -------
+    T.Fragment
+        Describes how threads and indices in fragment are laid out.
+
+    """
+
+    assert matrix in ["A", "B"], "matrix should be either A or B"
+    # s represents spatial axis
+    # r represents reduction axis
+    # sr represents the two dims are spatial + reduction
+    # rs represents the two dims are reduction + spatial
+    transform_func_sr_a: Callable = None
+    transform_func_sr_b: Callable = None
+
+    if k_dim == 4:
+        transform_func_sr_a = shared_16x4_to_local_64x1_layout_A
+        transform_func_sr_b = shared_16x4_to_local_64x1_layout_A
+    elif k_dim == 16:
+        transform_func_sr_a = shared_16x16_to_local_64x4_layout_A
+        transform_func_sr_b = shared_16x16_to_local_64x4_layout_A
+    elif k_dim == 32:
+        transform_func_sr_a = shared_16x32_to_local_64x8_layout_A
+        transform_func_sr_b = shared_16x32_to_local_64x8_layout_A
+    elif k_dim == 64:
+        transform_func_sr_a = shared_16x64_to_local_64x16_layout_A
+        transform_func_sr_b = shared_16x64_to_local_64x16_layout_A
+    else:
+        raise ValueError("k_dim must be 4 or 16 or 32 or 64 currently")
+
+    is_sr_conditions = [False]
+    is_sr_conditions.append(matrix == "A" and not transposed)
+    is_sr_conditions.append(matrix == "B" and transposed)
+    is_sr_axis_order = any(is_sr_conditions)
+
+    micro_size_x, micro_size_y, micro_size_k = get_mma_micro_size(dtype)
+
+    # the layout of mma.sync is row.col.
+    # so the b matrix expected a transposed basic layout
+    transform_func: Callable = None
+    if matrix == "A":
+        transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
+        micro_size_s, micro_size_r = micro_size_x, micro_size_k
+    elif matrix == "B":
+        transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(j, i)
+        micro_size_s, micro_size_r = micro_size_k, micro_size_y
+    else:
+        raise ValueError(f"Unsupported matrix {matrix}")
+
+    inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
+
+    def forward_thread(i: int, j: int) -> int:
+        """
+        Given the row index `i` and column index `j` in the fragment,
+        """
+        lane_id, _ = inverse_mma_load_layout.map_indices([i, j])
+        return lane_id
+
+    def forward_index(i: int, j: int) -> int:
+        """
+        Given the row index `i` and column index `j` in the fragment,
+        """
+        _, local_id = inverse_mma_load_layout.map_indices([i, j])
+        return local_id
+
+    base_fragment = T.Fragment(
+        [micro_size_s, micro_size_r] if is_sr_axis_order else [micro_size_r, micro_size_s],
+        forward_thread_fn=forward_thread,
+        forward_index_fn=forward_index,
+    )
+    return base_fragment
+
+
+block_rows = 2
+block_cols = 2
+warp_rows = 2
+warp_cols = 2
+chunk = 2
+
+from tilelang.tools import plot_layout
+
+# ldmatrix layout 16x16
+base_layout = make_mfma_load_base_layout(dtype=T.float16, matrix="A", transposed=False)
+print(base_layout)
+plot_layout(base_layout, name="base_layout")
+
+# warp layout 32x32
+warp_layout = base_layout.repeat([warp_rows, warp_cols], repeat_on_thread=False, lower_dim_first=False)
+print(warp_layout)
+plot_layout(warp_layout, name="warp_layout")
+
+# block layout 64x32
+block_layout = warp_layout.repeat([block_rows, 1], repeat_on_thread=True, lower_dim_first=True).replicate(block_cols)
+print(block_layout)
+plot_layout(block_layout, name="block_layout")
diff --git a/examples/plot_layout/fragment_mma_load_a.py b/examples/plot_layout/fragment_mma_load_a.py
index 988899448..df4a0b887 100644
--- a/examples/plot_layout/fragment_mma_load_a.py
+++ b/examples/plot_layout/fragment_mma_load_a.py
@@ -5,9 +5,7 @@
 from tilelang.intrinsics.utils import get_mma_micro_size
 
 
-def make_mma_load_base_layout(dtype: str = "float16",
-                              matrix: Literal["A", "B"] = "A",
-                              transposed: bool = False) -> T.Fragment:
+def make_mma_load_base_layout(dtype: T.dtype = T.float16, matrix: Literal["A", "B"] = "A", transposed: bool = False) -> T.Fragment:
     """
     Create a layout function for storing MMA results into a fragment buffer.
     This layout is used in conjunction with `inverse_mma_store_layout` to
@@ -36,6 +34,7 @@ def make_mma_load_base_layout(dtype: str = "float16",
         shared_16x16_to_mma_32x8_layout_sr_b,
         shared_16x32_to_mma_32x16_layout_sr_b,
     )
+
     assert matrix in ["A", "B"], "matrix should be either A or B"
     dtype_bits = DataType(dtype).bits
     # s represents spatial axis
@@ -67,17 +66,15 @@ def make_mma_load_base_layout(dtype: str = "float16",
     # so the b matrix expected a transposed basic layout
     transform_func: Callable = None
     if matrix == "A":
-        transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(
-            j, i)
+        transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
         micro_size_s, micro_size_r = micro_size_x, micro_size_k
     elif matrix == "B":
-        transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(
-            j, i)
+        transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(j, i)
         micro_size_s, micro_size_r = micro_size_k, micro_size_y
     else:
         raise ValueError(f"Unsupported matrix {matrix}")
 
-    inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+    inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
     def forward_thread(i: int, j: int) -> int:
         """
@@ -110,7 +107,7 @@ def forward_index(i: int, j: int) -> int:
 from tilelang.tools import plot_layout
 
 # ldmatrix layout 16x16
-base_layout = make_mma_load_base_layout(dtype="float16", matrix="A", transposed=False)
+base_layout = make_mma_load_base_layout(dtype=T.float16, matrix="A", transposed=False)
 print(base_layout)
 plot_layout(base_layout, name="base_layout")
 
diff --git a/examples/quickstart.py b/examples/quickstart.py
index 42514ee39..e99fc0dbc 100644
--- a/examples/quickstart.py
+++ b/examples/quickstart.py
@@ -6,13 +6,12 @@
 # target currently can be "cuda" or "hip" or "cpu".
 # if not specified, it will be inferred from the input tensors during compile time
 @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def matmul_relu_kernel(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -55,10 +54,9 @@ def matmul_relu_kernel(
 block_N = 128
 block_K = 32
 
-# 1. Define the kernel (matmul) and compile/lower it into an executable module
+# Define the kernel (matmul) and compile/lower it into an executable module
 matmul_relu_kernel = matmul(M, N, K, block_M, block_N, block_K)
-
-# 3. Test the kernel in Python with PyTorch data
+# Test the kernel in Python with PyTorch data
 import torch
 
 # Create random input tensors on the GPU
@@ -78,7 +76,7 @@ def matmul_relu_kernel(
 print("Kernel output matches PyTorch reference.")
 
 # 4. Retrieve and inspect the generated CUDA source (optional)
-# cuda_source = jit_kernel.get_kernel_source()
+# cuda_source = matmul_relu_kernel.get_kernel_source()
 # print("Generated CUDA kernel:\n", cuda_source)
 
 # 5.Profile latency with kernel
diff --git a/examples/rand/rand_uint.py b/examples/rand/rand_uint.py
new file mode 100644
index 000000000..466a51b7a
--- /dev/null
+++ b/examples/rand/rand_uint.py
@@ -0,0 +1,57 @@
+import tilelang
+import tilelang.language as T
+import torch
+import triton
+import triton.language as tl
+
+
+@tilelang.jit
+def tilelang_rand_1d(M=1024, seed=42):
+    num_per_thread = 128
+    threads = 1
+    blk_M = num_per_thread * threads
+
+    @T.prim_func
+    def rand_kernel(A: T.Tensor((M,), "uint32")):
+        with T.Kernel(T.ceildiv(M, threads * num_per_thread), threads=threads) as bx:
+            tx = T.get_thread_binding()
+            T.rng_init(seed, 0, bx * blk_M + tx * num_per_thread)
+            for i, j in T.Parallel(threads, num_per_thread):
+                offsets = (bx * threads + i) * num_per_thread
+                idx = offsets + j
+                if idx < M:
+                    A[idx] = T.rng_rand()
+
+    return rand_kernel
+
+
+@triton.jit
+def triton_rand_1d(X, M, elements_per_thread, seed):
+    pid = tl.program_id(0)
+    offset = pid * elements_per_thread + tl.arange(0, elements_per_thread)
+
+    r0, r1, r2, r3 = tl.randint4x(seed, offset)
+
+    base_idx = offset * 4
+    tl.store(X + base_idx, r0, mask=base_idx < M)
+    tl.store(X + base_idx + 1, r1, mask=(base_idx + 1) < M)
+    tl.store(X + base_idx + 2, r2, mask=(base_idx + 2) < M)
+    tl.store(X + base_idx + 3, r3, mask=(base_idx + 3) < M)
+
+
+def test_rand_1d(M, seed):
+    kernel = tilelang_rand_1d(M, seed)
+    tilelang_result = torch.empty(M, dtype=torch.uint32, device="cuda")
+    kernel(tilelang_result)
+
+    triton_result = torch.empty(M, dtype=torch.uint32, device="cuda")
+    grid = (triton.cdiv(M, 128),)
+    triton_rand_1d[grid](triton_result, tl.constexpr(M), tl.constexpr(128 // 4), seed)
+
+    torch.testing.assert_close(tilelang_result, triton_result)
+
+
+if __name__ == "__main__":
+    test_rand_1d(1024, 42)
+    test_rand_1d(512, 123)
+    test_rand_1d(128, 0)
diff --git a/examples/seer_attention/block_sparse_attn_tilelang.py b/examples/seer_attention/block_sparse_attn_tilelang.py
index dcd581c6b..0a3c3a6e3 100644
--- a/examples/seer_attention/block_sparse_attn_tilelang.py
+++ b/examples/seer_attention/block_sparse_attn_tilelang.py
@@ -10,10 +10,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -30,70 +27,33 @@ def get_sparse_attn_mask_from_threshold(x, threshold, use_dense_for_last_block=F
 
 
 @tilelang.jit(
-    out_idx=[4], pass_configs={
+    out_idx=[4],
+    pass_configs={
         tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    })
+    },
+)
 def blocksparse_flashattn(batch, heads, seq_q, seq_kv, dim, downsample_len, is_causal):
     block_M = 64
     block_N = 64
     num_stages = 0
     threads = 128
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     q_shape = [batch, heads, seq_q, dim]
     kv_shape = [batch, heads, seq_kv, dim]
     block_mask_shape = [batch, heads, downsample_len, downsample_len]
 
-    dtype = "float16"
-    accum_dtype = "float"
-    block_mask_dtype = "int8"
+    dtype = T.float16
+    accum_dtype = T.float32
+    block_mask_dtype = T.int8
 
     def kernel_func(block_M, block_N, num_stages, threads):
-
-        @T.macro
-        def Softmax(
-                acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
-                acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
-                scores_max: T.FragmentBuffer([block_M], accum_dtype),
-                scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-                scores_sum: T.FragmentBuffer([block_M], accum_dtype),
-                logsum: T.FragmentBuffer([block_M], accum_dtype),
-        ):
-            T.copy(scores_max, scores_max_prev)
-            T.fill(scores_max, -T.infinity(accum_dtype))
-            T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-            # To do causal softmax, we need to set the scores_max to 0 if it is -inf
-            # This process is called Check_inf in FlashAttention3 code, and it only need to be done
-            # in the first ceil_div(kBlockM, kBlockN) steps.
-            # for i in T.Parallel(block_M):
-            #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
-            for i in T.Parallel(block_M):
-                scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-            for i, j in T.Parallel(block_M, block_N):
-                # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
-                # max * log_2(e)) This allows the compiler to use the ffma
-                # instruction instead of fadd and fmul separately.
-                acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-            T.reduce_sum(acc_s, scores_sum, dim=1)
-            for i in T.Parallel(block_M):
-                logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-            T.copy(acc_s, acc_s_cast)
-
-        @T.macro
-        def Rescale(
-                acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
-                scores_scale: T.FragmentBuffer([block_M], accum_dtype),
-        ):
-            for i, j in T.Parallel(block_M, dim):
-                acc_o[i, j] *= scores_scale[i]
-
         @T.prim_func
         def main(
-                Q: T.Tensor(q_shape, dtype),
-                K: T.Tensor(kv_shape, dtype),
-                V: T.Tensor(kv_shape, dtype),
-                BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
-                Output: T.Tensor(q_shape, dtype),
+            Q: T.Tensor(q_shape, dtype),
+            K: T.Tensor(kv_shape, dtype),
+            V: T.Tensor(kv_shape, dtype),
+            BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
+            Output: T.Tensor(q_shape, dtype),
         ):
             with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
                 Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -108,47 +68,61 @@ def main(
                 scores_scale = T.alloc_fragment([block_M], accum_dtype)
                 scores_sum = T.alloc_fragment([block_M], accum_dtype)
                 logsum = T.alloc_fragment([block_M], accum_dtype)
-                block_mask = T.alloc_local([downsample_len], block_mask_dtype)
+                block_mask = T.alloc_fragment([downsample_len], block_mask_dtype)
 
-                T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+                T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
 
-                for vj in T.serial(downsample_len):
-                    block_mask[vj] = BlockSparseMask[bz, by, bx, vj]
+                T.copy(BlockSparseMask[bz, by, bx, :], block_mask)
 
                 loop_range = T.ceildiv(seq_kv, block_N)
 
                 for k in T.Pipelined(loop_range, num_stages=num_stages):
                     if block_mask[k] != 0:
-                        T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+                        T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
                         if is_causal:
                             past_len = seq_kv - seq_q
                             for i, j in T.Parallel(block_M, block_N):
-                                acc_s[i, j] = T.if_then_else(
-                                    bx * block_M + i + past_len >= k * block_N + j, 0,
-                                    -T.infinity(acc_s.dtype))
+                                acc_s[i, j] = T.if_then_else(bx * block_M + i + past_len >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
                         else:
                             T.clear(acc_s)
-                        T.gemm(
-                            Q_shared,
-                            K_shared,
-                            acc_s,
-                            transpose_B=True,
-                            policy=T.GemmWarpPolicy.FullRow)
-
-                        Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale,
-                                scores_sum, logsum)
-                        Rescale(acc_o, scores_scale)
-                        T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+                        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                        T.copy(scores_max, scores_max_prev)
+                        T.fill(scores_max, -T.infinity(accum_dtype))
+                        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                        for i in T.Parallel(block_M):
+                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
+                        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
+                        # in the first ceil_div(kBlockM, kBlockN) steps.
+                        # for i in T.Parallel(block_M):
+                        #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
+                        for i in T.Parallel(block_M):
+                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                        for i, j in T.Parallel(block_M, block_N):
+                            # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
+                            # max * log_2(e)) This allows the compiler to use the ffma
+                            # instruction instead of fadd and fmul separately.
+                            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                        T.reduce_sum(acc_s, scores_sum, dim=1)
+                        for i in T.Parallel(block_M):
+                            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                        T.copy(acc_s, acc_s_cast)
+
+                        for i, j in T.Parallel(block_M, dim):
+                            acc_o[i, j] *= scores_scale[i]
+
+                        T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
                         T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
                 for i, j in T.Parallel(block_M, dim):
                     acc_o[i, j] /= logsum[i]
 
                 T.copy(acc_o, O_shared)
-                T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+                T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
         return main
 
@@ -163,44 +137,40 @@ def test_topk_sparse_attention():
     torch.manual_seed(0)
 
     # Create inputs
-    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
 
     sm_scale = 1.0 / (D_HEAD**0.5)
 
     # Create sparse mask (downsampled to block level)
     downsample_factor = BLOCK
     downsample_len = math.ceil(SEQ_LEN / downsample_factor)
-    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                       device='cuda',
-                       dtype=torch.float16)
+    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.float16)
     x_ds[:, :, :, 0] = 100
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
 
     # Run tilelang kernel
-    kernel = blocksparse_flashattn(
-        BATCH, N_HEADS, SEQ_LEN, SEQ_LEN, D_HEAD, downsample_len, is_causal=True)
+    kernel = blocksparse_flashattn(BATCH, N_HEADS, SEQ_LEN, SEQ_LEN, D_HEAD, downsample_len, is_causal=True)
     tilelang_output = kernel(q, k, v, block_mask.to(torch.int8))
 
     # Compute reference
     # Expand block mask to full attention matrix
-    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda'))
+    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda"))
     full_mask = full_mask[..., :SEQ_LEN, :SEQ_LEN].bool()
     full_mask = full_mask & torch.tril(torch.ones_like(full_mask))  # Apply causal
 
     # PyTorch reference implementation
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
-    attn = attn.masked_fill(~full_mask, float('-inf'))
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
+    attn = attn.masked_fill(~full_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     print("ref_output", ref_output)
     print("tilelang_output", tilelang_output)
 
     # Verify accuracy
-    assert torch.allclose(tilelang_output, ref_output, atol=1e-2, rtol=1e-2), \
-        "TileLang output doesn't match reference"
+    assert torch.allclose(tilelang_output, ref_output, atol=1e-2, rtol=1e-2), "TileLang output doesn't match reference"
     print("Pass topk sparse attention test with qlen == klen")
 
 
@@ -213,42 +183,40 @@ def test_topk_sparse_attention_qlen_lt_klen():
     torch.manual_seed(0)
 
     # Create inputs.
-    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device='cuda', dtype=torch.float16)
-    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device='cuda', dtype=torch.float16)
+    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.float16)
     sm_scale = 1.0 / (D_HEAD**0.5)
 
     downsample_factor = BLOCK
     downsample_len = math.ceil(K_LEN / downsample_factor)  # number of blocks along one dimension
-    x_ds = torch.randn(
-        BATCH, N_HEADS, downsample_len, downsample_len, device='cuda', dtype=torch.float16)
+    x_ds = torch.randn(BATCH, N_HEADS, downsample_len, downsample_len, device="cuda", dtype=torch.float16)
     # Force the first column to be high so that the first block is always selected.
     x_ds[:, :, :, 0] = 100
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
 
-    kernel = blocksparse_flashattn(
-        BATCH, N_HEADS, Q_LEN, K_LEN, D_HEAD, downsample_len, is_causal=True)
+    kernel = blocksparse_flashattn(BATCH, N_HEADS, Q_LEN, K_LEN, D_HEAD, downsample_len, is_causal=True)
     print(kernel.get_kernel_source())
     tilelang_output = kernel(q, k, v, block_mask.to(torch.int8))
 
     past_len = K_LEN - Q_LEN
 
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
 
-    full_mask_full = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda')).bool()
+    full_mask_full = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda")).bool()
     full_mask_full = full_mask_full[..., :K_LEN, :K_LEN]
 
     effective_mask = full_mask_full[..., past_len:K_LEN, :]  # shape: (B, H, Q_LEN, K_LEN)
 
     i_global = torch.arange(past_len, K_LEN, device=k.device).unsqueeze(1)  # shape: (Q_LEN, 1)
     j_global = torch.arange(K_LEN, device=k.device).unsqueeze(0)  # shape: (1, K_LEN)
-    causal_mask = (j_global <= i_global)  # shape: (Q_LEN, K_LEN)
+    causal_mask = j_global <= i_global  # shape: (Q_LEN, K_LEN)
 
     final_mask = effective_mask & causal_mask  # shape: (B, H, Q_LEN, K_LEN)
 
-    attn = attn.masked_fill(~final_mask, float('-inf'))
+    attn = attn.masked_fill(~final_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     print("ref_output", ref_output)
     print("tilelang_output", tilelang_output)
@@ -264,5 +232,56 @@ def main():
     test_topk_sparse_attention_qlen_lt_klen()
 
 
+def run_regression_perf():
+    BATCH, N_HEADS, SEQ_LEN, D_HEAD = 4, 2, 256, 64
+    TOPK = 2
+    BLOCK = 64
+    torch.manual_seed(0)
+
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+
+    downsample_factor = BLOCK
+    downsample_len = math.ceil(SEQ_LEN / downsample_factor)
+    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.float16)
+    x_ds[:, :, :, 0] = 100
+    block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
+
+    kernel = blocksparse_flashattn(BATCH, N_HEADS, SEQ_LEN, SEQ_LEN, D_HEAD, downsample_len, is_causal=True)
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(q, k, v, block_mask.to(torch.int8))
+
+    latency_1 = do_bench(run_kernel_only, backend="cupti")
+
+    BATCH, N_HEADS = 1, 1
+    Q_LEN, K_LEN, D_HEAD = 128, 256, 64
+    TOPK = 1
+    BLOCK = 64
+    torch.manual_seed(0)
+
+    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.float16)
+
+    downsample_factor = BLOCK
+    downsample_len = math.ceil(K_LEN / downsample_factor)
+    x_ds = torch.randn(BATCH, N_HEADS, downsample_len, downsample_len, device="cuda", dtype=torch.float16)
+    x_ds[:, :, :, 0] = 100
+    block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
+
+    kernel = blocksparse_flashattn(BATCH, N_HEADS, Q_LEN, K_LEN, D_HEAD, downsample_len, is_causal=True)
+    print(kernel.get_kernel_source())
+
+    def run_kernel_only2():
+        kernel(q, k, v, block_mask.to(torch.int8))
+
+    latency_2 = do_bench(run_kernel_only2, backend="cupti")
+
+    return (latency_1 + latency_2) / 2
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/seer_attention/block_sparse_attn_triton.py b/examples/seer_attention/block_sparse_attn_triton.py
index ed33cc1e2..b4cc3cd00 100644
--- a/examples/seer_attention/block_sparse_attn_triton.py
+++ b/examples/seer_attention/block_sparse_attn_triton.py
@@ -15,10 +15,7 @@ def get_sparse_attn_mask_from_topk(x, topk, use_dense_for_last_block=False):
     bsz, num_head, downsample_len, _ = x.shape
     # N_CTX = downsample_len * BLOCK
     sparse_index = torch.topk(x, topk, dim=-1).indices
-    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len],
-                            False,
-                            dtype=torch.bool,
-                            device=x.device)
+    dense_mask = torch.full([bsz, num_head, downsample_len, downsample_len], False, dtype=torch.bool, device=x.device)
     dense_mask.scatter_(-1, sparse_index, True)
     if use_dense_for_last_block:
         dense_mask[:, :, -2:, :] = True
@@ -54,7 +51,6 @@ def _fwd_kernel_inner(
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
 ):
-
     mask_val = tl.load(block_mask_ptr + k_block_col_idx * stride_bmask_n)
 
     if mask_val == True:
@@ -69,7 +65,7 @@ def _fwd_kernel_inner(
         qk *= sm_scale
 
         # the following is needed only when LAST_K_BLOCK or BLOCK_M < BLOCK_N
-        qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0, float('-inf'))
+        qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0, float("-inf"))
 
         m_ij = tl.maximum(m_i, tl.max(qk, 1))
         qk -= m_ij[:, None]
@@ -149,7 +145,7 @@ def _fwd_kernel(
     v_ptrs = V + off_v
     mask_ptrs = block_mask_ptr + start_m * stride_bmm
 
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
     l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
     acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
 
@@ -185,24 +181,12 @@ def _fwd_kernel(
     acc = acc * l_recip
     acc = acc.to(Out.dtype.element_ty)
 
-    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[
-        None, :] * stride_od
+    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :] * stride_od
     out_ptrs = Out + off_o
     tl.store(out_ptrs, acc, mask=offs_m[:, None] < N_CTX)
 
 
-def _forward(ctx,
-             q,
-             k,
-             v,
-             block_sparse_mask,
-             sm_scale,
-             BLOCK_M=64,
-             BLOCK_N=64,
-             num_warps=None,
-             num_stages=1,
-             out=None):
-
+def _forward(ctx, q, k, v, block_sparse_mask, sm_scale, BLOCK_M=64, BLOCK_N=64, num_warps=None, num_stages=1, out=None):
     assert q.shape[-1] == k.shape[-1] == v.shape[-1]
     assert k.shape[2] == v.shape[2]
     o = out if out is not None else torch.empty_like(q).contiguous()
@@ -247,7 +231,6 @@ def _forward(ctx,
 
 
 class _sparse_attention(torch.autograd.Function):
-
     @staticmethod
     def forward(ctx, q, k, v, block_sparse_dense, sm_scale):
         # shape constraints
@@ -271,9 +254,9 @@ def test_topk_sparse_attention():
     torch.manual_seed(0)
 
     # Create inputs
-    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
+    q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    v = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
     sm_scale = 1.0 / (D_HEAD**0.5)
 
     # Create sparse mask (downsampled to block level)
@@ -281,9 +264,7 @@ def test_topk_sparse_attention():
     downsample_len = math.ceil(SEQ_LEN / downsample_factor)
     print("downsample_len", downsample_len)
 
-    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len],
-                       device='cuda',
-                       dtype=torch.bfloat16)
+    x_ds = torch.randn([BATCH, N_HEADS, downsample_len, downsample_len], device="cuda", dtype=torch.bfloat16)
     x_ds[:, :, :, 0] = 100
     print("x_ds.shape", x_ds.shape)
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
@@ -295,22 +276,21 @@ def test_topk_sparse_attention():
 
     # Compute reference
     # Expand block mask to full attention matrix
-    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda'))
+    full_mask = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda"))
     full_mask = full_mask[..., :SEQ_LEN, :SEQ_LEN].bool()
     full_mask = full_mask & torch.tril(torch.ones_like(full_mask))  # Apply causal
 
     # PyTorch reference implementation
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
-    attn = attn.masked_fill(~full_mask, float('-inf'))
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
+    attn = attn.masked_fill(~full_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     # print("ref_output", ref_output)
     # print("triton_output", triton_output)
 
     # Verify accuracy
-    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), \
-        "Triton output doesn't match reference"
+    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), "Triton output doesn't match reference"
     print("Pass topk sparse attention test with qlen == klen")
 
 
@@ -322,16 +302,15 @@ def test_topk_sparse_attention_qlt_kl():
     torch.manual_seed(0)
 
     # Create inputs.
-    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
-    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device='cuda', dtype=torch.bfloat16)
+    q = torch.randn(BATCH, N_HEADS, Q_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    k = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
+    v = torch.randn(BATCH, N_HEADS, K_LEN, D_HEAD, device="cuda", dtype=torch.bfloat16)
     # softmax scale
     sm_scale = 1.0 / (D_HEAD**0.5)
 
     downsample_factor = BLOCK
     downsample_len = math.ceil(K_LEN / downsample_factor)  # number of blocks along one dimension
-    x_ds = torch.randn(
-        BATCH, N_HEADS, downsample_len, downsample_len, device='cuda', dtype=torch.bfloat16)
+    x_ds = torch.randn(BATCH, N_HEADS, downsample_len, downsample_len, device="cuda", dtype=torch.bfloat16)
     # Force the first column to be high so that the first block is always selected.
     x_ds[:, :, :, 0] = 100
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
@@ -340,26 +319,25 @@ def test_topk_sparse_attention_qlt_kl():
 
     past_len = K_LEN - Q_LEN
 
-    attn = torch.einsum('bhsd,bhtd->bhst', q, k) * sm_scale
+    attn = torch.einsum("bhsd,bhtd->bhst", q, k) * sm_scale
 
-    full_mask_full = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device='cuda')).bool()
+    full_mask_full = torch.kron(block_mask.float(), torch.ones(BLOCK, BLOCK, device="cuda")).bool()
     full_mask_full = full_mask_full[..., :K_LEN, :K_LEN]
 
     effective_mask = full_mask_full[..., past_len:K_LEN, :]  # shape: (B, H, Q_LEN, K_LEN)
 
     i_global = torch.arange(past_len, K_LEN, device=k.device).unsqueeze(1)  # shape: (Q_LEN, 1)
     j_global = torch.arange(K_LEN, device=k.device).unsqueeze(0)  # shape: (1, K_LEN)
-    causal_mask = (j_global <= i_global)  # shape: (Q_LEN, K_LEN)
+    causal_mask = j_global <= i_global  # shape: (Q_LEN, K_LEN)
 
     final_mask = effective_mask & causal_mask  # shape: (B, H, Q_LEN, K_LEN)
 
-    attn = attn.masked_fill(~final_mask, float('-inf'))
+    attn = attn.masked_fill(~final_mask, float("-inf"))
     attn = F.softmax(attn, dim=-1)
-    ref_output = torch.einsum('bhst,bhtd->bhsd', attn, v)
+    ref_output = torch.einsum("bhst,bhtd->bhsd", attn, v)
 
     # Verify accuracy.
-    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), \
-        "Triton output doesn't match reference when qlen < klen"
+    assert torch.allclose(triton_output, ref_output, atol=1e-2, rtol=1e-2), "Triton output doesn't match reference when qlen < klen"
 
     print("Pass topk sparse attention test with qlen < klen")
 
diff --git a/examples/seer_attention/regression_block_sparse_attn_tilelang.py b/examples/seer_attention/regression_block_sparse_attn_tilelang.py
new file mode 100644
index 000000000..86d7b3b28
--- /dev/null
+++ b/examples/seer_attention/regression_block_sparse_attn_tilelang.py
@@ -0,0 +1,10 @@
+import tilelang.testing
+import block_sparse_attn_tilelang
+
+
+def regression_block_sparse_attn_tilelang():
+    tilelang.testing.process_func(block_sparse_attn_tilelang.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/sparse_tensorcore/regression_example_sparse_tensorcore.py b/examples/sparse_tensorcore/regression_example_sparse_tensorcore.py
new file mode 100644
index 000000000..1167c1603
--- /dev/null
+++ b/examples/sparse_tensorcore/regression_example_sparse_tensorcore.py
@@ -0,0 +1,11 @@
+import tilelang.testing
+import tilelang
+import tilelang_example_sparse_tensorcore
+
+
+def regression_example_sparse_tensorcore():
+    tilelang.testing.process_func(tilelang_example_sparse_tensorcore.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/sparse_tensorcore/tilelang_example_sparse_tensorcore.py b/examples/sparse_tensorcore/tilelang_example_sparse_tensorcore.py
index 59c79c283..f33832aff 100644
--- a/examples/sparse_tensorcore/tilelang_example_sparse_tensorcore.py
+++ b/examples/sparse_tensorcore/tilelang_example_sparse_tensorcore.py
@@ -1,7 +1,8 @@
 import torch
 import tilelang
 from tilelang.utils.sparse import compress_sm90
-from tilelang.layout import make_metadata_layout
+from tilelang.layout import make_cutlass_metadata_layout
+from tilelang import language as T
 import tilelang.testing
 
 
@@ -24,32 +25,24 @@ def matmul_sp(
     A_shared_shape = (block_M, block_K // 2)
     B_shared_shape = (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A_sparse: T.Tensor(A_sparse_shape, in_dtype),
-            E: T.Tensor((M, K // 8), 'uint8'),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // 8), "uint8"),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            E_shared = T.alloc_shared((block_M, block_K // 8), 'uint8')
+            E_shared = T.alloc_shared((block_M, block_K // 8), "uint8")
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.annotate_layout({
-                E:
-                    make_metadata_layout(
-                        E, mma_dtype="float16", arch="9.0", backend="cutlass", block_k=block_K),
-                E_shared:
-                    make_metadata_layout(
-                        E_shared,
-                        mma_dtype="float16",
-                        arch="9.0",
-                        backend="cutlass",
-                        block_k=block_K),
-            })
+            T.annotate_layout(
+                {
+                    E: make_cutlass_metadata_layout(E, mma_dtype=T.float16, arch="9.0", block_k=block_K),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=T.float16, arch="9.0", block_k=block_K),
+                }
+            )
             T.clear(C_local)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(E[by * block_M, k * block_K // 8], E_shared)
@@ -61,7 +54,7 @@ def main(
     return main
 
 
-def generate_2_to_4_sparse_tensor(shape, dtype=torch.float32, device='cpu'):
+def generate_2_to_4_sparse_tensor(shape, dtype=torch.float32, device="cpu"):
     if shape[-1] % 4 != 0:
         raise ValueError("Last dimension must be divisible by 4 for 2:4 sparsity.")
 
@@ -106,9 +99,9 @@ def run_gemm_sp(
         num_threads,
     )
 
-    A = generate_2_to_4_sparse_tensor((M, K), dtype=torch.float16, device='cuda')
+    A = generate_2_to_4_sparse_tensor((M, K), dtype=torch.float16, device="cuda")
     A_sparse, E = compress_sm90(A, block_k=block_K, transposed=False)
-    B = torch.randn((K, N), device='cuda', dtype=torch.float16)
+    B = torch.randn((K, N), device="cuda", dtype=torch.float16)
 
     C_sp = kernel(A_sparse, E, B).half()
     C = torch.matmul(A, B)
@@ -117,7 +110,46 @@ def run_gemm_sp(
 
 
 def main():
-    run_gemm_sp(512, 1024, 768, "float16", "float16", "float32", 128, 128, 128, 2, 128)
+    run_gemm_sp(512, 1024, 768, T.float16, T.float16, T.float32, 128, 128, 128, 2, 128)
+
+
+def run_regression_perf():
+    M, N, K, block_M, block_N, block_K, in_dtype, out_dtype, accum_dtype, num_stages, num_threads = (
+        512,
+        1024,
+        768,
+        128,
+        128,
+        128,
+        "float16",
+        "float16",
+        "float32",
+        2,
+        128,
+    )
+    kernel = matmul_sp(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        in_dtype,
+        out_dtype,
+        accum_dtype,
+        num_stages,
+        num_threads,
+    )
+    A = generate_2_to_4_sparse_tensor((M, K), dtype=torch.float16, device="cuda")
+    A_sparse, E = compress_sm90(A, block_k=block_K, transposed=False)
+    B = torch.randn((K, N), device="cuda", dtype=torch.float16)
+
+    from tilelang.profiler import do_bench
+
+    def run_kernel_only():
+        kernel(A_sparse, E, B)
+
+    return do_bench(run_kernel_only, backend="cupti")
 
 
 if __name__ == "__main__":
diff --git a/examples/topk/example_topk.py b/examples/topk/example_topk.py
index 0ca19fb18..ed5ba0d4a 100644
--- a/examples/topk/example_topk.py
+++ b/examples/topk/example_topk.py
@@ -22,19 +22,19 @@ def tl_topk(
     blk_m,
     threads=128,
 ):
-    dtype = "float32"
+    dtype = T.float32
 
     @T.prim_func
     def topk_kernel(
-            logits: T.Tensor([M, N], dtype),
-            topk_gates: T.Tensor([M, topk], dtype),
-            topk_indices: T.Tensor([M, topk], "int32"),
+        logits: T.Tensor([M, N], dtype),
+        topk_gates: T.Tensor([M, topk], dtype),
+        topk_indices: T.Tensor([M, topk], T.int32),
     ):
         with T.Kernel(T.ceildiv(M, blk_m), threads=threads) as bx:
             logits_frag = T.alloc_fragment([blk_m, N], dtype=dtype)
             max_val = T.alloc_fragment([blk_m], dtype=dtype)
-            expand_max_idx = T.alloc_fragment([blk_m, N], "int32")
-            max_idx = T.alloc_fragment([blk_m], "int32")
+            expand_max_idx = T.alloc_fragment([blk_m, N], T.int32)
+            max_idx = T.alloc_fragment([blk_m], T.int32)
 
             T.copy(logits[bx * blk_m, 0], logits_frag)
 
@@ -43,15 +43,12 @@ def topk_kernel(
                 T.reduce_max(logits_frag, max_val, dim=1, clear=True)
 
                 for i, j in T.Parallel(blk_m, N):
-                    expand_max_idx[i, j] = T.if_then_else(max_val[i] == logits_frag[i, j], j,
-                                                          expand_max_idx[i, j])
+                    expand_max_idx[i, j] = T.if_then_else(max_val[i] == logits_frag[i, j], j, expand_max_idx[i, j])
 
                 T.reduce_max(expand_max_idx, max_idx, dim=1, clear=True)
 
                 for i, j in T.Parallel(blk_m, N):
-
-                    logits_frag[i, j] = T.if_then_else(max_val[i] == logits_frag[i, j], -10000.0,
-                                                       logits_frag[i, j])
+                    logits_frag[i, j] = T.if_then_else(max_val[i] == logits_frag[i, j], -10000.0, logits_frag[i, j])
 
                 for i in T.Parallel(blk_m):
                     topk_gates[bx * blk_m + i, k] = max_val[i]
@@ -61,7 +58,6 @@ def topk_kernel(
 
 
 def ref_program(logits, top_k):
-
     top_k_gates, top_k_indices = logits.topk(top_k, dim=1)
 
     return top_k_gates, top_k_indices.to(torch.int32)
@@ -93,5 +89,29 @@ def main(argv=None):
     print(f"Tilelang latency: {tilelang_latency}")
 
 
+def run_regression_perf(argv=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--M", type=int, default=320, help="num_tokens")
+    parser.add_argument("--N", type=int, default=128, help="num_experts")
+    parser.add_argument("--topk", type=int, default=6, help="topk")
+    parser.add_argument("--blk_m", type=int, default=64, help="blk_m")
+    # In benchmark mode, ignore process-wide sys.argv unless an explicit argv is provided.
+    args = parser.parse_args(argv or [])
+    M, N, topk, blk_m = args.M, args.N, args.topk, args.blk_m
+
+    logits = torch.rand((M, N), device="cuda", dtype=torch.float32)
+
+    kernel = tl_topk(M=M, N=N, topk=topk, blk_m=blk_m)
+    tl_gates, tl_indices = kernel(logits)
+
+    torch_gates, torch_indices = ref_program(logits, topk)
+
+    torch.testing.assert_close(tl_gates, torch_gates)
+    torch.testing.assert_close(tl_indices, torch_indices)
+
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Auto)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/topk/regression_topk_tilelang.py b/examples/topk/regression_topk_tilelang.py
new file mode 100644
index 000000000..f59d866e8
--- /dev/null
+++ b/examples/topk/regression_topk_tilelang.py
@@ -0,0 +1,10 @@
+import tilelang.testing
+import example_topk
+
+
+def regression_example_topk():
+    tilelang.testing.process_func(example_topk.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/visual_layout_inference/visual_layout_inference.py b/examples/visual_layout_inference/visual_layout_inference.py
new file mode 100644
index 000000000..8fa1eaf85
--- /dev/null
+++ b/examples/visual_layout_inference/visual_layout_inference.py
@@ -0,0 +1,61 @@
+import tilelang
+import tilelang.language as T
+
+
+# use pass_configs to enable layout visualization
+@tilelang.jit(
+    out_idx=[-1],
+    pass_configs={
+        tilelang.PassConfigKey.TL_LAYOUT_VISUALIZATION_ENABLE: True,
+        tilelang.PassConfigKey.TL_LAYOUT_VISUALIZATION_FORMATS: "svg",
+    },
+)
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
+    @T.prim_func
+    def gemm(
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return gemm
+
+
+def main():
+    kernel = matmul(128, 128, 128, 32, 32, 32)
+
+    import torch
+
+    a = torch.randn(128, 128).cuda().half()
+    b = torch.randn(128, 128).cuda().half()
+
+    c = kernel(a, b)
+
+    ref_c = a @ b
+
+    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+    print("All check passed.")
+
+    # print the layout visualization result and save figures to ./tmp.
+    """
+    C_local inferenced layout:
+    Shape: [32, 32] -> [8]
+    Thread: _j // 16 * 64 + _i // 16 * 32 + _i % 8 * 4 + _j % 8 // 2
+    Index:  [_j % 16 // 8 * 4 + _i % 16 // 8 * 2 + _j % 2]
+    """
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/warp_specialize/example_warp_specialize_flashmla.py b/examples/warp_specialize/example_warp_specialize_flashmla.py
index 4a8f41ee4..155a45970 100644
--- a/examples/warp_specialize/example_warp_specialize_flashmla.py
+++ b/examples/warp_specialize/example_warp_specialize_flashmla.py
@@ -9,21 +9,23 @@
 
 @tilelang.jit(out_idx=[6])
 def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split):
-    scale = (1.0 / (dim + pe_dim))**0.5 * 1.44269504  # log2(e)
-    dtype = "float16"
-    accum_dtype = "float"
+    scale = (1.0 / (dim + pe_dim)) ** 0.5 * 1.44269504  # log2(e)
+    dtype = T.float16
+    accum_dtype = T.float32
     kv_group_num = heads // kv_head_num
     VALID_BLOCK_H = min(block_H, kv_group_num)
     assert kv_head_num == 1, "kv_head_num must be 1"
     h_dim = dim // 2
 
-    @T.macro
-    def flash_attn(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
+    @T.prim_func
+    def main_no_split(
+        Q: T.Tensor([batch, heads, dim], dtype),
+        Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
+        KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
+        K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
+        glse: T.Tensor([batch, heads, num_split], dtype),
+        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
+        Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(heads // min(block_H, kv_group_num), batch, threads=256) as (hid, bid):
             # smem_sQ
@@ -81,11 +83,6 @@ def flash_attn(
 
             cur_kv_head = hid // (kv_group_num // block_H)
 
-            T.annotate_layout({
-                O_shared_l: tilelang.layout.make_swizzled_layout(O_shared_l),
-                O_shared_r: tilelang.layout.make_swizzled_layout(O_shared_r),
-            })
-
             # barriers_Q
             q_shared_ready_barrier = T.alloc_barrier(arrive_count=256)
 
@@ -108,9 +105,9 @@ def flash_attn(
 
             tx = T.get_thread_binding()
 
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :h_dim], Q_shared_l)
-            T.copy(Q[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, h_dim:], Q_shared_r)
-            T.copy(Q_pe[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :h_dim], Q_shared_l)
+            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, h_dim:], Q_shared_r)
+            T.copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
             T.barrier_arrive(q_shared_ready_barrier)
             T.barrier_wait(q_shared_ready_barrier, 0)
 
@@ -123,25 +120,18 @@ def flash_attn(
                 T.fill(acc_o_l, 0)
                 T.fill(logsum_0, 0)
 
-                T.copy(KV[bid, block_N:2 * block_N, cur_kv_head, :h_dim], KV_shared_1_l)
+                T.copy(KV[bid, block_N : 2 * block_N, cur_kv_head, :h_dim], KV_shared_1_l)
                 T.barrier_arrive(kv_shared_1_l_is_ready)
 
-                T.copy(KV[bid, block_N:2 * block_N, cur_kv_head, h_dim:], KV_shared_1_r)
+                T.copy(KV[bid, block_N : 2 * block_N, cur_kv_head, h_dim:], KV_shared_1_r)
                 T.barrier_arrive(kv_shared_1_r_is_ready)
 
-                T.copy(K_pe[bid, block_N:2 * block_N, cur_kv_head, :], K_pe_shared_1)
+                T.copy(K_pe[bid, block_N : 2 * block_N, cur_kv_head, :], K_pe_shared_1)
                 T.barrier_arrive(kv_shared_1_pe_is_ready)
 
                 for k in T.serial(loop_range):
-
                     T.barrier_wait(kv_shared_0_l_is_ready, k % 2)
-                    T.gemm(
-                        Q_shared_l,
-                        KV_shared_0_l,
-                        acc_s_0,
-                        transpose_B=True,
-                        clear_accum=True,
-                        wg_wait=-1)
+                    T.gemm(Q_shared_l, KV_shared_0_l, acc_s_0, transpose_B=True, clear_accum=True, wg_wait=-1)
                     T.barrier_wait(kv_shared_0_r_is_ready, k % 2)
                     T.gemm(Q_shared_r, KV_shared_0_r, acc_s_0, transpose_B=True, wg_wait=-1)
 
@@ -161,8 +151,7 @@ def flash_attn(
                     for i, j in T.Parallel(block_H, block_N):
                         acc_s_0[i, j] = T.exp2(acc_s_0[i, j] * scale - scores_max[i] * scale)
                     for i in T.Parallel(block_H):
-                        scores_scale_0[i] = T.exp2(scores_max_prev_0[i] * scale -
-                                                   scores_max[i] * scale)
+                        scores_scale_0[i] = T.exp2(scores_max_prev_0[i] * scale - scores_max[i] * scale)
 
                     T.reduce_sum(acc_s_0, scores_sum_0, dim=1)
 
@@ -182,9 +171,7 @@ def flash_attn(
                     T.barrier_wait(scale_1_ready_barrier, k % 2)
 
                     if k < loop_range - 1:
-                        T.copy(
-                            KV[bid, (2 * k + 2) * block_N:(2 * k + 3) * block_N,
-                               cur_kv_head, :h_dim], KV_shared_0_l)
+                        T.copy(KV[bid, (2 * k + 2) * block_N : (2 * k + 3) * block_N, cur_kv_head, :h_dim], KV_shared_0_l)
                         T.barrier_arrive(kv_shared_0_l_is_ready)
 
                     # Step 11.
@@ -204,15 +191,10 @@ def flash_attn(
                     T.gemm(SP1_shared, KV_shared_1_l, acc_o_l)
 
                     if k < loop_range - 1:
-
-                        T.copy(
-                            KV[bid, (2 * k + 3) * block_N:(2 * k + 4) * block_N,
-                               cur_kv_head, :h_dim], KV_shared_1_l)
+                        T.copy(KV[bid, (2 * k + 3) * block_N : (2 * k + 4) * block_N, cur_kv_head, :h_dim], KV_shared_1_l)
                         T.barrier_arrive(kv_shared_1_l_is_ready)
 
-                        T.copy(
-                            K_pe[bid, (2 * k + 3) * block_N:(2 * k + 4) * block_N, cur_kv_head, :],
-                            K_pe_shared_1)
+                        T.copy(K_pe[bid, (2 * k + 3) * block_N : (2 * k + 4) * block_N, cur_kv_head, :], K_pe_shared_1)
                         T.barrier_arrive(kv_shared_1_pe_is_ready)
 
                 T.copy(logsum_0, logsum)
@@ -221,8 +203,7 @@ def flash_attn(
                 for i, j in T.Parallel(block_H, h_dim):
                     acc_o_l[i, j] /= logsum[i]
                 T.copy(acc_o_l, O_shared_l)
-                T.copy(O_shared_l, Output[bid,
-                                          hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H, :h_dim])
+                T.copy(O_shared_l, Output[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :h_dim])
 
             else:
                 T.copy(Q_pe_shared, Q_pe_local_1)
@@ -237,16 +218,9 @@ def flash_attn(
                 T.barrier_arrive(kv_shared_0_pe_is_ready)
 
                 for k in T.serial(loop_range):
-
                     # Step 2.
                     T.barrier_wait(kv_shared_1_l_is_ready, k % 2)
-                    T.gemm(
-                        Q_shared_l,
-                        KV_shared_1_l,
-                        acc_s_1,
-                        transpose_B=True,
-                        clear_accum=True,
-                        wg_wait=-1)
+                    T.gemm(Q_shared_l, KV_shared_1_l, acc_s_1, transpose_B=True, clear_accum=True, wg_wait=-1)
 
                     T.barrier_wait(kv_shared_1_r_is_ready, k % 2)
                     T.gemm(Q_shared_r, KV_shared_1_r, acc_s_1, transpose_B=True, wg_wait=-1)
@@ -265,8 +239,7 @@ def flash_attn(
                     T.copy(scores_max_1, scores_max)
 
                     for i in T.Parallel(block_H):
-                        scores_scale_1[i] = T.exp2(scores_max_prev_1[i] * scale -
-                                                   scores_max[i] * scale)
+                        scores_scale_1[i] = T.exp2(scores_max_prev_1[i] * scale - scores_max[i] * scale)
 
                     # Step 8.
                     for i, j in T.Parallel(block_H, block_N):
@@ -279,8 +252,7 @@ def flash_attn(
                         acc_o_r[i, j] = acc_o_r[i, j] * (scores_scale_0[i] * scores_scale_1[i])
 
                     for i in T.Parallel(block_H):
-                        logsum_1[i] = logsum_1[i] * scores_scale_1[i] * scores_scale_0[
-                            i] + scores_sum_1[i]
+                        logsum_1[i] = logsum_1[i] * scores_scale_1[i] * scores_scale_0[i] + scores_sum_1[i]
 
                     T.barrier_arrive(scale_1_ready_barrier)
 
@@ -291,9 +263,7 @@ def flash_attn(
                     T.barrier_arrive(s_shared_ready_barrier)
 
                     if k < loop_range - 1:
-                        T.copy(
-                            KV[bid, (2 * k + 3) * block_N:(2 * k + 4) * block_N, cur_kv_head,
-                               h_dim:], KV_shared_1_r)
+                        T.copy(KV[bid, (2 * k + 3) * block_N : (2 * k + 4) * block_N, cur_kv_head, h_dim:], KV_shared_1_r)
                         T.barrier_arrive(kv_shared_1_r_is_ready)
 
                     T.barrier_wait(p0_1_1_ready_barrier, k % 2)
@@ -301,15 +271,10 @@ def flash_attn(
                     T.gemm(SP0_shared, KV_shared_0_r, acc_o_r)
 
                     if k < loop_range - 1:
-
-                        T.copy(
-                            KV[bid, (2 * k + 2) * block_N:(2 * k + 3) * block_N, cur_kv_head,
-                               h_dim:], KV_shared_0_r)
+                        T.copy(KV[bid, (2 * k + 2) * block_N : (2 * k + 3) * block_N, cur_kv_head, h_dim:], KV_shared_0_r)
                         T.barrier_arrive(kv_shared_0_r_is_ready)
 
-                        T.copy(
-                            K_pe[bid, (2 * k + 2) * block_N:(2 * k + 3) * block_N, cur_kv_head, :],
-                            K_pe_shared_0)
+                        T.copy(K_pe[bid, (2 * k + 2) * block_N : (2 * k + 3) * block_N, cur_kv_head, :], K_pe_shared_0)
                         T.barrier_arrive(kv_shared_0_pe_is_ready)
 
                 T.barrier_wait(lse_0_ready_barrier, 0)
@@ -319,20 +284,7 @@ def flash_attn(
                 for i, j in T.Parallel(block_H, h_dim):
                     acc_o_r[i, j] /= logsum[i]
                 T.copy(acc_o_r, O_shared_r)
-                T.copy(O_shared_r, Output[bid, hid * VALID_BLOCK_H:(hid + 1) * VALID_BLOCK_H,
-                                          h_dim:])
-
-    @T.prim_func
-    def main_no_split(
-            Q: T.Tensor([batch, heads, dim], dtype),
-            Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
-            KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
-            K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-            glse: T.Tensor([batch, heads, num_split], dtype),
-            Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
-            Output: T.Tensor([batch, heads, dim], dtype),
-    ):
-        flash_attn(Q, Q_pe, KV, K_pe, Output)
+                T.copy(O_shared_r, Output[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, h_dim:])
 
     return main_no_split
 
@@ -352,31 +304,24 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
     dim = q.shape[-1]
     pe_dim = q_pe.shape[-1]
     num_head_groups = q.shape[1] // kv.shape[2]
-    scale = (dim + pe_dim)**0.5
-    q = rearrange(
-        q, 'b (h g) d -> b g h d', g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
+    scale = (dim + pe_dim) ** 0.5
+    q = rearrange(q, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, dim]
 
-    q_pe = rearrange(
-        q_pe, 'b (h g) d -> b g h d',
-        g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
+    q_pe = rearrange(q_pe, "b (h g) d -> b g h d", g=num_head_groups)  # [batch_size, num_head_groups, groups, pe_dim]
 
-    kv = rearrange(kv, 'b n h d -> b h n d')  # [batch_size, groups, seqlen_kv, dim]
+    kv = rearrange(kv, "b n h d -> b h n d")  # [batch_size, groups, seqlen_kv, dim]
 
-    k_pe = rearrange(k_pe, 'b n h d -> b h n d')  # [batch_size, num_head_groups, groups, pe_dim]
+    k_pe = rearrange(k_pe, "b n h d -> b h n d")  # [batch_size, num_head_groups, groups, pe_dim]
 
     query = torch.concat([q, q_pe], dim=-1)
     key = torch.concat([kv, k_pe], dim=-1)
 
-    scores = einsum(
-        query, key,
-        'b g h d, b h s d -> b g h s')  # [batch_size, num_head_groups, groups, seqlen_kv]
+    scores = einsum(query, key, "b g h d, b h s d -> b g h s")  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    attention = F.softmax(
-        scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
+    attention = F.softmax(scores / scale, dim=-1)  # [batch_size, num_head_groups, groups, seqlen_kv]
 
-    out = einsum(attention, kv,
-                 'b g h s, b h s d -> b g h d')  # [batch_size, num_head_groups, groups, dim]
-    out = rearrange(out, 'b g h d -> b (h g) d')  # [batch_size, heads, dim]
+    out = einsum(attention, kv, "b g h s, b h s d -> b g h d")  # [batch_size, num_head_groups, groups, dim]
+    out = rearrange(out, "b g h d -> b (h g) d")  # [batch_size, heads, dim]
     return out
 
 
@@ -399,12 +344,12 @@ def main(batch=1, heads=64, kv_heads=1, kv_ctx=1024, dim=512, pe_dim=64):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=1, help='batch size')
-    parser.add_argument('--heads', type=int, default=128, help='q heads number')
-    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
-    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
-    parser.add_argument('--dim', type=int, default=512, help='head dim')
-    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
+    parser.add_argument("--batch", type=int, default=132, help="batch size")
+    parser.add_argument("--heads", type=int, default=128, help="q heads number")
+    parser.add_argument("--kv_heads", type=int, default=1, help="kv heads number")
+    parser.add_argument("--kv_ctx", type=int, default=8192, help="kv context length")
+    parser.add_argument("--dim", type=int, default=512, help="head dim")
+    parser.add_argument("--pe_dim", type=int, default=64, help="pe head dim")
     args = parser.parse_args()
     batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
     main(batch, heads, kv_heads, kv_ctx, dim, pe_dim)
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_barrierpipe_stage2.py b/examples/warp_specialize/example_warp_specialize_gemm_barrierpipe_stage2.py
index 3f552795e..1672dbfb8 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_barrierpipe_stage2.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_barrierpipe_stage2.py
@@ -1,12 +1,13 @@
 import tilelang
 import tilelang.language as T
 
+tilelang.disable_cache()
+
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 @tilelang.jit(out_idx=[2])
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     num_stages = 2
     mbarrier_list = [128, 128] * num_stages
 
@@ -30,19 +31,13 @@ def main(
 
             for ko in range(T.ceildiv(K, block_K)):
                 with T.ws(1):
-                    T.mbarrier_wait_parity(
-                        mbarrier=ko % num_stages + num_stages,
-                        parity=((ko // num_stages) % num_stages) ^ 1)
-                    T.copy(A[by * block_M:(by + 1) * block_M, ko * block_K:(ko + 1) * block_K],
-                           A_shared[ko % num_stages, :, :])
-                    T.copy(B[ko * block_K:(ko + 1) * block_K, bx * block_N:(bx + 1) * block_N],
-                           B_shared[ko % num_stages, :, :])
+                    T.mbarrier_wait_parity(mbarrier=ko % num_stages + num_stages, parity=((ko // num_stages) % num_stages) ^ 1)
+                    T.copy(A[by * block_M : (by + 1) * block_M, ko * block_K : (ko + 1) * block_K], A_shared[ko % num_stages, :, :])
+                    T.copy(B[ko * block_K : (ko + 1) * block_K, bx * block_N : (bx + 1) * block_N], B_shared[ko % num_stages, :, :])
                     T.mbarrier_arrive(mbarrier=ko % num_stages)
                 with T.ws(0):
-                    T.mbarrier_wait_parity(
-                        mbarrier=ko % num_stages, parity=(ko // num_stages) % num_stages)
-                    T.gemm(A_shared[ko % num_stages, :, :], B_shared[ko % num_stages, :, :],
-                           C_local)
+                    T.mbarrier_wait_parity(mbarrier=ko % num_stages, parity=(ko // num_stages) % num_stages)
+                    T.gemm(A_shared[ko % num_stages, :, :], B_shared[ko % num_stages, :, :], C_local)
                     T.mbarrier_arrive(mbarrier=ko % num_stages + num_stages)
 
             with T.ws(0):
@@ -52,11 +47,14 @@ def main(
 
 
 def main(M=16384, N=16384, K=16384):
+    tilelang.disable_cache()
     block_M = 128
     block_N = 128
     block_K = 64
     jit_kernel = matmul(M, N, K, block_M, block_N, block_K)
 
+    print(jit_kernel.get_kernel_source())
+
     import torch
 
     a = torch.randn(M, K, device="cuda", dtype=torch.float16)
@@ -84,5 +82,15 @@ def main(M=16384, N=16384, K=16384):
     print(f"Latency: {latency} ms")
 
 
+def run_regression_perf(M=16384, N=16384, K=16384):
+    tilelang.disable_cache()
+    block_M = 128
+    block_N = 128
+    block_K = 64
+    jit_kernel = matmul(M, N, K, block_M, block_N, block_K)
+    profiler = jit_kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py b/examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py
index 9ba9f6816..b582ee74c 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py
@@ -5,20 +5,12 @@
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 @tilelang.jit(out_idx=[2])
-def matmul_warp_specialize_copy_0_gemm_1(M,
-                                         N,
-                                         K,
-                                         block_M,
-                                         block_N,
-                                         block_K,
-                                         dtype="float16",
-                                         accum_dtype="float"):
-
+def matmul_warp_specialize_copy_0_gemm_1(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -82,5 +74,27 @@ def main(M=1024, N=1024, K=1024):
     print(f"Latency: {latency} ms")
 
 
+def run_regression_perf(M=4096, N=4096, K=4096):
+    block_M = 128
+    block_N = 128
+    block_K = 64
+
+    jit_kernel = matmul_warp_specialize_copy_0_gemm_1(M, N, K, block_M, block_N, block_K)
+
+    import torch
+
+    a = torch.randn(M, K, device="cuda", dtype=torch.float16)
+    b = torch.randn(K, N, device="cuda", dtype=torch.float16)
+
+    c = jit_kernel(a, b)
+    ref_c = a @ b
+
+    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+
+    profiler = jit_kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_copy_1_gemm_0.py b/examples/warp_specialize/example_warp_specialize_gemm_copy_1_gemm_0.py
index faaf48c64..d6d243bb0 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_copy_1_gemm_0.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_copy_1_gemm_0.py
@@ -5,20 +5,12 @@
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 @tilelang.jit(out_idx=[2])
-def matmul_warp_specialize_copy_1_gemm_0(M,
-                                         N,
-                                         K,
-                                         block_M,
-                                         block_N,
-                                         block_K,
-                                         dtype="float16",
-                                         accum_dtype="float"):
-
+def matmul_warp_specialize_copy_1_gemm_0(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -83,5 +75,28 @@ def main(M=16384, N=16384, K=16384):
     print(f"Latency: {latency} ms")
 
 
+def run_regression_perf(M=16384, N=16384, K=16384):
+    block_M = 128
+    block_N = 128
+    block_K = 64
+
+    jit_kernel = matmul_warp_specialize_copy_1_gemm_0(M, N, K, block_M, block_N, block_K)
+
+    import torch
+
+    a = torch.randn(M, K, device="cuda", dtype=torch.float16)
+    b = torch.randn(K, N, device="cuda", dtype=torch.float16)
+
+    c = jit_kernel(a, b)
+
+    ref_c = a @ b
+
+    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+
+    profiler = jit_kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py b/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py
index c91274540..5468aa6ea 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py
@@ -5,26 +5,20 @@
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 @tilelang.jit(
-    out_idx=[2], pass_configs={
+    out_idx=[2],
+    pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-    })
-def matmul_warp_specialize_copy_1_gemm_0(M,
-                                         N,
-                                         K,
-                                         block_M,
-                                         block_N,
-                                         block_K,
-                                         dtype="float16",
-                                         accum_dtype="float"):
-
+    },
+)
+def matmul_warp_specialize_copy_1_gemm_0(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     warp_group_num = 2
     threads = 128 * warp_group_num
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py b/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py
index 3b1d86719..54566b785 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py
@@ -5,8 +5,7 @@
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
 @tilelang.jit(out_idx=[2])
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
         A: T.Tensor[(M, K), dtype],
@@ -79,5 +78,28 @@ def main(M=16384, N=16384, K=16384):
     print(f"Latency: {latency} ms")
 
 
+def run_regression_perf(M=16384, N=16384, K=16384):
+    block_M = 128
+    block_N = 128
+    block_K = 64
+
+    jit_kernel = matmul(M, N, K, block_M, block_N, block_K)
+
+    import torch
+
+    a = torch.randn(M, K, device="cuda", dtype=torch.float16)
+    b = torch.randn(K, N, device="cuda", dtype=torch.float16)
+
+    c = jit_kernel(a, b)
+
+    ref_c = a @ b
+
+    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+
+    profiler = jit_kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+
+    return profiler.do_bench(backend="cupti")
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/warp_specialize/regression_example_warp_specialize.py b/examples/warp_specialize/regression_example_warp_specialize.py
new file mode 100644
index 000000000..d5cd17d48
--- /dev/null
+++ b/examples/warp_specialize/regression_example_warp_specialize.py
@@ -0,0 +1,25 @@
+import tilelang.testing
+import example_warp_specialize_gemm_barrierpipe_stage2
+import example_warp_specialize_gemm_copy_0_gemm_1
+import example_warp_specialize_gemm_copy_1_gemm_0
+import example_warp_specialize_gemm_softpipe_stage2
+
+
+def regression_example_warp_specialize_gemm_barrierpipe_stage2():
+    tilelang.testing.process_func(example_warp_specialize_gemm_barrierpipe_stage2.run_regression_perf, M=1024, N=1024, K=1024)
+
+
+def regression_example_warp_specialize_gemm_copy_0_gemm_1():
+    tilelang.testing.process_func(example_warp_specialize_gemm_copy_0_gemm_1.run_regression_perf, M=1024, N=1024, K=1024)
+
+
+def regression_example_warp_specialize_gemm_copy_1_gemm_0():
+    tilelang.testing.process_func(example_warp_specialize_gemm_copy_1_gemm_0.run_regression_perf, M=1024, N=1024, K=1024)
+
+
+def regression_example_warp_specialize_gemm_softpipe_stage2():
+    tilelang.testing.process_func(example_warp_specialize_gemm_softpipe_stage2.run_regression_perf, M=1024, N=1024, K=1024)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/format.sh b/format.sh
index 8f127433c..3cc4390db 100755
--- a/format.sh
+++ b/format.sh
@@ -9,7 +9,7 @@
 #    bash format.sh --all
 #
 #
-# YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase.
+# Ruff (format) + Clang formatter (if installed). This script formats all changed files from the last mergebase.
 # You are encouraged to run this locally before pushing changes for review.
 
 # Cause the script to exit if a single command fails
@@ -29,10 +29,7 @@ ALL_FILES=''
 ONLY_CHANGED=''
 FILES=()
 if (($# == 0)); then
-    if [[ -n "$(git status --porcelain --ignore-submodules --untracked-files=no)" ]]; then
-        echo "Detected uncommitted changes. Please commit or stash them before running $0." >&2
-        exit 1
-    fi
+    # Default: allow dirty workspace; run on changed files (committed + worktree)
     ONLY_CHANGED='true'
 else
     while (($# > 0)); do
@@ -78,14 +75,17 @@ if [[ -n "${ALL_FILES}" ]]; then
     echo "Checking all files..." >&2
 elif [[ -n "${ONLY_CHANGED}" ]]; then
     MERGE_BASE="$(get_merge_base)"
-    echo "Checking changed files compared to merge base (${MERGE_BASE})..." >&2
+    echo "Checking changed files vs merge base (${MERGE_BASE}) and working tree..." >&2
 elif [[ "${#FILES[@]}" -gt 0 ]]; then
     echo "Checking specified files: ${FILES[*]}..." >&2
 fi
 
+# Some systems set pip's default to --user, which breaks isolated virtualenvs.
+export PIP_USER=0
+
 # If pre-commit is not installed, install it.
 if ! python3 -m pre_commit --version &>/dev/null; then
-    python3 -m pip install pre-commit
+    python3 -m pip install pre-commit --user
 fi
 
 echo 'tile-lang pre-commit: Check Start'
@@ -93,7 +93,17 @@ echo 'tile-lang pre-commit: Check Start'
 if [[ -n "${ALL_FILES}" ]]; then
     python3 -m pre_commit run --all-files
 elif [[ -n "${ONLY_CHANGED}" ]]; then
-    python3 -m pre_commit run --from-ref "${MERGE_BASE}" --to-ref HEAD
+    # Collect changed files (committed since merge-base + current worktree)
+    CHANGED_FILES="$(git diff --name-only --diff-filter=ACM "${MERGE_BASE}" 2>/dev/null || true)"
+    if [[ -n "${CHANGED_FILES}" ]]; then
+        echo "Running pre-commit on changed files:"
+        echo "${CHANGED_FILES}"
+        # Convert newline-separated files to space-separated and run pre-commit once
+        CHANGED_FILES_SPACE="$(echo "${CHANGED_FILES}" | tr '\n' ' ')"
+        python3 -m pre_commit run --files ${CHANGED_FILES_SPACE}
+    else
+        echo "No files changed relative to merge base and worktree. Skipping pre-commit."
+    fi
 elif [[ "${#FILES[@]}" -gt 0 ]]; then
     python3 -m pre_commit run --files "${FILES[@]}"
 fi
@@ -105,7 +115,7 @@ echo 'tile-lang clang-tidy: Check Start'
 if [[ -x "$(command -v run-clang-tidy)" ]]; then
     # Check if clang-tidy is available
     if [[ ! -x "$(command -v clang-tidy)" ]]; then
-        python3 -m pip install --upgrade --requirements "${ROOT}/requirements-lint.txt"
+        python3 -m pip install --upgrade --requirements "${ROOT}/requirements-lint.txt" --user
     fi
     # Get clang-tidy version
     CLANG_TIDY_VERSION="$(clang-tidy --version | head -n1 | awk '{print $4}')"
diff --git a/images/MatmulExample.svg b/images/MatmulExample.svg
index 6e20daf55..294e8f631 100644
--- a/images/MatmulExample.svg
+++ b/images/MatmulExample.svg
@@ -1 +1 @@
-<svg width="5243" height="2012" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><defs><clipPath id="clip0"><rect x="1237" y="5367" width="5243" height="2012"/></clipPath><clipPath id="clip1"><rect x="1237" y="5367" width="5243" height="2012"/></clipPath><image width="2741" height="1197" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAACrUAAAStCAYAAADgNkLIAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEbAAUAAAABAAAAUgEoAAMAAAABAAIAAIdpAAQAAAABAAAAWgAAAAAAAACQAAAAAQAAAJAAAAABAAOgAQADAAAAAQABAACgAgAEAAAAAQAACrWgAwAEAAAAAQAABK0AAAAAJ6/OxQAAAAlwSFlzAAAWJQAAFiUBSVIk8AAAQABJREFUeAHs3X2MXld5IPDr1F84W4aRUmeAxiATgSNjy+tptRPKpPHYwjOKZY+lick6iWOtE3nWbmviRTukWzSkKU0olI9q4Y9GSC0UVKVUabpICSsIu1A14Y8EiSwCKmBVoLCEaEOgiQp0yd4z5J3c97wf887M+77363cl79xz7r3nPOf3jNVU+/RxkrgIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQI5CzwSznvb3sCBAgQIEBg/QKz6RJb0z//Z/1LWYEAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIDA2gS+mH52/9o+9RUBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACB9QuELq3Pv/Bn3/qXswIBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACB1Qt8Nv2kUdSqW+vq/XxBgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQEIENBYlDGAQIECBAgMDqBUJn1i9mP/ulK65KNmzZlp1yT4AAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQKAUApeUIkpBEiBAgAABAu0EFuPJn//f78VTxgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgRKIaCotRRpEiQBAgQIEGgRCF1aZ+PZ55/9YfL8T56Lp40JECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIFF5gY+EjFCABAgQIECDQTuBsu8kw9/++/ZVOj8wTIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKJTAxivHl+PRqXWZwg0BAgQIECiNwFga6enSRCtQAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAj0IKGrtAckrBAgQIECgYAILaTxbCxaTcAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAisS0BR67r4fEyAAAECBIYuELq0zg99VxsSIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQGLCAotYBA1ueAAECBAj0WeCGdL2mLq2bt2xKRreP9HkbyxEgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAYroCi1uF6240AAQIECKxHIBSzLsQLzJyaSubOHYmnjQkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAiUSkBRa6nSJVgCBAgQqLnAfHr+saxB6NIaClpDYaturVkZ9wQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAmUTUNRatoyJlwABAgTqLNC2S2soZm0Ut9YZx9kJECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgTKLaCotdz5Ez0BAgQI1EdgNj1qU5fWcPSD108uC+jWukzhhgABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoIQCilpLmDQhEyBAgEAtBRbjU09Mjyc7d+9YntatdZnCDQECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQAkFFLWWMGlCJkCAAIHaCYQurfviU5+8eDyeSnRrbSExQYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUBIBRa0lSZQwCRAgQKDWAhfi048f2NvUpbXxXLfWhoSfBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECZRNQ1Fq2jImXAAECBOomEDq0XhsfeuamA/HU8li31mUKNwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAiUSUNRaomQJlQABAgRqKbAYn3rn7h3JxPR4PL081q11mcINAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAiQQUtZYoWUIlQIAAgdoJhC6ts/GpT148Hk+1jHVrbSExQYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUHABRa0FT5DwCBAgQKDWAmfj019+xWVdu7Q23tettSHhJwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQFkEFLWWJVPiJECAAIG6CYylBz4dH/romcPxVMexbq0daTwgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAooICi1gImRUgECBAgQCAVWEj/bM1KjG4fSUKhaq+Xbq29SnmPAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgCAKKWouQBTEQIECAAIFmgdCldb55Kknmzh1JQqHqai7dWlej5V0CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIE8BRS15qlvbwIECBAg0F7ghnS6qUtrKGY99ObJ9m93mdWttQuORwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAoUSUNRaqHQIhgABAgQILBWzLsQOoePqpS/dFk/3NNattScmLxEgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECOQsoKg15wTYngABAgQIRALz6XgsO7febqvr/T4bi3sCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECgxJQ1DooWesSIECAAIG1CbTt0jq6fWRtq73wlW6t6+LzMQECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwBAEFLUOAdkWBAgQIECgR4HZ9L2mLq3hu4PXT/b4eefXdGvtbOMJAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAMQQUtRYjD6IgQIAAAQJBYDFmmJgeT3bu3hFPr2msW+ua2HxEgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECAwJAFFrUOCtg0BAgQIEFhBIHRp3Re/c/Li8XhqzePQrfXomcNr/t6HBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAYpoKh1kLrWJkCAAAECvQtciF8dP7C3b11aG2sfOjGZhOJWFwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIGiCShqLVpGxEOAAAECdRQIHVqvjQ8+c9OBeGrd49HtI8nMqal1r2MBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAv0WUNTab1HrESBAgACB1Qssxp/s3L0jmZgej6f7Mp47d0S31r5IWoQAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQKCfAopa+6lpLQIECBAgsHqB0KV1Nv7s5MXj8VTfxrq19o3SQgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAn0UUNTaR0xLESBAgACBNQicjb+5/IrLBtaltbGXbq0NCT8JECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgSKIqCotSiZEAcBAgQI1FFgLD306fjgR88cjqf6Ptatte+kFiRAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEFingKLWdQL6nAABAgQIrENgIf12a/b7YRab6taalXdPgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECCQt4Ci1rwzYH8CBAgQqKtA6NI6Hx9+mIWmwyygjc9pTIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQCAWUNQaixgTIECAAIHhCNyQbtPUpXXzlk3JoTdPDmf3F3YZZhHtUA9mMwIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgdIJKGotXcoETIAAAQIVEAjFrAvxOWZOTSWXvnRbPD3QsW6tA+W1OAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwCoEFLWuAsurBAgQIECgTwLz6Tpj2bVCl9bQNTWPS7fWPNTtSYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgEAsoao1FjAkQIECAwOAFWrq0HjwxmYSuqXlcurXmoW5PAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBWEBRayxiTIAAAQIEBiswmy7f1KU1bDdz89Rgd11hdd1aVwDymAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAYOACiloHTmwDAgQIECDQJLDYNEoHE9Pjyc7dO+LpoY51ax0qt80IECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgTaCChqbYNiigABAgQIDEggdGndF6998uLxeCqXsW6tubDblAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBA4AUBRa1+FQgQIECAwPAELsRb7bn6qty7tDZi0q21IeEnAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAHgKKWvNQtycBAgQI1FEgdGi9Nj74sVvfFE/lOtatNVd+mxMgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEai2gqLXW6Xd4AgQIEBiiwGK8187dO5KJ6fF4Otexbq258tucAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIFBrAUWttU6/wxMgQIDAkARCl9bZeK+TF4/HU4UY69ZaiDQIggABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQOwFFrbVLuQMTIECAQA4Ct8R7ho6oRevS2ohRt9aGhJ8ECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQLDFFDUOkxtexEgQIBAHQXG0kPPxwcP3VCLfOnWWuTsiI0AAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUE2BjdU8llMRIECAAIHCCCykkWzNRlOGTqiNGB+491PZ0N0TIECAAAECBAYi8Pzzzw9kXYsSIECAAAECzQIbNmxonjCqrID/vqpsah2MAAECBAom4L+vCpYQ4RAgQIAAAQKVENCptRJpdAgCBAgQKKjAy9K42nZp3bxlU0FDfjEs3VpftHBHgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECAweAFFrYM3tgMBAgQI1FfgdHr0pi6toZj1mmMTpRBpdGstRbCCJECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKL2AotbSp9ABCBAgQKCgAqGYdSGObebUVBKKRcty6dZalkyJkwABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQfoGN5T+CExAgQIAAgUIKzKdRjWUjC11aQ5Foma5Gt9YH7v1UmcIWKwECBAgQIECAAAECBAgQIECAAAECBAgQIECAQJ8Fnn/++T6vaDkCBAgQIECgncCGDRvaTddmTlFrbVLtoAQIECAwZIEL8X4HT0yWqktrI/6D108milobGn4SIECAAAECwxL47tvPDGsr+wxI4BV3fbhlZXltISndhLyWLmU9BSyvPTGV7qV2eS3dIQTcVwH/c7ivnLks1u7vtbzmkoq+biqvfeUszGLyWphU9DWQdnnt6wYWI0CAAAECBAgQWBK4hAMBAgQIECDQd4HZdMVXx6vO3DwVT5VivHP3jmRierwUsQqSAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgvAKKWsubO5ETIECAQHEFFuPQQlFoKA4t63Xy4vGyhi5uAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBkggoai1JooRJgAABAqURCF1a98XRlr0oVLfWOKPGBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAEC/RbY2O8FrUeAAAECBGoucDY+/56rryp1l9bGeUJh7qMPPdYY+kmAAAECBAgQIECAAAECBAgQIECAAAECBAgQIFBzgff9w1trLlD+49/+2ve0HEJeW0hKN9Eur999+5nSnUPAzQKvuOvDzRPpSF5bSEo30S6vpTtEnwPWqbXPoJYjQIAAgVoLhA6t07HAsVvfFE+VcqxbaynTJmgCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQGkEFLWWJlUCJUCAAIESCCzGMVatEDR0a3URIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQGISAotZBqFqTAAECBOooELq0zsYHr1oRaNWKdON8GRMgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECOQnoKg1P3s7EyBAgEC1BG6JjzO6fSSZmB6Pp0s/rlqhbukT4gAECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgYoIKGqtSCIdgwABAgRyFRhLd5+PI5g7dySeqsRYt9ZKpNEhCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQKFE1DUWriUCIgAAQIESiiwkMa8NRt36NI6c2oqO1Wpe91aK5VOhyFAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIFEJAUWsh0iAIAgQIECixwMvS2Nt2ad28ZVOJj9U9dN1au/t4SoAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgsHoBRa2rN/MFAQIECBDICpxOB01dWkMx6zXHJrLvVPJet9ZKptWhCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQK5CShqzY3exgQIECBQAYFQzLoQn2Pm1FQyun0knq7cWLfWyqXUgQgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECuQooas2V3+YECBAgUHKB+TT+sewZQpfWuXNHslOVvtettdLpdTgCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwFAFFLUOldtmBAgQIFAxgQvxeQ6emKxFl9bGuXVrbUj4SYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgsF4BRa3rFfQ9AQIECNRVYDY9+Kvjw8/cPBVPVX6sW2vlU+yABAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIGhCChqHQqzTQgQIECgggKL8ZkmpseT0Lm0bpdurXXLuPMSIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAYjoKh1MK5WJUCAAIFqC4QurfviI9a5Y2mdzx7/HhgTIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAisTUBR69rcfEWAAAEC9RY4Gx9/1/4ra9mlteGgW2tDwk8CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIG1CihqXauc7wgQIECgrgKhQ+t0fPi589fFU7Ub69Zau5Q7MAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgrwKKWvvKaTECBAgQqIHAYnxGXUp/IcIh/s0wJkCAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQWI2AotbVaHmXAAECBOouELq0zsYIOpS+KMLiRQt3BAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECqxNQ1Lo6L28TIECAQL0FbomPP7p9JNl/YG88Xduxbq21Tb2DEyBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgTWLaCodd2EFiBAgACBmgiMpeecj886d+5IsnnLpni61mPdWmudfocnQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECKxZQFHrmul8SIAAAQI1E1hIz7s1e+bQpXXm1FR2yn0qoFurXwMCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIG1CChqXYuabwgQIECgbgIvSw/c0qX16JnDurR2+E04dP0bOzwxTYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQKC9gKLW9i5mCRAgQIBAVuB0Omjq0rp5y6bk0InJ7DvuMwIT0+NLHVszU24JECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIdBVQ1NqVx0MCBAgQILBUzLoQO8ycmkpGt4/E08YZgZMXj2dGbgkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAh0F1DU2t3HUwIECBAgMJ8SjGUZQpfWuXNHslPu2wjo1toGxRQBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEBHAUWtHWk8IECAAAECSwIXYodrjk3o0hqjdBjr1toBxjQBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECLgKLWFhITBAgQIEBgWWA2vXv18uiFm6O3Ho6njDsI6NbaAcY0AQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAi4Ci1hYSEwQIECBAYFlgcfnuhRtFmrHIymPdWlc28gYBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECSKGr1W0CAAAECBNoLhC6t++JHCjRjkZXHCoFXNvIGAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQICAola/AwQIECBAoJPA2fjBrv1XJjt374injXsQUAzcA5JXCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQI1F9Cptea/AI5PgAABAm0FQofW6fjJ3Pnr4injHgV0a+0RymsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgRoLKGqtcfIdnQABAgQ6CizGT0KH1lCY6Vq7gG6ta7fzJQECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgDgKKWuuQZWckQIAAgdUIhC6ts/EHCjJjkdWPdWtdvZkvCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQJ1ElDUWqdsOysBAgQI9CJwS/zS6PaRZP+BvfG08RoEFAevAc0nBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIGaCChqrUmiHZMAAQIEehIYS9+aj9+cO3ck2bxlUzxtvAYB3VrXgOYTAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEBNBBS11iTRjkmAAAECPQkspG9tzb4ZurTOnJrKTrlfp4BuresE9DkBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoKICilormljHIkCAAIFVC4Ri1pYurUfPHNalddWU3T/QrbW7j6cECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgboKKGqta+admwABAgRigVDQ2tSldfOWTcmhE5Pxe8Z9ENCttQ+IliBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIVExAUWvFEuo4BAgQILAmgVDMuhB/OXNqKhndPhJPG/dBQLfWPiBaggABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQMQFFrRVLqOMQIECAwJoEQpfWseyXoUvr3Lkj2Sn3fRbQrbXPoJYjQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECJRcQFFryRMofAIECBDoi8DZeJVrjk3o0hqj9HmsW2ufQS1HgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECi5gKLWkidQ+AQIECCwboHZdIVd8SpHbz0cTxkPQEC31gGgWpIAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUFIBRa0lTZywCRAgQKBvAovxSjqIxiKDG7MenK2VCRAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQJlE1DUWraMiZcAAQIE+ikQurTuixfUPTQWGeyY92B9rU6AAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKIuAotayZEqcBAgQIDAIgVviRXftvzLZuXtHPG08QAHdWgeIa2kCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQIkEFLWWKFlCJUCAAIG+CoQOraFTa9M1d/66prHBcAR0ax2Os10IECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAkUWUNRa5OyIjQABAgQGKbAYLx46tIauoa7hC+jWOnxzOxIgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEiiagqLVoGREPAQIECAxDYFe6SUuXVt1Ch0HfeY+D1092fugJAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBA5QUUtVY+xQ5IgAABAm0EzsZzo9tHkv0H9sbTxkMUmDk1lYQ8uAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBOopoKi1nnl3agIECNRZYCw9/HwMMHfuSLJ5y6Z42niIAsE/5MFFgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQTwFFrfXMu1MTIECgzgIL6eG3ZgFCd9DQJdSVv4BurfnnQAQECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgbwEFLXmJW9fAgQIEMhDIBSztnRpnblpSpfWPLLRZk/dWtugmCJAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQI1ERAUWtNEu2YBAgQILAkEApam7q0hiLKmZt1aS3S74durUXKhlgIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAsMTUNQ6PGs7ESBAgEC+AqGYdSEOQQFlLJL/WLfW/HMgAgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAHgKKWvNQtycBAgQI5CEQurSOZTdWPJnVKNa9YuNi5UM0BAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIFhCChqHYayPQgQIECgCAJn4yAmpseT0e0j8bRxAQQUHBcgCUIgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECAxZQFHrkMFtR4AAAQK5CMymu+6Kd547fySeMi6QgG6tBUqGUAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECQxBQ1DoEZFsQIECAQO4Ci3EEoUvrzt074mnjAgno1lqgZAiFAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIDAEAUWtQ0C2BQECBAjkKhC6tO6LIzh58Xg8ZVxAAd1aC5gUIREgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEBiSgqHVAsJYlQIAAgcII3BJHEjq06tIaqxRzrFtrMfMiKgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIDAIAQUtQ5C1ZoECBAgUBSB0KE1dGptunRpbeIo/EC31sKnSIAECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgb4IKGrtC6NFCBAgQKCgAotxXKFD68T0eDxtXGAB3VoLnByhESBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgT6KKCotY+YliJAgACBQgnsSqNp6dI6d/5IoYIUTG8CurX25uQtAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECZBRS1ljl7YidAgACBbgJn44ej20d0aY1RSjLWrbUkiRImAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQGAdAopa14HnUwIECBAorMBYGtl8HN3cuSNJKI50lVNAt9Zy5k3UBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIFeBRS19irlPQIECBAok8BCGuzWbMChS2soinSVV0C31vLmTuQECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgV4EFLX2ouQdAgQIECiTQChmbenSOnPTlC6tZcpih1h1a+0AY5oAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUAEBRa0VSKIjECBAgECTQChoberSGjp8ztysS2uTUkkHurWWNHHCJkCAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAj0IKCotQckrxAgQIBAaQRCMetCHK3unrFIucfyWe78iZ4AAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0ElAUWsnGfMECBAgUEaB0KV1LBu4zp5ZjWrcy2k18ugUBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIFYQFFrLGJMgAABAmUWOBsHPzE9noxuH4mnjUsuoFtryRMofAIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAGwFFrW1QTBEgQIBAKQVm06h3xZHPnT8STxlXQEC31gok0REIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABApGAotYIxJAAAQIESiuwGEceurTu3L0jnjauiIBurRVJpGMQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBF4QUNTqV4EAAQIEqiAQurTuiw9y8uLxeMq4QgKhW+s1xyYqdCJHIUCAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAjUW0BRa73z7/QECBCoisAt8UFCh1ZdWmOV6o3nzh1JQnGriwABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoPwCilrLn0MnIECAQN0FQofW0Km16dKltYmjsoPR7SPJzKmpyp7PwQgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECdRJQ1FqnbDsrAQIEqimwGB8rdGidmB6Pp40rKqBba0UT61gECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBA7QQUtdYu5Q5MgACBSgnsSk/T0qV17vyRSh3SYboL6Nba3cdTAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEBZBBS1liVT4iRAgACBdgJn48lQ4KhLa6xS/bFurdXPsRMSIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAEC1RdQ1Fr9HDshAQIEqiowlh5sPj6c4sZYpB5j3VrrkWenJECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBKotoKi12vl1OgIECFRZYCE93NbsARU2ZjXqd6+guX45d2ICBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgWgKKWquVT6chQIBAXQRCMWtLl9aZm6aSzVs21cXAOSMBRc0RiCEBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoGQCilpLljDhEiBAgMCSQChoberSGopZZ26ewlNzAd1aa/4L4PgECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAqQUUtZY6fYInQIBALQVCMetCfPKZU1NJ6NTpqreAbq31zr/TEyBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAuUWUNRa7vyJngABAnUUCF1ax7IHD11aQ4dOF4EgoFur3wMCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQDkFFLWWM2+iJkCAQJ0FzsaH339gry6tMUqNx7q11jj5jk6AAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAiUWkBRa6nTJ3gCBAjUTmA2PfGu+NQnLx6Pp4xrLqBba81/ARyfAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKKWAotZSpk3QBAgQqK3AYnzyienxZOfuHfG0cc0FdGut+S+A4xMgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQKlFFDUWsq0CZoAAQK1FJhOT70vPvnc+eviKWMCSwK6tfpFIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAiUS0BRa7nyJVoCBAjUWeBsfPjQoXXX/ivjaWMCSwK6tfpFIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAiUS0BRa7nyJVoCBAjUVSB0aJ2ND3/y4vF4yphAk4BurU0cBgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBQgsoai10egRHgAABAi8ILMYSoUvrxPR4PG1MoElAt9YmDgMCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQKEFFLUWOj2CI0CAAIFU4NXpn5YurUdvPQyHQE8CurX2xOQlAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEDuAopac0+BAAgQIEBgBYEL8fPQffOaYxPxtDGBtgK6tbZlMUmAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKJyAotbCpURABAgQIJARGEvv5zPjpVudN2MR45UE/M6sJOQ5AQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQCB/AUWt+edABAQIECDQWWAhfbQ1+1jXzayG+14F/N70KuU9AgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEB+Aopa87O3MwECBAh0FwjFrKfjVw6dmEw2b9kUTxsTWFFAt9YVibxAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEMhVQFFrrvw2J0CAAIEuAvPps5dln4di1qNnDmen3BPoWUC31p6pvEiAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQyEVAUWsu7DYlQIAAgRUEQpfWhfidmVNTSShMdBFYq4BurWuV8x0BAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAYPACiloHb2wHAgQIEFi9QOjSOpb9LHRpDQWJLgLrEQhF0RPT4+tZwrcECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIDElDUOiBYyxIgQIDAugRuib/ef2CvLq0xivGaBObOK45eE5yPCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIDFlDUOmBgyxMgQIDAqgVm0y/2xV+dvHg8njImsCaBnbt36Na6JjkfESBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQGK6CodbC+VidAgACB1Qssxp+Efy4+FCK6CPRLQJF0vyStQ4AAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBDon4Ci1v5ZWokAAQIE1i8wnS7R0qV17vx161/ZCgQyArq1ZjDcEiBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQKIqCotSCJEAYBAgQILAmcjR1C8eGu/VfG08YE1i2gW+u6CS1AgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEOirgKLWvnJajAABAgTWIRA6tM7G3ys8jEWM+yWgW2u/JK1DgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEOiPgKLW/jhahQABAgTWL7AYL6HoMBYx7reAoul+i1qPAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQILB2AUWta7fzJQECBAj0T+DV6VItXVqP3nq4fztYiUAbAYXTbVBMESBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgRyEtiY0762JUCAAAECWYEL2UG4H90+klxzbCKeNibQd4HQrfXRhx7r+7oWJECAAAECBNYn8Iq7Pry+BXxdSAF5LWRa1h2UvK6bsJALyGsh0yIoAusS8Pd6XXyF/VheC5uadQUmr+viK+zH8lrY1AiMAAECBAgQIECgYAI6tRYsIcIhQIBADQXG0jPPx+eeO3ck2bxlUzxtTKDvArq19p3UggQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBNQkoal0Tm48IECBAoI8CC+laW7PrhS6tM6emslPuCQxUIHRrdREgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECOQroKg1X3+7EyBAoO4CoZj1dIxw6MSkLq0xivFABXRrHSivxQkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECPQkoau2JyUsECBAgMCCB+XTdl2XX3rxlU3L0zOHslHsCQxHQrXUozDYhQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECHQU2NDxiQcECBAgQGCwAqFL6/9O/4xltzl22+HktnfcmJ1yT2BoAn9w5gPJow89NrT9bESAAAECBAj8QuD5559vofju28+0zJkol8Ar7vpwS8Dy2kJSugl5LV3KegpYXntiKt1L7fK6YYP/L4HSJXKNAfvvqzXCFfyzdn+v/fdVwZPWQ3jy2gNSCV+R1xImrYeQ2+W1bv991e6/MXqg8woBAgQIECCwSoG6/TdG4Nl45fiykk6tyxRuCBAgQGDIAqFLa1NBa+jSOnfuyJDDsB2BFwV0a33Rwh0BAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAYNgCilqHLW4/AgQIEGgI3NK4afzcf2BvMrp9pDH0k8DQBXbu3pFMTL/4f/0z9ABsSIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBCosYCi1hon39EJECCQo8Bsuve+eH9dMmMR4zwE/B7moW5PAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECSKGr1W0CAAAECeQgsxpuG7pihS6aLQN4CurXmnQH7EyBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAnUV2FjXgzs3AQIECOQmMJ3u3NKlde78dbkFZGMCsUDo1vroQ4/F08YECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIZgff9w1szI7dlFLj9te9pCVteW0hKN9Eur999+5nSnUPAzQKvuOvDzRPpSF5bSEo30S6vpTtEnwPWqbXPoJYjQIAAgRUFzsZvhM6Yu/ZfGU8bE8hNQLfW3OhtTIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECNRYQFFrjZPv6AQIEMhBIHRonY33DV0xXQSKJuD3smgZEQ8BAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQdQFFrVXPsPMRIECgWAKLcTg6YsYixkUR8LtZlEyIgwABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEKiLgKLWumTaOQkQIJC/wKvTEFq6tM7cPJV/ZCIg0EFAt9YOMKYJECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgMQUNQ6AFRLEiBAgEBbgQvx7Oj2keTgicl42phAYQR0ay1MKgRCgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQI1EBAUWsNkuyIBAgQKIDAWBrDfBzH3LkjyeYtm+JpYwKFEtCttVDpEAwBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQYQFFrRVOrqMRIECgQAILaSxbs/GELq0zp6ayU+4JFFIgdGvdtf/KQsYmKAIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAlQQUtVYpm85CgACBYgqEYtbTcWjXHJvQpTVGMS6swNz56wobm8AIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAlURUNRalUw6BwECBIorMJ+G9rJseJu3bErmzh3JTrknUGiBienxJHRsdREgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECAxOQFHr4GytTIAAAQJJErq0LsQQM6emktHtI/G0MYFCC5y8eLzQ8QmOAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIFB2AUWtZc+g+AkQIFBsgdCldSwOUZfWWMS4DAK6tZYhS2IkQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEyiygqLXM2RM7AQIEii9wSxxiKAzUpTVWMS6LgG6tZcmUOAkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIEyCihqLWPWxEyAAIFyCMymYe6LQ1UUGIsYl0lAt9YyZUusBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQNkEFLWWLWPiJUCAQHkEFuNQFQTGIsZlFFCYXcasiZkAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAog4Ci1jJkSYwECBAon8B0GnJLl9Zjt76pfCcRMYFIQHF2BGJIgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEOiTgKLWPkFahgABAgSaBM42jdLBzt07kj1XXxVPGxMopYBuraVMm6AJECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBggsoai14goRHgACBEgqEDq2zcdyKAGMR4zIL6NZa5uyJnQABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECiqgKLWomZGXAQIECivwGIceujSGooAXQSqJKBQu0rZdBYCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgCAKKWouQBTEQIECgOgJj6VFaurTO3DxVnRM6CYEXBHRr9atAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEOivgKLW/npajQABAnUXWIgBRrePJAdPTMbTxgQqIaBbayXS6BAECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAQQQUtRYkEcIgQIBABQRCl9b5+Bxz544km7dsiqeNCVRCQLfWSqTRIQgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIGCCChqLUgihEGAAIEKCIQurVuz5whdWmdOTWWn3BOonIBurZVLqQMRIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECOQkoas0J3rYECBComEAoZr0hPtM1xyZ0aY1RjCsnoFtr5VLqQAQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEBOAopac4K3LQECBComMJ+eZyx7ps1bNiVz545kp9wTqKyAbq2VTa2DESBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAkMUUNQ6RGxbESBAoKICoUvrQny2mVNTyej2kXjamEAlBXRrrWRaHYoAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAYsoCi1iGD244AAQIVFDidnqmpS2s4oy6tQcFVJwHdWuuUbWclQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEBiGgqHUQqtYkQIBAvQTOxscNXSt1aY1VjKsuoFtr1TPsfAQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgMCgBRS1DlrY+gQIEKi2wGx6vH3xEXWsjEWM6yLgd78umXZOAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAYBACiloHoWpNAgQI1EdgMT6qbpWxiHGdBPz+1ynbzkqAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAj0W0BRa79FrUeAAIH6CFybHrWlS+uxW99UHwEnJdBGQLfWNiimCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQI9CChq7QHJKwQIECDQVuBCPLtz945kz9VXxdPGBGoloFtrrdLtsAQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEAfBRS19hHTUgQIEKiRQOjQOhufV4fKWMS4rgL+LtQ1885NgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIrEdAUet69HxLgACB+gosxkcPXVpDh0oXAQLJ0t+F0e0jKAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBFYhoKh1FVheJUCAAIElgbH0/23p0jpz8xQeAgQyAnPnjmRGbgkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBFYSUNS6kpDnBAgQIBALLMQToSPlwROT8bQxgVoLzJyaSnRrrfWvgMMTIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECqxRQ1LpKMK8TIECg5gKhS+t8bBA6Um7esimeNiZQa4Hwd0K31lr/Cjg8AQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgsEoBRa2rBPM6AQIEai4QurRuzRqETpShI6WLAIFWAd1aW03MECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQ6CShq7SRjngABAgRigVDMekM8ec2xCV1aYxRjAi8I6NbqV4EAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0LuAotberbxJgACBugvMpwBjWQQFe1kN9wTaC+jW2t7FLAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIEAgFlDUGosYEyBAgEA7gdCldSF+oFgvFjEm0Cqg+LvVxAwBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoJ2AotZ2KuYIECBAIBY4nU40dWkNLxw9czj8cBEgsIKAAvAVgDwmQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECKQCilr9GhAgQIBALwJn45cmpseTy6+4LJ42JkCgjYBurW1QTBEgQIAAAQIECLksBUoAAEAASURBVBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEIgFFrRGIIQECBAi0CMymM/vi2ZMXj8dTxgQIdBHQrbULjkcECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgVRAUatfAwIECBBYSWAxfiF0ad25e0c8bUyAQBcB3Vq74HhEgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEEgFFLX6NSBAgACBbgLXpg9burTO3HSg2zeeESDQQUC31g4wpgkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECqYCiVr8GBAgQINBN4EL8MHRoHT+wN542JkCgBwHdWntA8goBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQWwFFrbVNvYMTIEBgRYHQoXU2fuvkxePxlDEBAqsQ0K11FVheJUCAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBGoloKi1Vul2WAIECKxKYDF+O3RpnZgej6eNCRBYhYBuravA8ioBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQKwFFrbVKt8MSIECgZ4Gx9M2WLq0Hr5/seQEvEiDQWUC31s42nhAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQL1FVDUWt/cOzkBAgS6CSzED0e3jyShEM9FgMD6BXRrXb+hFQgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIHqCShqrV5OnYgAAQLrFQhdWufjRebOHUlCIZ6LAIH+COjW2h9HqxAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQLVEVDUWp1cOgkBAgT6JRC6tG7NLnbpS7fp0poFcU+gDwK6tfYB0RIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBApQQUtVYqnQ5DgACBdQuEYtYb4lUOvXlSl9YYxZhAHwR0a+0DoiUIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACByggoaq1MKh2EAAECfRGYT1cZy66km2RWwz2B/gr4+9VfT6sRIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAEC5RZQ1Fru/ImeAAEC/RQIXVoX4gV1koxFjAn0V8Dfsf56Wo0AAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAor4Ci1vLmTuQECBDot8DpdMGmLq1hg6NnDocfLgIEBiSgW+uAYC1LgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIlE5AUWvpUiZgAgQIDEzgbLzyxPR4cvkVl8XTxgQI9FlAt9Y+g1qOAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKKWAotZSpk3QBAgQ6LvAbLrivnjVkxePx1PGBAgMQEC31gGgWpIAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAonYCi1tKlTMAECBAYiMBivGro0rpz94542pgAgQEJHHrzZBKKW10ECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgboKKGqta+admwABAi8KXJvetnRpnbnpwItvuCNAYOACl750WzJzamrg+9iAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIFBUAUWtRc2MuAgQIDA8gQvxVqFD6/iBvfG0MQECAxaYO3dEt9YBG1ueAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKK6Aotbi5kZkBAgQGIZA6NA6G2908uLxeMqYAIEhCIxuH9GtdQjOtiBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgSKKaCotZh5ERUBAgSGJbAYbxS6tE5Mj8fTxgQIDElAt9YhQduGAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKJyAotbCpURABAgQGJrAWLpTS5fWg9dPDi0AGxEg0CqgW2uriRkCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgHgKKWuuRZ6ckQIBAO4GFeFIxXSxiTCAfAd1a83G3KwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIJCvgKLWfP3tToAAgbwEQpfW+XhzhXSxiDGBfAQUmOfjblcCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIEAgXwFFrfn6250AAQJ5CVxIN96a3fzSl25LZk5NZafcEyCQo4Ai8xzxbU2AAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAjkIqCoNRd2mxIgQCBXgVDMejqO4NCbJ5PNWzbF08YECOQkoFtrTvC2JUCAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBHITUNSaG72NCRAgkJvAfLrzWHb3UMwaukK6CBAoloBurcXKh2gIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBwQooah2sr9UJECBQNIHQpXUhDmrm1FQSukK6ii3w7I+eS5545CvLf55+8pliByy6dQvo1rpuQgsQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECJRLYWKJYhUqAAAEC6xe4IV2iqUtrWPLomcOrXvn9t9+bfPq+z6/6u+wHe66+amm4eeumZP9v7kn2vOGqZOfuHdlX3L8g8OhDjyUfvOPPkmwha+iwe8sdJ5Jjt60+f2DLIxC6tT74kYeTn/7kZ+UJWqQECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgTUIKGpdA5pPCBAgUGKBC3HsE9PjyeVXXBZPD2Ucuo42rsc++6Wl29CZMhTxhe6xoWjTlSTf+fr3kj8696GWosZQ5HjvOz6WXDqyLTl0YhJVRQUa3VofuPdTFT2hYxEgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEfiFwCQgCBAgQqI3AbHrSffFpT148Hk/lOg6dSEOh5n/8zYUkW/Saa1A5b/5I2qW1W5fOz//tF3KO0PaDFgiF3oq8B61sfQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQCBvAUWteWfA/gQIEBiewGK8VejSunP3jni6EOPvf/upZPHG9ySNDq6FCCqnIJ78zlNdd/7+t3/Q9bmH5RdodGst/0mcgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECDQWWBj50eeECBAgECFBCbSs7R0aT10/RsLfcTQnfSdZz6Q3PmxtyZ7rr6q0LEOMrjX7X9N8uBHH+64xa79V3Z8lseDj//x/cnH33t/x60/+U8f6fjMg84CoVvrgx95uGvX3s5fe0KAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKL6ATq3Fz5EICRAg0A+BhXiR0KE1dGot+hUKW//8D/+q6GEONL5rjk107KgbOnjecseJge5v8WII6NZajDyIggABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEBicgE6tg7O1MgECBIoiEDq0zsbBnLx4PJ7q6/hXr3x5MvorL+u45tM/+GHyna9/r+Pz7IOvPv715Jtf/lbHws7su2W+f/ZHz6Xn/Mfk2WeeS77/7aeSY7cdXjrO5i2bkvf8t8XkE//1k8kTj3w1+VrqsXP3q5JgHApaQ7Gjqx4CurXWI89OSYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIE6iqgqLWumXduAgTqJLAYHzYUQw66S+vc+SPJoROT8dZN41DE+YkPfjJ54N5PrfhPqj/40YeT8/ecbvq+KoNP3/f55NFPPZ48+tBjy0e6/IrLlotaw2QobD35nwZbiLy8uZvCCjS6tYa/My4CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQNUELqnagZyHAAECBJoExtLRdNNMOpi5aSqeymV86Uu3LXUafcv7bltx/ye/89SK75TthZ/+5GfJH537UPL+2+9tKmgt2znEO1yB0K01FDm7CBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQJVE1DUWrWMOg8BAgSaBRbS4dbsVKPTY3Yu7/trjk0k4U+36/vf/kG3x6V8FgpaP/fAo6WMXdD5CRTx73B+GnYmQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEqiSwsUqHcRYCBAgQaBIIXVrnm2bSQVG7PO65elfXAs8nv722Tq2PffZLydce/0by9A+eSb7z9e81cVx+xWXJ69N9x6/dm4RCwX5c30/jfPShx5Jnf/RcErrLhnG4do2/Jtm0eVPyq1e+PJmYHk+e+Puv9NydNazV7Z+bP3hiMgln6fUKDo+kMf4s7RT7tS9+I/npv/xs+dPNWzclr/u3aaxpJ9BQaNzLuh//4/uXvw83Tzzy1aZxPIjfbzw/dtvhJHTv7eUaVF5Xsm4X46fv+3zyT9/4XvLVx76xFPp//tC5vv0+dbIIf48f/MjDSej26yJAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIVEVAUWtVMukcBAgQaBW4kE41dWkN/2T5zKmp1jcLMBOKPbtd21dRtPn0k88kodDwob94eLmotN3aTzySLL0Xnu25+qrktjtvTHbu3tHu1RXnQsfVBz/62bSg8ytt383OhzxcOtJb8WZY7J+feS75+HubC0ezm+x5w64Vi09DsWaI8TP3/V3y1ce/nv285T4UjIbrz+++Lxk/sDc5eP0bu3bS7RZby+LpRKf3Q3Fut6LWYeR1JetsjA9+9OGls4S4stcwCk0b3Vq7FTtnY3JPgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECiDgKLWMmRJjAQIEFi9QChmPR1/FgpaQ0FlEa9GR9NOsV1+xa90etQ0H4pH/+A/fGCpU2rTgxUG4bvfedPvJTM3Ty0Vt/bqFAoa33/x3qRRCLrCNkuPQ9HjTzOFkGGvUCx5zbF/t/R85+5XdS3u7GWP7Dvf/PK3kneeeX/XAt/s+9n7cK7w5xMf/GRy9yd+t69xZfdZ6X7Yee0WT8jfh972Z8sF0d3eHeQz3VoHqWttAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAIA+BS/LY1J4ECBAgMHCB+XSHsewuoXAyFMEV9fpfK/yT9St1cg3nCl0r75i7e9UFrVmT0H1z8cb39LRGKGh969E7V1XQmt2rcX/NsYnk/D2nl7rFho6x3bqVNr7p9WfoynrH3B+uqaA1u0cojA3rhI6vw76GndeVzleEgtYQY6Nb60rxek6AAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKIuAotayZEqcBAgQ6F0gdGldiF8PXVpDEVwRr9D5MnTi7HYdvH6y2+Olrpn3vuNjXd/p9WGI5X2339v19RDzO898oGuxaCgkDkWqu/Zf2XWtT9/3+aWC3K4vreFhKET93XUW+Wa3bRS2hrMP6wo2w8zrSud6/H98KfcOrdkYj545nB26J0CAAAECBAgQIECAAAECBAgQIECAAAECBAoqsGHDhkJF9pKXvCQpWkyFAhIMAQIECBAgkJvAxtx2tjEBAgQIDErghnThpi6tYaOZm6YGtd+61g3dThdvenfX4tCJ6fFk5+4dHfcJa9y72L2gNXR6HT+wN7n0l7ctrfO1L34jeeLvv5J0KtB89KHHktC1debm9m4PfuThJHRBbXeFYtZ/f/F4cv1vvdgZN3Q4DTGGIs1215/ffV8SOrb2s/A4xN/pfCGGUGx78MQbk0YX3J/+y8+Sx//nE8nnHng0CabtrlDY+vhnv5SEnAz6yiOvK51ppd+zlb7v9/PLr7hsKRfh99VFgAABAgQIECBAgAABAgQIECBAgAABAgQIFE8gFI4ePXo02bVrV/Knf/qnydNPP517kAcOHEhuvPHG5K//+q+TBx98MPd4BECAAAECBAgQyAooas1quCdAgEA1BC7ExwgFiI3CxfhZHuNQ4PnNL/9jWlT61eQzf/X5rgWtoUD0ZFog2u364B1/loQ1213h+1vuOJEcu621o+V3vv695P1pR9ZOxakff+/9adHnZBLWiK9PfOiT8dTy+LY7b2wphr30pduSt7zvtuSf0zjbFSCG4tNQ8JothF1ecI03oatopysU+N75F29teRzmg/cdc3+Y5uhbLc/DxOf+9gtNRa13f+KOpvc+c9/fdSzeDS/G7zc+jgt688hrI5ZOP9sVCYfi4E3p78jO1+9YKpr+NyO/KJzutEa/50O+2v1O9Xsf6xEgQIAAAQIECBAgQIAAAQIECBAgQIAAAQKrF7juuuuSd73rXckrX/nKZHR0NLn77ruTZ55p31xk9auv/ovf+I3fSO68884k/Ny3b1/yk5/8JHn44YdXv5AvCBAgQIAAAQIDElDUOiBYyxIgQCAngdl0333x3isVhcbv92McikXDn/VcoRD0zo+9tWuX1u9/+6muBX3tCkwbMYVC3//y4QvJ7xz+vbadSUOn0M+khaZxt9YnHvlK2/fDumHN+P3GfuHn3PnrOsYbCnz7WdQabDpdE4f3d3qUBPfgdsfc3W3fCQXJ2WvP1Vdlh0vFyk0T0SB+P3q8NMwjr+3i6DS3lOe0+3G/u+t22q/bfOhiHArXFbZ2U/KMAAECBAgQIECAAAECBAgQIECAAAECBAgMX2B6enqpiPV1r3vd0ua//du/nfz85z9fKnLNo7D1DW94Q3LXXXclk5OTS/GMj48n99xzT/K2t71NYevwfz3sSIAAAQIECHQQuKTDvGkCBAgQKKfAYhx2KHYLRW9lu8I/q373J343CV0wu12fe+DRjo/Dt90KTMOHoTvo3LkjHdf43ANfaHkWOrx2ulaKd+fuV3X6NHmySxFqx4/W+ODBj3b/v7jNO8488torZehm+yf//Q+Wuv/G3WV7XaPf7+VRuN7vM1iPAAECBAgQIECAAAECBAgQIECAAAECBAhUSeDw4cNLxauvf/3rl4+1bdu25MKFC0tFpCMjI8vzw7iZmJhYKmi99tprm7b79V//9aU4Dx482DRvQIAAAQIECBDIS0Cn1rzk7UuAAIH+C0ykS7Z0aT10/Rv7v9MAVwxFodM3H1jqgLk5/SfdV7q+8KnHO76y/9o9HZ9lH4QixXvf8bHs1PL91x7/ehL+yflsLE9+p3MH1Fe+5uXL37a7CeuEQsjQBTa+wj5hvl+FkqEwuFO31m9++VtLnViP3fqmpS6fcSyhW+sn/+kj8fTQxnnktZfDHToxmZy753TT70Mv3w36Hd1aBy1sfQIECBAgQIAAAQIECBDoJrBly5bkta99bbJ58+alzmPZdy+55JLkm9/8ZvL0009np90TIECAAAECBCotcOjQoaUOqHv37m05Zyhsfctb3rI0/653vSv54Q9/2PJOvydC4erv//7vJwcOHEg2bNjQsvyv/dqvLXds/cxnPtPy3AQBAgQIECBAYJgCilqHqW0vAgQIDFZgIV6+UegWzxd1HP5Z+pm0oDUUmWaLSDvFGwpAv5oWnXa69rxhV6dHTfPhn5IP+4Wi0vgKc6GwNcTWuLb98rbGbcvP5378XMtcPPHsM53fuXSk89rxOiuNX5cWCHcqag3fPvHIV5b+hALWq9OOvqEIOHwTimHzvPLKay9nPnjijT39bvayVr/fCd1aH33osX4vaz0CBAgQIECAAAECBAgQILCiwBVXXJH85V/+ZXL55Zcn//qv/7r8fiiYCEWtt912W/I3f/M3y/NuCBAgQIAAAQJVFQj/7TM1NbXU+XTfvpZeNMvH3rp163Jh6z333JM880xrM5Tll9d5Mz4+vlTQGgpt2xW0NpZvFLbecccdSShsff755xuP/CRAgAABAgQIDFVAUetQuW1GgACBgQmE/614Nl69bP8kebbI8tith5O53zrStYDw6R90/1/w75i7OyZZ0zgUhu65+sVPuxV9disiDSuEgs12xbPhWejQ2ksxb3i3l2vu/JHkcw88uuKrz/7oueTT931+6U94OZxvIi1yPXj9ZBIKo4d95ZXXYZ+z3/s1itgVtvZb1noECBAgQIAAAQIECBAgsJLAxo0bk5e//OXJ6Oho21df8pKXtJ03SYAAAQIECBComsDY2Fhy++23J/v371/xaI3C1lA8+u53v3sgne1DHKFD6+HDh7sWtDaCDe+Hd7/whS8kP/7xjxvTfhIgQIAAAQIEhipwyVB3sxkBAgQIDEpgMV44dB8NhYllvEKR5cffe3/yzjMf6FgAGs717I+eHcrxnkyLWrPXzt2vyg6b7kNhbihc7XR1KzL91de8otNna5oPRY4zN0+t+ttQmPvAvZ9KfudNv5ecmbiY3PuOj3Xt+LrqDVb4IK+8rhBWKR6XrZC9FKiCJECAAAECBAgQIECAAIEVBUIhxs9+1vov4DQ+/PnPf9649ZMAAQIECBAgUGmBp556Krn//vuT7373uz2dMxS2hiLYhYWFjv8HQj0t1Oal0Cn2zjvvTGZmZv4/e3cCHlV99n38npkkQAKBJCwJm8iiIIVXwVejVSqCIuojoIBeguJTpCLIIloRAVmE4gbutBWsguhzSW0t1YpY6gJPFX0EnxekQFmqIIYd2YIkJPOe+9AZM2eWTCaznJn5/q8rnXP+55z/8jkRKf64E1agVX/P9u6778prr70mJ07E57/BBVg2XQgggAACCCCAgFCplW8CBBBAIPkFCo0tXGPdRt+hNQ8zWseozbkGatue51/lU6twfrutxAyk7ti4M+QUaz9cL4vmLJUR04cEvK+6qqgBH4pCpycwHKgipgZadc3jnxrhN5Ou98357/j1ezr63Xm15zBqn6MfvUPKT5V7q7DWdGBPwHX54g9EK79WVz23puMHuj9R7zXQWpKtj2qtyfbGWC8CCCCAAAIIIIAAAggggAACCCCAAAIIIIBAKgmUlZXJ4sWLpaKiQh555BFp0aJFtdvzBFv1xsceeywqFVu7du1qBlqvu+66sAOty5cvl8mTJ8v69eurXTM3IIAAAggggAACsRQg1BpLXcZGAAEE4iMw0ZimbtWp9MfY9709waHWPt2k9+DLqy7L71hDrU/f+6KECrdqxdBegy4XDevZqfUd2lMChVp1jSuXrjaDuzcYIVV9F9o2r90ub77wjhHmLTXPrf/TrFVj6dazq7U7KucasC023kdtKq6WGcFYrZ675cvtMmPJ/VFZF4PERkCrtQb73ozNjIyKAAIIIIAAAggggAACCCCAQHILOOrmiKNufTESH8mzEbexVHeFVB7ZnzxrZqUIIIAAAgikiYAGW5csWWKGSTXY2rx59T+pLysrS8aPHy9aLfWJJ56oVbD1Jz/5iUyfPl1uuOGGsMR1zhUrVsjUqVMJtIYlxk0IIIAAAgggEGsBQq2xFmZ8BBBAILYCWqV1pHWKgaOul6w6mdZu251rUPXJt6eZP+Zeq7cGaxqmnPLSOL/L1e2xyyWd/J6JpKOpETi1tu5GAHXYpMFmVVbrNT3fvG6bbB61LdAlvz7dx+SXxsf0nWnlXP3SsOOaFevkU+MzWMDWb4FVOjzVc3XvsWqJfK+x2lM8x6Vaazy1mQsBBBBAAAEEEEAAAQQQQCAVBBz188TZoEDcZSeTZztOlziy6hFqTZ43xkoRQAABBNJMoLy8XF599VUz2Dpz5sywgq116tSRe++915R68skn5dChQzVW69SpkxloHTBgQFjPaqB15cqVZqD1yy+/DOsZbkIAAQQQQAABBGItQKg11sKMjwACCMRWQJOePlVaNRCY6CqtNdmyrveB+aPNYGuw59Z9uF4O7zvirXrquc9TBdVzbv0cMWNITCu8Drrnetn37QFZ/uoH1qnDPs/JzTb2Pyqm66y6GE+4Vau3Rhpw1eq5Nwzv4/c+qs5Tm+NEv9farN0uz1Kt1S5vgnUggAACCCCAAAIIIIAAAggkhYAjQ9ylR6Ri79dJsVxdpFaWdTU/J2nWy0IRQAABBBBIRwENti5evFjcbreEW7G1bt263mDr3Llz5eDBg2HTdezY0Qy03nTTTWE9o4HWDz74QCZPnixr164N6xluQgABBBBAAAEE4iHgjMckzIEAAgggEBMBDbPeYR1ZA63VVbq0PpPoc60sqZVPg7WyU+VGdVH//zPdtGWTYI+Y/d9u+y7k9WhcHP3oHdKjX3FEQ3Xs1l7mGpVqQ+09ooHDfEgDrhpufWPTb+TZ92eZlWdbti+q9ml9HyuXrq72vkhvsMN7jXTtdnnOU63VLuthHQgggAACCCCAAAIIIIAAAgjYWsDhtvXyAi7O4TC6k3DdATdDJwIIIIAAAqkr4KnYOnXqVCkpCf5TC6sKeIKt999/vxQUFFS9FPT43HPPNQOtgweH/5P2PvzwQzPQ+sUXXwQdlwsIIIAAAggggEAiBAi1JkKdORFAAIHoCIw0himsOpSGWQeOur5qV9Ic9xp0Wci1rlr2md/1+g2z/fqqdmz5cnvV06gfrzUqyN52wRhZtWxN2GNrZdbegy+XKS+NkyfffljCCZGGPXgtbtQQpFae/c3Hj5nr0vNQbff24H/wonsM1bTqbqiW6Pcaam3JdE2rtdIQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgsQKeiq2RBFvvu+++aoOtHTp0EB375ptvDnujH3/8sUyZMkU+//zzsJ/hRgQQQAABBBBAIF4CGfGaiHkQQAABBKIqoFVaJ1pH1Cqt1f3oduszdjmvrlrphk83yYmjpVI1MKnHXS7pJHotUNvx1c5A3X59Wnn0xrbD/fo9HSOmD5F+I/p4Ts3P3z//jiyas9SnT+2HTRpsVm7dsfEbKTfGrdq0AmmzVo2rdsXsWE02fLI54PjdenYRrRIbrOm1J40Ksnf/bKLs3XUg4G3B+vXmnGrCxuodqiXyvYZaV7Jd81RrXfOef5XjZNsL60UAAQQQQAABBBBAAAEEEEAg5gLGjwVOquauTKrlslgEEEAAAQTSXeD06dOyePFik+GRRx6RoqLqf3JenTp1ZPz48eYzc+fOlYMHD/oxtm3b1gy03nrrrX7XgnWsWrXKrNC6Zk34RVuCjUU/AggggAACCCAQCwFCrbFQZUwEEEAg9gK3GFP4VGnVKfsOvTL2M8doBg0yaghvx8bgQVStjNqjX7HPCnr0uzhoqFWDnfpMdYHZb7cFrzqqk1mrqW5et80v0Kr3PTB/lBmy1eNQoVG9Huu2ee12eX3eWwGnOXGstNr1adXfszufFTTUGnDgf3fmNWkY6rJsMfyqC/cm4r2GXHSSXtRqrYRak/TlsWwEEEAAAQQQQAABBBBAAIH4CtTNEWeu/nhfZ3znjWg2I4CbVc9Yqiuip3kIAQQQQAABBBIjoBVbFy1aJA6HQ2bOnBlWsLVevXpmsNVt/AUcDbYeOnTIu/g2bdqYgdahQ4eaY3ovhDj4+9//blZo1U8aAggggAACCCBgV4Fk+NMZu9qxLgQQQCCRAuOskxdf090vfGm9x+7nxX26h1ziuo82+F2v7pn5k16pNpgZLPzpmaytEe6s2tZ96L8Ovf63pf8th/cdqXprwo5DVezVoG911VJ14ft27Q+6/lCh1Ooqtb75wjvVOiXivQbdbBJf8FRrTeItsHQEEEAAAQQQQAABBBBAAAEEYi9QdlLMQq25TUQ02Gr7L+MnAdWrL5WnSmNvwwwIIIAAAgggEFUBrdj6yiuvyMMPPywlJaGLrngm9gRb77vvPsnLyzO7W7dubQZab7/99rADrZ9++qlZoXX16tWeoflEAAEEEEAAAQRsKUClVlu+FhaFAAIIhBTob1w933qHVmRM9tbl0o4i84LvQiuvWpuGN/uN6CPLFqywXjLP9+46IA8N+pWMmD5ENPhbtem1RXOWhqxkqaFAa0BUK50GaiuXrhb90vtbtmvuc0vbn7SWnAbZ3j4dV8OfXS7p5O2L5kGocbUy7ezhz8j4eSP89uZZg+4jVNXcc7u189zq96lVarXy7omjgZ103LF9pvgZaaVbj3Ui3qvfRlKkg2qtKfIi2QYCCCCAAAIIIIAAAgggYHMBrTiWrK3y+30i+kVDAAEEEEAAAQTiIOAJtupU4VZszc7Olnvvvdf4izhueeutt+QXv/iFDBs2TJzO8OqYffbZZ2ag9eOPP47DDpkCAQQQQAABBBConQCh1tr58TQCCCCQCIFp1kk1rKkhyWRv5xphSP2x98GqiGoIVQOZLdsX+Wx12KTBotVH9Vqgps/NMkKcGrT0VF09vP/7oPdXHSNQWLhLccegIVp9Vqu1Wiu2BgrkeubRAGqPfhdLr8GXm/v39NfmUyup6rjB5lWv2y4YY8xbbIRLfT03fLo56HO6Jn1HutZQrXvPrrJq2ZqgtwQysr73eL/XoItN8gueaq1r3lub5Dth+QgggAACCCCAAAIIIIAAAgjUXEDDtllZWZKZmSmVlZU1H8AGT5SWBv6LwzZYGktAAAEEEEAAgQgFPMFW/b3KjBkzpKjI97/VBBpWK7ZqsPWqq66S888/X1wuV6Db/PrWrl1rBlo//PBDv2t0IIAAAggggAACdhQg1GrHt8KaEEAAgeACxcYlvyqtvQddFvyJJLqiYUkNtgYLYupWNIxpDbXqc+OfGiEPDZwTNBCrz2rl0FBj6z1VmwYzrdVd9br2aRg0VGiz6jjVHeua9Ov1eW/JjCW/jFpAedhDg6o1iWQPGjZV81DthjuvrrVPvN9rqP0k+zWqtSb7G2T9CCCAAAIIIIAAAggggAACkQrk5OTIuHHjpFu3blJRURHpMAl9bvDgwQmdn8kRQAABBBBAIDYCGmx9+eWXzeqr06dPlxYtWlQ7kVZsveiii6q9z3ODBlofeugh+dvf/ubp4hMBBBBAAAEEELC9AKFW278iFogAAgj4CEz0OTNOPFUYrf3Jet7lko4hg6cb1myWfiP6+G1Pf+T9r96cJLONiqzWKql+N4fRoYHWyS+NC3rnA/NHiVZDXbZgRcggbdABAlzQdU8a+CuZ8+ZDUQm2qomGfZ++d0HU1tjbqNAayN+6HZ170D3Xy++ff8d6qUbn8X6vNVpcEt3s+XWCaq1J9NJYKgIIIIAAAggggAACCCCAQFQEtEprr169pGfPnlEZLxGDOBs28Z/W+NHD+uOH3ccO+l+jBwEEEEAAAQSSRsBTsVUXrBVbmzdvHrW1r1+/XqZOnSrvv/9+1MZkIAQQQAABBBBAIB4CznhMwhwIIIAAAlER0Aqt/a0jaQXGVGpdLu0YcjvrjEqt1h9T73lAA5DPrpglXS7p5OmK6FNDmxpora4a6cDR10svI+QZzabVZDXYGo1grq5LK8pq2Nda3bama85r2lBGP3qHGZIN91mt6Dpi+hDJyc0O95GA98X7vQZcRAp0ptqvFSnwStgCAggggAACCCCAAAIIIIBAHAQ0+Hnq1Kk4zBS7KRwNCsTvq1GhuIraxW5SRkYAAQQQQACBuAl4gq1arXX37t1RmXfDhg1mhdbly5dHZTwGQQABBBBAAAEE4ilApdZ4ajMXAgggUDuBadbHNahYfE13a3dSn59rBFM1TBosuKr9W9ZtCxpc1fDlHCPEudYIv/7t9/8tq5atCctD51RLDWJqBdbq2sqlq2XBtNdEQ6jRbjrm2o/Wi1ZFjUbTUOhvPn7MrIC7atlnpkm469aKtb0GXWaGYyNZiwaENVi7ZsVaY/7N5nsJd+6q88XrvVadM9WOqdaaam+U/SCAAAKpL9D8kZdSf5NpuEPea2q+dN4r7zU1BdhVTQU0PEqLjUDFd1v9BnbUrS+uwrZ+/aE6+PU6lE7yXuO9Ju+7C7Vy3msoneS9xntN3ncXj5VrsPXll182p9Jwa20qtv7jH/+QKVOmyF/+8pd4LJ05EEAAAQQQQACBqAsQao06KQMigAACMREoNEa9xjpy36FXWrvidn6mSullQedr2S6yH4+i4VKtLFpuhFeDtXDG1jCmfml10Q2fbJIdG3eaw2mwUltW3Uw594Iz1Sy69ewiGvwMt2lg9ul7FwS8XYPGU14aH7AyqlZf/Xb7d7J31wFZNGdpyGqsW9Zt94ZaPYHOgBManW07nxXskk+/VrDVL9Pk003m/N9uKzHv2fLldmnasrHkNWkoOp/uo2nLJmEFfH0mCXCi4/W97Urzq+plfScnjp7wdul91bVYvledO1rW1e0jUde1Wuua99YmanrmRQABBBBAAAEEEEAAAQQQQACBSAQqK/yfCtTnfxc9CCCAAAIIIJBEAlWDrZMnT5azzgrvv/94tqh/yeirr76SadOmyZ///GdPN58IIIAAAggggEDSCRBqTbpXxoIRQCBNBSYa+65bde9mUPD2xIVaNfSoX7FoNQmYVjd/Tm62WYE1WhVttcro46PmB5xWK7w++/4ss9JsoBv0nelXl0vErF469uop4gmVWu/XarSepkFfDaNGs0V7vEjWppVDI23Rfq+edcTC2jO2HT6p1mqHt8AaEEAAAQQQQAABBBBAAIHUEygrK5NQlVrLy4P/5eXU04jTjhyOOE3ENAgggAACCCAQTwENti5YsEByc3Nl6tSp0rBh9QVBPOvbv3+/PPPMM/LWW295uvhEAAEEEEAAAQSSUoBQa1K+NhaNAAJpJqBVWkda9zxw1PVBw5PWezmPnoBWfdVga6B2jVE5V0OR4TS9r8cNxfL6vMB/sBBsjnDG5h4EQgn0u/NqqrWGAuIaAggggAACCCCAAAIIIJDiAjk5OXLeeedJVlZWyCBquAwVFRXStm1bc7xgz/zkJz+RXbt2SWZm8D830VCsI4ygpt6j927btk327dsXbEpvv96ve065ZhjQEEAAAQQQQCA1BTTQWlBQUOPNZWRkSJs2bSQvL08OHz5c4+d5AAEEEEAAAQQQsIsAoVa7vAnWgQACCAQXGGdc8qnSqoHI3jdfHvwJrsRMYMOazUHHLj9Vs6ojJ44FDsfqBHlNGgWdhwsI1EZAq/RqxdYdG3fWZhieRQABBBBAAAEEEEAAAQQQSFIBDaD+/ve/l8JC/XvUtW8aMHU6naIhimBtypQpMmnSJDO0Giy4GqrSq3XcyspKufvuu2XRokXWS37nWiX2k08+kVOnTvld83Q4s+qJOI2wbNkPRpcNK6BmZHmWeuZTA60ZwQPCvjdzhgACCCCAAALJJKCVWe+//34ZNWpUjaq06h7z8/NlzJgx5u/L5s6dKwcOHEimrbNWBBBAAAEEEEDAKxD8T5m8t3CAAAIIIJBAAQ2z3mGdv+/tV4r++HVa/AUaNW4YdNK//X619Bp8uTRr1TjoPZ4LGihc+cZqz6nfZ5dLO/r10YFAtARunTBAZg1/JlrDMQ4CCCCAAAIxEXjqn/fHZFwGjZ/Avec86TcZ79WPJOk6Ar3X76YOT7p9sGBfgeaPvOTbYZzxXv1Ikq4j0HvVTWgANTs7W+rUqRO3PWmF1lBVWiNZSKgQbdXxjh07Jg8++GDVLr9jZ+NW4mzUVNynThqZVvuFWl3NO/iuWUOtLpfxVbP/xMO/h30Zk/Es0L+Hea/J+CZ91xzovfLvYV+jZDwL9O9h3msyvknfNQd6r7531O6sQYMGMn78eBk7dqxotdZImoZiR48ebVa212DrwYMHIxmmRs8E+nWsRgNwsy0FeK+2fC21XlSsfx2r9QIZICIB3mtEbDxkc4Ga/YmHzTfD8hBAAIEUFBhp7MmnbIZWaR046voU3GpybEkrXAZre3cdkPtvmCE3DO8j1wYJHmuYdfmrH8iqZWvkxNHAlVr1HesYNARiJVB8TXeqtcYKl3ERQAABBBBAAAEEEEAAAZsLaEXU06dP23yVoZdXUVEhWq01Wq3yyD5xlx6xZaA16B412KpfNAQQQAABBBBICYH69evLuHHj5N5774040OqB0HDsPffcY57OmzePiq0eGD4RQAABBBBAIGkECLUmzatioQggkIYCWqV1onXfWqU1r2nwaqHW+zmPrkD3nl1DhgEP7zsii+YsNb90Zq3amtekkWxety3shQybNJh3HLYWN0YqQLXWSOV4DgEEEEAAAQQQQAABBBBAIOUEyk+J2/iiIYAAAggggAACiRDQQKtWZ73vvvtEK61GoxFsjYYiYyCAAAIIIIBAogQItSZKnnkRQACB6gVuMW7xqdKqj/QadHn1T3JHTAXGP/ULmTTwV0ErrVadXKu36le4rd+IPqJfNARiLUC11lgLMz4CCCCAAAIIIIAAAggggIAdBDIzM+Wiiy6SZs2a2WE5Ea3hj3/8Y0TP8RACCCCAAAII2F8gJydHRo8eLffff780atQoqgvWYKuO7XA4ZO7cuVRsjaougyGAAAIIIIBALAUItcZSl7ERQACB2gmMsz7uCaFZ+zmPr0Dbzq3l2fdnyezhT8uOjTujMrlW3x0xfYj06FcclfEYBIFwBKjWGo4S9yCAAAIIIIAAAggggAACqSWgoYaMjOT+TwMul0ucTmdYL0bDHI8++qhcdtllYd1vx5v0ndEQQAABBBBAIPUEsrOz5e6775YHHnhA8vLywtrgt99+K++++65cffXV0qZNm2qfyc3NNYOteiPB1mq5uAEBBBBAAAEEbCKQ3H9yZRNEloEAAgjEQKC/Meb51nE1gEazh0CzVo3NYOua99bK6/PeijjcqgHZvrddKb0GXy5ZdTLtsTlWkTYCnqB8tMLZaQPHRhFAAAEEEEAAAQQQQACBJBaorKyU0tJSOXXqVFR24Xa7zYCpBmWDBU3Ly8uloqLCrBIWLKCp44TbdA+nT58O63Yd9/jx42Hdy00IIIAAAggggEC8BOrVqye/+IXxkwEnTZL8/Pywpt27d6/Mnj1b/vCHP8jnn38u06dPl5YtW1b7rKdiq/6+6KmnnpL9+/dX+0xNbvhu6vCa3M69NhRo/shLfqvivfqRJF0H7zXpXllYC+a9hsWUdDcFeq9Jt4koL5hQa5RBGQ4BBBCIksA06zjde3YVDUDS7CWgoUD9OnG0VDav3SZfrdlsfG4Pucgul3SUlu2L5Nxu7UXDsTQEEilAtdZE6jM3AggggAACCCCAAAIIIBB/gR07dsigQYMkKytLahIkDbZSDau2bdtWXnjhhaAVxmbNmiXLly+XzMzgf6FX1xIs8Fp1br1H7922bVvVbo4RQAABBBBAAIGkEahbt66MGDFCpkyZUqNAq/6e6uWXXzb/ctKrr75q/t7p4YcfllatWlW7dw223nPPPeYzGmzdt29ftc9wAwIIIIAAAgggkCgBQq2JkmdeBBBAILiAVmj1q9Lad2jP4E9wJeECObnZosFj/aIhkEwCVGtNprfFWhFAIG0FHMaP1nVXpu322TgCCCCAAAIIRFfgxIkT8j//8z9RHVRDEWVlZUHH/Oqrr6I+Z9DJuIAAAggggAACCNhYoE6dOjJ8+HCZOnWqFBQUhLVSrayqFVoXLlzorbavv/davHix+XwkwdZ58+YRbA1Ln5sQQAABBBBAIBECxn8ZoyGAAAII2EzAr0qrVmjV4BkNAQQQiIWAVmulIYAAAgjYV8DZqKk46of3Y+jsuwtWhgACCCCAAAKpLKBVX0NVWQ1VoTWVXdgbAggggAACCCBQVUB/z3THHXeYgdbGjcP7SX4HDhyQOXPmmIHWH374oepw5l8q0mDrzJkzZdeuXT7Xgp3Ur19fRo8eLRMmTJCmTZsGu41+BBBAAAEEEEAgoQKEWhPKz+QIIICAn4BWaO1v7SVwZhXhHAEEoingqdYazTEZCwEEEEAgigJGpVZX4dmi4VYaAggggAACCCCAAAIIIIAAAggggEDyCWigddiwYTJ9+nRp1qxZWBs4dOiQPProo/Lb3/5WTp48GfAZT8VWDbbu3Lkz4D3WTg223nPPPQRbrTCcI4AAAggggIBtBAi12uZVsBAEEEDAFJhodWjWqjFVWq0onCOAQNQFCM9HnZQBEUAAgagLOBu3EmdBi6iPy4AIIIAAAggggEAqC2gF2YYNG6byFtkbAggggAACCNhcQKvW33rrrWagtbCwMKzVHj58WB577DEz0FpaWhryGQ22vvrqq/LII4+EHWzNyckh2BpSlYsIIIAAAgggkEiBjEROztwIIIAAAj4C+v9i/aq03jC8j89NnCCAAAKxEPBUa92xMby/yR2LNTAmAggggED1As68QnFkZErF3q+rv5k7EEAAAQQQQAABBMyqZr/73e9k9erVaCCAAAIIIIAAAnEX0EDrzTffbAZOmzdvHtb8R44ckSeeeELmz58vx48fD+uZU6dOyeLFi8XtdsvDDz8srVu3rvY5DbaOHj3avG/evHmyb9++ap/hBgQQQAABBBBAIB4ChFrjocwcCCCAQHgCWqW1btVb85o2lL63X1m1i2MEEEAgZgJarXXW8GdiNj4DI4AAAghER8DRoEBcLiPYWrJdxF0ZnUEZBQEEEEAAAQQQSFEB/VG9CxcuTNHdsS0EEEAAAQQQsLNARkaGDBo0SGbNmiUtW7YMa6lHjx6VJ598Up5//vmwA62egT0VW51Op0ydOlVatWrluRT0s379+t5g61NPPSV79+4Nei8XEEAAAQQQQACBeAk44zUR8yCAAAIIhBTQKq0jrXcMHHW9ZNXJtHZzjgACCMREwFOtNSaDMygCCCCAQFQFHNm54mp5rojTFdVxGQwBBBBAAAEEEEAAAQQQQAABBBBAoPYCGmi96aabzEDrWWedFdaAx44dE62Y+txzz4keR9I02Lpo0SKzMuw333wT1hAabL3nnntk/Pjx0qRJk7Ce4SYEEEAAAQQQQCCWAoRaY6nL2AgggED4Ahpo9anSqmHW3jdfHv4I3IkAAghEQUCrtdIQQAABBOwrUFBY5F2co062EWztKI7MLG8fBwgggAACCCCAAAIIIIAAAggggAACiRXIzMyUAQOMn4xmVGg9++yzw1rM8ePHRSulPvPMM3LkyJGwngl2kwZbX3nlFXP+cIOtOTk5ZrD1vvvuk8LCQnE4HMGGpx8BBBBAAAEEEIi5AKHWmBMzAQIIIFCtgIZZ77Le1ff2KyUnN9vazTkCCCAQUwGqtcaUl8ERQACBWgtc1PtauXnML73jOLLqiqvFuaIBVxoCCCCAAAIIIIAAAggggAACCCCAQGIFXC6X/Md//IfMmTNH2rdvH9ZiTpw4IU8//bT59f3334f1THU3lZeXy+LFi2X27NkSbrDVU7FVq7Y2aNCguim4jgACCCCAAAIIxEyAUGvMaBkYAQQQCFtAq7QWVr1bq7QOHHV91S6OEUAAgbgJUK01btRMhAACCEQk8NPr+suwB2dIZta/K7RmZBnB1nPEUTcnovF4CAEEEEAAAQQQQAABBBBAAAEEEEAgOgLZ2dnSo0cPadeuXVgDlpaWyrPPPmtWaT18+HBYz4R7UyQVW+vUqSONGzeWLM+fO4U7GfchgAACCCCAAAJRFCDUGkVMhkIAAQQiENAqrROtz2mV1rymDa3dnCOAAAJxEaBaa1yYmQQBBBColUD3K3rLyFnzpF79f1fNcLrOBFtzGtVqXB5GAAEEEEAAAQQQQAABBBBAAAEEEIhc4Pjx47Jw4UL54x//KBUVFSEHOnnypDz//PMyd+5cOXToUMh7I72oFVsXLVoks2bNqrZiq977X//1X+Z6Dh48GOmUPIcAAggggAACCNRagFBrrQkZAAEEEKiVQH/jaZ8qrTpar0GX12pQHkYAAQRqK0C11toK8jwCCCAQe4EOXS+QsY8/J7n5+WcmczjFVdROnLkFsZ+cGRBAAAEEEEAAAQQQQAABBBBAAAEE/ATcbrd89dVXMnXqVHnrrbfk9OnTfvdohwZa58+fL08++aTEOkCqYdVXXnnFDLZ+/fXXAdej9yxdulQeeeQR2bp1q+g+aAgggAACCCCAQKIECLUmSp55EUAAgTMCflVaqZDItwYCCNhBgF+L7PAWWAMCCCBQvUCLth1kwlMvSkFhkfdmZ9M24sxv7j3nAAEEEEAAAQQQQAABBBBAAAEEEEAgvgL/+Mc/5OGHH5Zly5b5BVt/+OEH+e1vfyuPP/647N+/Py4L03CtBltnz54t1mCrXvvDH/7gDbTGZUFMggACCCCAAAIIhBAg1BoCh0sIIIBAjAW0Suv51jmojmgV4RwBBBIlwK9HiZJnXgQQQKBmAvnNiuSB518WDbh6mjO/SJxNWntO+UQAAQQQQAABBBBAAAEEEEAAAQQQiLPApk2bZNq0afKnP/3JG2zVQOuLL74ojz76qOzbty+uK/IEW2fNmiX/+te/zLkrKirMirIzZ86ULVu2xHU9TIYAAggggAACCAQTINQaTIZ+BBBAIPYC06xTdO/ZVdp2JnxgdeEcAQQSI0C11sS4MysCCCAQiUC9+g1kwtMvSqcLi72POxs2EVdROxEH/9ffi8IBAggggAACCCCAAAIIIIAAAgggEEeBjRs3msHWv/zlL3LkyBGzWqoGWvfu3RvHVfw4lSfYOmfOHNm6dau88847Mn36dNEALg0BBBBAAAEEELCLQIZdFsI6EEAAgTQT0AqtflVa+w7tmWYMbBcBBOwuoNVaZw1/xu7LZH0IIIAAAoZAZlaW3PnwHFn6/BPy2fvvmiaOnEbiat5eKkq2i1RW4IQAAggggAACCCCAAAIIIIAAAgggEGeBf/zjHzJjxgxZuXKlvP3221JSUhLnFfhOp9VZFy9eLLt27ZKdO3eKro+GAAIIIIAAAgjYSYBQq53eBmtBAIF0EvCr0qoVWrUqIg0BBBCwk4CnWuuOjTvttCzWggACCCAQRECDrUMmTJZ6OQ3ko7feMO9y1GsgrhbnSMV320QqyoM8STcCCCCAAAIIIIAAAggggAACCCCAQKwEvvzyS9Evu7RTp07Je++9Z5flsA4EEEAAAQQQQMBHgJ9B6MPBCQIIIBAXAa3Q2t86k1ZDpCGAAAJ2FLjhzj52XBZrQgABBBAIIXDjXWNFvzzNUSdbMlp1FEdWXU8XnwgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgjYToBQq+1eCQtCAIE0EJho3WOzVo2p0mpF4RwBBGwj0KNfseQ1bWib9bAQBBBAAIHwBK4YcLMMvW+yaPVWs2VkiaulEWw1Aq40BBBAAAEEEEAgmgIOh0MyMzODDul08p8iguJwAQEEEEAAAQQQQAABBBBAAAEEEEDAR4A/SfLh4AQBBBCIuUChMYNfldYbhlMFMebyTIAAAhELZNXJlIGjro/4eR5EAAEEEEicwEVXXSt3Pjznx2Cr02UEW88VR3Zu4hbFzAgggAACCCCQcgKnT5+WkpISOXjwoOzdu9f7tW/fPjlw4ICcPHky5fbMhhBAAAEEEEAAAQQQQAABBBBAAAEEYiOQEZthGRUBBBBAIIiAVmn1+ZmvWv2w7+1XBrmd7mQXOHG0VHZs/Ma7jZbtmlPx0qvBQTIJ6K9Tb85/Rw7vO5JMy2atCCCAAAKGQKcLi2XC0y/Ksw+MkZPHj4k4nOIqaieV+3dK5dGDGCGAAAIIIIAAArUW2LVrl9xyyy2SZVSIr6ys9BlPq7Tu2LHDp48TBBBAAAEEEEAAAQQQQAABBBBAAAEEggkQag0mQz8CCCAQfQGt0jrSOqxWP9QqiLTYCTx97wJZuXR17Cb7e1lUAABAAElEQVQwRh7/1AjpPfhynznWvLdWXpj0ik8IUN/1sEmDpd+IwNV5dZ263mDtnd2Lg12iH4GYCniqtS6Y/lpM52FwBBBAAIHYCLRo20EmvvCyPDdxjBzcU2IGW51N24gYlVsrv98Xm0kZFQEEEEAAAQTSRuDUqVOyYcOGtNkvG0UAAQQQQAABBBBAAAEEEEAAAQQQiJ2AM3ZDMzICCCCAgEVAA60+VVo1JNb7Zt8gpOUZTpNU4NttJfL4qPk+gVbdStmpctFQYKxDtknKxrJtLqDVWrW6NA0BBBBAIDkF8psVyb3zXhQNuHqas3Er0S8aAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAnYQINRqh7fAGhBAIB0ENMx6l3WjGhDLyc22dnOeAgKfGlVaNcAarK3+82fBLtGPgG0FPNVabbtAFoYAAgggUK1Abn6+jH38OenQ9QLvvc5GTcXVrI1ZvdXbyQECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCRAg1JoAdKZEAIG0FNAqrYVVd044rKpG6h3v+/ZAyE3t3bU/5HUuImBXAaq12vXNsC4EEEAgfIF69RvIyFnzpPsVvb0PORoUiKuoHcFWrwgHCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACiRAg1JoIdeZEAIF0E9AqrROtm+41+HJ+jLcVJYXOz+1mhEJCtI7d2oe4yiUE7CtAIN++74aVIYAAAjURyMzKkmEPzpCfXtff+5gjO1dcLc8Vcbq8fRwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggEE8BQq3x1GYuBBBIVwFNCvhUaVWIvrddma4eabHvHv2KpW3n1gH3mte0oQybNDjgNToRSAYBqrUmw1tijQgggEB4AjeP+aX0HTrce7OjTrZktD5PHJlZ3j4OEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEIiXQEa8JmIeBBBAII0F/Kq0Fl/TPWjgMY2dErb1lu2LJK9Jo1rNn9ekoc/zWs3yybenyZvPvyMbPt0sW9ZtM975WaJzaaBVg600BJJVwFOtdcH015J1C6wbAQQQQKCKQN+hP5eCZoWyZO7sM70ZWUbF1o5S8d02cZ8qrXInhwgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgjEVoBQa2x9GR0BBBDQKq3nWxlunTDA2sV5AgUGjr5eeg++POor0ODfrffxrqMOy4C2ENBqrW/Of0cO7ztii/WwCAQQQACB2glcdNW10iAvXxbOnCTlZWUirkxxtThHKkq2i/vksdoNztMIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIhCngDPM+bkMAAQQQiExgmvWxLpd0okqrFYVzBBBIOgFPtdakWzgLRgABBBAIKtDpwmIZ8/jzUq9+gzP3OF3iat5eHPXzgz7DBQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSiKUCoNZqajIUAAgj4CmiFVr8qrf3uvNr3Ls4QQACBJBXQaq15TRsm6epZNgIIIIBAIIE2HTvL2Mefk4LCojOXHU5xFZ4tzoZNAt1OHwIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJRFciI6mgMhgACCCBQVcCvSmvbzq2l+JruVe/hOEUFThwtlWULVgTdXa/Bl0uzVo2DXo/VhbUfrpct67bL4f1H5NttJT7T6Hp+cklH6X5F16gFFXWOT99bK+WnymXLl9ul7Idy75xZdTPl3AvaSWadTOnRrzghHt7FcBCRgKda64Lpr0X0PA8hgAACCIQp4Irv/3Vv0baDjHnsOVkwY5Ls3rHVXKSzSWsRV6ZUHvouzEVzGwIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAI1F4jvfxmr+fp4AgEEEEhWAa3Q2t+6+FsnDLB2cZ6iAsePlMrr894Kursul3aMW4jz8L4jsnLpanlvyQeyd9eBoGva8KmY9+kNXS7pJCNmDBENYte0aaB31bI18rel/y2b120L+biGbLUtmrNUuvfsKr0GXWYGXEM+xEVbCWi11jfnvyP6fUZDAAEEEIiCgNMljro5xld9cdTTrwZRGLTmQ+Q3KzIrtv56yn3y9eaN5gDO/CJxZGZJxd6vaz4gTyCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAQhoAzjHu4BQEEEECg5gITrY/oj+imSqtVhfNYC2z4dJOM/NlEMzQaKtBqXYc+N/bqKfLCg69ImVFlNdy2Y+NO73PVBVqtY2rA9fFR883nNRhLSw4BT7XW5Fgtq0QAAQTsJ+DIqivO3ALRSqiu1p0lo+354mreQcwAaYICrR6levUbyJjHn5dOFxZ7usTRoEBcRe1EHPxxgheFAwQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgagJ8F+hokbJQAgggIBXoNA48qvSOnDU9d4bOEAgHgLLFqyQSQPnSG0Costf/UCmDXkyrDE0xDpp4K9CVoMNZ98ajNVxarPucObhnugJaLVWDe7TEEAAAQSqF9AqrM68QjMYmnF2VzPI6mzaRpwNm4gGXAO13Px86X5Fb7nxrrFy8VV9A90Ss77MrCy5e9Zcufjqa71zOHIaiavFOSJGVVkaAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAtEUyIjmYIyFAAIIIGAKaJVWn0SChr009EVDIF4CK5eulgXTX4vKdFq19al7F8iUl8YFHU+DqA8ZAdqaVHUNOphxwRNsffLtaaKVQGn2FvBUa43W95y9d8vqEEAAgRoIGKFPDbE6jIqrns9wnm7RtoO06dRZ2ne5QM42PvObFYXzWEzvGTJhsuQ3LZLlS14y59H9aLC1cs92cZeXxXRuBkcAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEgfAUKt6fOu2SkCCMRHQKu0jrROpVVaCeZZVexzvmbFOtm360BEC2rbubUUX9M9omdj9dDhfUdkwbTQgdaW7Yuke8+uktMg21zGli+3y4ZPNgUNpa55b61o1da+twUOZ+u1UIHWjt3aS6/Bl4nOq63sh3JZ9/EGWbVsjeh6AzUNtq77cL3tfAOtlT4xg/tvzn8n6PvECAEEEEgXAUdmljiyG4pWM3Vk51a7ba2E2qajEWDt2s34PM/81D47tr5Dfy65efnyxnNPmMtz1Mk2gq3nSsV3W8Vd9oMdl8yaEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEkkyAUGuSvTCWiwACthfQQKtPlVYNs/boV2z7hafzAjWwqV+RtN6DL7dd6PKFSa/IiaOlAbej34/DJg2WfiP6+F3/dluJPG1UZN28bpvfNe14fd5bRjD18oAB7XUfrQ/4jHZqeHbGkvv9rmv/rRMGyKSBvzIrs/rdYHSs+vNntvMNtE76xPy+0AA/1Vr5bkAAgXQU0HCno37emSBrls9vBf04CgqLzBDrWeeeJx3+TzfRqqzJ1H56XX+pm1Pf+H3BbCkvMyq0ZmSJq2XHM8HWH04k01ZYKwIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAgA0FCLXa8KWwJAQQSFoBTTDcZV1939uvlLymDa3dnCMQE4G9RsXZUAHdETOGBK22qlVUJ780Tsb2mRKw2qZWVP3b0tUBn9d5g7XiPt2CXZKc3GzRNU0aOCfgPTs2fhOwn057Cuivd1Rrtee7YVUIIBBlAYfTrMLqrP/vaqyuzKATaGhVw6saYj27U2fJb3amannQB5LgQvcrektufoEsnDlJTh4/JuJ0GRVbz5GKku3iLj2aBDtgiQgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAnYVcNp1YawLAQQQSEIBrdJaWHXdWhVTKxfSEIiXwKpla4JO1bFb+4CB1KoPaAA71PfsqmWfVb09rOPlr34Q8r62nc8Ken1fiLBs0Ie4kDABfs1LGD0TI4BAPASM4KqzYRNxFbWTjLb/x/x0NCgQsQRaM7OypOulPWTofZNl1utvy8T5r8iNd40VDYKmQqDVQ92h6wUy9vHnjHBr/pkuI+jrat5BnLmGCQ0BBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBCAWo1BohHI8hgAACFgGt0jrR0mf+qHaqtFpVOI+lwGcr1gUdvtsVXYJeq3qhe8+uQX+E/JZ126TsVLn5o+arPtOsVWMJVq11x8adZiXWfndeLcXXdK/6mHms1Vrf2b3Yr5+O5BSgWmtyvjdWjQACgQUcdbLNiqyO+nmix8FaQWGRdOx+sXQyvjTQmi5Nq9BOeOpFeW7iGDm4p8TctrNpGzPoW3l4T7owsE8EEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIEoChBqjSImQyGAQFoL9Dd271OlVTX63nZlWqOw+fgKHN53RDYbodNgrculHYNd8ulv2b7IDK1qeNXatE+DrV0u6eRz6VyjCmywUKveuOHTTeaXBlgvMYKtGrDVZzQMS0stAU+11gXTX0utjbEbBBBIGwFHVl1x5DYRZ/1GIhlZQfetgc5uP+sl5/3fYtHjdG1affaB51+WZx8YI7t3bDUZnAUtTLvK/TvTlYV9I4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIRChAqDVCOB5DAAEELAJ+VVq1ImXbzq0tt3GKQOwEDu8/EnLwSQPnhLwe7kUNr3a5xPfugaOvl1XL1vh2Bjg7cbRUVi5dbX7pZQ216j8rvQZdzj8vAbyStYtqrcn65lg3Amks4HSJs0G+EWZtHLQia2ZWlnS6sFi6XnK5UZW1WHLz89MYzHfr9eo3kAlPvygLZ06STV+c+f2As2ETcbhcUrH3GxF3pe8DnCGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAQRIBQaxAYuhFAAIEaCGiV1vOt9986YYC1i3ObCox/aoT0Hny5TVcX/rJOHD0R/s21uHOfEWq1Ng1wa2Xi5a9+YL0U8lwDsssWrDC/PAHXG4b3oYJrSDX7X9RqrX2HXimvz3vL/otlhQggkNYCjpxGRpg1Txz1AwdUCwqLjADrxdLJ+Op6aY+0tqpu8xr6vfPhObL0+Sfks/ffNW9XV5czQyr27BCprKhuCK4jgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggIIRa+SZAAAEEai8wzTqE/mh2qrRaVTiPtYAGRBPZRj96h5SfKvdWYa3pWjwB1+WLPxCt/DrwnutFw5G05BTQkPObL7wjZcb3BA0BBBCwk4Ajq65RkbWJGWYVl/+/ZzScqQHWi3r3NSuz2mntdl+L2g2ZMFnq5TSQj956w1yuIztXXC3OkYrd/yTYavcXyPoQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAARsIEGq1wUtgCQggkNQCWqHVr0prvzuvTupNsXgEIhXQqrfFfbrJgumvSaQhWw1BaoXPLV9ulxlL7o90KTyXYIG8pg2l7+1XmlV4E7wUpkcAAQREnC4jxJpvhFkbi6NOdkCRNh07y8VXXSvdftZL6tVvEPAeOsMTuPGusZLftJn88bfPmg+oeUbr84xg6xZxl5eFNwh3IYAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIpKUAoda0fO1sGgEEoijgV6VVK7QWX9M9ilMwFALhCVRX1VQrCEejNW3VOOQw+v2vX2veWytrVqyTT43PE0dLQz4T6OLaD9fLojlLZdikwYEu05cEAgNHXS9aeZdqrUnwslgiAikq4MhpZFZkddTPD7jD3Px8I8h6nVmVtVmrswLeQ2dkAlcMuFmyjXDwG889IeVlRpA1I0tcrTTY+k9xn6r57wsiWwVPIYAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIJJsAodZke2OsFwEE7CSgFVr7Wxd064QB1i7OEYiLgFbGDNVGzBgiGrqOV/OEW7V6a6QB12ULVsgNw/tIdXuL156Yp2YCVGutmRd3I4BAdAQcmVniaNjMDLOKKzPgoN2v6C0X9OglXS/tEfA6ndERuMiofNsgL18Wzpx0JthqVMx1tThHKvbsEHfp0ehMwigIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIpJeBMqd2wGQQQQCC+AuOs02mAiyqtVhXO4yXQtGWTkFN9u+27kNdjeVH/udBw6xubfiPPvj/LrL7asn1RtVNqhc+VS1dXex832FdAq7VWV0XYvqtnZQggkEwCjro54ipqJ66zuoizUVOxBlpbtO0gN4/5pTz25nsy7MEZBFrj9HI7XVgsE55+UeoZVVvNpsFW4z0Fq54bp2UxDQIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAgE0FCLXa9MWwLAQQsL1AobHCW6yr1PAWDYFYChzedyTo8PUbZge9phe2fLk95PV4XdRqsYPuuV5+8/Fj8uTbD1dbPXb39pJ4LY15YiDgqdYag6EZEgEEEDAFHNm5ZvVPV8uO4shp5KOSm58vP72uv0xe8LpMnP+KeewNV/rcyUksBTRQPPGFl6Wg8N9/ocXhFFfh2WfCx7GcmLERQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSSTiAj6VbMghFAAAF7CEw0llG36lIIblXV4DhSgbwmDUM+enj/EdHvtUAtJzdbulzSSTZ8uinQZdnx1c6A/dZOrY56Y9vh1m7v+YjpQ6TfiD7ec51vwyebvedVD7r17CIdu7Wv2uVzrNeefHua3P2zibJ31wGfa56TYP2e63zaX0AD/8sXfyD6vUVDAAEEoiXgzC0QR8Om4qjj/5c6tDroRb37SvcrekdrOsappUB+syK5d96L8uspE2T3jq3maM7GrcyKupUHd9dydB5HAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBIFQFCranyJtkHAgjEU0CrtI60TsiP2LaKcB6JQFbdzJCPbVm3LWRl0x79Lg4aatXw6doP10v3nl1DzvHtttCVUVu2/3eVtX+Psnntdnl93lsBxzxxrDRkqFUf0h9Nf3bns4KGWgMOTGdSCXhC/8sWrEiqdbNYBBCwoYBR4dPZsLFR4bOZSEaW3wIvvvpauaL/YNHKoDT7CWjl3LGPPycLZ06Sreu/NBfozCsUR0bo3//YbyesCAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAIFYCzlgNzLgIIIBACgvcYezNp0qrhvJ69CtO4S2ztXgJNG3ZJORUyxauCBn+LO7TPeTz8ye9EvJ5fThYQNUzcFsjgFq1Bascq/doiDac6pz7du2vOqTPcbNWjX3OOUlOAYL/yfneWDUCthFwZYozv7lknN1VzOqeVQKtmVlZcsWAm2X6ojdlyITJBFpt89ICL6Re/QYyctY8nyq6jgYFcv3wsfLDqbLAD9GLAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCQNgKEWtPmVbNRBBCIkoCGWcdZx+p7+5VBfyS89V7OEQgloAHOnFz/H6PseUarqI69eopMGjjH+3V43xHPZfP7sN+IPt5z68HeXQfkoUG/kjXvrbVeMsOuj4+aH/Ca5+a2nVv7fa93uaST57Lfp6539vBnpOoarTetXLpadmzcae32np/brZ33mIPkFfBUa03eHbByBBBIhIAjM0ucTVpLRpufGKFWo1K40+Vdhlb97Dt0uMx6/c9y411jRX+8PS05BDSIPOzBGfLT6/p7F7xi1Sdy2aA75Pujx7x9HCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCKSfQEb6bZkdI4AAArUSGGk8XVh1BK3SqhUIaQhES+CSa7qLBj2DtRNHS2XDp5u8l62VUIdNGmxWSNVAaaCmwdZZRtBUw7OeqquH938vwe6vOsatEwZUPTWPNYirwdaqa6p6k1Zrve2CMWY145btfANHGz7dHPQ5HUP/+eo1+PKqw3GcxAL6a+XyxR+EVb03ibfJ0hFAIAoCjjrZ4mzUVLSCp7UVFBZJ70FD5aKrrhUNR9KSV+DmMb+U3LwCWb7kJXMT/2/TP41g63/KX373rJzVonnyboyVI4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIRCxAqDViOh5EAIE0FNAqrROt+9bAXagfv269n3MEqhO45raeIUOt1T2vQdDxT42Qh4xqrtbAa9VnreHYqtcCHXfv2VWKjcBtoDbsoUHVzrdq2ZpAj4bs04Cu7oeWGgKeaq3LFqxIjQ2xCwQQiLqAGWY1KrI6chr5jd2ibQfpPXioz4+t97uJjqQT6Dv051LQrFCWzJ1trn3Ljq+l99CR8ub8J0V+LMybdPtiwQgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAApEJOCN7jKcQQACBtBS4xti1T5VWVeh725VpicGmYyfQsVt7GXRP7ar/6hi/enNS1ALXGmid/NK4oJvW+TRIG80Aam8jMN5vRJ+gc3IhOQW0Wms0v0+SU4FVI4CAn4ArU5xNWourVSe/QGunC4tlzOPPy8T5rxBo9YNLjY4LftZb2nTs7N3M199+J3v2H/Cec4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIJA+AoRa0+dds1MEEKi9wDTrEFq1sm3n1tZuzhGotYBWKB396B2Sk5sd8VgaNH12xSzpckmniMfQBzVYqoHW6oKIPfoVm0Halu2LajWfVvPUvWtIlpZ6Ap5qram3M3aEAAIRCThd4sxvLhltfiLOhk18hrj46mvNIOvds+ZKh64X+FzjJLUEXp83W77evNG7qZcemyZ9elzqPecAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgfQQy0mer7BQBBBColUB/4+nzrSPcOmGAtYtzBKImoFWAi/t0lzUr1sqGTzfL2g/Xy4mjpTUaXwOEc4yKrfrs337/37Jq2ZqwntcAq4a2NVzbrFXjsJ7RmzRI+5uPHzPWu8mY6zNzvnDXrNVgew26TDQcS0ttAa3WunzxB1J2qjy1N8ruEEAgpICzUVNx5hlF8I0qrVWbBlhvHDlOWrTtULWb4xQVeOO5J2TtRyu9u5t9/z1y+43/4T3nAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAIL0ECLWm1/tmtwggELnAROujWv2SKq1WFXueDxx9vfQafFnQxbVs1zzotUgveMKkwZ5v2/msYJd8+s2qlka4VQOuVduOjTuNgOsJs0vvqa5pYFS/tALqhk82iT6vTcOy2rLqZsq5F7Qzj7v17GKGU82TCP9H//nQL3M+I+B6eN8R+XZbiTnali+3S9OWjSWvSUPRtWtl16Ytm9QoPBvhsnjMJgLm9/XtV8qyBStssiKWgQAC8RRw5DQSV5NWIhlZPtNqiPWGn4+UThfylxt8YFL4ZPmS38nf//In7w4fuOsO0S8aAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBA+goQak3fd8/OEUAgfAGt0OqXruh359Xhj8CdCRXQ0KR+xbNppVMNdcaqRRqozsnNNiuwahXWeLVYOsRrD8wTfQGqtUbflBERsLuAo26OOAtaiKNeA5+l5ubnyw3/ebdcdNW1Pv2cpLbA5399V5Yvecm7yWE3/YdolVYaAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAegsQak3v98/uEUAgPIFp1ts0UBjPUKB1fs4RQACBZBegWmuyv0HWj0D4Ao7MLHE2biVaobVqq1e/gVzRf7D0HjxUMrN8q7ZWvY/j1BNY/8kqWTJ3tndj7tKj8vyMSd5zDhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBNJXgFBr+r57do4AAuEJaJXW/tZbb50wwNrFOQIIIIBADQWo1lpDMG5HINkEXJnizCsUZ6Omfiu/YsDN0nvQUNEqrbT0Etj0xRpZ9OiPf2fM/cMJqSjZLnXrEGxOr+8EdosAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIBBYg1BrYhV4EEEDAIzDOc+D51OqC3Xp29ZzyiQACCCAQoQDVWiOE4zEE7C7gcP4YZnW6fFbb9dIecuNdYyW/WZFPPyfpIbB7x1Z55dHpUl5WZm7YfapUKr7bKuKuTA8AdokAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFCtAKHWaom4AQEE0lig0Nj7Ldb9a2XBrDqZ1m7OEUAAAQQiEKBaawRoPIKAjQUcdXPE2bSNOLLq+qyyTcfOcuPIcaKftPQUOLS3RH49ZYKcPH7sDMDpMqncs12ksiI9Qdg1AggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAQAFCrQFZ6EQAAQRMgYnG//okMjxVBfFBAAEEEIiOgOfX1WULVkRnQEZBAIHECGh11vwis0Jr1QUUFBbJgF+MFa3QSktfgaOHDslzE8eIfprNCLJqhVZ3+ZmKrekrw84RQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSsAoRarSKcI4AAAmcEtErrSCvGDcP7UKXVisI5AgggUEsBqrXWEpDHEUiwQKDqrJlZWXLNkJ/LVTffluDVMX2iBbQyq1ZoPbin5MxS3JVSsfuf4i77IdFLY34EEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEbCjhtuCaWhAACCNhB4A5jET5VWrPqZErvwZfbYW2sAQEEEEgpAU+11pTaFJtBIB0EnC5xFrQQV8uO4sj68bdNbTp2lgdeeIVAazp8D1Szx/KyMlk4c5Ls3rH1zJ0aaC3ZLu5TpdU8yWUEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIF0FaBSa7q+efaNAAKhBDSVMc56Q9/brxQNXtEQQAABBKIvQLXW6JsyIgKxFHBk54qrWRsRV6Z3Gqqzeik4+LfAokenydb1X3o9KvZ+I+7So95zDhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBCwClCp1SrCOQIIICAy0kAorAqhVVo1cEVDAAEEEIiNgP6lgV5Uw44NLqMiEE0BozqrhlldzTv4BFo7XVgs0175A9VZo2md5GO9Nm+2rP9klXcXlQd2ifv4Ie85BwgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggEEqBSayAV+hBAIJ0FtErrRCtAj37FVGm1onCOAAIIRFmg721XyvJXP4jyqAyHAALREghUnbVe/QZy011j5aKrro3WNIyTAgLLl/xOPnv/Xe9OKg+VSOX3+7znHCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAQTIBQazAZ+hFAIF0FrjE27lOlVSFuuLNPunqwbwQQQCBuAm07t5bia7rLmvfWxm1OJkIAgTAEtDprk1biaFDgc7NWZx0yYbLk5uf79HOS3gIfvfWGLF/ykhfBfeygVB76znvOAQIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAKhBAi1htLhGgIIpKPANOumNWClQSsaAggggEDsBW6dMIBQa+yZmQGBsAWcuQXibNxKxAi2epqGWG/4z7upzuoB4dMrsPajlfLH3z7rPXef+F4q9n7tPecAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgeoECLVWJ8R1BBBIJ4H+xmbPt25YA1Y0BBBAAIH4CFCtNT7OzIJAtQIOp7iatvarznrx1dfKjb8YK/XqN6h2CG5IL4FNX6yR1+fN9m7aXXpUKvb8y3vOAQIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALhCBBqDUeJexBAIF0EJlo32rFbe6q0WlE4RwABBGIsQLXWGAMzPALVCDiy6oqzsJ3op6dpddYhEyZLpwuLPV18IuAV2L1jqyycOUnKy8rMPvepUiPQukPEXem9hwMEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEwhFwhnMT9yCAAAJpIKAVWv1SGgNHX5cGW2eLCCCAgL0EPNVa7bUqVoNAegg46ueLq2VHn0CrVmed/OLrBFrT41ugxrvcu+sbefaBMd5Aq5wuk4rvtolUVtR4LB5AAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAgEqtfA8ggAACZwSmWSEIVVlFOEcAAQTiJ0C11vhZMxMCpoDDKc6CFuJs1NQLkpmVJTeP+aVcdNW13j4OEKgqcGhvifxm6n1y8vixM90V5VKxe4uI8UlDAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAIBIBKrVGosYzCCCQagJapbW/dVMaqKIhgAACCCRGgL9YkBh3Zk1PAUdmlrhanOMTaG3W6iyZ8PSLBFrT81sirF1rkHXBjElycE/JmfuNyqxaodVdXhbW89yEAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQCABQq2BVOhDAIF0Exhn3XBe04bSrWdXazfnCCCAAAJxFOAvF8QRm6nSVsCRnSuuVueJo26O16DrpT1kwlO/lRZtO3j7OECgqkB5WZn8esp9snvH1jPd7kqpKNku7lOlVW/jGAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIEaCxBqrTEZDyCAQIoJFBr7ucW6p4GjrpesOpnWbs4RQAABBOIoQLXWOGIzVVoKOAtaiKu5EVx1urz7v/GusXLnw3OkXv0G3j4OEKgqoIHWhTMnydebN3q7K/b8S9wnj3nPOUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgUgFCrZHK8RwCCKSKwERjI3WrbkartPa9/cqqXRwjgAACCCRIgGqtCYJn2tQWcGWKq8U54szTv9tzpuXm58uEp1+UKwbc7OniE4GAAkuff0I2fbHGe61y39fiPvG995wDBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBGojQKi1Nno8iwACyS6gSY6R1k3cMLwPVVqtKJwjgAACCRKgWmuC4Jk2ZQUcdXMko3UncdT7sRJrh64XyOQXX5c2HTun7L7ZWHQE3n75N/LZ++96B6s8VCKVRw96zzlAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoLYChFprK8jzCCCQzAJ3GIv3qdKaVSdTeg++PJn3xNoRQACBlBOgWmvKvVI2lCABrczqatlRxKjU6ml9hw6XMY8/L/Xq/xhy9VzjE4GqAn9941XRL0+r/H6fVB76znPKJwIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJRESDUGhVGBkEAgSQU0DDrOOu6+95+peQ1bWjt5hwBBBBAIIECVGtNID5Tp4aAwymuZm3EWdDCux8Nsd49a670Hfpzbx8HCAQT+Pyv74pWafU09/FDUnlgl+eUTwQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSiJkCoNWqUDIQAAkkmMNJYb2HVNWuV1oGjrq/axTECCCCAgE0EqNZqkxfBMpJPwOkSV1E7cTQo8K69TcfOMvGFl6XThcXePg4QCCaw6Ys1smTubO9ld+lRqdj7jfecAwQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSiKUCoNZqajIUAAskioFVaJ1oX26NfMVVarSicI4AAAjYRoFqrTV4Ey0guAQ20tjhHHNm53nVffPW1Mubx5yW/WZG3jwMEggl8vXmjLJw5yXvZfapUKkq2i7grvX0cIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIBBNAUKt0dRkLAQQSBaBa4yF+lRp1YXfcGefZFk/60QAAQTSUoBqrWn52tl0hAKOzCzJaH2eOOpke0foO3S4DJkwWTKzsrx9HCAQTGD3jq3y6yn3SXlZmXmLu+wHqdj9TwKtwcDoRwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQiIoAodaoMDIIAggkmcA063qLr+kuWgWQhgACCCBgXwGqtdr33bAyewlokNXV6jyRjB/Dq0Pvmyx9h/7cXgtlNbYVOLS3xKzQevL4sTNrPF0mlSVbRSorbLtmFoYAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIJAaAoRaU+M9sgsEEAhfoL9x6/nW26n+ZxXhHAEEELCnAL9e2/O9sCr7CDiyc8XV4hwRp8tclFZlvfPhOXLRVdfaZ5GsJGIBrZ6qX7FsGmR9buIYObin5Mw0RpC14rut4i4/U7E1lnMzNgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIZECCAAAJpJjDRut+O3dpTpdWKwjkCCCBgUwFPtdY176216QpZFgKJE3DmFoiziVF53nHm7y7Wq99A7p41V9p07Jy4RTFzxALlZWWybf06+XrzP/79uVG0r+/Q4dKibYeIxw31oI7/7ANVAq3uyjOB1rIfQj3GNQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSiJkCoNWqUDIQAAkkgoBVai63rHDj6OmsX5wgggAACNhbQaq2EWm38glhaQgScjZqKs3Er79wFhUVmhdZYhR+9E3EQNYG9u76RbzZvlH8ZX19v2hjziqzWhWugdeHMST/Oq4HWku3i/uGE9VbOEUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEIiZAKHWmNEyMAII2FBgmnVNnop/1n7OEUAAAQTsK+D5tZtgq33fESuLr4CzoIU48wq9k2qQdcS0OZLfrMjbx4H9BLau/9IIr34l32z5h1GN9Ss5euhQQhf5+rzZsumLNd41VO7fKe7So95zDhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBCIhwCh1ngoMwcCCNhBQKu09rcuZMfGnXJ9i9ut3ZwjgAACCCCAAAL2F3A4xdW0tTgaFHjX2qZjZ7l71lypV7+Bt4+DxAtoFVQNjJoBViPIqoHWcJr7VKlZKdWRVVcc9WL3Tt947glZ+9FK75IqD+ySyqMHveccIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIBAvAUKt8ZJmHgQQSLTAuEQvgPkRQAABBBBAAIGoCWigtaidOLJzvUN2vbSHDHtwhmRmZXn7OEicwN5d35hB1k1rP/epgBp0Re5KM8DqPnnc+NSvEyKVFebtzvzmMQu1Ll/yO/n7X/7kXVbl4T1S+f0+7zkHCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCMRTgFBrPLWZCwEEEiWgP4/3lkRNzrwIIIAAAggggEBUBQIEWi+++loZMmFyVKdhsJoJeKqxblr7mWw2vg7uKQk9wOkyb3jVDLIaVVnj3T7/67uyfMlL3mndxw5K5cHd3nMOEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEIi3AKHWeIszHwIIJEJgojFp3URMzJwIIIAAAggggEBUBQIEWvsOHS59h/48qtMwWHgCh/aWGFVYP5P1n66uthqru+wHcZceFffJY2eqsFaUhzdJjO5a/8kqWTJ3tnd0XVvF3q+95xwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggkAgBQq2JUGdOBBCIp4BWaR0ZzwmZCwEEEEAAAQQQiJWAq2lrcWTneocn0OqliNvBpi/WyKa1n5sh1r27vgk+r7vyTIhVg6ylR8RdXhb83jhf0T0senSad1YN2laUbPeec4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAogQItSZKnnkRQCBeAnuMierFazLmia9ARUXFU06nc3x8Z2U2BBCwo8CAAQPkT3/6kx2XxpoQiJqAq1kbcTQo8I5HoNVLEdOD8rIy0aqmX61ZbX7qebDmrcaqIVYjzGrHtnvHVnnl0eni2Yf7VOmZQKsRwqUhgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggkGgBQq2JfgPMjwACCCAQkYDb7W5kfFGFNyI9HkIg9QQmT55MqDX1Xis7qiLgLGjhE2j96XX9pe/Qn1e5g8NoC2iQdYMZZF0tJ48fCzy8jauxBlrwob0l8uspE37cz+kyqdxjVGitrAh0O30IIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIxF2AUGvcyZkQAQQQQCBKAnc4HI66URqLYRBAIMkFLrzwQjn//PPlf//3f5N8JywfAX8BZ35zceYVei9cfPW1cvOYX3rPOYiewNebN8pnf31XNny6So4eOhRw4GSoxhpo4bqf5yaO+XFfRpC1YvcWcZcHrzwbaBz6EEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEIilAKHWWOoyNgIIIIBAzASMKq3jjFBrzMZnYAQQSD6BBx98UG655ZbkWzgrRiCEgLNhE3HmF3nv6HRhsQyZMNl7zkHtBfbu+kY+X7lc1n28Ug7uKQk4oBlkPbpf3Ce+T8oQqFaa1Qqt3v2ZgdZ/JuVeAr4gOhFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBFJGgFBryrxKNoIAAgikj4ARaO1v7LZN+uyYnSKAQDgCN910kxQWFsqePXvCuZ17ELC9gDO3QJxNWnvXqYHWOx+e4z3nIHKBQ3tLZP0nq+Tv7y4TDbUGbKfLpPLYIXEfOygaak3WVl5WJgtnTpLdO7ae2YK7Uir27BD3qdJk3RLrRgABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQSGEBQq0p/HLZGgJpJ9Dm/xZmZFQGLq+VdhipveHMDhem9gbZHQIIRC5Qv4VktG8R9vMVJdvNyothP8CNCMRJwJHTSJxN23hna9G2gxlozczK8vZxUDMBDXd+/td35TPj6+vNGwM/XFEulceNaqwaZP3hROB7kqx30aPTZOv6L72rrtjzL3GXHvWec4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIICAnQQItdrpbbAWBBColUBGhntirQbgYQQQQACBtBPQSpgVxo8TpyFgJwFHdq64Cs/2LkkDrWMff04ItHpJanSgVVn/unSJrPv4b3Ly+DH/Z43Kpe7jh8+EWVPs14PX5s02K9J6Nl25fydBfg8GnwgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAArYUINRqy9fCohBAoMYCba6oK3L0FhFHjR/lAQQQQACB9BXQapiOrLpJ/aPF0/ftpebOHXVzxFXUzvgtjdPcYEFhkRlorVe/QWpuOIa7Wv/JKvn7u8tk0xdrAs7iNgKslccOnwl5GsHWVGvLl/xOPnv/Xe+2Kg+VSOWR/d5zDhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBCwowChVju+FdaEAAI1FsjMPHaH2+0orPGDPIAAAgggkPYCjtwm4j6wK+0dAEi8gAasXc07+ARaxzz2nBBoDf/daCXWz/76rny87PdycE+J/4Ony6Ty+71SefSgSGWF//UU6fnorTdk+ZKXvLvRMGvloe+85xwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggYFcBQq12fTOsCwEEaiRQUSF3Oc8UNKvRc9yMAAIIIICAs2HjM2GvFA648ZaTQMDpOhNoNT61aZBVA635zYqSYPGJX+LeXd/Iyt8vkXUfrZTysjK/BblLj5pVSrU6a6q3tYbBH3/7rHeb7uOHpHL/Tu95Mh3ce86TybRc1hqmAO81TKgku635Iz8G6ZNs6Sw3hADvNQQOlxBIUgH+PZykL66aZfNeqwFK0sv8ezhJX1w1y+a9VgPEZQQQQAABBBBAAAEE/i1AqJVvBQQQSHqBjPYXXiHiPt+6keXXnCvnNapr7eYcAQQQQCCNBU5VuKXLHzeIfnqb8WPenbkFRvXGfd4uDhCIt4CrsK1IRpY5rQZaxz5OoDWcd6ABzo//tFS+3rzR/3YjqF557JC4v98j7nL/oKv/A8nfs+mLNfL6vNnejWiYt2LvN95zDhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBCwuwChVru/IdaHAAJhCFSOM35Or899FxTkEGj1EeEEAQQQQEAF6rgcMrRdY3npn/t9QJyNmhFq9RHhJJ4Czvzm4sjO9U45ZMJD0qJtB+85B74CRw8dks/++hf5eNlS0WNrc5f9IO4j+6Ty6EHj7z1VWi+n7PnuHVtl4cxJ3kq17lOlUrFnR1oZpOzLZWMIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQBoJEGpNo5fNVhFIRYG6Hc9vc/q0o791b3d3amLt4hwBBBBAAAFTYEiHAr9Qq1bIdOQ0+v/s3Q10XGd97/v/3vOi99Ho1RKxEtmybMuE1E5wSJMDiF58naQraUgcmuWYkoi1sNNDMZQLuaX09gCltyWFtmlOIb1N4BTCgrAgNCzsA86CNDQBUg4+DZA3Eyex48hvkvUuWdLsffcz8mzN3jOSZo/2vH+ftYT288zez/Pszx4jZ/mn/0glfDQ5b4PiElDvO725097UdXveJ5dd/Ta7z8GigAqwPvbNr8qT33vEDm4uvmplNyeGxRg9K+b0ePJwRRyfOv6q3PuxP1p0mZ+V2IkXRaxqtTQEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEESkmAUGspPS32igACKQLz80GrSmvSR0hbZ7TXBGXn2saUcxlAAAEEEEBACfQ0VMV/Tnz/tVEHiN7YJrHJEccYHQRyKaCFwhJY020vocKs1+0ZsPscLAgsG2aNzcUrsppjZ8Scm61IsuFTg/LFP/uITE9cCPPGA60vEGityHcDN40AAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlL4AodbSf4bcAQKVK9DdXy0yfocbYF9fu3uIPgIIIIAAAg6BW7qbxB1qVR//rlXVivrIbhoCORfQdNE7e0X0QHyplo5Ouf2PP57zZUtpgeXCrOrPqTl6Wozxc9bvNxmldFu+7lUFWf+/T/6JDJ0cXJjXqswaG3ypbAK+f/vi/+WrF5PlX+DDG/8mZVGeawpJyQ2ke66v/9n7Su4+2LBT4A2ffsA5YPV4rikkJTeQ7rmW3E2wYV8F+DnsK2dBJkv3c5jnWpBH4eui6Z4rP4d9JS7IZOl+DvNcC/IofF003XP1dYESnQyXEn1wK2yb57oCUIm+zHMt0Qe3wrZ5risA8XJJChBqLcnHxqYRQEAJhIJj+0zRoskaVQFNbuhqSh7iGAEEEEAAgRQBVdF7S7Ranh2ZcbymRVrFPHPMMUYHgVwIBDrWiRa2fj/HaqFwWPZ9+nNSU9+Qi6VKbs6VwqzG8KCYVFWWudlZ+cInPiInjh5ZeMZWuDceaCWYX3LveTaMAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCwKECoddGCIwQQKDGBmKG9V9edm961rlnaa/i/NqcKPQQQQACBdAK71jfLp37xuuMlPdIiKjAn1kea0xDIlYAebRetbvH3cnb/8Z/Kmq5LcrVcycw7fGpQDj70oPzi8cfigc3kjavKrIRZF0VUoPWfP/Un8srzv7YH44HW6XG7zwECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACpShA8qsUnxp7RgABCfZcfq1ostVNsWdDq3uIPgIIIIAAAmkF9vS0yt/96pSMzcYWX1cfCd/QJMbI6cUxjhDwUUCraRC9tcuesf9dvy9X9L/T7lfiQSLM+rMfHEi5fXNmUoxzJ6nM6pJ5+L575Lmf/9QeNU6/IubUmN3nAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIFSFSDUWqpPjn0jUOkCmux1E/R3RuIfJe0ep48AAggggEA6gaqAJrd2N8sDL55xvKw3dRBqdYjQ8UtAC4Ul0LHOnq73sm1y894P2v1KOxgbHpbvfvkLkjbMalUcjVdmpfJoytviyQOPiLJLNGPohBhjQ4ku3xFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoaQFCrSX9+Ng8ApUpEN54xWbDkJvcd7+7p9k9RB8BBBBAAIFlBQY2taaEWiUQin80vDk5suy1vIiAJwFVBXjNelHvL9VaOjrlvf/3pzxNUS4nz83OymMPf9X6+oqo4+RmEmZN5kh77Ai0WlVsVSVbGgIIIIAAAggggAACCCCAAAIIIIAAAoUSeP3P3leopVnXJ4E3fPqBlJl4rikkJTfAcy25R5bRhnmuGTGV3EnpnmvJ3YTPGybU6jMo0yGAQO4FDEOzqrSajoW66sOyc22jY4wOAggggAACKwmsrVv4+fH910Ydp+rRdokRanWY0FmdQKD9YtGq6+KThMJhK9D6SYk0V94v5Dzz1BPy8H33OCqNKhTCrN7fX+b4kKgqrTQEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEykmAUGs5PU3uBYFKEOjurxYZ2yeiOe72zo2tjj4dBBBAAAEEMhUYsH6GuEOtWk2DaFW1Yp6fynQazkNgSQGtNiJaQ4v9+s1790v35jfa/Uo4OHH0iHz7i38vR5457Lhdc2YyHsxUoVZa5gLm1JjETh/L/ALORAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKBEBAi1lsiDYpsIILAgEAqO7TOtOmfJHlUBTW7trrxKZ8kGHCOAAAIIZC9wVXu9bIlWy7MjM45J4tVaT73iGKODgGcBTZfAmm77ssuufptc87s32f1yP5ieGJdHv/RFefJ733HeamxOjOFBMUbPOMfprSiggsCxwZes8rbGiudyAgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAKlJkCotdSeGPtFoMIFTJG9boI9Pa0SCQfcw/QRQAABBBDIWGDPhlb5+M9fc5yv1TeJnLU+2tsK39EQyFZAb10rEgjFL6+pb5B3f+Cj2U5Vctc9/sg35OBDXxIVbLWbFcQ0Rk6Lce6kiBGzhznITEBVj469foRAa2ZcnIUAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFCCAoRaS/ChsWUEKlUg0HOFKmu22X3/t/cufpyv+zX6CCCAAAIIZCKwa12z/N2vT8rp6fnF060Km3qkZSF8tzjKEQIZC2g1DaI3ttnn37L3gxJpLv/q8keeOSwP/8M9cur4q/a9qwNzakyMM6+KOTfrGKeTocD8rBgnrQqthIEzBOM0BBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBUhQg1FqKT409I1ChApqmWVVarVqtSW3n2kbpaahKGuEQAQQQQAAB7wJVAU1u6GqSB150fhS6Hm2PV5XkY769m1b8FSoU3XaxzdB72Ta5csf1dr8cD4ZPDcq3779XnnnqCcftmbMzYpw9Hg+1Ol6gk7mAMR+v0EogOHMyzkQAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEChNAUKtpfnc2DUCFScQ6t2+1TSNa903fku39dHQNAQQQAABBHwQ2NfXnhJqVR8br9VGxJwc8WEFpqgkAb25U7RwdfyWQ+Gw3P6RPy3r2z/0ja/INAiBxQAAQABJREFU/3zoQZmbTarCalUUNYZfXwiGl/Xd5/7mjJHTuV+EFRBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoAgFCrUXwENgCAgisLGCa5nvdZ6kKrapSKw0BBBBAAAE/BNprgvGfK99/bdQxnQonxgi1OkzoLC+gVdWKqvKbaNfePiDNazoT3bL6fur4q/LQ5z4jrzz/a8d9GaNnxBg6IWIFW2kIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBApgKEWjOV4jwEECicQPfWqJWI2CeiOfZwe2+Lo08HAQQQQACB1Qrc1dcm7lCrCiiqL/P81Gqn5/oKEdDbL7H+2qLH77Z78xtlx++/pyzvPF11VnNmUowzx/jzUpZPnJtCAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAIPcChFpzb8wKCCCwSoFQUL/DFG3h83svzFUV0GRPT+sqZ+ZyBBBAAAEEnALbWupkS7Ranh2ZcbygN62R2MmXHWN0EEgnoDd1xEPQ6rVQOCy//8GPpjutpMfSVmc1DTGGB8U4d7Kk743NI4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAYQUWygcVdg+sjgACCCwrYJjafvcJKtCqgq00BBBAAAEE/BZ436a2lCm1OqtoeCCUMs4AAskCWrha9OZOe+id736PXLS+1+6Xw4GqzvrZ/3qHvPL8r+3bUdVZY8efI9Bqi3CAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALZChBqzVaO6xBAIC8CgZ4rbtI06XYvNrCJKq1uE/oIIIAAAv4I3HBxk7TXuD7QwPooeb0xNezqz4rMUi4CetvFItZ7RbU1XZfIO9+9p1xuTVR11s9/6P3y3S99UeZmZxfuS1VnHTohsdeeF3PWWd24bG6cG0EAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgbwKEGrNKzeLIYCAVwFNM1KqtO5c2yhr68Jep+J8BBBAAAEEMhJQlcB3dTennKs3Wr9QcSGwmPIiAxUvoELPWk2D7XD7R/5UQuHy+PsK1Vntx8oBAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgjkWMBVgirHqzE9Aggg4EEg1Lt9q2ka/e5LdvekBo3c59BHAAEEEEBgNQJ3bmyTB148I+dj5uI0gZBodVExJ4YXxzhCQAlY7w295SLbov9dvy/dm99o90v1QFVnfehzn5FXnv/14i2o6qzDg2KcO7k4xhECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCDgkwChVp8gmQYBBPwXMAxjr6Y5590SrZb+zohzkB4CCCCAAAI+C7TXBEVVBn/01RHHzHrTGokRanWY0BHRmzqs/wnEKVo6OuWGO/eVPIuqzvo/H3pQ5mZn7XsxZybFOP2KmLMz9hgHCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAgJ8ChFr91GQuBBDwT6B7e4emxe6wPufZMeeu9VRpdYDQQQABBBDImcBdfWtSQq1aVa1o1XWiwn00BOICqkprY6uN8a73f1BC4bDdL7WD6YlxeejzfynPPPXE4tapzrpowRECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCORUQM/p7EyOAAIIZCkQCsZuswKt1cmXq6p5e3oWQyPJr3GMAAIIIICA3wKqOrj6crd4VU73IP2KFYi/H7SF/6y6aH2vXHb120rW4sTRI/LZD9zpCLSqAHfs+HNinDtZsvfFxhFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoHQECLWWzrNipwhUjkB3f7Upcrf7hm/oapKqgLNyq/sc+ggggAACCPgpoKq1uptWGxGxqnPSENDC1Y4qrdftGShZlCe/9x35/IfeL0MnB+17MEbPSOzEi2LOzthjHCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAK5FAjmcnLmRgABBLIRCATGr7WqtHa4r93X1+4eoo8AAggggEBOBXaubRRVKfz09PziOlZVTj3aLsbQicUxjipSQG+5yPory8LvCfZetq0kq7TOzc7Kw/fdIz/7wYHFZ2gaEjv1qpgTw4tjHCGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJ5EKBSax6QWQIBBDwLpFRpTYSKPM/EBQgggAACCKxCQFUI393TmjKDHmmxw4wpLzJQEQJaVa1odVH7Xm/et98+LpWDU8dfjVdnTQ60qqqssePPEWgtlYfIPhFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoMwFCrWX2QLkdBEpdINS7faumyVXu+xjYmBoocp9DHwEEEEAAgVwI3N7TIirc6miBkMSDrY5BOpUkoDd32rd72dVvk4vW99r9Ujh45qkn5PMf3isnjh6xt6sqs8Zee15UsJWGAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIFEIgWIhFWRMBBBBYSsAwjP1WqNXRtkSr5ar2escYHQQQQAABBPIl0F4TlBsvaZJvHnV+FLsWsX7hYvRMvrbBOkUk4K7Set2egSLa3fJbmZudle9+6Yvy+CPfWDzRNMQYOiHGyOnFMY4QQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBAghQqbUA6CyJAAJLCHRv79A08zb3q3s2UKXVbUIfAQQQQCC/AgMb21IWjAcbaxpSxhkof4FSrdI6Njws//CxDzgDrfOzEjvxIoHW8n/bcocIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlIQAodaSeExsEoHKEAgGjTtEtOrku1XV8Xata04e4hgBBBBAAIG8C6iq4dta6lLW1aPtKWMMlLdAqVZpfeX5X8tn3r9b1PdEM6fGZP7Ys2LOTCaG+I4AAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggUVIBQa0H5WRwBBGyB7n4rzGrut/sXDnZ1N0tVQHMP00cAAQQQQCDvAnf1panWWhcVCYTyvhcWLJxAKVZpfe7nP41XaJ2eGLfhjOFBib1+RMSI2WMcIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAoUWINRa6CfA+gggEBcIBCZusqq0diRzqDDrnWk+7jn5HI4RQAABBBDIl0B/Z0RUBXF305scP77cL9MvIwGtNiKaCjJbLRQOy7s/8NGiv7unDx2QL3ziIzI3O7uw1/lZiZ14UYzh14t+72wQAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQqT4BQa+U9c+4YgaIUME3zbvfGlgoPuc+jjwACCCCAQD4E1C9b7OtrT1lKb2iyfi+Dv1anwJThgN5ykX1X1/zuuyTS3Gz3i/Hg4FcflK9+7jP21szzU1ag9QUxpxcrttovcoAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggUgQD/+l4ED4EtIFDpAqHe7Vt1Xba6HdJ9zLP7HPoIIIAAAgjkU+CGriZR4VZHC4REb2x1DNEpPwGtuk60qtr4jakqre+8dU9R3+RDn/+MHPzqA/YezZnJeIVWc+5CxVb7FQ4QQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACB4hEg1Fo8z4KdIFDBArGUKq1botWyraWugk24dQQQQACBYhRorwnKrnWp1Tm1hpZi3C578lFAb2yzZyvmKq1zs7PyhU98RH72gwP2fs3JkXigVYyYPcYBAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggUIwChFqL8amwJwQqSaB7e4dpyk3uW76rb417iD4CCCCAAAJFIbBnQ2pVVlXBU6uNFMX+2EQOBPSAaPVN9sRv2XGdfVxMB9MT4/L5D71fnvv5T+1tGaNnJDb4kohp2GMcIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAsUqQKi1WJ8M+0KgQgSCwfn9Ilp18u2qKng71zYmD3GMAAIIIIBA0QioauJXtden7Ce5kmfKiwyUtIAesSrxagv/6dS9+Y1y0freoruf4VOD8vkP75UTR4/YezOGTohx5pjd5wABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAodgFCrcX+hNgfAuUs0N1vhVm1O9y3uLunVaoCmnuYPgIIIIAAAkUjMLAxTbXWuqhooXDR7JGN+CegNVih1gvtv/xuSoH5xEsF+66CrJ//8Pvl1PFXF/ZgVWU1Tr8ixrmTBdsTCyOAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALZCBBqzUaNaxBAwBeBUGjsNivU2pE8mQqz3t6zGBxJfo1jBBBAAAEEikVAVRTvqk8NsGqNa4pli+zDJwGtuk60qtr4bDX1DbLt7e/0aWZ/pnnu5z+Vez/2RzI2PLwwoRVojQ2+JMbYkD8LMAsCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCQRwFCrXnEZikEEHAKxGLafueIiAoJtdcE3cP0EUAAAQQQKDqBO9NUa9UbrQquFz6mvug2zIayEtAb2+zrLn/7/yGhcGqY2T4hzwcq0PrPn/oTmZ4YX1jZiEnstRfEnBrL805YDgEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEPBHgFCrP47MggACHgVCGy6/Stdlq/uyu/qocOc2oY8AAgggUJwCt3Y3i6ow7mhWoDUebHUM0ilZAT0gWn2Tvf3/8rs32ceFPkgEWudmZxe2Mj8rsePPinl+qtBbY30EEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgawFCrVnTcSECCKxGwBS52339tpY62RKtdg/TRwABBBBAoCgFIuGA7OmxKrO6mhZZrOzpeoluiQnokRa78u5F63tFfRVDO3H0SLxCqyPQesKq0Dp3IeBaDJtkDwgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEAWAoRas0DjEgQQWKVA9/YOa4Zr3bPc1UcIyG1CHwEEEECguAV2rW9O2aAWrhatLpoyzkDpCWgNVqj1QiuWKq0q0Hrvx/5ICLQmngzfEUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgnAQItZbT0+ReECgRgWDQtKq0ao6SrO01QenvjJTIHbBNBBBAAAEEFgRUhfF0P7/iFT5BKmkBrbpOtKra+D2EwmG5csf1Bb+fRKB1emJ8YS9GTGJUaC34c2EDCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCDgnwChVv8smQkBBDIR6O63wqzGbe5T79zYJlUBzT1MHwEEEEAAgaIX2N2TplqrValVVWylla6A3rhYQV4FWlWwtZAtfaD1RTHnZgu5LdZGAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEfBUg1OorJ5MhgMBKAqHQ+B1WldaO5PNUmHVXd2ogKPkcjhFAAAEEEChWgZ1rG6WnoSple1pkMRSZ8iIDxS2gB0Srb7L3+JYCV2ldMtB6fsreIwcIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAOQgQai2Hp8g9IFBCArGY7HVv98ZLmqS9Jugepo8AAggggEDJCNze25KyV72xVcQKR9JKT0CPWM9TW/hPpYvW90r35jcW7CaGTw3KFz7xxzI9Mb6wByMmsRNWhVYCrQV7JiyMAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAK5EyDUmjtbZkYAAZdAcMOb+3VdtrqGZWAjlezcJvQRQAABBEpLYE9Pq6jK445mhSLj4UjHIJ1SENDqovY237LjOvs43wcq0PoPd/+RjA0PLyxNoDXfj4D1EEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgzwKEWvMMznIIVLaAsd99/1e118uWaLV7mD4CCCCAAAIlJaACrSrY6m56dI17iH6xCwRCotU0xHcZCoflLTuuL8iOE4HWoZODC+ubhsReP0KF1oI8DRZFAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIF8CRBqzZc06yBQ4QLVm7d2W5/je5ObYWBjagDIfQ59BBBAAAEESkHg9l7rI+vdLRiW5Kqf7pfpF5+AXhexN9W9+Y1SU78QcLUH83AwPTEuX/jER8QRaB18ScyZyTyszhIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBA4QQItRbOnpURqCiB+flgSpXW9pqg7FzbWFEO3CwCCCCAQPkK9DRUpf25pje2le9Nl+GdafXN9l1d/vZ32sf5PPjyX/03OXX81YUlVYVWFWidGsvnFlgLAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQKIgAodaCsLMoAhUm0N1fLWLe4b7rfX3t7iH6CCCAAAIIlLTA7p7FQGTiRrTaiGhVtYku34tZQA+IVlNv77DvzW+xj/N18O3775Xnfv5Te7nYqVcJtNoaHCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQLkLEGot9yfM/SFQBAKh4Ng+axvR5K1UBTS5oaspeYhjBBBAAAEESl6gvzMiW6LW73K4mhZpdY3QLUYBrdaqIK8t/CdS9+Y3SvOazrxu88nvfUcef+Qb9prGuZNiTgzbfQ4QQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBchcg1FruT5j7Q6AIBGKG9l73Nvb0tEp7TdA9TB8BBBBAAIGSF9i1PrVaqx5pEQmESv7eyv0G9Hor1Hqh9b35qsRhXr4feeawfPv+v7fXMidHxBg6Yfc5QAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBCpBgFBrJTxl7hGBAgoEey6/Vtdlq3sL6QI/7nPoI4AAAgggUIoC6hc3IuGAc+tW9U+9gQrlTpTi68UrtV7Y1mVXvzVvGxw+NSj//Kk/kbnZ2fia5vkpiZ18OW/rsxACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQLAKEWovlSbAPBMpVQJO97ltb6qOZ3efRRwABBBBAoBQFqgKa3NqdplprU0cp3k7F7FmrqhXRF8LINfUNctH63rzc+/TEuHzhEx8R9T3eYnNinHxJxDTysj6LIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAsUkQKi1mJ4Ge0GgzATCG6/YLKLd5L6t3T2pQR/3OfQRQAABBBAoZYGBTa2p2w+ERKuLpo4zUhQCWk29vY/ey7bZx7k++PJf/Tc5dfzVhWWsIGts8CUx5xYqtuZ6beZHAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEik2AUGuxPRH2g0AZCRiGllKltas+LDvXNpbRXXIrCCCAAAIIpAqsrUv/806PtqeezEhRCGg1DfY+Nrxpq32cy4Nv33+vPPfzn9pLxE69KubMpN3nAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEKg0AUKtlfbEuV8E8iXQ3V8tYuxzL3fnxjSV69wn0UcAAQQQQKAMBAbS/MxTwcn4x9yXwf2V2y0kh1p7f+vynN/ek9/7jjz+yDfsdYxzJ8WcGLb7HCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAKVKECotRKfOveMQB4EQsExK9CqWcHWxVYV0OTW7ubFAY4QQAABBBAoY4Gr2utlS9TxozB+t1RrLb6HHg8a64H4xmrqG+Si9b053eSRZw7Lt+//e3sNc3JEjKETdp8DBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoFIFCLVW6pPnvhHIsYApste9xJ6eVomEFwIj7tfoI4AAAgggUI4CezakVijX6ptEAqFyvN2SvSetpt7ee+9l2+zjXBxMT4zL//ir/0fmZmfj05vnpyR28uVcLMWcCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQMkJEGotuUfGhhEofoFAzxU3WVVaN7t3entvi3uIPgIIIIAAAmUtsGtds7TXBJ33qOmiR/iZ6EQpbE+rabA30HfFW+zjXBw8fN/fyNjw8MLUsTkxTr4kYhq5WIo5EUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgZITINRaco+MDSNQ/AKapqVUad25tlF6GqqKf/PsEAEEEEAAAR8FqgKa3NBlVWZ1NT3abv3+B38Vd7EUrKtV1dprd/e90T72++DpQwfkfz3+mD1t7PQxMecWKrbagxwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACFSzAv6RX8MPn1hHIhUCod/tWq9zYte65b+lODfS4z6GPAAIIIIBAOQrs62sXFW51tEBItNqIY4hOgQT0gEgwHF88FA7LRet7c7KR4VOD8q3777XnNseHxJwcsfscIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgiIEGrlXYAAAr4KmKb5XveEqkKrqtRKQwABBBBAoBIF2muC0t+ZGmDVmzsrkaPo7lmrrrP3lKtAq1rgy//vn8v0xHh8LXN2RlSVVhoCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAgFOAUKvTgx4CCKxGoHtrVMTY557i9t4W9xB9BBBAAAEEKkrgrr62lPtVH3mf/LH3KScwkBeB5GfQ3XdpTtY8+NUH5ZXnf70wt2mIcfoVq7C9kZO1mBQBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoZQFCraX89Ng7AkUmEArqd4ho1cnbioQDsqenNXmIYwQQQAABBCpOYFtLnWyJOn5Exg30pjUVZ1FsN5xcqXXt+g2+b0+FWR97+Cv2vMa5U2LOTNp9DhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEFgUItS5acIQAAqsUMExtv3uKW7ubpSqguYfpI4AAAgggUHEC79uUplprnVXkPBCqOItiuuHkSq0X9fT6urW52Vl56HOfEfVdNRVmNYZf93UNJkMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQTKSYBQazk9Te4FgQIKBHquuEnTpNu9hYFNVGl1m9BHAAEEEKhMgRsubpL2mqDz5jVd9MbUsKvzJHo5E9ADIsFwfPpQOCwXrfc31PrwfffIqeOvLmzfNMQ4dTRnt8LECCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQDkIEGoth6fIPSBQBAKaZqRUad25tlHW1i0ERYpgi2wBAQQQQACBggqoyuW7e1J/2UNvtMascCst/wJauNpetH3tJfaxHwfPPPWE/OwHB+ypjDPHxJxbqNhqD3KAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIOAQ4F/PHRx0EEAgG4FQ7/atInq/+9rdPc3uIfoIIIAAAghUtMDtPS2iwq2OFgiJVhd1DNHJk0Cwyl6o42L/Qq1jw8OiqrQmmjkxLMbYUKLLdwQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBJQQItS4BwzACCGQuYBjGXvfZW6LV0t8ZcQ/TRwABBBBAoKIF2muCoiqZu5vetMY9RD8PAlposaJ885pO31b87pe/ICrYGm+xOYmdPubb3EyEAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlLMAodZyfrrcGwL5EOje3qFp5h3upfZsSP14Zfc59BFAAAEEEKhEgbv6UgOsWlWtaNV1lchR2HsOJoVa2zt82csrz/9afvaDA/Zc8UCrEbP7HCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIILC1AqHVpG15BAIEMBELB2G1ixXCST1VV6Hata04e4hgBBBBAAAEELgioaubqy930Jn9Cle556S8toIUXn0N71yVLn+jhlW/ce499tjk1JubkiN3nAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEBgeQFCrcv78CoCCCwn0N1fbYrc7T7lhq4mqQpo7mH6CCCAAAIIIHBBIG211tqISCCEUR4FtFCVvVrLmtWHip/83nfkxNEjC3Oahhhnj9vzc4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggsLIAodaVjTgDAQSWEAgExq+1qrSmJED29bUvcQXDCCCAAAIIIKAEdq5tFFXZ3NE0XfQoP0MdJrnsWN4SDMdXCIXD0rymc1WrjQ0Py8GHHrDnMEZOizk7Y/c5QAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBYWYBQ68pGnIEAAksLpFRpTRvSWfp6XkEAAQQQQKAiBVRF8zs3tqXcux5psX5fhL+ip8DkYEALLoaKI82W+yqbCrSqYGu8xebEGB5c5YxcjgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCFSeAP9iXnnPnDtGwBeBUO/2rZomV7knG9jY6h6ijwACCCCAAAJpBHZ1N4sKtzpaICR6Q5NjiE6OBCzrRGtuTyk8n3gpo+8njh6RJ7/3Hfvc2OljIqZh9zlAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEMhMgFBrZk6chQACLgHDMPa7hmRLtFquaq93D9NHAAEEEEAAgTQC7TVBufGS1ACr1tie5myGfBdICrXW1Desavpv3HuPfb05PS7m5Ijd5wABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAIHMBQq2ZW3EmAggkBLq3d2iaeVuim/j+vk2pH6OceI3vCCCAAAIIIJAqMLAx9WenVlUrWs3qQpapKzHiFtACAXuooanZPvZ68PShA/LK879euMyqzmqcfc3rFJyPAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIHBBgFArbwUEEPAsEAwad4ho1ckXqmpzN1ycWm0u+RyOEUAAAQQQQMApoKqcb2upcw5aPT1KtdYUFL8HglX2jJGmFvvYy8H0xLg8+qUv2JcYo2fFPD9l9zlAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEPAmQKjVmxdnI4BAd78VZjX3uyF2dTdLVUBzD9NHAAEEEEAAgRUE7upLU621LioSCK1wJS+vSiAQtC+PZFmp9eBDX5Kx4eGFeWJzYgydsOfkAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEDAuwChVu9mXIFARQsEAhM3WVVaO5IRVJj1zjQfn5x8DscIIIAAAgggkF5g59pGURXP3U1vcvy4db9Mf5UCWnAxNFxdV+95NhVmffyRb9jXxc68Zv3ej2H3OUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQ8C5AqNW7GVcgUNECpmne7Qbo74ykDeO4z6OPAAIIIIAAAukF9vW1p7ygNzRZv0fCX9dTYPwa0AP2TJHmFvs404PHvvlV+1Tz/JSYExcqttqjHCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIeBXgX8m9inE+AhUsEOrdvlXXZaub4EOXUknObUIfAQQQQAABLwI3dDWJqnzuaIGQ6I2tjiE6xSGgqrQ++b1H7M0Yw4P2MQcIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAtkLEGrN3o4rEahAgVhKldYt0WpRXzQEEEAAAQQQyF6gvSYou9Y1p0ygNXivIJoyCQPpBZKq4IbC4fTnLDGqqrTOzc7GX41XaZ0cWeJMhhFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEvAgQavWixbkIVLJA9/YO05Sb3AR39a1xD9FHAAEEEEAAgSwE9mxIrcqqVdWKVhvJYjYuWUlAC4bsUyJNqYFi+0XXAVVaXSB0EUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAR8FCLX6iMlUCJSzQDA4v19Ec5RkVVXldq5tLOfb5t4QQAABBBDIm4CqfH5Ve33KenpjW8oYA4UT+MW/HaJKa+H4WRkBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBMpcgFBrmT9gbg8BXwS6+60wq3aHe67dPa1SFdDcw/QRQAABBBBAIEuBgY1pqrXWRUULhbOckcv8FJibnZXHvvlVe0pz7Kx9zAECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAwOoFgqufghkQQKDcBUKhsdtMU+tIvk8VZr29pyV5iOMiFgh3rRe9vlFCnRfbuwx1dFm1d2vj/djIWYmNDC0cn1PHZ2X+3FD8u30BBwgggAACORdQFdC76sNyfGLWsZbWuEbMs8cdY3TyL/Dk9x6RseHhhYVjc2KMLfzszP9OWBEBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoTwFCreX5XLkrBHwViMW0/bqrrrMK3bTX8H8hvkL7OFnVuk0SXrdZwt3WdyvQqgVDK8y+Ke3rKtx6/uUXZNb6Ut9V3+/WMvAxUfvNpKn1T33u7kxO5RwEykogevOA1G67JqN74s9JRkxFfdKdVrXWT/3idcce9UiLGEMnREzDMU5nFQKat2rz7iqtxrmTPI9V8HMpAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggkE6ARFo6FcYQQMAWCG24/CpTZKs9cOHgrr417iH6BRbQraqrdVfvkLrffqeoYz9aINpqBenU10KYToXlxn/4qEwdftKP6ZkDAQQQQCCNwK3dzfLX/zko52PWT+BE0wOiN7aKMXI6McL31QoEVvqFD+cCTx864KzSOur/L3o4V6SHAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIVJ4AodbKe+bcMQKeBKw4TUpZzG0tdbIlWu1pHk7OnUAuwqxL7VaFXFXFyPq3Xidjh74lM88dXupUxhFAAAEEshSIhAOyp6dVHnjxjGMGLdImQqjVYZLPzr9/7zv2clRptSk4QAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBHwVINTqKyeTIVBmAt3bO0Ri17rv6q4+K1RDKwqBqt5Lpende32rzJrpTQXbOqV59wdkbvCYjDzypfj3TK/lPAQQQACBlQV2rW9ODbWGq0Wri4o5ObLyBJzhq8AzTz0hJ44eWZjTNMQYP+fr/EyGAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIILAgoAOBAAIILCUQDJpWlVbNUZK1vSYo/Z2RpS5hPI8CtduuiQdLVaXWQrVQ58XSMvBRUeFaGgIIIICAfwKqIvrOtY0pE+qRlpQxBlYvMHZueNlJ/u07D9uvG6Nnrd/5mbP7HCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAII+CdAqNU/S2ZCoLwEuvutMKtxm/um7tzYJlUBzT1MP88CKtAavXlAtGAozyunLqdCtapqK8HWVBtGEEAAgdUI3NLdlHK5qtSqWRVbaf4KzM3OLjnh8KlBOfLMYft149xJ+5gDBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAF/BQi1+uvJbAiUjUAoNH6HVaW1I/mGVJh1V3dz8hDHBRCoedOV8UBrAZZeckkVrm35gw+LCtvSEEAAAQT8EVCVWnsaqlIm0yJtKWMM5E7gZ4cO2pObU2NUabU1OEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQT8FyDU6r8pMyJQFgKxmOx138iudc3SXhN0D9PPo4AKjzZen1JAN487WH6pxhvfI6HOi5c/iVcRQAABBDIWuL23JeVcvbFVRA+kjDOQG4GnHztgT2yMDdnHHCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAII+C9AqNV/U2ZEoOQFghve3K/rstV9I3s2WCEaWkEFaq/sF72+saB7WG5xFbqNvutOUd9pCCCAAAKrF9jT0yqqUrqjabrokdSwq+McOp4Ezp0aTHv+kWcOy9DJC68ZMTEnR9KexyACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAgD8ClFz0x5FZECgzAWO/iDNAc1V7vWyJVpfZfZbW7aigaMNbr1vVpmeeOyzTv3xajIlRMWamZW7wWHy+qnWbRKz5w109Ur1566qqrapKrdV92+LrrGqzXIwAAgggEA+0qmDrAy+ecWjo0TVijJx2jNHxX8BRpXV8WMQ0/F+EGRFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEbAFCrTYFBwggoASsQGP3/Lx2k1tjYCNVWt0m+e7Xbrsmqyqt5vycjB36lkw/sxBmTbfv8y+/EB8+f+RXMv7DfxUVcq1/6/VS1XtputNXHKvd3k+odUUlTkAAAQQyExjYlBpqlWBYtLoolUMzI1zxrNHhoZRz5mZn5RePP2aPm+Op59gvcoAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggg4IuA7sssTIIAAmUjMD8ftKq0Olt7TVB2ri3ej7x37rZ8e8HOLs83pwKtw1+7TyafOhSvzprpBCrkOvQvfyvjP3o000sc56lQbCBKENqBQgcBBBDIUmBtXTjtz2G9sS3LGbnMLTA+cs49JIf/7TFRwVbVzPNTYs5MppzDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIICAvwKEWv31ZDYESlugu7/aim3c4b6JfX3t7iH6BRAItnZ6XnXswNdFVV/NtqmqrSoQm00LZRHCzWYdrkEAAQQqQWB3T3PKbWq1EdGqalPGGfAuMH4utQrrzw4dsCeiSqtNwQECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCQUwFCrTnlZXIESksgFBzbZ+04mrzrqoAmt3anBmmSz+E4PwKhtg7PC82sItCaWGz04Ndl9vjRRDfj71XdmzI+lxMRQAABBJYX6O+MyJao9bsnrqY1tLhG6GYjMDbsDLUOnxqUI88cXpjKNMQYT63kms06XIMAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggsLwAodblfXgVgYoSiBnae903vKenVSLhgHuYfp4FtGBI9PpGT6vGRs6K+vKjTf3H456n0Ru87dfzAlyAAAIIVJjArvWpv2SiN7aKBEIVJuH/7Q6fPumY9GeHDtp9c2pMJDZn9zlAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEMidAKHW3NkyMwIlJRDsufxaXZet7k2nC9C4z6Gfe4FgW6fnRWLjVgjHpzZ/dtDzTF5DuJ4X4AIESlRA/Xnmz0eJPrwCbzvtL5pouugNTQXeWekv767U+vRjB+ybMsacVVztFzhAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEPBdIOj7jEyIAAKlKaDJXvfGl/qoY/d59ItTINjW4dvG5s8NyfmXX/A03/zgMU/n+3WyXl0rNZdfI+p7INpifbVKoD4igaZWmT1+NL6MCumeP/Kr+Jc5n9/qeyrMWLVuk4TXrpdg58Xx/QSbFvaZbKD2qvZmzkzJ3MnjC/s+s7Bvwxrzq9VuuyZuk8l889b60798etlTa950pajQZqijSzTrGYQ6u+LP4tTn7l515WA1r5pftcT8bju1x9iEFei27GZ+8yuZtd63cwV6L6p9Kt+q3kvjIdaQ9WfSHWZN7Pe8tdepXzwpxsSouoyGQFqBqoAmt3Y3ywMvnnG8rjd1iDFy2jFGx5vA3OysTE+MS019gxx55rAMnbzwyxxWhdZ4pVZv03E2AggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggECWAoRas4TjMgTKSSC88YrNhiE3ue9pd0/qxxy7z6GfH4HYubOeF4qHO60A4EohxEwmVkG7oQc/m8mpBTsn3LVearf3x0OPWjD9R3GrMKlq6nudda4KjU49/biM/+hR8TMo6kYIWeHV+rdeJ2qPKmSbSVPnJlp137bEYfy7CuSq5zp1+EnHeDadGhW6vOCy0vUq2Jzu/aSCmrVWkLhu+9szvr+V1kq8rp6lCrKqZ5tsknjd/V0FX9WXaipMqpp6/85YZuM/fHTVwdr4hCv8j/Kou7I/7uEOsbovTexXPYPIjlvivpM/OWQHsN3n00dgX197SqhVAiHR6qJiTo4AtAqBsXPD8VDrb6xQa6IZE5apaSS6fEcAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQyLEAodYcAzM9AqUgYBiaVaXVdGy1qz4sO9c2OsboFE5ABS7VlwqqemmN198Wr/KpqkGWa1OhwOi7BjIKPLoNVGCy7uod8cqu5x6+P1651X3OavoqzNrwjhvFHUpdzZzqWhXWVF91v/1OGTv4dc9VdFe7fvL1yq/xutuSh3w5Vs9VBYFVoHWpkHKmC8VDt1Z4V1VNVUHgXIZbG37n9+L7znbP6n7V16j1XCefOpTpLXJeBQm01wTjP5+//5qzqq8ebZcYodZVvROGTw3Kmq5L5JmnnrDnoUqrTcEBAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggkBcBPS+rsAgCCBSvQHd/tVXHcJ97g3duzKyapPs6+rkTyKpaq1UxsvX9H/c9VJm7u/Q2swr/qfvLpILncjOrsHDLH3w4Hnpc7jwvr6lwY9sf/nlO7VVotmXgY/EKn1725te5jTe+JyeBVhUCVs9VhVCzDYcudY9qzjUf+et4mHmpc7IZV/uM3jwQDzH7sWcVFFbz+TFXNvfDNcUtMJDmZ7RW0yBalbdffCjuu8z/7k4df1WmJ8blxNEj9uLm9Lh9zAECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQO4FCLXm3pgVEChqgVBwzAq0alawdbFVBTTZ00OodVGkOI5iI0NZbUQFNpt3fyAefsz0Y+azWijPF6lAZdO793quXrvcNlWIMPGR9cudt9JrKtCqKrTmq9W/7fp4ADJf66mgpQoB123v931JZafer16rEnvdiAqNqveQHy3+Z8znULTalwrg+rVHP+6TOYpH4Kr2etkSdfzojm9OVWulZS9w6rVj8uzPf2ZPYM5MWr/3E7P7HCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAII5F6AUGvujVkBgaIWMEX2ujeoAq0q2EorLoHzr7ywqg2pQKuq6tn+wb+IhxFzHRpc1WZXuDiy45acBCrVsk1WsHU1NvkOtCaoVABSrZ3LlqgaqkK0foR/k/caD4ZaYdZ8hoFVKFeFcxP3lbyfTI/jAd+Bj0quAuPqudZdvSPT7XBeBQns2ZD6yydafZNIIFRBCv7e6omXjsjzv0gKtU6N+bsAsyGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIrChAqHVFIk5AoHwFAj1X3GRVad3svsPbe1vcQ/SLQGDq6cfFmBhd9U6CbZ3x6o9r7v58vLqn+qj31YT6Vr0hjxOogJ8KVeaq6fWNWQcrVdAzn6FMt0H9W6+TQDQ16OY+L9t+oCESn1+t43dTVXfVezHfTT2z1VRDVc871HlxTretQtzhtetzugaTl57ArnXN0l4TdG5c00WP8DPciZJ5T1Vq/c0zh+0LzOlx+5gDBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIH8CLj+JTw/i7IKAggUh4CmaVaVVqtWa1LbubZRehqqkkY4LBYBc35Oxn98UNTHpvvRVJBVVYFUX6rNPHdYzh/5lUxb3/0Iz/qxR/ccKnDq1/27507u11x+jYwd+pYocy9NVf5cTTNmpsS0vrINpqpn2nj9bTL8tftWs41lr1Xz+x2CVu9Bvyu/LnsTrhfV+rMvvyBTh590vbJ8VwXE81FFVXmrtWgIJAuoiuq7upvlH587nTwserRdjBFrzDQc43RWFpieGBf1FW+WnzkzufJFnIEAAjkV+PDGv8np/ExeGAGea2Hcc73qGz79QK6XYP4CCPBcC4DOkgjkWICfwzkGLtD0PNcCwed4WX4O5xi4QNPzXAsEz7IIIIAAAggggAACJSdAqLXkHhkbRsAfgVDv9q2maVzrnu2Wbuuji2lFK6CqtTZYVTJVuNPvpqpkqi9VtXJu8JhM/e+nZObZwxIbOev3UlnP53eYcqmN6NW1UvOmKz2FHFW1zmwqjaog5bT1dd4KVSY3FfJUIVmvc6rz1f5VQNbvpsK22QZul9qLmi+bSqnqfTl7/KjMnz1pBVKfj08ftJ5BsLFFwus2ea6eqvYw+5o135nBpbaaMq4C1vl6T6YszgAClsCdG9vkgRfPyPlY0i+oBEKi1UbEnBzBaBUC5tQYweBV+HEpAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggkK0AodZs5bgOgRIXME3zve5b2BKtFlWplVa8Aqpy6LlvPyjNuz+Q0zCdCmg2qi8rtFesAdd0T0kFOecGj9sv6dU1nsONiYtVqNRL5c7a7W9PXJrRd+U6/LX/vmRoWFXNVV9VVkCz+Q8+7Ol5q2Crl71ntOEcnRS9ecDTvak/A5NPHZLxHz2aUkk3ORisqq9GrKqyKuCbSVPh1MiOWzKucqtCz9lUl1X7n/7l0zL1H4/HQ7lqb2rtcNd6CbZ2SsPv3JiT0HomBpxTegLtNUHp74zI918bdWxeb+qQGKFWh4nXjjl9oWKr1ws5HwEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEBgVQKEWlfFx8UIlKhA99aoiLHPilI5bmDX+mZHn05xCqigo/p4+VwHWxN3nxxwVaHByZ8ckpnnDideLorvKsA5/sNH0wZEVSg0YoVz1X14aaGOLi+nS7UVgs20qfDtcoHW5HmU+dihb8UDxsnjyx17DeQuN1emr6mwpnpfqMqpqtqpMbEQsps7c9I+ds+lKtGq55NpU3OeffCejKqpqvfEjPVnpXXgoxJs68xoCeWmqiAn9r7cRbXW3r025TLyyIN2mDVxvbJTz1l9qX03vONGqX/b9YmX+Y7AsgJ39bWlhFq16jrRqmrFPO9/xeZlN1NGL5rTE2V0N9wKAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggUDoChFpL51mxUwR8EwgF9TtM0aqTJ4yEA7KnpzV5iOMiFsh3sDVBoQKI6kt99PvEEwfjFSdz8TH3ifVW+r5StVN1vQoKDv3L30nb3o+L+qj7TJsKQqoqn5nen5e5Rx/9StoA7lJ7U5VJa7denXEwV1X9zFebPX5Upq0gpqo+mqlVYm/VVrVTL01VKVbB0EybCqee++Y/Sdsf/nlGl6iKqQ1vvU5GD3592fPV+8JLGFdNpmzU+1D92VmuqYCrCjGff+UFabEq9NIQWElgW0udqErrz47MOE7Vm9ZI7OTLjjE6GQrE5ggEZ0jFaQgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggIDfAoRa/RZlPgRKQMAwtf2as0ir3NrdLFUB12AJ3Eslb1EFW8/84yel8Yb3eA7YrdZNBTgbb3yPNPyft8jEjw9aAdcDq53S8/Uq0DpkVe3MJEipwo2jB74er27rZaFgW0dKVc1013sJtKrr1bPz2mZfO5pxqFWzQpe5bipcqgKj6jlk01RFVC/BUFXBNBs3tT8VCq67ekdG26y9sj8eKlXh0qVadd+2pV5actxrkFnd64gV4o3ePLDknLyAQELgfZva5CM/O57oxr9rdVZR9kBIxApo0rwJmFNj3i7gbAQQ8EVAc/8Hii+zMgkCCCCAAAIIIFC5Avz9qnKfPXeOAAIIIIBAvgXe8OkH8r0k6+VBgOeaB+QCLMFzLQB6HpbkueYBmSXyLqDnfUUWRACBggoEeq64yfr34m73JgY2ZV7B0n0t/cIJqGDh0IOfjYffMgl3+r1TVbEysuMWaX3/n2b8Ee9+7MFLoDWx3sxzhzP6WPnE+eq7qtyZSQs2tWRyWvwcFbDN5lnFRoYyXkM9l1w2Fbg8+09/mXWgVe2txmMwdMwKJWfbVOXTTJt65itVuq3qvTTT6eLnqWeu3n9emwryqkAuDYGVBG64uEnaa1y/q6bpoje2rXQpr18QaH/DRbaFOTNpH3OAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIJBfAUKt+fVmNQQKLqBpxn73JnaubZS1dWH3MP0SElDht9OfuzsegMsmMLnaW1UhQPUR7/Vvu361U614fTaB1sSk8+cyD4aqazKuwGoFIc+//EJGX9PPPJ3YjqfvsXPLf2y9ezJVCTUXbfb4URn+2n1ZBXOT91P9piuTu8seq2e+mve1qrqqAuCZtvC6zUueqkKvXkOtEz95TJar/LrkYtYLKpCrQrE0BJYTUJXWd/ek/nKK3miNWeFW2soCXRs2Lp4U5O9EixgcIYAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAvkVcJV0yu/irIYAAvkVCPVu32qaRr971d09ze4h+iUooEJ/owe/Hg/B1W67Rup++515rZ6qwn6qaqtqE08cyJmg+sj7bAOOcyePrViFM5uNq8ql6iuXzWuwMdMqs173PPLIg1kHNBNrqcBt1bpNie6K32etwPBq2/zZkxn/eQh3L723sLVvr5VwV/PeUGHY8R8flMbrblstAdeXucDtPS3yhedOyfmYuXingZBodVExJ4YXxzhKK9C1vlf+1xM/ir+mVdelPYdBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIHcC1C6KffGrIBA0QgYhrHXvZkt0Wrp74y4h+mXsIAKwU3+x+Ny+t5PWB8R/xlRVVyzDYFmw6CCrdUeP1reyzrm3JyX0x3nqmAjbXUCq/FPrKwq+3ppcyePezk97bmeKrVa+1sqFBzwWAE3NnJWVKXZ1bSZZw+v5nKurRCB9pqgqMrr7qY3rXEP0U8j8IZLuu3ReKiVCre2BwcIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAPgWo1JpPbdZCoJAC3ds7NC12h/U5xI5d7NmQ+nHFjhPolLSA+qh49aWa+sh0FTatsb5y9dH0Caymd++1ArV/ueowX2I+v76b01N+TZXzeYJtnRKoXwycBzsvzvma+VrAazBUVR2usaoPr6YlW640jwq0qj8jKpDqbnpDamjQfU5yf/4MQepkD45zK3BX3xp59NURxyJaVQ90/ywAAEAASURBVK2okKY5M+kYp+MUqItEpLG5RUaHh6y/KukLZtPjzpPoIYAAAggggAACCCCAAAIIIIAAAggggAACFS6gac5/a65wDm4fAQQQQAABBHIkQKg1R7BMi0CxCYSCsdtMK6KRvC9V1W3XuubkIY7LWEB9BLr6Gn30K/GPfq/evC0edFXhSb+bCgVGrI9LH3rws35PXfLzqY+uD3V2xUOTyl63gqvB1k4JNrVIIFoZIXOvwdBQAQK96nmkDbVW1Xh6D3qpEOtpYk5GII2Aqr6uvp4dmXG8qjd1SGzwJccYnVSBizf0yi+ftkKtVtNqGsQk1JqKxAgCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCQYwFCrTkGZnoEikKgu7/alLG73Xu5oatJqgL8Np3bpRL6519+QdSXHBRRwcp4FVcVcl23ybfbV3OpuQn1XaiSu+HSnIWIfXtoeZpIBXlLtXkN5M6PLgTkSvV+2XfpCXzo0g55/7+/4ti4Vmv9mQuERGJzjnE6ToG16zdYodafxge1mnrni/QQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBPIiQKg1L8wsgkBhBQKB8WutmmMd7l3s62t3D9GvQAEVOlVfk08dilcPrbnsSql789vjgdTVcqh5Rg9+fbXTlOT1NW+6UtSXCgyryrW0RYFAfeNip0iPFqrmWsFvV1OVdr00c3rKy+mci8CqBfo7I6IqsZ+enl+cS9NFj7aLMXRicYyjFAFVqTXRtOo6669OuohpJIb4jgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCQBwHrX2ppCCBQAQIpVVp3rm2Mh14q4N65RQ8CxsRoPNx6+t5PyJl//ORCNVcP17tPVQHZSgt0qjBkyx98WJrevVeq+7ZV3P273wPp+noJhFrT7VuNaR5DrUvNwzgCuRJQFdjv3NiWMr0eaVkIaaa8wkBCINLUIu1vuGihawVa4xVuEy/yHQEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgLwKEWvPCzCIIFE4g1Lt9q6bJVe4d3NWXGnhxn0O/sgXmBo/J0IOflaF/+VuJjZzNCkOFF8Nd67O6thQvqn/b9dK+/y/i1VlLcf/52nOgwfoo9BJtQSv0RkOg2AV2dTeLCrc6WiAkekOTY4hOqsCGS3/LHtTro/YxBwgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAC+REg1JofZ1ZBoGAChmHsdy++JVot21qsj9WlIZCBwPkjv5KzD9yzqmBrBsuU9CmqGm3LwMcksuMWKrNm8CTnzw1lcFZxnmJMTxXnxtgVAkkC7TVBufGS1ACr1tiedBaH6QTWbe6zh6nUalNwgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCORNIJi3lVgIAQTyL9C9vUPTYrdZnzfsWPt9m6jS6gChs6KAqtQ6/LX/Lm1/+Ocrnus+Qa+Aj2uP3jwgVes2uW/dU3/+zKDMnz0pcyePx6+bffl5x/VVGy4VVQm2HJo54y0YOnv8qJjzc3m99djEaNr1jJnptOMMIlBsAgMb2+SbR4cd29KqakWraRBzetwxTmdR4A2XrJPG5hYZHbbC91Z1W626TsyZycUTOEIAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQyKkAodac8jI5AoUVCAaNO6xAa3XyLlT1thsuTq3elnwOxwikE5gbPCaqamtV76XpXl5yTG9oXPK1cnihum+b1LzpSs+3YljBzpnnDsv0L5+W2ZdfWDG0GYi2el6jWC9YKjC61H5Hv/sVUe+/YmjGEmHXpfamqvjSECiEgKrKflV7vfz09IRjeT3aLjFCrQ4Td6d7U5/850/+PT6s1UUJtbqB6COAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAI5FCDUmkNcpkagoALd/VaYdWy/u0rrru5mqQo4K7cWdJ8svqKAqgJau+2aFc9LPuHcw/fHw5LJY34cn3/lBc+h1kC0xY+li3aOhnfc6HlvU4eflLEDXxcVbK3EZkyMebptvT7i6fxcnuz1mZX7+z+X1sy9eoGBja0poVYV0lQVSCWW3+rHq7+b/M3Qe+lljlCrDJ3I3+KshAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCFS4gF7h98/tI1C2AoHAxE1WoLUj+QZVmPVO6+OIaaUlEBuxPgLZYwu2dXq8IrPT588MZnZihZylqqeGOi/2dLcTTxyQkW8/WLGBVoUVO3fWk1mgvniq/XoN5AaayqfCrqeHxslFIbBzbaOoCu3upjc5/nrgfrni+109G6W6pibuoIWrRQuFK94EAAQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCBfAoRa8yXNOgjkWcA0zbvdSy4VbnGfR7/0BYKtBJby8RSr1m3ytMzc4DEZO/QtT9eU48leq52GOrqKhsEYH/W0F/4seuLi5BwI7OtrT5lVb2iyfu+F/wxIgbkwEAwFZd3mLfbLWgPhdBuDAwQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCDHAvxrdo6BmR6BQgiEerdv1XXZ6l77rr417iH6JSDgtaqluiUVttSC1sdL+9yyqQDrNQTo85ZzOl2oa72n+Sd/8pin88v15NnjRz3dWthjeNjT5B5P9hzItSr56tW1Hldxnh7qLJ5Qr3Nn9EpB4IauJlGV2h0tEBK9kaCmw8TV2fRbV9gjeqTFPuYAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCC3AqmfR5rb9ZgdAQTyIhCzqrQ6AyxbotWivmilJzB38rjnTevWx7XXXtkvk08d8nztcheE13oLcaq5YpNjy01Z0q8FWzs97X/2NW9hTk+Tl9DJxsSonH/5hXj4OpNth1Qw1HpPq+uybY3X3SZ1V+/I+PKz//QZSRe+Vfv22lQod+a5w14vi5+vwunRG9+T1bVchIASaK8Jyq51zfLQb4YcIFqDFdQcOe0Yo7MooCq1VtfUyMz0tEgwLFpNg5jT44sncIQAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggkBMBKrXmhJVJESigQPf2DtOUm9w7oEqrW6R0+uoj671Wh1R31/DW6yQQ9a8SX3XfNlFfXtu8tf9ybXp1jadby6bqbmKBUEd5Veuc+eXTiVvL6Lt6P6+mea0yPH/mZNrlEoHctC8uMVi77ZolXll5OLLjlnigd+UzOQOBpQX2bEj9WaBV1YpWG1n6ogp/JRgKyqati9VatfqmChfh9hFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIH8CBBqzY8zqyCQN4FgcH6/VaXVUZJVVWnbubYxb3tgIf8FZrOoDqkqW7a+76O+BFvDXeslevOA5xsz5+fSVrv0PFGRXqCqaHppgabUYFkm16sKo16qjGYyZ6HPmbYql6r3R6ZNVR72GkxNnjtkvYczbSq4ulyQ/PxvfpXpVPHzVBhcVZv12hp+5/fK7rl7NeB8fwRUpfb+ztQAq97Y5s8CZTrLpdvfYt+Z3tBs/fWK/3SyQThAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEciTAv8zmCJZpESiIQHe/FWbV7nCvvbunVaoCmnuYfgkJTB1+MqvdqkqtbXs/Hg/GeQ1gqgX16lqpf9v10jLwsfix102cP/IrT8FFr/MX+vzYiPPjvFfaT7C1Y6VTUl5XVT4br7stZbzUB1Rw1EtYW71/m259f1bvQxUOVe/lTNvc4PFlT51+xluVWTVZ4w3vWXZO94tqzw3vuNE9TB+BrAV291ihTFfT6qKihcKuUboJgTdcsk4am1sWunpAlBcNAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCC3AsHcTs/sCCCQT4FQaOw209QcqTkVZr2950IgI5+bYS1fBWasqpYqBKiqr3pt6hoVilQf366qY6og4ezxoxIbOeuYSoX+Qp1d8TVURcxAtEVq3nSlZBOGTUycbRg3cX2xf5+3DKs8bLJue7+oZ5lJU+6R628TdU25NvX+qOq9NOPbU9VOm969V4a/dl/GYemqdZs8h0PnBo8tuyf1Z2f+zKCnyrGJasejj35l2b2rP3vqz6sXl2U3y4sIXBBQFdu76sNyfGLWYaI1rhHz7PJBbscFFdZ545vfIk/94ED8rvVIi8QmhitMgNtFAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEE8itAqDW/3qyGQE4FYjFtv+6qv3zjJU3SXsMf9ZzC52ny0QNfjwf6sl1OhVtVQDJfIUkVDMw0wJntPRX6Oq+VWlVQse7qHTL51KFlt67Oa3jH74kKQpZzm/7l01JrvSdV8DTTpmza9/+FjP/wUVkuNK3e7yrIXXtlf6ZTx88z5+dk4iePrXjN1P9+SiI7blnxvOQTVNXdYGunTP7kkKh7T7REiLy6b5uoc1YTJE/MyXcE0gncubFVPvWL1x0vqaCmMXRCxDQc43QWBN505WKoVauNiARCIrE5eBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEciRA0i1HsEyLQL4FQhsuv8oU2eped2Bjm3uIfokKxAOAVuCtVCo4jn73KyUqnfm2V6romW4mVYWzevM2mfjxAVEfcx+vwHuhSm4g2ir1VhBThRwrpZ17+P54SFVVCs60KafozQPS8Ds3yvy5IZl95QX7Ur0+Eg+OqkBwNuHQqacfjz8Te8IlDlQwuW77262Kxq1LnJF+WO0r3LV3VQH19DMzisDKArd2N8tf/+egnI9Zf2NIND0gemOrGCOnEyN8TxKINLXIGy5ZJ6+/+nJ8VG9owirJh0MEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEDAbwFCrX6LMh8CBRKw4il3u5fe1lInW6LV7mH6JSygAoAtAx8V9THsxdxU4G/2+NFi3qIvezt/5FeiPorea7BRVSb1Up3Ul80W6SQq1Dv66FeyCnkqd/Xll6Wq0jr+44MZSalzVfXk5t0fyOh8TkKgGAQi4YDs6WmVB14849iOFrF+AYZQq8MkuXPp9rcshlqbOsQYPUtl22QgjhFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEfBVwfVO7jzEyFAAL5E+je3mEtdq17wbv6qNLqNin1vjEzJUMP3mNV+DxWtLeiAq2jB79etPvze2OZfFS9H2uqEGW5NlWFeOrwkwW/PfXeVSHbTNvMc4dFfeW6qT/3519erEab6/WYv7wFbu9tSblBLVwtWl00ZZyBBYE3vvkqqYtEFjqBkGi1F44BQgABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBDwXYBKrb6TMiEC+RcIBk2rSqvmKMnaXhOU/k5CF/l/GrlfMRFsLcaKreM/elTGf/ivuUcoohWmf/GkNLzjRtGra3O2K/XMR779YFlXBR2zqp76WXXV68NQodqxQ9/yelm8WmvYqrybq+evwsznvnafhNdt9q0ireeb5IKyEuhpqJKdaxvl+685A9yBzp6yuk8/byYYCspb3rFDfvivC/8fgZWfusyFAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIOAWo1Or0oIdA6Ql091thVuM298b39bVLVUBzD9MvE4FEsHX2+NGiuSP1EfKVFmhV+OpZDP/L30quKqkmnnU+KoIW8s20cJ+fFRWMzndTgVYVGs6mxUbOxqsne6nw6mUdFfZVVVqVDw0BvwRu6W7ya6qKmWfz1itEhVtpCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAK5FSDUmltfZkcg5wKh0PgdVpXWjuSFVJj1hi4CK8km5XisQm5n/+kzMmxVcZwbPFawW1RhyzP/+EmZ/I/HC7aHQi+swsXnHr7f920kAq2FfL6+39QKE6pgtHpP5yPEqYLIE08cyDrQmrgV9XzO3P+XMn9mMDG06u9qbyoonvhzZU4Tal01KhPYAqpSq6rYSstcoC4Skd/67bdmfgFnIoAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAghkJUC5oazYuAiB4hGI/f/s3Qt0HPV99//vzO5Ku7qs7rJkW0ZClm/Bxo5j10CghhOuCeAGp/iPnWKcUzC0T2hLW5q2OTlp+vRp+jR92ue0ef5NT8i/TdqSPqEtpIUGcoBAuAdIwAkXAyYYfJFtWZatiyXtzn++I2u9O7uSZla70l7ev3M2mpn9zW9+v9esHHH00Xdicpvpiqdv7WqU1gjf3oVzl/I7Ew2V6iu8cp3UXbPNeYR7fq84MbpeU6tqllPgcjpX9ThmV2xt+OQuMWvqpuvq6T2tHjrw8H2SXAFUt3MxtqcJzGMntTzyN190Ps/6uc5HG371eTlhV0FN9p3NdbRi69Gv/Yk0/spvSkXHubMZypmTBnuTKzHr+DQEcimwvadJ/uilA64hrZHx8Xi7vPvjftcbJbkbXPphO4k+8YdBgwMDosHV6dovXHq5vPjEY4kuhmGuG9v7wo8TB9hAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEJi1AKm3WRMyAALzJxBc+pHNItZa9wx2LG12H2K/DAQ0CKivyOqNUtG1XCo7l0uwpT2nK9eQ3ei+12V4zwuEWTPInt67R3r/6g8laoeLq9ZdlKHHzIfUeOChf04JNE6epdU7y6VpiFODnRrijazZKNUf+cVZf541wHp63xty6smH8vL5nayeXGl//9VcfI1U9pzn63bp/IZeesqZn7tSbTnde19odM5aYEd3s3z5JwfldMxKGsMIh4LmTvtfmr9MOliym5ZljBjGxPJisZn/fdXQa8/qNbL31VeckyzLutneINRasp8QFoYAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAvMhQKh1PtS5JgI5E4jfaVcYSxltU2uNrKoPpxxjp7wEtAKlvrSZ4aqzAdf2JWIEQ56rSOqj1GOnBmT03TecIKuGLfMRrBu2K5LqNbw2ayT7x7CPHdrvVJf1ei3t77dpGLH/X+9xqqxG7CqjYTtkrCHHqZr21zCsvkbs13SVQ089833nnk41VvJxr05z6Z88P6/b6jH49CPOS0PaTmh7sV0J1f4sh1rapq1c63yG+4/JyFt77M/wG3kJsmZahwZn9aXzrbn4aqd6crChKWMVZf2+0jUmf99mGnP8yCHPn113IDbTeBxDoDJgiAZbv/7mkRSMuGXYP1uUR6g1ZeEedy684uOJUKv9h0W/EerZ8PdUa/WIRzcEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEDAg0BqGs7DCXRBAIHCEAivWNs5Ph7Y557N1z7aKVcunv2jz93jsl+aAvqYdA26atMAq4YAafkTmPTWcLCGGWm5EdBqrhpw1TZmhz+nCwbn5orZjxKobxYjFOJ7LXtCzsyhwPuDo3LRd19LG9Gy5Jdib7/472lvlNiBQPf6Z+xKrZt0WZ+5+/PS2LrA0wq/83dflX2v/+xMX+vfx9966Zc8nUgnBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIEZBajUOiMRHRAoTIHx8aBdSS35kcEirZEggdbCvF0FOyuClXN7a/DOj7eGWE/br2Josf6jxTBN5lgmAourK5yfG773fur3j2EYt9kEJR9qNYz4iF3T3LnbgycHPIdaL/n49UmhVmOLXa11LdVay+SbhmUigAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEDeBSZ+i5v3y3ABBBDIqUDn5rAdaN3pHnP3ylb3IfYRQAABBBBAAIEpBW7qbszwnnWVBjUzvMEhW6B14SLpWb0mYWFZsS8kdthAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEJiVAKHWWfFxMgLzIxAKDuy2r1yffPXKgCGf6swUTEnuxTYCCCCAAAIIIHBWYHN7VFbV238r42qWZd3sOlRyu3ZF2kPZLurCKz6edOpEtdakA2wigAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCGQpQKg1SzhOQ2A+BWJxIy1osqO7WaIVgfmcFtdGAAEEEEAAgSIU2Hpupj+Kie+Wzg1tRbgcz1OOx42Ryc4jQ8OTm56+Uq3VExOdEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAd8CQd9ncAICCMyrQLD7w1eJIWmPBM4cSJnXqXJxBBBAAAEEECgCAf3DmP/3tV7pHR5Pmq0RDgbjB2Xp+qRjpbVpGCKWNbGm0yNDvhen1Vr3vvrKmfOMLcGl68+M5nsoTkAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQOCNApVY+CggUm4Aht7mnPNWjg9392EcAAQQQQAABBNwClQFDru1ocB8u+f3JQKsuNDaeHOj1tnR3tVZvZ9ELAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQGA6AUKt0+nwHgIFJlCxbP0KEWOLe1o3dWd6bLC7F/sIIIAAAggggEBmgd0rWzO/USZHTw0MZLVSrdZKQwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBDInUAwd0MxEgII5FsgHjfsKq2pT7btrq2UKxfX5fvSjI8AAggggAACJSzQGgk6P0987/0TKau0hk9K7IM3U46Vyo4ZbRKztdNZzsn+vqyWpdVaV2/cJK8+/+zE+eOjMv7zn9o/rsWzGo+TEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgXIUCC5dn1g2lVoTFGwgUOACnZvDIvHd7llu72lyH2IfAQQQQAABBBDwLbBrWXPaOUakVozKqrTjpXDAGhtNLCPbSq06wMVXXyfhSGRirGCFmA1tiXHZQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBDwJ0Co1Z8XvRGYN4FQcMAOtBp2sPVsqwwYsqM7PYBytgdbCCCAAAIIIICAN4FNrTWyqj7lRw3nRLO+1dsAxdYrNpaY8eBAaoXaxBseNqqjUbnwimsSPc2GBWKEKhL7bCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIeBcg1Ordip4IzKuAJXKbewIaaNVgKw0BBBBAAAEEEMiFwGeWt6QNY9Q0iARCaceL/YA1nhRqPXVyVstZf8ml0rpw0cQYhilmc8esxuNkBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoFwFCLWW651n3UUlEOhev8Wu0rrCPentPU3uQ+wjgAACCCCAAAJZC1y7pEFaI8HU8zWkGS3BnzniMREr7qx1cGAgdc1Z7F2+dVviLKO6XvRFQwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBDwJ0Co1Z8XvRGYFwHDMNKqtF65uE66ayvnZT5cFAEEEEAAAQRKU0ArwG/tbExbnFnfav99TQn+p0NsPLHWgePHEtvZbCw8p0tWb9yUODXQYldrLUWzxArZQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBHIvUIK/mc49EiMiMJ8CoZ4Na+0yYle553BTd3rgxN2HfQQQQAABBBBAwK/ALctaRMOtKS0QEqMqmnKoFHassdOJZZzKQbXWi6++TsKRyMSYwQpxwsCJK7CBAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIDCTAKHWmYR4H4F5FrAs62b3FFbVh2Vze+kFS9zrZB8BBBBAAAEE5l6gNRLM+HOG2dA295PJ9xVjY4krnOjrS2xnu1EdjcqFV1yTON1sbBejIpzYZwMBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAYHoBQq3T+/AuAvMr0Lm2XiS+2z2JredSpdVtwj4CCCCAAAII5E7g9pUtaYMZ4WoxKqvSjhfzAev0cGL6Rw58kNiezcb6Sy6Vhed0TQxhmGK2ds5mOM5FAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEykqAUGtZ3W4WW2wCoaC5U8RIKe8VrQjIju7mYlsK80UAAQQQQACBIhJY11QtWhne3cyGBe5DRb1vjY4k5t935FBie7YbV2/bIcFQ0BlGw8Bm48LZDsn5CCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQFkIEGoti9vMIotVIG4Zd7rn/qnORqkMGO7D7COAAAIIIIAAAjkVuH1leoDVqLaLyAdCOb3OvA42Ppq4/LHDhxPbs91obF0gl12/NTGMhoFLrcptYnFsIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAjkUINSaQ0yGQiCXAoHu9VsMQzrdY+5aTpVWtwn7CCCAAAIIIJB7gSsX10lrZKLaaGJ0wxSzriWxW+wbKZVaew/L+Nh4zpZ0/gUflY7unonx1K31HLsAP//5lTNgBkIAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgZIU4LeqJXlbWVQpCBhGPK1Kq4ZLFldXlMLyWAMCCCCAAAIIFLiAVoa/qTv9j2nMOvtYqYQzrbikBFuP5K5aq97ea/6fHRIMTQSDtVKr2dBW4Hed6SGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALzK0CodX79uToCGQVCPRvWipib3W/uWpYeLHH3YR8BBBBAAAEEEMiVwPbuJtFwa0oLhMSork85VNQ7YyOJ6R87fCixnYuNaEOTXHb91sRQZsMC0XArDQEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgswCh1swuHEVgXgXi8fht7gmsqg/LptYa92H2EUAAAQQQQACBvAm0RoKileLdTcOZpdKSK7UeOfBBzpd1/gUfla4VqybGtSvcBtq77b9dCuT8OgyIAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlIIAodZSuIusobQEOje0GYa1072oHUup0uo2YR8BBBBAAAEE8i9w+8r0AKtWGzXC1fm/+BxcwTo9nLhK35HcVmqdHPjqG3dIOBKZ2A1WSGBB5+RbfEUAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQSBIg1JqEwSYChSAQCsa2iR0TSZ6LVknb2tWYfIhtBBBAAAEEEEBgTgS0Wvy6pvQAq9nQNifXz/tFxkYSl9j/1t7Edi43qqNRuerGTyeGNKrrxaxvTeyzgQACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAwIUColU8CAgUmYInc7Z7StR0NUhkw3IfZRwABBBBAAAEE5kTg9pUtadcxqqIigVDa8WI7YJ0eEonHnGmPDA9L74EP8rKEntVrZP0llybGNps7xIjUJvbZQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAQIdTKpwCBAhIIdK/fYldpTSl7pmHW3Sup5FVAt4mpIIAAAgggUHYCm9ujopXjU5phlky1UWv4ZGJp+996M7Gd641LrrleFp7TlRg20GZvl0AwOLEgNhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIFZChBqnSUgpyOQY4G0Kq0ZQyQ5vijDIYAAAggggAAC0wnoH9ncsiy9WqsZbbL/Hqf4/5MiOdR65GB+KrWqbzAUlGs/vVPCkcgEtx1odYKt0+HzHgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFBGAsX/G+gyulkstbQFQj0b1hqGbHKvMtPjft192EcAAQQQQAABBPItsLWzUTTcmtLsUKZZ25ByqBh3rOFTiWm/99bexHY+NqINTXLVjZ9ODG1EasVsXJjYZwMBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoZwFCreV891l7QQnE4/E73RNaVR+WdU3V7sPsI4AAAggggAACcy7QGgnKdeekB1iNutY5n0uuL2iNjohYcWfYE33HZOD4sVxfImW8ntVr5MIrrkkcMxvbxaiKJvbZQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBMpVgFBrud551l1YAp0b2gzD2uae1GeWpz/m192HfQQQQAABBBBAYK4Edi1L/9nEqKwSrTZa1M0OtFojg4kl5Ltaq17ooiuvkY7unsQ1A23nilrSEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgXIWINRazneftReMQDAY3ylihJMnpNXQrl2SXg0tuQ/bCCCAAAIIIIDAXApoFflNrTVplzTrS6Ba6/CpxLref+etxHY+N67dcYvUNTZNXMIMSKC9WyQQyuclGRsBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoaAFCrQV9e5hcWQh0brbDrNad7rVu7WyUyoDhPsw+AggggAACCCAwrwK7ljWnXd+ori/6MKY1fDKxrrmo1KoXq45GZcstt0o4Epm4drBiIthq8J9piZvBBgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlJUAvy0tq9vNYgtRIBA4tcWu0tqWPDcNs96S4fG+yX3YRgABBBBAAAEE5kPgysV1ohXl3c1sSPlxxv12we9bI4Mi8ZgzzxN9x6Sv9/CczLl14SL5hF2xdbIZ4WoJtHVN7vIVAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQKCuBQFmtlsUiUIACRkP7Nw1DUlIgH19SL5/qaizA2TIlBBBAAAEEEEDALspqGvKDg2crm6qJEaqQ+Ikj9pZVpESWGJURMSomqqbW1tfL4q7uOVlLQ3OL1NhVW9/+2R7nekaFXcjfrtaaXD12TibCRRBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIF5EDAbFyauSqXWBAUbCMy9QKhnw1rTlLXuK9++coH7EPsIIIAAAggggEDBCFzb0SBaWT6lBUJi1jWnHCq2HWtoIDHlfa+/ltiei43zL/iorL/k0sSltPKtGW1K7LOBAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlIMAodZyuMussYAFYne7J7eqPiz6oiGAAAIIIIAAAoUq0BoJyo7u9ACrUVvcIcz4qX670GzcYd//9l4ZGR6e01tw2fU3SM/qNYlrmi1LxKiKJvbZQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBEpdgFBrqd9h1le4Ap0b2ixLtrgnSJVWtwj7CCCAAAIIIFCIAlvPbUybllFZVdwhzHhMrJHBxLre2vOTxPZcbXxi+y5pXbho4nKGKYH2blFXGgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFAOAoRay+Eus8aCFAgGrd0iRkpJVq16duXiuoKcL5NCAAEEEEAAAQSSBbSy/Ob29CqiZl1Lcrei27YG7WqtZ9pbP31lcnPOvgZDQdn6q78mdY1nqt5qsHXRMoKtc3YHuBACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCMynAKHW+dTn2uUr0LnZDrPGb3MD3LKsRSoDhvsw+wgggAACCCCAQEEK3NSdoVprdb0YoYqCnK+XSSWHWve9/jMZHxv3clpO+1RHo7Ltjs+eDbaaAQksXFrUrjkFYjAEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEChZAUKtJXtrWVghC4RCA9vsKq1tyXPUMOvWzvRgSHIfthFAAAEEEEAAgUIS0ArzHTXpAVajbkEhTdPXXKyxUbFODznnaKBVg63z0aINTbLlllslHIlMXD4Qsiu2LifYOh83g2sigAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggMCcCRBqnTNqLoTAWYFYzLjz7N7E1nXnNEhrJOg+zD4CCCCAAAIIIFDQArcsa06bnxltsv9+p3j/U8MaPJFY009ffDaxPdcbrQsXyY13/IYEQ2d+RgxWiNnWLWJXbqUhgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACpShQvL9pLsW7wZrKQiC09MObTFPWuhe7a1mL+xD7CCCAAAIIIIBAwQvs6G4WrTif0uzQpVmXHnZN6VPAO9bJo4nZaaXWwYGBxP5cb2iwdcvOWxPBVqOyyq7Yuoxg61zfCK6HAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJzIkCodU6YuQgCZwUskbvP7k1srWuqllX1Yfdh9hFAAAEEEEAAgYIX0ECrBlvdzYgW7x/sWGOjYo0MOksaHxuXt376int5c7rftWKVfGL7rsQ1CbYmKNhAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIESE3CVVCqx1bEcBApNoHNDWzAY22c/jzclwfq1j3bKlYvrCm22zAcBBBBAAAEEEPAk8PbJ03LZf76e1jd28G2xBvvTjhfDAbOuRcyWJc5UF57TJds/e9e8T3vPC8/KQ/d+KzEP6/SQxD54UyQeSxxjA4FSEbAs+88BaQgggAACCCCQdwHD4FcEeUcukAvw81WB3AimgQACCCBQ8gLl9PMVP1+U/MeZBSKAAAIIFIhAufx8EVy6PiFOpdYEBRsI5F8gGLTsKq2pgdbWSJBAa/7puQICCCCAAAII5FGgu7Yy488zZrQpj1fN79Dxk30iVty5yIGf75O+3sP5vaCH0c/bsEmu2Lot0ZOKrQkKNhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoEQECLWWyI1kGUUg0LnZrs4aP5tCODPl3Stbi2DyTBEBBBBAAAEEEJhe4IbOhrQORnW9GBUpBerT+hTsAbv6qTU0kJjenheeS2zP58b5F3xUrt62IzEFgq0JCjYQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKAEBAi1lsBNZAnFIRAKndxpV2ltS55tZcCQazvSAyDJfdhGAAEEEEAAAQSKQeDKxXWiFVvdzYi2uA8VzX584Fhirnt+9Gxie743tGJrpmCrEaqY76lxfQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQGBWAoRaZ8XHyQh4F4jF5DZ3761djdIaCboPs48AAggggAACCBSlwPaeprR5m3XNImYg7XgxHHAqtdoVW7UNDgzI/rf3Fsy0MwdblwvB1oK5RUwEAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSyECBNlwUapyDgVyC49CObRay17vN2LLVDHrSSFFhw15clUO///sZPnZDDX7lbrPGxeXExw1XSeucfi1lT5/v6Ou9Y/1Hf53ECAggggEDpCOzobpa/3HNYBkYngqDOygxTzGiTxPt7i2+hVlziJ/vErJuoNvvik49JR3dPwaxDg62BYEj+69vflPGxcZFghQQWLZfYB2+INTZaMPNkIgjkSuC3v/9YroYqqHH+/GOXps2HtaaRFN0B7ivfr0X3oXVNmM9w+XyGXbee3TIT4GeO4r/h/HtdPv9e8/3K92sxCZT7v03FdK/yMdcDn/9MPoad9zEXfunraXNgrWkkRXeg3O8rP18U3Uc2bcLl/v+55fQZTrv5ZXCASq1lcJNZYiEIxO90z2JTa42sqg+7D7Nf5gIaJq255Jp5U6i99LqsAq3zNmEujAACCCBQUAKVAUM+1dmYNiezfkHasWI5YA2c/YONva++IgPHjxXU1FeuWy9bdt4qwdCZv1fUYGvHKjHC1QU1TyaDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIeBEg1OpFiT4IzEagc0ObiLHFPcSuZf6reLrHYL80BWouvjqrKq+z1Qi1L5GqjZtnOwznI4AAAgiUucCu5Rl+xrGDlkZ1fVHKWKeHxBrsT8z9R088ntgulI2uFatSg61mwK7YuqxozQvFlXkggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggMDcCxBqnXtzrlhmAsGgdbd7yR01FXLlYv+Pd3ePw35pChj2Y4Trrtk254vTKq16bRoCCCCAAAKzEVhcnfnnHLOuZTbDzuu58f7exPV/8syTMjgwkNgvlA0Ntt54+50SjkQmpmSYEmjvFrO+tVCmyDwQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBGQUItc5IRAcEZiHQuTksYu10j3ALVVrdJOy7BMIr10llz3muo/nb1evpi4YAAggggEAuBG7qbkwbxqiKilFZlXa8GA5YwydFK7ZqGx8bl9d//GJBTnvhOV1y812/J3WNTYn5mc0dYrYsSeyzgQACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCBSyAKHWQr47zK3oBULBgd32IlKetVsZMORTnelBj6JfLAvIuUDd1dvmpHKqVmeNXn5DzufPgAgggAAC5SuwuT0qq+rtv+1xNaP2bNjS9VbB78b7Dibm+Nxjjzjh1sSBAtqINjTJr/zW74kGXCebVsnVqq1iV2+lIYAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAoUswG81C/nuMLeiF4jFjZvdi9jR3SzRioD7MPsIpAkEW9qlauPmtOO5PqDX0GvREEAAAQQQyKXAjqXNacOZdfaxQCjteDEcsAb7RWJjzlQHBwbsaq0/KthphyMRufH2O6Vn9ZrEHI3qegksXi5i8nNoAoUNBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoOAFCrQV3S5hQqQgEuz98lWnKWvd6tp5LlVa3CftTC2gFVbOmbuoOs3xHx6ZK6ywROR0BBBBAIKPA1q5GaY0EU9+zK4WatQ2px4poL378UGK2zz1auNVadZLBUFC27LxV1l9yaWLORmWVBJesEv1KQwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBApRgFBrId4V5lQaAobc5l7IlYvrMj6K192PfQQmBYxgSKJX3DC5m/OvtZddJ3oNGgIIIIAAArkWqAwYcm1HeoDVbGjL9aXmbLz4iaMi8Zhzvb7ew/KTZ56cs2tne6HLrr9Brti67ezpwQqnYqtRwx9anUVhCwEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEECkWAUGuh3AnmUVICFcvWrxAxtrgXdUNnerDD3Yd9BNwCVesukoqOc92HZ70fal8i1Rs2z3ocBkAAAQQQQGAqgd0rW9PfCoTEqK5PP14MR6y4xPsOJGb63GOFXa11cqLnX/BRp2prOBKZOGRXzA20dYnZ3DHZha8IIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAQQgQai2I28AkSk0gHjfSqrR211aKVmqlIZCNQN21n87mtGnPyceY016QNxFAAAEEyk6gNRLM+POPWZ8h7FokOk611tiYM9vBgYGiqNaqk+1ZvUa2f/a3pbF1QUJa70Ng0TIRM5A4xgYCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAwnwKEWudTn2uXpkDn5rD9XNrd7sVt72lyH2IfAc8Cua6qGlm9MS/VXz0viI4IIIAAAmUjsGtZc9pajUitGJVVaceL4oBWaz1+KDHVYqnWqhPWQKsGWzXgOtn0XgSXrCre+zG5EL4igAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEBJCBBqLYnbyCIKSSAUHLADrYYdbD3bKgOG7OhOD3Sc7cEWAjML1F52nZg1s6/2awRDUnfNtpkvSA8EEEAAAQRyILCptUZW1af8aOSMSrXWHOBmMUQ4EpEtO2+VSz5+3dmzgxUSWLxcjJrGs8fYQgABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBOZBgFDrPKBzydIWsERuc69QA60abKUhMBsBDbTWXnz1bIZwzq29NDfh2FlPhAEQQAABBMpG4DPLW9LWatQ0iARCaceL4kCGaq0jw8NFMfXJSf7CZVfItjvuFA25Os0wJdDWJYEFnSJmYLIbXxFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIE5FSDUOqfcXKzUBQLd67fYVVpXuNe5vafJfYh9BLISqL7wcgm1L8nqXD1Jg7E6Bg0BBBBAAIG5FLh2SYO0RoKpl7RDlGa0eH9Gip84KhIbc9Y0ODAgTz/8YOr6imCvo7tHbr7r96R14aLEbI3aJgme8yExqqKJY2wggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACcyVAqHWupLlOWQgYhnWze6FXLq6T7tpK92H2EchaIHr1tqzPrb/u02IEi7QqXtar5kQEEEAAgfkW0Ir1WzvTH21v1rfafw9UpP9J4qrW+pNnnhQNtxZbizY0yfbP/o6s3rjp7NTtCrqBhT1iNncU7/05uxq2EEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgiARc5ZKKaOZMFYECEwj1bFhrWXG7Umtqu6k7PcCR2oM9BETip044VVS9WFR2LZfI6o0y/OrzXron+uh54ZXrEvvTbfiZz3Tj5Oq9YEu7s2YdL9TWIUa4SoJ2CCdQ35y4xPiRgxI7ZYeJxsdk5K09MrrvDRk7+F7i/VxtVK27SAINZ6873bix40dl6OWnpuvihIz1fuqYZk1Ugs3tYoYjTkXe0f3viGWvR+/HyGsvy+m9eyQ+MjTteDO9qWaRNRud6wab25zPXcC+rl5fr6dt/OhB51p6zflq+nmt6JoofF2x+FwRO4wdau+wbaok1n9Uxo8fc1zGjx5yjEb3vZ6Yfz7m7Oe+62dxpu9Pvef6uZ78PE+u7fBX7pZAbVQqe1Z7XoZ+Rk49kdsKkZPz8zoJ/azk4/vN6/XpVxwCtyxrka+/eUROx6yzE7bDk1oR1BrsP3usiLa0WqvZ0CZir2N8bFyesqu1XrE1+z8+ma+lB0NBuerGHbL8/A/LQ9/+ViKcq6FjvT/x3nfFGhmcr+lxXQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQKCMBQq1ldLNZan4FLCu9Suuq+rBsbufRrfmVL43RTz76gESv2ea5imqd3VdDZBpm89q8VnjVMXU+dXZV1/lsWlFWg3VVGzZLRYcdapyhaUBQX9oqe85zvjphUDsIquvRIGQuWsQOtWrg0ks7bQdrpwq16lyrL/iYs0YNamZqyetWC2163088eK/v9TiWOvczNpmuN7ku/Vptu+tnYej5x+XkYw/MOkyb6XruY15M9BwN5iYHmifH0fs99NJTMvjCD3z7TI4x1Ve/9z1TqNWsqZOqD19k2/5ixvlPXlsDu82XXje56+lrrkO90StumHaO7kmpOw2BmQRaI0Hn56LvvX8ipauGQmNFGmoVu1prrPc9CbR3O2v6yTM/lLUXXiytCxelrLFYdrpWrJJdv/t5+Y9vfUP2vf4zZ9pGRVgCi1dI/PghiR/7oFiWwjwRQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKBIBQi1FumNY9oFJtC5tt6utbnbfj5rysS2nkuV1hQQdqYU0MqYGh6svvDyKfskv6HhuFo79DbwyH3Jh6fc1nFD7UumfD/5DZ2Hzme+mgYbay6+2gl7arB1Ns0JEdpBTq2yqeHSXIZbvcwr0/x1Tg2/fJvnYKz7OlptV19amdPL/de1a0BRr+u36fz1sxOxg5gDdpB2qoCu33Hd/TVAW2UHPb1+Rt3nT+7rGmsuucZ5adXZwWcembFi6uS5+f6qjnVXe6veqOFcrco7XQDZPd/winU5q1Sr9yFTaNh9zcl9tc5VaHxyTL6WrsBvnNcm7lCrEa4Wo7JKrNOzq0Q9X2paZdYaGnAqmuocHvrnf5Cb7/rcfE1n1tcNRyKy9VfvkD0vPCuP3X+fjAwPO2Nq+NiI1E5UbR0dmfV1GAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBDIJGBmOsgxBBDwJxAKmjvtQGs4+axoRUB2dHt7RHnyeWyXr4AGFDXM5rVpSM5L8EyrgNbaIVEvTa/vJSjpZaxs+mhYs/nW33dCqJkCodmMOXmOhjsX3PVlz8HhyfNm81UfI5/cdH2td/5x1oHW5LE0vFn/yV3Jh1K2NeDZ9Cu/6fTJJtCaPJh+hvRaes1cNh1X56hVgWcbaHXPS6vcani48aZfl6kq4brPyde+rs9roHVyDn4DxJE1E5V8J8+fzVf9nPppw3ZgnIaAVwGtYq8vdzMbFrgPFdV+/Oh+0aqt2noPfCBasbXY23kbNjlVW7V662TTAHKgY6VowJWGAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAII5EOAUGs+VBmz7ATilnGne9Gf6myUykBq5VZ3H/YRSBbQR73rY+W9Ng191l0zc+VHrejqNdSolUx1HvPRai+7fk4CiBou1JDhXDe9Zq4DlhrUzRSW1ONOeLbnvJwuM3r5DU7gOBeDaoi15de+4KsaaTbXnQxK5zo062Uu+j2qoV2tROu3jbz2sq+QuwbcNcibi1bRudzzMPrvxbA9VxoCfgRuX5keYDWq7aL3gdlV5/Yzh1z3tezKpfH+3sSwTz38oAwODCT2i3WjOhp1qrZe8vHrJBg685APwxSzaZEEFi0r6ntWrPeEeSOAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQ6gKEWkv9DrO+vAsEutdvMQzpdF9o98pW9yH2EZhRYPjV5309QlwDe9NVVdQgX9XGzTNeVzuMHXxPBl943FPfXHbSKpoa9tTw7Vw1DRlq2DDX1WCnmr+GQbMJNk41XvJxrdhb2XU2hKjhYK2qmq/qpFE7SO01JJ08z+RttdCKvF4qDSefl+12sKU9UQE42zH8nDf5udLKtpVZBoudsOgrz/u5rIRX+Kuwmmlw/dwkf54y9Uk+NrrvDV/h2+Rz2S5fgSsX10lr5ExAcpJBg5J1LZN7Rfk13ndQJDbxhyEaaNVga6m0X7jsCrn5tz4nC8/pSizJiNRK8JwPiRltShxjAwEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEJitAKHW2QpyftkLGEY8rUprxrBG2UsB4FXgxHe/6bWr008Dk5MhOveJ073n7uv3uu7zs93XR8RPF8zNdtyZztOw4VxUbNXQqYYb89k0xKqfAa3amu9wsIYe62dR6dapLmufP9VnNl9Oej110uvnuwVqo05gt+biq2d1qaEfP+3r/Miajb76Z+rs93tRg/g0BPwKaCX7m7qb004z6+xjdri1aJsVl1iNSsl3AABAAElEQVTve4np/+SZH0rvgQ8S+8W+0di6QLZ/9i7Rqq2JZgbEbO2UwMIeEXubhgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCMxWoIh/azzbpXM+ArMXCPVsWGv/Bn+ze6Rdy9KDGu4+7CMwlYDfiqlahVKDk+6m4TSvVSL1Ueej+99xD5H3fQ0Yep1jPiaj189nyFErmmrQNN9NK55q5dNMn4N8XFs/W9lUa9V5aqXX+Wx6/bmoEFtnX2e2wV39t0BfXpuuq6LjXK/dM/bz8/3oVJMl1JrRkYMzC2zvbhINt6a0QEiM6vqUQ8W2Yw32izU0kJj2I9+5N7FdKhtatVXDrRpynWxGVdSp2qpfaQgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggMBsBAi1zkaPc8teIB6P3+ZGWFUflk2tNe7D7CPgS+Dkow9IfGTI8zlanTM5ZKhhOq3S6qVpMK3/AX/VYb2MO1MfDeBlUyk11n9UtDrkyccekGP3/JnzOvHQvTL49CO+AoCT89M5aDA4H222oUY/cwq1L/HTfdZ9ay74mK8x1EKr8mql1/lsen2dRz7vjX62/VY8ncpk6IUfTPVWxuPhFesyHvdyUE38hFo1DK//ftAQyEagNRKU685pSDvVbDgblEx7s0gOxI/uF7Grtmo78PN98tyjDxfJzL1Pc+E5XXLzb31ONOCaaHYoWSu2Btq6xAhVJA6zgQACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCPgRCPrpTF8EEEgS6NzQZhixnfZzcpMOiuxYSpXWFBB2shKInzrhhDa9VvnUMJpWhjz+L3/rXK9q42bPQc1TTz4ker25bvooeD/BQg3PaXBVw6zuIN3pfW8kpq+VV7Uap9fwpM5BA8B9//TXiTHmckMrccZHhhOX1EqbflwSJ3rcSLbS62Rb2VMfdT/wyH0erypSc8k1WV9Lw5Njh/ZL7PhR0VBz0A7w6v2t6FwulV3LPc9hsqOuWedz8tH7Jw8V7FcNcOvn2etnwu99SV64unj9vtHzhl5+Kvl0thHwLbBrWYv833f6Us4zKqvECFeLNTKYcryYdqzREYn3HRSzaZEz7acfflB6zjs/pbJpMa1nqrkGQ0G55OPXSdeKlfLdb31DBgcmKtQaNY0SsCvuxvt7JX78kEg8NtUQHEcAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQTSBAi1ppFwAAFvAqFgbJtlxy6Se2vVsa1djcmH2EYgawENcFatvVC8VuCMrN4og888IuPHj3mu0qph1lNPPJj1HLM9sXrDZl9hRJ3n0Xv+p4wfOTjjJTVoN7J3jzTv+h3PwV6tTqmVbucq3KtB1sFnvp8xFKihwuoLLxetvpurpiZDLzwuo/vfSRtSq9TWXftpX/dDB9FqpDpXLxWF9TOczXp0vie++820CrzJwVwNYur8vX6fTALofJywrH0v5rJpIFuvO370kPN5nvzMjR05lPHzp77aX7+/vTS9L2qS6V7PdL6fKq8679GkMPlMY/M+ApkEtLr9uqZqeflYaoDVbGiT2MG3M51SNMc0zGlURcWI1Mr42Lh895v3yPbP2v+/ZAdBS611dPfIrt/9vDzxn/fLT5754cTyDFP0PprRJifgGz9xpNSWzXoQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCBPAmaexmVYBEpewBK5273IrZ2NUhlIrdzq7sM+An4EBh661093J9wXveIGz1UdTzx4b1rVU18XzLJz2GNAb3L44/96j6dA62R/Ddwd/79fm9yd8atWway9+OoZ+822gwYatZruka9+MWOgVcfXEKNWED3xwDdnezk5bYd7D335t6Tf9psq5KhB4b5/+F+SHBT1euGQHZ700jSc7bdpkPPo1/57WqDVPY6uSz11rX5bNvPye43J/k5A176nh+37oZ8BvcdahVXd9TUZbp3sn/xVA8l+mp9wavK44VXrknen3R6274+7YvK0J/AmAlMI3L6yJe0dDYOK/Sj7Ym/x3ndFrLizjN4DH8hzjz5c7Euacv7hSESu2LpNbr7rc6Ih10Sz76PZskQCHSudkG/iOBsIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIDAFAKEWqeA4TAC0wkEutdvETHakvtomPUW+zG6NARyKaBhNw33eW1arbJq3UWeumvITkN1c920IqqfR8ZrldFsAotONVS72q3XVrVxs+cwsNcxk/tpWPXYPX/m2XzQDjJ6qUybfI3JbQ0bDjxynxyzw6rThSVT+vsMUOu5WuXVS4us8VZldHIsXbcGP/007R/rP+rnFPE7L1+Dn+msa9HQrQZ09Z56qWzrvo7+O+BnbdmsSyu86strG5mHfzu8zo1+xSWwuT0qWuk+pWmVz/rWlEPFuGONjUr86PuJqT//2MOi4dZSbq0LF8m2O+6ULTtvlbrGpsRSjcoqCSzscV5GRcqDDhJ92EAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQRUgFArnwMEshNIq9KaMZSR3dichUCKQL9d3TEfFRH1se7z0SIrvVeD1PkN2NVks20a7PTatFqrPrY9H00DiUe/9idTVkud6prDe16Y6q0pj+tnpe+f/lpOPfHglH0yvaEhYA1P+mlmuGrG7pU954kGmf00rbLr9zOvYdG+f/obP5dx5qXzy1fTMLbed7WdbRt6+WnPQ2g41e9n2U+VVv08+/2seJ48HctOYKo/CtLH1osdbi32Fj9xRKzhk84yxsfG5aF//gfRr6XeelavkVv/4Ity2fU3iFZxnWxahTew5ENiNneURDXeyXXxFQEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEcidQ/L8pzp0FIyHgSSDUs2GtYcgmd+dMj89192EfgWwEtNLmqScfyubUKc/R6qe5CNpNeYFp3giv9l61U+eYTWXLyctrMNJPtdOKrhWTp+bsq67hyN/+ia95TF7cz9z1nMlAazaVbfX88aMH9YvnZtbYjwifoUV83G8dSisIZ/vZ1PP8nut3fjMsN/G2rkPDxbP5/CYGszeGXnoqeXfG7fAKf+FxP/2HX5n7Cs8zLpgORS2wtbNRNNya0vSx9bUNKYeKdSfe+65IPOZMXyu1Pv2wvz86KNZ167zXX3Kp/Oof/JHzNXkdWok3eI4dbm2wH3xQAuHl5LWxjQACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACsxMg1Do7P84uQ4F4PH6ne9mr6sOyrqnafZh9BHImoFU3/Tx+fLoLa/Bx4GHvFUynG8vve1qxs7JruefTRn1WDs008PjRQ5kOZzxW0el9bhkHyHBQq45qMDmb5mfuOr5eJ9tAq57v93pa3Xa6pu/7DY2eenJ2Ya/BZ74/3ZTS3tP5zbSOtJM8HOj/t3t8V5udblinOqpd+dVri6zxHh7Xirt+KrsO/dh71Viv86VfeQu0RoKytasxDcGoa007VowHrLFRiR/dn5j6c48+LHtffSWxX+obWqlVK7be9odfFK3gmmhmQMymRU641aiuTxxmAwEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEyluAUGt5339W71egc0ObYVjb3Kd9ZnmL+xD7CORUQIOoJx68NydjDjxyX9Yhy9lOwE9wTq81duhsCCjba/updqrzy3XA0Roby3bqEjt+NOtzszkx19fLxnM2oVxd88hrL/taut5vv59LLxeYzX2fanytsOy1BeqbPa+rsuc8z597/X7y8z3ldb70Q2DH0uY0BKOySoxIbdrxYjwQHzgm1mB/Yur/9e1vysDxY4n9ctiINjTJlp23yrY77pTWhYvOLjlYIYH2bgksWiZGmD8SOwvDFgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALlKRAsz2WzagSyEwgG4zvtZ6SGk8/W6mLXLimNx+Mmr4vtwhPQsJ4G/jSAlm3Tao9Dzz+e7emzPi9gV2r106ov+JhE1l3k55S0voGaaNqxqQ5owFGryeaqKu5U1/F6PFePrvd6PWtkyGtXT/3U0k/TsKQGuGfT1EzHCba0ex7G7zw9D5zjjvpvgK5PK6t6aeEV62R0/zszdvXzbwpVWmfkpEOWAlr1flNrjTzbeyplBH1MfWz4ZMqxYt2JHX5XgktWidghzpHhYfm3e74m2z/7OxIMldd/knV098jNd31OfvLMD+Wphx+UwYEB55ZqgDmweIVYp/okduR9kdjs/v+gWD8nzBsBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBchcor9+glvvdZv2zE+jcbIdZB+60Q60p49zU3SyVgdRjKR3YQSCHAlpltblrueeqiu5La7XX2YYG3WP62Tdr/YUcQ+1L/Ayfk75Bu5JcoYRac7KgeRzE7/0eP3ooJ7PVCr++Qq0+P5c5mWQWg+j37vBLT0n1hZd7OjuyZqPovxkztfDKdTN1Sbw//MrziW02EMi1wK5lzWmhVuex9IFQaQQc4zGJHXrHqUgqhim9Bz6Q7//rvXLVjTtyTVkU451/wUflQx/ZJM89+rA8/9jDMj427szbqGmUoP2yTh6T+PFDYo2OFMV6mCQCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCORGwMzNMIyCQOkLBAKnttiB1rbklWqYdXt3U/IhthHIq8DYwfeyrrR6et8bvh/NnuvFmD6qpub62ow39wKBau9VcnV2uXqsvd9wrN95zr3k2Sv6qZQaqG+Wio5zz56cYavSDsl7rfyq/4YQ+M6AyKGcCVy5uE60Ar67mQ0pP3653y6qfWtkUOLHPkjM+dXnn5U9Lzyb2C+3Da1Se9GV18itv/9HsnLd+pTlG7VNEljyIQm0d4tWcaUhgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEB5CBBqLY/7zCpzIGBZ1t3uYaYKX7j7sY9ALgW08mL81AnfQw48dK/vc3J9QsDn4+hzfX0v42kQkJYbAb+VWq3YRJW+2V49ftLf94ffec52frM5X4Pt+vLawiumr8Ja0bXC61Ay8ipVWj1j0TFrgd0rW9PONWsb7L8rKp3/bIn394o12J9Y5yP33St9vYcT++W4UR2Nyid23CI33/U56ejuSSHQar2BRcsksHiFOJV7U95lBwEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEESk2gdH47XGp3hvUUlECoZ8Na05S17kndvnKB+xD7CORdQB9BPvDwzI8UT57I4NOP+ArCJZ+by22zCEKtuVxvuY/lNyAcO340J2T6PeKn+Z2nn7Hz0ddPtdbImo3TTiG8Iu3/2jL2V9NhQq0ZbTiYW4FPdTaKVsJPaYGQmHWl9QcHscPvijU64ixzfGxc/u0bXxP9Wu6tdeEi2XbHnU64tWf1mhQOI1ztVG0Ndq4WM2o/KaGEgs4pC2UHAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgTIXINRa5h8Alu9VIJZWpXVVfVj0RUNgPgSGXn7Kc0g1PjIkJ598aD6mmXbNQK2/x9GnDcCBohIINtihoyJoxTLPScrhl54Sr8FdDexWdJw7eWrKV30v1L4k5dhUO6P73hD9t4SGQL4FohUB2dGdHmDVR9GXVIvHJH7obREr7ixLK7X+xz/eU1JLnM1iNNy6ZeetctsfflFWb9wkwVDw7HDBCjFbOyXYeZ6YDW0iZuDse2whgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEDRCxBqLfpbyALyLtC5oc2yZIv7Or9xnv1LdBoC8yjQ/2/f8HT1k489IPFT/h7H7mngLDqNHz+WxVmcUqwCsZMDRTH1YpnnJKaGS0dee3lyd8av4RXrMvYJ95yX8XimgxqkpyEwVwJbz21Mu5RRWSVGVWn9YYRWao0feS+x1r2vviKP3u+vEnvi5BLdiNp/HHHVjTvk1t//I/mFy66QcCRydqVawbdpkR1utSu3NneI2Ps0BBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAofoGkkjfFvxhWgEA+BIJBa7f9fNOUkqytkaBsbi+tYEU+7BgzvwJjB98TDZpVrbtoygs5fZ5/fMr35/oNy2elx9H973iuSJmrtcQKJACcq/XM5zh+w9RGpGpeput3nvMySddF9Xs/snqj62jm3ciajTLwSHpQLrwyc9jVPYpWhfUTonWfzz4CfgW0Er7+nPX4wdRgvFnXIrGh1GN+xy60/vEB+489KqrErG91pvbiE49JU+sCOf+CjxbaVOd1PtXRqFzy8etk42WXy09feFaee+wRGRw481mwK7Wqn76sk8ckfvyQaGCYhgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACxSlAqLU47xuzniuBzs12mHXgNjvUmnLFW5a1SGUg9VhKB3YQmCOB/n+9R/RVLM1vYPTEd78pGsylFaeA3/tthnMTag00pD+6fDpBv/Ocbqy5eu/03j1OBWazpm7GSwbqm6Wi41zRkPhkM4IhqehaPrk77dfhV5+f83D5tBPizbIQuKm7MS3UalTXixGqEGtstKQM4kf3O+vS9Wl79P7vSLShUbpWrCqpdeZiMVqpdf0llzqvPXa49emHH5ITfWerwBu1TRKwX9Zgv8T7e8UaPpmLyzIGAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgjMoYA5h9fiUggUnUAoNLDNDrS2JU9cw6xbO9Mfi5vch20EEMgsED/lr8KeWUNF5MySxXF0vu63WZn0eGoPVH7n6WHIOeky+MIPPF8nvCK1Kmtlz3miwVYvTUOtNATmWuDKxXXSUVORdlmjbkHasVI4EDu0T6zTQ85SxsfG5T++9Q3p6z1cCkvL2xrO27BJbv2DL8qWnbfKwnO6Uq6jAeHAomUSWLxCjBp+bk/BYQcBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBAhcg1FrgN4jpza9ALGbc6Z7Bdec0SGuEIsduF/YR8CIQO37US7dEn4CHKpSJzmwUnIDf+x1qW5KTNQTb/Y3jd545mWQOBhl66SnPo0TWbEzpq6FWLy1+6oRoVVgaAvMhcMuy9KrLZrTJ/nujEvxPGCsusQNviYxPVKEdGR6W7/zdV0W/0qYX6Fm9RrZ/9i7ZdsedotvJzQhXS6CtS4LnrhWzZYkYlbmpCJ58DbYRQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCC3AiX4G+HcAjFa+QqEln54k2nKWrfArmUt7kPsI4CAR4H4yEQVOo/dJdTW4bUr/QpQIGYHIv20YEtKYWw/p6b0reg4N2V/ph2/85xpvLl6P9Z/VE7ve8PT5QL1zZLsEvYYah1+7WVP49MJgXwI7OhuFq2Qn9LMgJh16WHXlD7FuhMbk9jBt0XiMWcFJ/qOyX12sFUrt9JmFujo7nGqtn7m7s/L6o2bUk9wPjctEuhYKYElHxKzoU2MUHol4NST2EMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgfkQINQ6H+pcsygELJG73RNd11Qtq+rD7sPsI4CAR4HR/e947DnRraJrua/+dC4sgVj/MV8TMsNVouHL2TQNbhrBkK8hxo8c8tW/kDoPvfC45+mEV6xz+obsSrZenYdf9l4N1vNE6IiARwENtGqw1d2MaOn+gZF1ekhih87+f+WBn++Tf///vuYmYH8agcbWBXLVjTvkji/8ifzCZVdIdTSa0tuoCIvZtEgC56yWwKJl4lT/tUOvNAQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQKAwBnqFeGPeBWRSaQOcGu1xg7Cr3tG5fWbohCvda2UcgHwLOo8ztypKVHsOqGr4za+pEz8u21V29TaovvNzz6Ue/9t/Fb/jW8+Bl1nH8yEHHMrlC6EwENRd8TE48dO9M3aZ8P7Luoinfy/SG3muteFqsbcSupKoVkDUQPFOLrNkoA4/cJ+GVE+HWmfqrC98LMynxfr4Ftvc0ydffPJJyGQ0lGtX1Yg32pxwvlR1raEDiR94Ts2WJs6R9r/9M/uvb33KCmqWyxrlYh4ZZL/n4dc5LDd/4yUvy2ss/Sql8a0RqRV9mS1ysU8clfrJP1J+GAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALzJ0Cl1vmz58oFLBAMWnaVViOlJGtrJChXLq4r4FkzNQSKQ2Dk1ed9TbT24qt99Xd3Dra0uw9Nu1/MVTunXdg8vem30mfVxs2eApqZlqPBziqfoVa/88t03fk8Zo2PybDH7ymtzqoB48ql53ma8tDLT3vqRycE8inQXVuZ8ecvp7pmPi88z2PHTxyReN/BxCxeff5Zefg72Qf+EwOV6UbXilVOKPi/fenP5eptO0T3U5philHbJIGFPRLsWiNmc4doeJqGAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJzL0Code7NuWKhC3Rutn+DHd/mnubula3uQ+wjgEAWAsN2ZUkN4nltGnL0G0xNHjtkh/i8Nq0Iq1UvabkT8Hu/jWBIai+9LqsJ6Hl6vtfmBELtz2Oxt6EXfuB5CVUbNjvBVi8neA3LehmLPgjMRuCGzoa007VSa6mHDuN9ByTe35tY+0+e+aE89b0HE/ts+BcIhoJy3oZNsvVX75A7vvAnctn1N0hj64LUgQIhMetbJbDkQ85Lt8U+RkMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgbkRINQ6N85cpYgEQqGTO+0qrW3JU64MGHJtR3qgIrkP2wgg4E1Ag6Oj+97w1tnupSHFhk/dmlX1ztrLrvd13tjB/Z7nRUdvAn7vt45afeHlElm90dsFzvSqtsOaep6fpp9DnV+xt7GD74m+vDSvlWx1vPEjZ6tEehmbPgjkS0Ar5a+qT6+aaURb8nXJghk3fnS/WCePJebz9MMPyotPPJbYZyN7gepoVNZfcql85u7POy/d1mPJTYPTWrVVq7dqFVejptH+wYT/hE42YhsBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBXAvwG7lcizJe0QvEYnKbexFbuxqlNRJ0H2YfAQSyFBh6+SlfZ4bal0jDL9/mqwpnZddy3xU/vQYDfU2ezuL3fitZ/Sd3iQZVvTQNs0avSSuwPeOp2cxrxkHnqcPQj5/O6ZWH97yQ0/EYDIHZCmw91w4TuppZ11wWAcPY4XfFGhpIrP7R+++TPS88m9hnY/YCWq1Vq7Zq9Vat4rp64ybRqq7JzaiKSqCtS4Lnni+B9m4x6+xQNRVck4nYRgABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCAnAoRac8LIIKUiEFz6kc2mKWvd69mx1A5N0BBAIGcC+ljz0z6qteqFK3vOk9Y7/1hmqjRp1tRJ3dXbpPFXftPXfPVR9Kee+b6vc+jsTSCb+60Veuuu+7S03PEFCa9cJ8GW9pSLadBZq7m2fvaPnfut/f00/fzpvEqlDb/yvOhnOFdt6CV/wfNcXZdxEJhKYEd3s0QrAqlv2xUznWBr6tGS3IsdfDsl2PrQvd8i2JqnO921YpVcdeMO+W9f+nO5etsO0f2UZn/ujOp6MVuWTFRw7VgpZuNCMSqrUrqxgwACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAC2Qmklp/JbgzOQqCEBOJ32iW/UtazuT2a8ZG3KZ3YQQAB3wLH/+VvnZCqGfYeAgnUNzsVPGsvu07Gjx+T0XffSFzXrIlKsLldKjrO9VXRdXKAoecfL4lH0U+up9C+9v/rPdLya18QP/db16Dh1cabfj2ny4mPDInOp5Ra/NQJOb13jxMAnu26NPCr49EQKCSByoAhn+pslK+/eSRlWmb9Aon396YcK8kdKy6xQ+9IYNGyRHhSg63aztuwqSSXPN+L0kqtaquvwYEBef3HL8qPn/mh9PUeTpmahlmdQGuj/ccXsTH7389+J4DsVNe17xsNAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQT8CRBq9edF71IW6NzQJhLf4l7iTd3pj7t192EfAQT8C2ho7sQD35SGX77N98kabtVXZddy3+dmOkErXJ588qFMb3EsRwKx/qNOkDTXAdVspqeBVp1PqbXBFx7PSah1+GWqtJbaZ6NU1rNreXNaqFWCFU7VTGuwv1SWOfU64jGJffCmBBavEKMi7PQj2Do1Vy7fqY5GZf0llzqvAfuPava++ors3fOK7H97b+plAiG7enCLiL7sQKs1fMp+nZx4jQym9mUPAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQyCqSWpMzYhYMIlIdAcOlH/pf92+ffSF5tR02F/PATK5MPsY2AJ4EFd33ZCV166mx3OnbPn4lWRyyUpmHRpl2/62s6h79yd1ZBwfpP7pKqdRf5ulauO5964kEZeOQ+T8Oqi58wbbYuk5NZ+KWvT27O+FWDmnq9bJvf+z5khx/9Vjytu+7TUr1hc7ZTnPV5GvzUMLXfNtf33e/8Jvu33f0XYtbUTe76/qoB78Nf/i3RarY0BApR4NYfvivfez+1krBWxIwdcIULC3HyuZqTGUip2KrDXr1tBxVbc+XrY5yR4WHZ9/rP5O2fvuJ81f0pGyHXKWmmesOyrKne4jgCCCCAAAII5FDAMPgVQQ45C3oofr4q6NvD5BBAAAEESkignH6+4ueLEvrgshQEEEAAgYIWKJefL4JL1yfuA5VaExRslLVA52a73NXJnW6DW5Y1uw+xjwACORYYePDenFZd9Ts9DWZ6DbT6HZv+6QLO/bZDl+GV69LfzPORkddeFr1+Kbehl56SmkuuyXqJp/fuIdCatR4nzoXALvtnM3eo1aiKOo9/t06XSRh7smLromUTj7234anYOhefvvRrhCMRWbluvfPSdzXgqhVc333jNTnRdyz1BMMU57Nqf16dRsg11Yc9BBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBM4IEGrlo4CALRAKDuy2xKhPxqgMGPKpzsbkQ2wjgEAeBLQipFaqrb3seqm99Lo8XGHqIbOpNDr1aLzjRUArgfb90187wcvo5Td4OWXWffSaGlwefPqRWY9V6AMMvvCDWYVah199vtCXyPzKXGBTa42sqg/Lz/pHUiSM2iYpm1CrrnyKYOuJvj656Mrsg+0pqOz4FuhasUr0pa2v97Dsf3uv83rP/jo4MJA6njvkar9rDZ8Ua2Qw8ZLYWOo57CGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQBgKEWsvgJrPEmQViceNm00ztt6O7WaIVgdSD7CGAQN4ETj56v4wdfE/qP7lLzHBV3q6jA2vIUQOOVGjNK/O0g5964kEZ3fe6NN7062LalVvz1WL9R+X4v/ytjO5/J1+XKKhxdb2n970hlV3Lfc9LA+ZazZaGQKEL7FjaLL//o/dTpmnWNUv8+CEpqxBghmDr0w8/6LgQbE35eMzLTmPrAtHX+Rd81Ln+jCFXu5cRqXVeiQnbodbkkKuGXmkIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlLoAodZSv8Osb0aBYPeHrxJD1ro7bu9pch9iHwEE8iyggbojf/NFqbtmW94eT6+VKE/Yj6CPnzqR59Uw/EwCGjTtPXO/I6s3ztTd1/saXNZKvCcfvk80rFlObdhedzahVv3+UzcaAoUusLWrUf7yp4ekd3j87FTtqpdmbYPE+3vPHiuHLQ22vv+GBNq7nUfb65I12Hp6ZFguu35uqmGXA3Mu1jhdyPXAz9+VE33H0i8TCIlRXe+8Jt/UkKuMjYg1ar+0qqv9tazC3JMQfEUAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEChZAUKtJXtrWZhnAUNuc/e9cnGddNdWug+zjwACcyCglSb18fRavTOyZqNUf+QXJdjSPqsra4BVq1eeevIhpxrsrAbj5JwK6L3RSqonHvimaLC1+oKPzep+a1BWQ50aXi63MOvkjdG1R+1guN+Kx3oeDYFiEKgMGHJtR4N8/c0jKdM1G9rKL9SqAlZcYgffTgm2vvjEYzJw/Jh8YvsuCYb4T76UD0qB7LhDroMDA6Lh1oPvvWt/3Sf7396bcaZGuFrEfhnJ79rhZuv0kP0aTgm86mej1Npvf/+xUluSs54//9ilaetirWkkRXeA+8r3a9F9aF0T5jNcPp9h161nt8wE+Jmj+G84/16Xz7/XfL/y/VpMAuX+b1Mx3at8zPXA5z+Tj2HnfcyFX/p62hxYaxpJ0R0o9/vKzxdF95FNm3C5/39uOX2G025+GRzgN5xlcJNZ4tQCFcvWr4jHZYu7xw2dDe5D7CPgS+DUM9/3FSgbt4MnhdR0Picfe8DXlKwcV8PUsOPg0484Lw21auCxYvG5YqdzJNTSNu0j68ePHJRY/zEZeWuP/Yj7N3IaZNXA5Oi7b3i2ma2Ln/sw2xCn3/s+dvA9zw4zddS5D77wuPMKtS9xKvVWdC53TpvqfmsA2vnesauLjr5vh1ntUKbe+3y0ub7vs1mDVlsds8O9lT3neR7GCX7v3eO5Px0RmG+B3Stb00KtcqaqpTXYP9/Tm/vrTwZb27oSVT33vvqKfPv//JXc8Kt3SDgSmfs5cUVfAtXRqPSsXuO8Jk/sPfCBHLQDrhp0/eDdfdLXe3jyrdSvZkCMSK3zSn5jsoqrE3bV4OvIKRHnq13tlYYAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAgQoQai3QG8O05kYgHjfsKq1WysW0QqtWaqUhMBsBDWMWc9Ow4MlH7y+YJWhQMdN8tJqrBh61jR05JBrMy3fTR9rPZcu07nxdv1Duu4ZlMwVmDTvQXNFxrlN1N18GU4071/d9qnl4Oa7fFxVdE4FgL/21z/ArVGn1akW/whBojQSdn9e+937qv/tmfavEyjHUqrflTLDVbO4QddCmFT//4S/+VLbawVatDEorLoHWhYtEX+df8FFn4iPDw3LkwPvS+8H7cuJ4n/P14Hv7ZHxsPOPCjIqwfTycFnZ1Op+p7irjo2KN2a/REZHY2ETo1a76SkMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEBgvgQItc6XPNedf4HOzfZveQd2S+rDO2V7T9P8z40ZIICAJwGnuuQcBFk9TYZOeRfQCqSn7cq7tOkFIms2igaA/bShHz/tpzt9ESgIgdtXtog71OpUq6ysch7FXhCTnIdJxI/uFxk/LRpu1Xai75j84//+c6di68JzuuZhRlwyVwJacbeju8d5JY85ODAgfUcOy3tv7ZWT/X1y7PBhJ9Cc3Cdt+0x1Vz1upL155oATeD3t7ExWe9XwtDVyttKrpQFYOyBLQwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCBXAoRacyXJOEUnEAoO7LbE0PJFiVYZMGRHd3Ninw0EEEAAAQSKSUDDrLUXX+1ryloJOVNlXF+D0BmBeRBY11Qtq+rD8rN+u8JkUjOizWIdeS/pSPltxvt7neqbgTY7xGqYohU+v/1//kquuvHTsnLd+vIDKfEVV0ejoi8NvCa3gePH7FDzREVX/QwcfO9du6rrmOx/e29yt6m3gxX2H0lUOO9rYNxTSwrCOv2dirDDnk7VTomKsZ7PoCMCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggECpCRBqLbU7yno8C1git7k7a6BVg600BBBAAAEEilGg+sLLxayp8zX1wR/9wFd/OiNQSAKfWd4idz1nVyZNama0SeJ9BycepZ50vNw2rcF+iX3wpgQW2kFHuyqnPqL+P771DRk6OSDrL7m03DjKcr3RhibRlzvsqhgacj1y4H05pVVeew/LcbvSq25rO/jePufz4uz4/Z+kIOzkqUZ1/eQmXxFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBCYUYBQ64xEdChFgUD3+i32ula417ZrOVVa3SbsI4AAAggUh0D1hs0SvfwGX5O1xsdk+KWnfJ1DZwQKSeDaJQ3y5VcOSu/w+Nlp2ZVJnWDr8UNnj5Xplj4mPrb/Z2K294hRMfGAgkfvv8+p2KlVW4Mh/nOwTD8aEo5EMoZdkz00CK0BV22T4VfdTg7A6r6GYzUkS0MAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgFwL8FjMXioxRdAKGYd1sP4s1Zd5XLq6TxdUTj9dMeYMdBBBAAAEEClTACIYkvHKdVK27SCp7zvM9y6HnH5f4yJDv8zgBgUIR0Ar7Wzsb5auv9aZMyaxvlXi/fcyKpxwvxx1rbFRi77/uVGw1wtUOwWsvvyjHDh+SX9p1q1PJsxxdWPPMAhp6zlTldaYzB44fkxN9fYluI0PD0msHX720px9+0Es3+iCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCJSwAKHWEr65LC2zQKhnw1rLimul1pR2U3djyj47CCCAAAIIFIpA/Sd3OcHVXM9n8Ec/yPWQjIfAnAvcsqxFvv7mETkds85eOxASoyoq1mD/2WPlvBWPSeyDN8VsXixmXYsj0XvgA/n7r/ypbLnl1qyCi+XMydqnF4g2NKWFpXtWr5n+pDPvEmr1xEQnBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKCkBcySXh2LQyCDgGVpldbUtqo+LJvbo6kH2UMAAQQQQKCEBUZee1nGjxws4RWytHIRaI0ERSvuu5vZ0OY+VN77dtXa+JH3nNdkBVt9ZPy9X/0refGJx8rbhtUjgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEDBCBBqLZhbwUTmRKBzbb1IfLf7WlvPpUqr24R9BBBAAIHSFbDGx+TEg/eW7gJZWdkJ3L5yQdqajXC1GJVVacfL/UD8xBGnaqvExhIUj95/n/zHt74h42PjiWNsIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAvMhQKh1PtS55rwJhILmThEjnDwBre61o7s5+RDbCCCAAAIIlLTAgB1ojfUfLek1srjyEtCq+/pyN7MhPezq7lOO+9bIoIz//KdiDZ9MLP+1l1+Uf/zf/1P6eg8njrGBAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIzLUAoda5Fud68yoQt4w73RO4tqNBKgOG+zD7CCCAAAIIlKTA6X1vyOALj5fk2lhUeQtkrNZabRfpD4TKG2aq1cdjTsXWeH9vokfvgQ/k7//if8hzjz6cOMYGAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggMJcChFrnUptrzatAoHv9FsOQTvckdq9sdR9iHwEEEEAAgZIUGN3/jhz/p78uybWxKASuXFwnWoE/pRmmmHUtKYfYSRWIH90vsUP7RKy488b42Lg88Z8P2FVbvyIDx4+ldmYPAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQyLMAodY8AzN84QgYRjytSmvG8EPhTJmZIIAAAgggkDOB03v3yLF7/kziI0M5G5OBECgkAa28f1N3c9qUzDr7mB1upU0tYJ3qk9j+18QaGUx0OvDzffL3X/lT2fPCs4ljbCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAL5FuC3u/kWZvyCEAj1bFgrYm52T2bXsvTgg7sP+wgggAACCBSzQPzUCTn52APSZ1dotcbHinkpzB2BGQW2dzeJhltTWiAkRnV9yiF20gWs0RGJvf+6xI99kHhzZHhYHrr3W/Kdv/uqDA4MJI6zgQACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCORLgFBrvmQZt6AE4vH4be4JraoPy6bWGvdh9hFAAAEEECgJgbGD78mJh+6Vw1+5W04+ej+B1pK4qyxiJoHWSFCuO6chrZvZsCDtGAcyC8SPH3LCrRpynWz7Xv+Z3PNnXxL9SkMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQTyKRDM5+CMjUBBCHRuaDOM2E77ubMp09mxlCqtKSDsIIAAAggUrMD40UNyet8b085vbP/bYsXGZXTf6zK6/53/n717gY7zrA+E/5/RyJKsSLJ8kRXHiZXIduzEBjmOib1fkmOHJYTuts3ZQpsDOYeQAnbYstn2Y3GANpSm3GkPu92FLaV8sAvb9OuN0pa22R74FtoCZTmBJlwWcO5xEidxHDuOnVia+eaZIFkzI8eyNJLm8nvOmcz7PO/7PpffO1ZGmv/8X0GsL6plZzML3Lh+RfzRPQfLlpjpWBzpUXju2bJ2lakFCsePxtiD34/s0rMj2z9YOihlbU0ZWze/bHvs/Jmfi86urqlP1kqAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIFZCAhqnQWeUxtDoD03dl0hMp2TZ5uyeL36/KWTm2wTIECAAIG6FXjmK1+M9FAIEDi9QMrGv2VZd9z55NGyg1OA5tgj+8raVF5EoJCP/JMPR+HooWg7eziirb108F3/9PW4p5ix9aevf0OcO7zuRTqwiwABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAmcukD3zU5xBoLEEChF7K2f86qGl0dFWnrm18hh1AgQIECBAgACBxhS4aeOKqolnFvdOBGZW7dRwSoGUtXX0/u9G4ciTE8ccPXw4bv/Yf4yv/NUXYvTE6ES7DQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECMxWQFDrbAWdX9cCbcNbr43IvHDP1J/MNAWzvqF4W1qFAAECBAgQIECgOQV2nt0bKTN/WclkI7tkoKxJZZoC+bEYe+y+GNv/o4ixExMnfeNLd8Rnfvv9sf/+eyfabBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQGA2AoJaZ6Pn3EYQqMrSOmWQQyOsxBwJECBAgAABAgSmJZC+xLRnY3UAa7Z3WfH7Tn4FmhbiFAcVnj38QtbW4vN4OXjgsfjcf/qtSAGuCgECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBGYr4BPd2Qo6v24F2tdtG8lkYnvlBKe6HW3lMeoECBAgQIAAAQKNLfDT5/ZHCm4tK23tke3pL2tSOUOBlLW1mLE1f+C+iOL2ePnKX32hFNyaglwVAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIzFRAUOtM5ZxX9wL5fP7mykletKQztizrrmxWJ0CAAAECBAgQaDKBga5cvPr8pVWryvRVZ3CtOkjDaQXyh598IWvrsSMTx+6//974/Q/eFl/68z+Jo4dPZnOdOMAGAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIETiMgqPU0QHY3qMDQtsFMpnBd5exv2riyskmdAAECBAgQIECgSQWuX7u8amWZjsWR6eqpatcwA4GxEzH28A8j/+TDEYX8RAff+sqX41Mfui2+8aU7YvTE6ES7DQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECJxOQFDr6YTsb0iBXC5/Q0Smc/LkU7auV67um9xkmwABAgQIECBAoIkFUpb+7QNnVa0wu0S21iqUWTTkn3o0xh74bhSOHpro5fixY/GVv/pCKbj1R3f980S7DQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECLyYgKDWF9OxrzEFhnYWg1kLN1dO/rXDy6OjLVPZrE6AAAECBAgQINDEAjeunyJba/eSiLb2Jl71/C+tcOL5GHtkXylza+G5Zycm8PTBJ+Pzn/5E3P6x/xgH9hczuioECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBB4EQFBrS+CY1djCrS1PXNtMUvr4OTZp2DW1w0vm9xkmwABAgQIECBAoAUEUqb+c89aVLXSbH/Z28Wq/RpmJlA4diTGHvx+5B9/IGLsxEQnD+77UXzmt94fd/zx7XH08OGJdhsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBCYLCCodbKG7aYQKBQKeysXkoIZBrpylc3qBAgQIECAAAECLSDwhimytWZ7+ovfg/Lr0Fxd/vzTj8fo/d+N/FOPFm+ikJ8Y5jtf+/v41Idui3/42y/G6InRiXYbBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQSAI+xfU6aCqB9nXbRrLZGKlc1E0bV1Y2qRMgQIAAAQIECLSIwGuGlkbK3F9W2toj2yuTf5lJrSv5scg/+XCMPfDdKDxzcKL348eOxT/e8cX4xPtuje/f+a2JdhsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAQ1Oo10GQCY1VZWrcs646LlnQ22TothwABAgQIECBAYLoCvYva4vrh5VWHZ3qr26oO0jBrgcKJ52Ps0Xtj7KEfROH40Yn+jh4+HH/52f8nPveffiv233/vRLsNAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgRaV0BQa+te++Zb+dC2wUIhrq1c2E0bV1Q2qRMgQIAAAQIECLSYwKsvWFq14kzH4sgs7q1q1zA3AimgNQW2pgDXGDsxMUgKaE2BrSnANQW6KgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQItK6AoNbWvfZNt/JcrrAnIlOWknWgKxc7zxao0HQX24IIECBAgAABAmcokDL3T/W+MNvnC1BnSDnrwwvPHIzR++6O/MFHIgr5if6+f+e34hPvuzX+4W+/GMePHZtot0GAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQOsICGptnWvd3Csd2lkMZs3vrlzkG9aviI62TGWzOgECBAgQIECAQAsKvHZ4imyt3Usi076oBTUWeMnFYNb8wf0vBLc+/fjEZEZPjMY/3vHF+Ph73hF3/PHtcfipJyf22SBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoPkFcs2/RCtsBYH29sPXFQqZwclrTcGsrx6qDlyYfIxtAgQIECBAgACB1hF45eq+GO7piH1HnitbdKZvZRSeeLCsTWWeBMZORP7xB6Jw+InILl8dma6e0sApuPU7X/v70mPjlq3xsquujoFV58zTpAxDgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgMBCCcjUulDyxq2pwNhY5ubKDn9mTX8MdInbrnRRJ0CAAAECBAi0ssDr1i2rWn62t9iW8atRFcw8NhSeezbGHv5hjD2yLwrHjpSN/P07vxWf+a33x+0f+49x7w++V7ZPhQABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACB5hIQ8ddc17MlV9O+9pLthYiRysXfuH5FZZM6AQIECBAgQIBAiwtcP7w8PvidR+K5seI7yPGSbYts3/LIHzow3uJ5gQQKRw/FWPGR6eyObP9gZLqXTMzkwX0/ivRIGVu3XrkrNm3bPrHPBgECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECzSEgHVFzXMeWXkUxHGFvJcD2gbPioiWdlc3qBAgQIECAAAECLS7Q0ZaJFNhaWTK9vhBVabKQ9cLxo6WsrWP33xWFI09GFPIT0zmw/+H469s/Gx97zzvjW1/5chw/dmxinw0CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBBpbQFBrY18/sx/aNlhEuKYS4sb11YEKlceoEyBAgAABAgQItKbA69Ytq1p4ZlFnWVbQqgM0LIhA4cTzMfbYfTF6392Rf+rRiPzYxDyOHj4cX/rzP4nfe++t8ZW/+kKkukKAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQGMLCGpt7OvX8rPP5QrFLK2ZspSsA125eOXqvpa3AUCAAAECBAgQIDC1wHBPx5TvF7O91cGuU/egdd4Fxk5E/smHi8Gtd0X+iQcjivXxkjK1fuNLd8Qn3ndr/M0ffjYOHnhsfJdnAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQaTCDXYPM1XQInBYZ2FoNZD19XDGo92Vbc2rNxoKyuQoAAAQIECBAgQKBS4LXDS+NvH3q6rDnTvSRSxtbC88fL2lXqSKCYqTV/6EDpkYKQM0sGS9cszXD0xGjc9U9fLz3WbX5JbL1iV5w7vK6OJm8qBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAicTkBQ6+mE7K9bgfb2IzcUCpnByRPsaMvET5/bP7nJNgECBAgQIECAAIEqgZ1n98ZFSzrje4fKA1gzvSuikDKBKnUvkD/8ZPE7bk9GCkbOLhmITFfPxJx/dNc/R3oMrDonNm3bHhtGtkZ3b+/EfhsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECNSnQLY+p2VWBE4vMDYWuyuPun54eQx0idWudFEnQIAAAQIECBCoFnj1BUurGrN9y4s3AvBrUhVMHTcUjh6KsYd/GGMP/SDS9uRyYP/D8aU//5P42HveGZ//9Cfi+3d+q5TRdfIxtgkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQqB8B0X/1cy3M5AwEcmsv3RlRGKk8ZarAhMpj1AkQIECAAAECBAgkgfSFqI/e/Vgcfn7sJEgxoDUFtqZb3CuNJVA4fjTGHtkXmfZFkV26KjJnFe/gMClAeTx7a2dXV6zb/NK4+NLL4tzhdY21SLMlQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0OQCglqb/AI37/LyNxc/oS5b3vgtZMsaVQgQIECAAAECBAicQqCjLROvGVoav//Dx8uOyC5ZKai1TKSxKoUTz8fYY/dFPP5gZHuWRqZnWWQ6uycWcfzYsbjrn75eevQtXRYbRrbGyL/4v6K3f9nEMTYIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEFgYAUGtC+Nu1NkIDG0bjMhfW9nFa4erbx9beYw6AQIECBAgQIAAgckCN164vCqoNXKLItO9pOpW9pPPs90AAvmxyD9dDFguPlL21kzP8sj2FgNXi9d3vDx98Mn4xpfuKD1WrTk/Nm27LC4sBrmmbK4KAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQLzLyCodf7NjThLgVyusLeyi3PPWhSvXN1X2axOgAABAgQIECBA4EUFVne/8D7ybx96uuy4bN+KGDt6qKxNpXEFUvbWwsH9kS8+UtbWlL01ZXGNbNvEovbff2+kxx1/fHts3LI1hi9+Sel54gAbBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAjMuYCg1jknNkBNBYZ2dkYcuaGyzzesX17ZpE6AAAECBAgQIEBgWgI3Ft9LVga1Zhb3RqZjcRSee3ZafTiocQQKx49GeuQffyAyZy2N7Fl9pefJK/j+nd+K9Pi7P7m9lLk1ZXBNmVwVAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgTmVkBQ69z66r3G03v0xAAAQABJREFUAu25w3sKkVkyuduOtky8ZqiYZUkhQIAAAQIECBAgMAOB7QNnxUVLOuN7h46XnZ2yeQpqLSNpukrhmYMxVnxE9oFS5tZ0zVMm1/Fy/Nix+M7X/r70WDqwMjaMbI11m18aA6vOGT/EMwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECNRQQ1FpDTF3NvcBYPvP6bLZ8nOuHl0fvopO3DS3fq0aAAAECBAgQIEDg9ALXr10e7/zfD5UdmO1bHvmnHo0YO1HWrtKEAvmxyD/9eETxkWlfFJme5ZHtXRaRWzSx2IMHHot/vOOLpUd3b2+svfglcf6FF8X5Gy6KXLtfrSegbBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBCYhYBP3maB59T5FcgNX3JNZGKkctTXrSt+2KwQIECAAAECBAgQmIXAq89fGh/97qNx4NjoyV4y2WL2zv7IHzpwss1W0wsUTjwfhYP7I198pKytKXtrtqd4Z4jsyS/SHT18eCKDawpoTYGtKcA1BbqmgFeFAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIGZCQhqnZmbsxZCIBO7K4d95eq+GO7pqGxWJ0CAAAECBAgQIHBGAh1tmfjpc/vj939YzNY5qWT7BwW1TvJotc3C8aORHvnHH4jMWUsj290bme4lZQGuoydG40d3/XPpcccf3x4Dq86JtZteWgx03Rir1pzfamTWS4AAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQGBWAoJaZ8Xn5PkSWLR+64Z8Pq6tHO/nhvorm9QJECBAgAABAgQIzEhgz8aB+Oy+J+K5scLJ89vaS0GMhaOHTrbZakmBwjMHY6z4SCXT1VN6XWQWF4NcF3WWeRzY/3Ckxz/e8cUYz+J67gXr4ty160sBr2UHqxAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUCYgqLWMQ6VeBfL5TDFL66TgguJEU4bWlKlVIUCAAAECBAgQIFALgYGuXOw8uzf+9qGny7rLLhmIMUGtZSatXikcOxLpkUoKai0FtxYzuWY6u8toJmdxTTs6u7qKGVwvinOHi0GuxcfSgZVlx6sQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQaHWBTKsDWH8DCAzt7MzlDj9V/Li4LAXSrZesil9cv6IBFmCKBAgQIECAAAECjSJw55NH49r/+eOq6Y49+P0oPPdsVbsGAmUCxcy+2e5i9taUwbWYzTWK9Rcr3b29cUExyPXs84ZixapzYtWa81/s8Kbf9+H/+5cm1njiR/97YtsGAQIECBAgMHcCmYyPCOZOt756LhTKk0bU1+zMhgABAgQINI9AK72/8v6ieV63VkKAAAEC9S3QKu8vcmu3TlwImVonKGzUq0B77vCeQkVAa++itrh+eHm9Ttm8CBAgQIAAAQIEGlRgy7LuuGhJZ3zv0PGyFWR6l0fh8QfK2lQIVAmMnYj84Scj0qNYJrK4FgNcS0Gu2bayU44ePhx3/dPXS4/xHSmD68A5q0uBrucMDUVv/7LxXZ4JECBAgAABAgQIECBAgAABAgQIECBAgAABAgQINL2AoNamv8SNv8Di98d3V67iNUNLo6NNFoFKF3UCBAgQIECAAIHZC/zihSvi//7Gg2UdZXuXRf7gIxHFoEWFwHQFCs8fj/SIQwdKp2Q6FheDW88qZnLtKz1HJlvV1YP7fhTpMV46u7ri7GIG15TNddWaoVi2cqVA13EczwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAk0nIKi16S5pcy2obXjrtcUVbahc1Y0XytJaaaJOgAABAgQIECBQG4GfPq8/PvjPj8SBY6MnOywGH5YCW5969GSbLQJnKFB47tlIj4kg187uUgbXTEdX8d4UZ0XkFlX1ePzYsbj3B98rPcZ35tpzxSDX82PpwMpYVnykzK4rVq2OFACrECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEGhkAUGtjXz1WmDumUzh9cWbdpat9JWr+2J1d/WHvWUHqRAgQIAAAQIECBCYoUC6I8Brh5fHR+8uD2DNLhmIfMq4WcjPsGenESgXKBw/GukxUbJtxeDWYqBrMcD1hefuiGJbZRk9MVrK5jo5o2s6pru3N5auWFnM6Hp+Mcj1nDirWBfsWqmnToAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUM8Cglrr+eq0+Nza120bKRTyKVNrWXnt8NKyugoBAgQIECBAgACBWgu8bnhZfPz7j8VzY4WTXbe1F28b3xuFo4dOttkiUEuB/FgUnj1ceox3m2kvfqGv4ydBrimja8fiKQNd0/FHDx8uPSqDXdO+FOjaUczkevZ5Q6WMrim7a9/SpdHbvyztbojytr/7ckPM80wn+ZF/uavqFGutImm4BtfVv9eGe9FWTNhruHVewxWXXrXFBLznaPwL7ud16/y89u/Vv9dGEmj1n02NdK3mYq77f+0X56LbBe9z1W2/XzUHa60iabiGVr+u3l803Eu2asKt/v/cVnoNV138FmgQ1NoCF7lRl1gopCyt5eWiJZ2x8+ze8kY1AgQIECBAgAABAjUWGOjKRbpDwBfuLw9gzfYPxpig1hpr6+7FBAonno84cTAKzxw8eVjK6FoMbk3ZXCO3KDKLOiPT1XNy/xRb+++/t9R67w++V7V36cDK6O7pLWV27S9mek1lYNXq6FzcVQyEXVzcPqfqHA0ECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIE5kJAUOtcqOpz9gJDI0si8nsiMmV9Xb92eVldhQABAgQIECBAgMBcCdy0cWVVUGvplvDFYMLCc8/O1bD6JXB6gZTR9diR0mPywaWsru3FANfOYmbXlOE1Bby+SGbX8XMPHngs0uN0ZTz4NR03eXv8vPPWrhvfLD0LiC3jUCFAgAABAgQIECBAgAABAgQIECBAgAABAgQIEJiGgKDWaSA5ZP4F2nPZGwrFj2Inj5yyZb36/KWTm2wTIECAAAECBAgQmDOBdJeA9PjeoeNlY2T7V8bYoy9kvSzboUJggQVeyOr6fBSePVw1k1JW10z2hYyupUyvXRFt7aUsr1UHn6JhcvDrg/t+VHXUP95R1aSBAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwBkJCGo9Iy4Hz5dAvpC5OVOepDV++tz+6GiraJyvCRmHAAECBAgQIECgJQVStta3fu3+srVnuos3FSgGA8bYibJ2FQL1LFA4frQ0vZThtbJkFhW/T5he08WS6ep5YXdbbiLgdTwg9oUd/kuAAAECBAgQIECAAAECBAgQIECAAAECBAgQIEBg7gQEtc6drZ5nKNA2vPXaYkDrUOXpezYOVDapEyBAgAABAgQIEJhTgVeu7ot0x4ADx0ZPjlPMdpntWxH5g/tPttki0MAChedTNuIXMhJPFfQ6sbRShtfFE9XJwbDjjZmus8Y3X3iuOKd8pxoBAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBcgFBreUeanUgkMnkb47Ils1kPJigrFGFAAECBAgQIECAwBwLpDsFvGH9ivjgdx4pGynbtzzyTz0aUciXtasQaGqB/FhMDnqdvF2LdU8VJFuLfvVBgAABAgQIECBAgAABAgQIECBAgAABAgQIECDQOALlkYONM28zbVKB9nXbRooBrTsrl3fj+uWVTeoECBAgQIAAAQIE5kXg1UNLIwW3lpXirdoz3UvKmlQIEJidQMoYmwJlax0sO7tZOZsAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQGA+BQS1zqe2sU4rkM/nd1cedNGSztg+UHELy8qD1AkQIECAAAECBAjMkcBAVy5+Zk1/Ve/Z/pVVbRoIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAYOYCglpnbufMWgsMbRvMZAo3VHb7ixeuqGxSJ0CAAAECBAgQIDCvAjeur35PmulYHOmhECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEBtBAS11sZRLzUQaM+NXReR6ZzcVcqK9dPnVWfFmnyMbQIECBAgQIAAAQJzLZDuHrBlWXfVMNmlZ1e1aSBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBmQkIap2Zm7PmQKAQsbey21cPLY2OtkxlszoBAgQIECBAgACBeRe4aeMU2VoX90a0tc/7XAxIgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBZhQQ1NqMV7UB19Q2vPXaYpbWwclTT8Gsb5jiNq+Tj7FNgAABAgQIECBAYL4EXrm6L9KdBMpKJhvZJQNlTSoECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgMDMBQa0zc3NW7QWqsrTuPLu3Omig9uPqkQABAgQIECBAgMC0BfZsrA5gzfYuK34/y69W00Z0IAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBE4hUJFm6BRHaSYwhwLt67aNFAr57ZVD/O1DT8ea279T2axOgAABAgQIECBAoL4E2toj29Mf+cNP1te8zIYAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQINJiCdUINdsGacbj6fv7kZ12VNBAgQIECAAAECrSOQ6avO4No6q7dSAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQI1EZAUGttHPUyU4GhbYOZTOG6mZ7uPAIECBAgQIAAAQL1IJDpWByZrp56mIo5ECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoGEFBLU27KVrjonncvkbIjKdzbEaqyBAgAABAgQIEGhlgewS2Vpb+fpbOwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECsxcQ1Dp7Qz3MVGBoZzGYtXDzTE93HgECBAgQIECAAIF6Esh0L4loa6+nKZkLAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEGkog11CzNdnmErjv/zs+GnF2cy3KaggQIHB6gUKh0Fl8PJXJyFR9ei1HECBAoP4FtmzZEt/+9rfrf6JmSIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgToXkKm1zi+Q6REgQIBAUwrsEdDalNfVoggQaFGBm29284EWvfSWTYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAjQUEtdYYVHcECBAgQOB0AsUsrbtPd4z9BAgQINA4Atddd10MDg42zoTNlAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAnUqIKi1Ti+MaREgQIBAcwoUA1qvLWZp3dCcq7MqAgQItKZAZ2dn3HDDDa25eKsmQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBADQUEtdYQU1cECBAgQOB0ArK0nk7IfgIECDSmwM033xwpuFUhQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgZkLCGqduZ0zCRAgQIDAGQkUA1pHillarzmjkxxMgAABAg0hMDg4GNdc40d8Q1wskyRAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoG4FBLXW7aUxMQIECBBoNoF8Pv/6ZluT9RAgQIDASYG9e/eerNgiQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgTMWyJ3xGU4gUCOB9nXbRgqF/J016k43BAgQqHuBjgtfVvdzNEECBAgQmJ1Abu3Wsg7GHtkXhaOHytpUCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQGBqAZlap3bROg8Cxdtwy1g4D86GIECAAAECBAgQWDiB7NKzF25wIxMgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQKDBBAS1NtgFa5rpDo0sicjvaZr1WAgBAgQIECBAgACBKQQyHYsjPRQCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQOL2AoNbTGzliDgTac9kbIjKdc9C1LgkQIECAAAECBAjUlUC2f2VdzcdkCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUK8Cglrr9co0+bwKEXubfImWR4AAAQIECBAgQKAkkOku3qSgrZ0GAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECJxGIHea/XYTqLlA2/DWa4udDlZ2fON73xE9S5dWNqsTIECAAAECBAgQaBiB0ROj8elfe18cffrIyTlnspHtWxH5g/tPttkiQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgSoBmVqrSDTMtUAmk7+5cozhkU0CWitR1AkQIECAAAECBBpOINeei43bL62ad7ZveUQxuFUhQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgVML+FT11Db2zIFA+7ptIxHZnZVdb778ssomdQIECBAgQIAAAQINKTCy64pIwa1lpa09sj39ZU0qBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIFAuIKi13ENtjgXy+fzuyiFWrF4Vay7eUNmsToAAAQIECBAgQKAhBbr7eiLdiaCyZPoGKpvUCRAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQGCSgKDWSRg251hgaNtgJlO4oXKUjTuqb89aeYw6AQIECBAgQIAAgUYS2Hr1rqrpZjoWR6azu6pdAwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAi8ICCo1Sth3gRyufwNUfwYf/KAKYvV5it2TG6yTYAAAQIECBAgQKDhBdLdCNKjsmT7Byub1AkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIEDgJwKCWr0U5kdgaGcxmLVwc+Vg67eORK49V9msToAAAQIECBAgQKDhBabM1rq4N6KtveHXZgEECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBCYCwFBrXOhqs8qgba2I9cUs7RWpaWa6oP+qpM1ECBAgAABAgQIEGhAgeGRTZHuTFBWMtmQrbVMRIUAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQITAoJaJyhszLHA3sr+p/yQv/IgdQIECBAgQIAAAQINKpDuSDCy64qq2Wd7+ovf9/KrWBWMBgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEWl7AJ6kt/xKYe4D2ddtGMpnYXjnSyK7LK5vUCRAgQIAAAQIECDSVwMbtl0YKbi0rbe2R7V1W1qRCgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAhGCWr0K5lwgn8/fXDnIitWrYvX64cpmdQIECBAgQIAAAQJNJdDd1xPrL91StaZM7/KqNg0ECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBBodQFBra3+Cpjr9Q9tG8xkCtdVDrPl5bK0VpqoEyBAgAABAgQINKfAyFXV730zHYsj09XTnAu2KgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECMxQQFDrDOGcNj2BXK6wJyLTOfnolK1q3dbqbFWTj7FNgAABAgQIECBAoFkE0l0KBs9fU7Wc7JKBqjYNBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQaGUBQa2tfPXneu1DO4vBrPndlcNs3H5p5Npzlc3qBAgQIECAAAECBJpW4NKrd1atLdO9JKKtvapdAwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBFpVQFBrq175eVh3W9sz1xaztA5OHioFs47sumJyk20CBAgQIECAAAECTS8wPLIp0h0LKku2v+ztcuVudQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECLSUgKDWlrrc87vYQqGwt3LENRdvmPLD/Mrj1AkQIECAAAECBAg0m8DWq3dVLSnbu6z4PTC/llXBaCBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoCUFfHrakpd97hfdvm7bSDYbI5UjTXXb1cpj1AkQIECAAAECBAg0o8D6rSOR7lxQVrJtke1bXtakQoAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgVYVENTaqld+jtddKIy9u3KIFatXxeD5ayqb1QkQIECAAAECBAi0hEB3X09s3H5p1VozPcVsrQoBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIhKBWL4LaCwxtGyx2ek1lx1PdbrXyGHUCBAgQIECAAAECzSyw+codVcvLdCyOzOLeqnYNBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQaDUBQa2tdsXnYb253OjNEZnOyUOlrFTDI5smN9kmQIAAAQIECBAg0HIC6e4Fq9cPV60727eiqk0DAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEWk1AUGurXfG5Xu/QzmIwa+aGymE2Xb4jcu25ymZ1AgQIECBAgAABAi0nMLLr8qo1Z7qXRGZR2ffCqo7RQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgWYXENTa7Fd4ntfX3n74umJQ6+DkYVMw6+Yrtk9usk2AAAECBAgQIECgZQXSHQx6ly2tWn+mV7bWKhQNBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAi0lICg1pa63HO/2LGxzM2Vo6QP7bv7eiqb1QkQIECAAAECBAi0rMDIVdXZWrN9y4vfD/MrWsu+KCycAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIHIMCBQK4H2tZdsL0Tma5X9vfZdvxwrVq+qbFYnQIAAAQIECBAg0LICzz17LD55y2/E6InRMoP8Ew9G/tCBsjYVAq0oUCgUWnHZ1kyAAAECBOZdIJPxEcG8oy/QgN5fLRC8YQkQIECg5QRa6f2V9xct9/K2YAIECBBYIIFWeX+RW7t1QlgaoAkKG7MVKH7kuLeyj8Hz1whorURRJ0CAAAECBAgQaHmBjsVdsfmKHVUOmd4VVW0aCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0CoCglpb5UrP9TqHtg0W75V6beUwl169s7JJnQABAgQIECBAgACBosDGHZdWOWQWdUame0lVuwYCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAi0goCg1la4yvOwxlyuUJWltbuvJ9ZcvGEeRjcEAQIECBAgQIAAgcYTWLF6VQyPbKqaeLZ3WVWbBgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECLSCgKDWVrjKc73GoZ2dEfnrKocZ2XVF5Npzlc3qBAgQIECAAAECBAj8RGDjZVurLFKm1kzH4qp2DQQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEGh2ARGHzX6F52F97e1HbigUMoOTh0rBrBu3V99OdfIxtgkQIECAAAECBAi0ukDK1Nq/ciCeeuxAGUWmZ1kUnnu2rE2FQKsLvO3vvtyUBB/5l7uq1mWtVSQN1+C6+vfacC/aigl7DbfOa7ji0qu2mID3HI1/wf28bp2f1/69+vfaSAKt/rOpka7VXMx1/6/94lx0u+B9rrrt96vmYK1VJA3X0OrX1fuLhnvJVk241f+f20qv4aqL3wINMrW2wEWe6yWOjcXuyjFSQGt3X09lszoBAgQIECBAgAABAhUCm6/cXtESke1bXvxPW1W7BgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECDSzgKDWZr6687C23PAl12SzMVI51OYrd1Q2qRMgQIAAAQIECBAgMIXA5it2RLrTQVnJZCPbu6ysSYUAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQLNLiCotdmv8FyvL1OdpXX1+uFYsXrVXI+sfwIECBAgQIAAAQJNIZACWlNga2XJLllZ2aROgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBphYQ1NrUl3duF9e5YWQoInNt5Sgjuy6vbFInQIAAAQIECBAgQOBFBLa8fIr30LlFkele8iJn2UWAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIHmEhDU2lzXc15XMzqau7lywO6+nhge2VTZrE6AAAECBAgQIECAwIsI9CxdOuX76OySgRc5yy4CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAg0l4Cg1ua6nvO3mqGdnRGFGyoH3Hr1rsomdQIECBAgQIAAAQIEpiGw+fLLqo7KdPVEpmNxVbsGAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQINKOAoNZmvKrzsKb23OE9xWHK7oWaa8/F+q0j8zC6IQgQIECAAAECBAg0n8CaizfEitWrqhaW6V1e1aaBAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECzSggqLUZr+o8rKkQsbtymM1X7Ijuvp7KZnUCBAgQIECAAAECBKYpsHHHpVVHZnuXRbS1V7VrIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQLMJCGpttis6D+tpG956bURmQ+VQU30AX3mMOgECBAgQIECAAAECpxZIXxTrWNxVfkAmG9me/vI2NQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECDShgKDWJryoc72kTKbw+soxTnWr1Mrj1AkQIECAAAECBAgQOLVArj0XF22fIltr/+CpT7KHAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECTSIgqLVJLuR8LWPR+q3FDK2ZYqbW8rL58svKG9QIECBAgAABAgQIEJiRwNard1Wf19Yeme4l1e1aCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0EQCglqb6GLOx1Ly+czuynF6ly2N4ZFNlc3qBAgQIECAAAECBAjMQKC7r2fK99dZ2VpnoOkUAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQaSUBQayNdrYWe69DOzoj8nsppjFx1eWWTOgECBAgQIECAAAECsxAY2VX9HjvT2R2ZjsWz6NWpBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQqG8BQa31fX3qanbtucPFgNZMMbD1ZMm152LzFTtONtgiQIAAAQIECBAgQGDWAqvXD8eK1auq+skuGahq00CAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIFmERDU2ixXch7WkS9kbq4cJgW0psBWhQABAgQIECBAgACB2gpsvrL6y2OZs/oj2tprO5DeCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUCcCglrr5ELU+zTahrdem8nEUOU8N1+5vbJJnQABAgQIECBAgACBGghs3H5pdPf1lPeUyUa2d1l5mxoBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgSaREBQa5NcyLleRiaT2V05xvDIpuhf6fanlS7qBAgQIECAAAECBGohkO6IkAJbK0t2SfE9eDG4VSFAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECzCfgktNmu6Bysp33dtpGIwjWVXW+8bGtlkzoBAgQIECBAgAABAjUUGNl1RaTg1rLS1h6Z7iVlTSoECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBBoBgFBrc1wFed4DYVC4fWVQ6xYvSpSplaFAAECBAgQIECAAIG5E+ju64k1F2+oGiDbv7KqTQMBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQaXUBQa6Nfwbme/9C2wYj8nsphNu6ovg1q5THqBAgQIECAAAECBAjMXuDSq3dWdZLpWBzpoRAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQKCZBAS1NtPVnIO1tOfGrovIdE7uumNxV2y+YsfkJtsECBAgQIAAAQIECMyRwOD5ayLdKaGyyNZaKaJOgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECjCwhqbfQrOMfzL0TsrRziou2XRq49V9msToAAAQIECBAgQIDAHAlsefnlVT1nupdEtLVXtWsgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAowoIam3UKzcP824b3nptMUvrYOVQU32gXnmMOgECBAgQIECAAAECtRNYt3VLdPf1lHeYyUa2b0V5mxoBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQaWEBQawNfvLmeeiaTv7lyjOGRTdGzdGllszoBAgQIECBAgAABAnMokO6UsOnyHVUjZPuWF7+H5te6KhgNBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAg0pIBPPxvyss39pNvXbRuJyO6sHGnz5ZdVNqkTIECAAAECBAgQIDAPApuv2B4puLWstLVHtqe/rEmFAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECjSogqLVRr9wczzufz++uHGLF6lWx5uINlc3qBAgQIECAAAECBAjMg0B3X0+kOydUlkzfQGWTOgECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBBpSQFBrQ162OZ700LbBTKZwQ+Uom6+svt1p5THqBAjMn8Cavt4Y7l8Suawf5fOnbiQCBAgQILCwAluv3lU1gUzH4sh0dle1ayBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECjCVTcu7LRpm++cyGQy+VviOLH4pP7TlmhNm6/dHKTbQIE5kigK5eLVT1nFR89MbC4Kwa6u2O8bTpD7nvqUOmwg8eOxwOHD8f+I0fi/qcPT+dUxxAgQIAAAQJ1LpDunpAejz+0v2ym2f7BGHtkX1mbCgECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBBpNQFBro12xuZ7v0M5iMOvhm4tBrWUjrd86Erl2L5cyFBUCNRJIAaubBpbHxStWRMq+2rNo0ax6TtlbUxnuj9i2anCirxTsOh7gevfjT8RoPj+xr542btq6pZSBdrZzetvffXm2XTh/BgLp9Zeu4WxLer1+/Ft3zrYb5zeggJ8BDXjRTHneBS77V6+Iv/zdz5SNm1ncG9HWHjF2oqxdhQABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAo0kIEqxka7WPMy1re3INcWA1pNRcD8Zc6rbnM7DdAxBoGkFJgeyblqxfF7WmYIN0+OK4mjHRkfj248+Fv/7kUdlcZ0XfYMQIECAAIHaCay5eEOkOykcffrIyU4z2UjZWvNPPHiyzRYBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQaTEBQa4NdsHmY7t7KMYZHNpU+NK9sVydA4MwFlnZ1xivOP78sg+qZ9zL7M1JQ7Y7V55QeB44+G197eH8pyPXI88/PvnM9ECBAgAABAnMqkO6gMLLriviHz3+xbJxsT3/kn3w4olCf2djLJqtCgAABAgQIECBAgAABAgQIECBAgEBkMplYvnx5PPnkk5Gvk7ss9vf3R19fX9x3332uEAECBAgQIEBgQQQEtS4Ie30O2r5u20ihkN9eObtLr95Z2aROgMAZCqzqOSuuGloTIysHzvDMuT98oHtx/Oz6tfGv1l4QX3ngwfjyfQ+UMrnO/chGIECAAAECBGYqsHH7pfGNv7ojRk+MnuyirT2yvcsi//TjJ9tsESBAgAABAgQIEFhggY6Ojli/fn0sWrSoKlAjm83GPffcE0899dQCz9LwBAgQIECAAIGFEXjVq14VN954Y/y3//bf4i/+4i+iUCgszER+Murg4GC8/e1vj5e85CVx2223xf/6X/9rQedjcAIECBAgQKA1BQS1tuZ1P8Wqx4pZWjNl+1asXhWD568pa1MhQGD6Ailg9KfWDsemFcunf9ICHZkrfoiQAm+3rTo77rjnvvjaQ8VMbwoBAgQIECBQlwLdfT2x/tIt8b2vfbNsfpne4nsOQa1lJioECBAgQIAAAQILK3DuuefG7bffHitXrozR0ZNfykpZyVJQ65ve9Kb4/Oc/v7CTNDoBAgQIECBAYAEEXvnKV8b73ve+eOlLXxqbN28uBbT+5V/+5YIFtqb3aymg9aabborOzs7o7u6OW265RWDrArw2DEmAAAECBFpdINvqANb/E4GhbYPFL31dW+mx5eWXVzapEyAwTYFtqwbjrdu2NkRA6+Ql9RSzZvzchvXxK5dti+H+JZN32SZAgAABAgTqSGDkqur36pmOxZHp6qmjWZoKAQIECBAgQIBAqwvkcrk4++yzY9myZaXA1hQskR4DAwOlW+12dXW1OpH1EyBAgAABAi0okAJaP/jBD5YCWtPyU2b7D3/4w3HttdeWvvgz3yTp/dnevXtj9+7dpYDWNP727dvjQx/6UOzatWu+p2M8AgQIECBAoMUFBLW2+AtgfPm5XGFPMUtr53g9PafsT+u2bpncZJsAgWkIdBX/UP/GLS+NX7hoY6TtRi2res6Km4o/A1KAa8riqhAgQIAAAQL1JZDuqrB6/XDVpLJLBqraNBAgQIAAAQIECBBYKIF0C90TJ06ccvh8Pn/KfXYQIECAAAECBJpR4FWvelVZQOv4Gi+88MJSEOm/+Tf/Jtra2sab5/x5cHCwFNC6Z8+eWLx4cdl4L3vZy0rBtq94xSvK2lUIECBAgAABAnMpIEppLnUbpe+hncVg1vzuyulu3H5p5NobNyCvcj3qBOZDYMOypfH2f3FZpOdmKTtWnxNvKgbpNnKAbrNcC+sgQIAAAQKVAiO7psjW2l3MtN7WXnmoOgECBAgQIECAAAECBAgQIECAAAECCyiQyWTimmuuife///0TGVorp7N27dpSwOurX/3qeQlsTRla3/72t5cytJ4qg/7WrVvjAx/4QGnu7e3+7lh5zdQJECBAgACB2gsIaq29acP12Nb2zLXFLK2DkyeegllHdl0xuck2AQKnEdi2arCUobVn0aLTHNl4u4f7l5Syti7tKkvo3HgLMWMCBAgQINBkAsMjm0p3WKhcVra/7O195W51AgQIECBAgAABAgQIECBAgAABAgTmUSBbvCviv/7X/zo+8pGPnDKgdXw6F1xwQSnw9ed//ufnNLB1ckBrZYbW8bmMP19yySWlYNsrr7xyvMkzAQIECBAgQGDOBAS1zhlt43RcvP3T3srZnurD8crj1AkQeEEgBbT+wkUbm5pjVc9Z8dZtWyM9KwQIECBAgED9CGy9elfVZLK9y4rfW/PrXhWMBgIECBAgQIAAAQIECBAgQIAAAQILIJCCWnfs2BEXXXTRtEY///zzS4Gtr33tayOXq/3dVVNA6y233BJ79uyJ0wW0jk+4u7s7li5tnrtVjq/LMwECBAgQIFB/Aj7lrL9rMq8zal+3baT4/nmkctCpPhivPEadAIEXBK4aWtP0Aa3j1zplob1p65ZImVsVAgQIECBAoD4E1m8diXSnhbKSbYts3/KyJhUCBAgQIECAAAECBAgQIECAAAECBBZGYHR0NH73d383PvWpT8Xzzz8/rUmsWbMm3vve98b1119f08DWFNC6d+/eePOb3zztgNZ9+/bFb/zGb8Sf/dmfTWvuDiJAgAABAgQIzEZAUOts9Jrg3EJh7N2Vy1ixelWkh0KAwOkFUkDrT6294PQHNtERXcVvg16/+eJIAa4KAQIECBAgsPAC3X09sXH7pVUTyfQUs7UqBAgQIECAAAECBAgQIECAAAECBAjUhcD9998ft956a3z605+O5557blpzOvfcc+O2226LG264Idrb26d1zosdNJ6hdffu3WcU0Pqe97wn/vt//++RgnMVAgQIECBAgMBcCwhqnWvheu5/aNtgcXrXVE5RltZKEXUCUwtsWrG85QJaxyVSQGsKbFUIECBAgACB+hDYfOWOqolkOhZHZnFvVbsGAgQIECBAgAABAgQIECBAgAABAgQWRmD//v3x67/+66WMrdMNbF29enXpnNkGtq5YsSJuueWWM8rQes8995QytH7uc5+LQqGwMGhGJUCAAAECBFpOQFBry13ykwvO5UZvjsh0nmyJSFmehkc2TW6yTYDAFAJLuzrjFy7eOMWe1mka7l8SV19wfuss2EoJECBAgEAdC6Q7Lay5eEPVDLN9K6raNBAgQIAAAQIECBAgQIAAAQIECBAgsHACjzzySKTMp5/4xCemnbH1nHPOKQW23njjjbFoBndTHBgYmFFAawrA/exnPxv5fH7hwIxMgAABAgQItJxAruVWbMEvCAztLAazHr6hkmPT5Tsi1+5lUemiTmCyQC6bjTeOvDS6cvP7b+XgsePx1PHjpakcK97aY/+RZ6K/szNSgG0qa/p6I81tPsvVFwzFvqeeKj4OzeewxiJAgAABAgSmENh8+WVx/3d/ULYn070kMos6o/D8C+8hynaqECBAgAABAgQIECBAgAABAgQIECCwIAKPPfZY3HbbbaXsp7t3746Ojo7TzmPVqlWlwNb29vb4vd/7vWkHxK5cufKMA1r37ds3kaFVQOtpL40DCBAgQIAAgRoLzG9EVo0nr7uZC7S3H76uUMgMTu4hBbNuvmL75CbbBAhMIXBdMUPrQPfiKfbUtunI88/Htx99LO5/+nA8cPhwpKDW6ZQ0t5HiL6cjKwfmZZ7Xb744fvvr34w0X4UAAQIECBBYOIF0x4XeZUvj8JMHyyaR6V0RhSceLGtTIUCAAAECBAgQIECAAAECBAgQIEBgYQUef/zxUmBrChq96aabphXYOjg4GL/6q78a2WKim5Tp9fhPEuKcaiUpoHXv3r3x5je/ORYvnt7nmymgNWVo/YM/+AMZWk8Fq50AAQIECBCYUwFBrXPKW7+dj41lbq5M6Lj+0i3R3ddTv5M2MwJ1ILBpxfJSsOhcTiVlYP3qgw/GnY8eiNEZ3MrjwNFn44577i09UoDrpWcPxpXnnTtnWVx7irc4uWrovPjzH/54Lln0TYAAAQIECExDYOSqy+Mrf/SFsiOzfcsj/+TDEQW3CCuDUSFAgAABAgQIECBAgAABAgQIECCwwAJPPPFE/OZv/mYpePQtb3lLdBbv0ni6kgJV3/Wud5UCW//rf/2vpwxsnUlA67333hvvec97SgGtY2Njp5uK/QQIECBAgACBOREQ1DonrPXdaW7tpTuLn2iPVM4yfQCuECBwaoFcMRL8p9YOn/qAWe45NjoaX/jhj+Kb+x+dZU8nT08Brl/88T3x1Qceil8oZpjdUMzeNhdlx+pz4q+K48wkCHcu5qNPAgQIECDQqgIXbb80/vHzX4zRE6MnCTLZKAW2Hjpwss0WAQIECBAgQIAAAQKRyWRiUfEL2+kWvo16W91nn33WlSRAgAABAgQaXODJJ5+M9773vTFa/KzwrW99a3R1dZ12RQMDA/HOd74z2tra4uMf/3hUvicYD2jdvXv3tDO0poDWW2+9NW6//fYQ0HraS+AAAgQIECBAYA4FBLXOIW79dp2/OSJTNr3B89fEitWrytpUCBAoF0iBmynz6VyUHxRvE/yH3/1+HHn++bnovtTvJ+/8TqRMsz+38cJI2VVrWVLAb/L56gNubVxLV30RIECAAIEzFehY3BWbr9gRd37pq2WnZnpXRAhqLTNRIUCAAAECBAgQINDd3R0333xzXHLJJQ0buPHzP//zLiQBAgQIECDQBAIHDx6MD3zgA6X3JP/+3//7aQW2rlixIt7xjneUMrZ+7GMfi6NHj5YkZhrQ+u53v7sU0JqCaxUCBAgQIECAwEIKCGpdSP2FGHto22BE/trKoS+9emdlkzoBApMEUhDov1p7waSW2m2mgNZPf+eueclyevfjT8T+Z56JPZdsiaVdp799yZms8qqh8wS1ngmYYwkQIECAwBwJbL5ye3VQ66LOyHQvicLRQ3M0qm4JECBAgAABAgQINJ5AytL68pe/PHbt2tV4kzdjAgQIECBAoOkEnnrqqfjQhz5UyiD/y7/8y9PKsLps2bK45ZZbSoGtH/3oR+Oss86Kt7/97fHmN795WucnxJShNQW0/sEf/EEpW2zTwVoQAQIECBAg0HACglob7pLNbsK5XGFvZQ/dfT2x5uINlc3qBAhMEviptcORspHWuuw/8kx87q7vzktA6/jcDx47Hp/+57vipq1boitXu/8NpMDflAk2Bc4qBAgQIECAwMIJ9K8ciOGRTbHv23eXTSLbuyzGBLWWmagQIECAAAECBAi0tkChUIjnnnuutRGsngABAgQIEKgrgUOHDsWHP/zhUsbWX/mVXykFqZ5ugkuXLo23ve1t0d/fHykT/Rve8IbS8+nOS/tTQOutt94af/iHfyigdTpgjiFAgAABAgTmRaD2EVrzMm2DzEhgaGcxLWP+uspzt169K3LttQtsq+xfnUCjC6RgzW2rikmOa1zGg0uPLcAtPFIw7ce/dWfUeuxdQ2tqrKQ7AgQIECBAYCYCGy/bWnVaytSa6Vhc1a6BAAECBAgQIECAAAECBAgQIECAAIH6EXj66afjt3/7t+MjH/lIHDlyZFoTW758efy7f/fv4o1vfOMZBbSmDK0poPXEiRPTGsdBBAgQIECAAIH5EBDUOh/KdTJGe/uRGyIyZZF5KZh1/daROpmhaRCoT4Edq8+p+cRSMOknv/2dSIGtC1VSYOsd99xX0+HX9PXG0q5i/LxCgAABAgQILKhAytSaMrZWlkzPssomdQIECBAgQIAAAQIECBAgQIAAAQIE6kwgBbZ+9KMfjQ984ANx+PDhac2uq6srOjun9zndfffdV8rQevvttwtonZaugwgQIECAAIH5FBDUOp/aCzzW2FjsrpzCxu2XRndfT2WzOgECPxHIZbOxY/WqmnqM5vPx6e/cFQeOPlvTfmfS2dceejiOPP/8TE495Tnn9faecp8dBAgQIECAwPwJbL5ye9Vg2b7lEdm2qnYNBAgQIECAAAECBOZK4Pni354KhcIpu5cV7JQ0dhAgQIAAAQItLpACW3/nd34n3v/+98ehQ4dqpnHvvfcKaK2Zpo4IECBAgACBuRBwz/m5UK3DPnPDl1wTmahKybr5yh11OFtTIlA/AhuWLY2eRYtqOqEUSLrvqdr94jmbyaUA2y/d90D87Pq1s+mm7NyUrfXbjx0oa1M5tUB6fQ10L441fX3R39lR3O6eODi1p/0ps2/KrDteUiByCop+oPjHjAPPPrugGX/H51QPz+m1d+GyFzIQDvcvKU1pVc9Z0ZUrf7uTXvf3P324+Hi6aJieD9c8uHu+PdIa0/rPK76OUjmvuN1eDMpPpb/4rexTZVBOr6PxwPb0c+mF19bR0ustve4UAgQaW2DzFTviG3/1P+O5Z4+dXEgmG9neZZE/5P/VJ1FsESBAgAABAgQIjAt0F/8ucdFFF8Wi4t8jXiwQdfz40z2PFTMtXHDBBaX+TnXspk2b4sEHH4z29vZTHTLt9kwmU5r3j3/84zhw4PTvedPxac0KAQIECBAgQKBeBY4cORL/+T//5xgt/s3+ne98Z/T3989qqpMDWlOfCgECBAgQIECgHgUy9Tgpc6q9QG7tJX8Wkbl2cs+r1w/Hz/3ynslNtgkQqBB445aXRgpsrVVJwXTv/fuvTQSR1arf2fSTstG+6/IdNQve/cGTB+OTd35nxlO6aeuWGA9InHEnxRPf9ndfns3pc3puWt9I8ZbQmwZW1MQ9va7ufvyJuPvA46XnVF+oktaWruFsSwqw/Pi37jxtN+n1u2VwIK4499xIAawzLcnv248+tuB+051/ClJN1ikzcgqIns3aTzVmCnj9P08+Gd985NGyoOpTHV+r9lb4GVArK/0QmI7AV/7oC3Hnl75afujo8zF6313lbWoE6kygFkE0dbYk0yFAgAABAnUpkII6J5fNmzfHX/zFX8Tg4ODk5hlvp/+nZ4u/u+eKX8ZMz1OVlKk1Bb9WzmWqY6fTli/+XeSmm26Kz3zmM6c9vKenJ971rnfF1q1bT3tsvR7wile8YlpT8/5qWkwOIkCAAAECsxao1XuayomkL+Ls3r27FNi67CcJPiqPOV09BbS++93vjv/xP/5H6f3X6Y4/3X7vL04nZD8BAgQIEKiNwFy9v6jN7GrXS27tyb/PlKcuq90Yeqojgc4NI0Ojo+UBrWl6I7sur6NZmgqB+hNIQWO1DGhNK0wBn+NZEetlxSkA8gdPHIxtq2rzYUXKFqlUC6TX0yvOPz82LK999t8U2JmCZNMjlRSg+fViRuD0emvWkjy3n7Mqdqw+pyoT60zWvGnF8kiP9O/ziz/eF9/c/+hMupnzc9K/0/Q6Suuf65IyBafHFeedW3JJQb8ps3O9/Qybawf9E2h0gS0vv7w6qDW3KDLdS6JwtD4yxze6sfkTIECAAAECBJpJIAWeLl68ODo6OuZtWSlDay2ytE6ecAqinU5Jmc9uueWW6RzqGAIECBAgQIDAggocPXo0Pv7xj5eCUVPG1oGBFz4Tmu6kUkDrr/3ar8Xtt99ek4DW6Y7rOAIECBAgQIDATASm95edmfTsnLoRGB3N3RxRKJtPbzHz5PDIprI2FQIEygVqkS20vMeIO+65t7KpLur3HHqqZkGt6TboKeDu4LHjdbG2hZ5EyqB51dCaiYDT+ZjPeIDm/iPPFIMQ749vP3b62+3Nx7xqNUYK3r3u4o2RgnlrXXqKt1f8hYs2ljK//skP/k/c//ThWg8xo/7SNf2Z9evmJZh1qgkmlxTcmoKIU2DrVx94MI65LdFUVNoI1J1Az9IX3vfv+/bdZXPLLhmIMUGtZSYqBAgQIECAAAECxb8iFzOrNvptaFPW15StVSFAgAABAgQINJvAsWPH4m/+5m/iVa96Vbzyla88o+Xddddd8fWvf11A6xmpOZgAAQIECBBYKIHaR4Ms1EqMO7XA0M5iKrfCDZU7R66SpbXSRJ1ApcDFK1ZUNs2qnrJnpiDDeiwP13heq86a+W3g69FnJnNKQYBv3PLS+JXLts1rQOvkuaaA2us3Xxxv33FZ8Rb1jZ9BNwWx/tyG9aU1zUVAa6XdTVu3lAI5J7fP93YKZk2voRteunnBAlonrzm5X33BULz9X1xWs0D4yf3bJkBgbgQ2X35ZVceZrp7IdCyuatdAgAABAgQIECBAgAABAgQIECBAgEB9Cpx99tmxe/fuuOKKK854gi9/+cvjrW99a5xzzjlnfK4TCBAgQIAAAQLzLSBT63yLz/N47bnDewqRWTJ52Fx7Li7afunkJtsECEwhUOtMrfWapTUtPQXbplvVt9co8+VcBxxOcbnqqindIj5l1UxZa+uhpFvIv3Xb1vjm/kfjCz/8UUNm2EzZf1+36eJ5Dc5Nr+OfXb+2NObt3/1+jM5zlpeUMTa9luqxjGe07e/sqtsM1PXoZk4EFkpgzcUbYsXqVfH4Q/vLppDpXR6Fxx8oa1MhQIAAAQIECBAg0EoC7e3t8bKXvSxWrlzZsMv+0z/904adu4kTIECAAAEC0xdI71f+w3/4D/FLv/RLkd7DnGnp7u6OPXv2RK742dUHP/jBePDBB8+0C8cTIECAAAECBOZNoD6ibeZtua03UCFid+WqN1+xIzoWd1U2qxMgMEkgZbWsZUBiChqt1yyt48v+5J3fGd/0PEOB9Jr5hYs3RsquWY8lBUhuWL40Pv2du+L+pw/X4xSnnNN4UG4t//vbmbUAAEAASURBVE1OOdApGkdWDkQKcE1u81XqOaB1skHK2prLZuKLP75ncrNtAgTqUGDjjkvj8T/6QtnMsr3LIn/wkYixE2XtKgTqVWD/r/1ivU5tVvNaddvvV51vrVUkDdfQ6tf1bX/35Ya7ZtOZ8Ef+5a6qw6y1iqThGlr9ulZesEwmUwp0qGxvpHpbW1tkp/nF7Z6envjABz4Ql1/euHc2S9dspsV7jpnK1c95rf6eo5Vew95z1M+/u5nOpNXfc7TSa3imr5EXO29wcLAU0Ppv/+2/nVFA63jfHR0d8eY3v7n0XikFtt5///3ju2r23ErX2lpr9rJZsI78bPb3mwV78dVoYK/h1nkN1+gl01DdCGptqMt1ZpNtG956bfGMDZVnpQ+0FQIEXlzgwmXLXvyAM9y776mnzvAMhzeaQAq4TLerX9VzVl1PPWXYTPNMAZopO2+9lzTfN468tKZB5jNZcwpUToGmf/i978/k9DM6p1ECWscXddXQmtL1+ZMf/HC8yTMBAnUokL7Y9q07vhxHnz5ycnaZbGR7+iN/6MDJNlsECBAgQIAAAQItLZAv3qXk2Wefjeeee64mDoVCoRQ0kTKCnSrQ9MSJEzE2NhazCc6cPNm0htHR0clNp9xO83vmmWdOud8OAgQIECBAgMBCC6SA1r1795YytKb3VLMtKcvrm970ptIXmd73vvfFfffdN9sunU+AAAECBAgQqLnA7N/11HxKOqyVQCZTeH1E+be0x289Wqsx9EOgWQXOK2ZqrWXZ99ShWnanrzoTWNrVGXsu2RLpuRFKyjr6xi0vLQVofnP/o3U75RQo/KbiPOvFNWW6fbz4wd6X7qv9N5fHL0KjBbSOz3vH6nPiu48/0RCB0uNz9kyg1QRy7blYv3Uk7vzSV8uWnu0fjPzTT0QU8mXtKgQIECBAgAABAq0pcM8998RrXvOaWFT8kmkK+JxtScGqF1xwQfyX//Jfor+/f8rufvM3fzP++q//elZZx8Y7ToGxad4//vGPx5s8EyBAgAABAgQaViAFtN5yyy1x00031TSbfgqOvfHGGyNluE+Brfv27WtYIxMnQIAAAQIEmlNAUGtzXtdYtH7rhuIX0lOm1rKy+fLLyuoqBAhMLXBOjbNtCmqd2rkZWlMm0UYKaJ1sngIoT4zl49uP1V+GvhR4+7rNF9dd5turLxiKux9/PA4cfXYyZU22rzjv3EiBs41afmb9uvjxN74Zo8U3IAoBAvUpsPXqXVVBrdHWHpnFvVE46gs49XnVzIoAAQIECBAgML8CR48ejW9+85s1HfTAgQPx/PPPn7LPu+++u+ZjnnIwOwgQIECAAAECDSIwHtD6lre8Zdpf/nnooYciZa0/77zzTrvKFND6+te/vhQse9ttt/lS0GnFHECAAAECBAjMp0B2Pgcz1vwJ5POZ3ZWj9a8ciOGRTZXN6gQIVAik7JApULFWZf+RZ+LYNG95Vqsx9TM/Ainw8oaXbq6bTKIzWfV1F2+MNTXOTDyTeVSek+a0YdnSyuYFr6drnoKBa11Sv1cNnf6PTLUet5b9DXQvjiuLgbkKAQL1K9Dd1zPl7wMpW6tCgAABAgQIECBAYK4EUtbXlEH1VCXdAlchQIAAAQIECBA4KTAwMBB79+4tZWid7nule++9N371V3813vWud8UPf/jDk529yFYKbL3++uvj1ltvjXXr1r3IkXYRIECAAAECBOZXQFDr/HrPz2hDO4v3v87vqRxs85XbK5vUCRCYQiAFZtWy7HvqqVp2p686EqjXgNAzIWqGwNwzWW8tjk0Btymrai1LygBby2D6qeZ2pJgV5+Cx41PtqlnbfKyjZpPVEYEWFRjZdXnVyjOd3ZHpqO37n6pBNBAgQIAAAQIECBAgQIAAAQIECBAgcFqBFNB6yy23lAJa05eDplPuu++++PVf//X4zGc+E5/97Gfj3e9+97QDW7Ppznmve13p/AsvvHA6wzmGAAECBAgQIDDnArk5H8EA8y7Qnju8pxCZYmDryZJrz8XmK3acbLBFgMApBfo7y/75nPK46e7Y95Tb+U7XqpGOS0GNI8UM2M1QUjDl6zZdHL/zzW81w3LmZQ0pq+rXHno4Rou38ZltSYHF21adPdtuys5P8/rm/kfiu48/UcoUff/Th8v2D/cvKQXRXloct5YZcdNaNixfWhz70bLxVAgQqB+B1euHY8XqVfH4Q/vLJpVdMhBjj91X1qZCgAABAgQIECBAoNkFUgbZvr6+Zl+m9REgQIAAAQINIjAe0PqWt7wlOjo6pjXrFNCaglhTMOt4uf3222O0eBfJ2267LTZs2DDefMrnFNh63XXXlbLrp3O+//3vn/JYOwgQIECAAAEC8yEgqHU+lOd5jHwhc3Pl3ZxSQGsKbFUIEDi9wKqes05/0BkcIaj1DLAa5NAUBJoyUjZTSdlHd6w+pxSo2YjrOnD02UiZSMf/veWymVhT/FAqXataZ19OPqnf5PXVBx6cNdfanwSYzrqjn3SQ5vSl+x4oeZyqz3Gnbz92IFKA68+uXxe1+tl38YoVglpPBa+dQJ0IbHn55XHHZ/7fstlkzuqPeOLhiLETZe0qBAgQIECAAAECBJpZ4NixY/GpT30qvvrVrzbzMq2NAAECBAgQaACBFcW/rb/jHe8oZWg9k4DWW2+9NT73uc9FviIJxx//8R+X2lKQ6kUXXXRagfHA1lwuF7/xG78Rd99992nPcQABAgQIECBAYK4ERDnOlewC9ds2vPXaYkDrUOXwm6/cXtmkToDAKQQGurtPsWdmzceK34RUmkvgZy9cF13FX+prXVJgZgoy3H/kSCkgcXJ2zRRwmMYc7u8vBR9uWrG81sPHT629IO4+8PiLBkPWfNBZdHjw2PH4n/feGz944uCLzjkF7G4/55xiNtTBWYxWfWqtsrWmbKm1KMnjk9/+TqTX0ZmUFOD629/4Zun6XzW05kxOnfLYlPk1ZWytRRbbKQfQSIDArAXWbd0S//D5v46jTx852VcmG9neZZF/Sqblkyi2CBAgQIAAAQIEml0gBbV+8pOfbPZlWh8BAgQIECBQ5wIpoPWd73xn7NmzZ9oZWu8tfj6SMrROFdA6vtw//dM/LQW2vuc974mXvOQl482nfE5Z7F/zmtdECmxNfd91112nPNYOAgQIECBAgMBcCtQ+ImcuZ6vv0woU32jujiiUHTc8sin6m+QW2WULUyEwRwK1DFZMmSP/f/buPzaO887z/Lea3RTJVvOHJFI0TYuUKdGkRY5bIQmRF1EjKhuOYU9izWaACHfBRcB6YPmPOd0O5sYTZ5L4kEwOCBY7wO0fmcXOHmaAxEnm4CT2bHwXJZB34klkj/zzLMeSbMmSLFOyflqiflki2VffdlruqmqS3V1PNau73w9AsJ+nq5566lXdbdH88Fu0yhLQqpZJw5+pGnx+zq6suffY8XmxpqavpJ/LVNjU4ODn7eqaJquQ6mv/gXU98qPfhvu2Mur1zOG3864GquFg/Xrz7Fn54oZ+Y4Fkrdaq4WINIvtp+pry2zTQ+revvCr6vdj27DtH7cB0QvS15adpoFWDxJnXqp+52BcBBIIR0Ds49I8Oy0s/f85xgEhzm8x9aH+mpeYc43QQQAABBBBAAAEEEEAAAQQQQAABBBBAIBiBVatWpSu0aqC1rq4ur4McO3ZMtELrk08+6anQ6p7gpz/9qczOzsq3vvWtvIKtuv8f/dEfiQZcn3jiCXn99dfdU9JHAAEEEEAAAQQCF4gEfgQOUDKB2PqRpP0b6PvdBxzcvMk9RB8BBEokcGuWUEiJqEt2mAkDlSyzF3vg7Dn5zm9eXDDQmr195vHB8xfS1TU1iGiyaTVTDWuGtWk4Vb32TxVeSVCt/8auSGqyenKyfbUvKrU24a0hXz+B1sxJPPvOkcxDX9+DqCTsa0HsjAACHoHkxLhouNXRamJixf0H7R1z0kEAAQQQQAABBBBAAAEEEEAAAQQQQACBnAIrV64sONCqFVq/9rWv5RVozRz0n/7pn9LHeeWVVzJDi37fvn27fPOb35SNGzcuui0bIIAAAggggAACpgUItZoWXcL5UqnUl92Hb+3skK4Nfe5h+gggsICAiYBZZvqLN4qvmpiZg+/hEehILPddxTL7bLSS5ffeeFOKreirt3fX6q5PH34ne1rfj7d1r/E9RxATaJD3uy+/WrSXrkmDn0+9dcjY8rSqqZ/qznfarym/7czVa6KBXRNNKwKbmKsuGjOxHOZAAIEABeJNiZw/J0Ra/IX1A1wyUyOAAAIIIIAAAggggAACCCCAAAIIIFAxAhpoffzxx+XRRx+V+vr6vM6rkAqt7gmfffbZ9PEKCbZ+7nOfSwdbh4aG3NPRRwABBBBAAAEEAhVwleYJ9FhMHqRA90i7yOwuEctxlP6xYUefDgIILC6gt8421W7ZocMgmwZwTd5+3uRaNXCpVTUrqY3cYX/UGmoartRAqzr5bc+feC99u/fk6ja/U6X31+qjP7MrwJpYm5EF2ZNo2PL7hrxe++CM6DmaqCaqnxc6176T7xd1qq3xeFH7Ze906Pz57K7vxxq29muzor7O9zqYAAEEghfY9OBn5chrBxwHspY1iH6lPrrmGKeDAAIIIIAAAggggAACCCCAAAIIIIAAAmYEMoHWXbt2lSTQmln1z3/+c7ELZclf//Vfy/BwfjmCBx98UKLRaLo67P79+zNT8R0BBBBAAAEEEAhUgFBroLylmzwWnd2REsuRIFnWUC+D42OlWwRHQgABj8D0Rzc9YyYH+latkC/e229ySmNzaWjz27/eZ2y+pZ5Iq3GOdd5pZBkaFv3+geIrtOZaxA/ffEta6urS4dZczxcypmHpkY47ig5qFnKsfLbVQKtWaL0+M5PP5nlts+fou76Dm5kD3WNXay021Oqnymvm+CYqq2bm0u8nLl3K7hb12MR5FXVgdkIAgYIE9K4O+nX25JRjP63WOnv6XccYHQQQQAABBBBAAAEEEEAAAQQQQAABBBDwL6CB1r/8y78UDbQ2NDTkNeG7774rTzzxhPzgBz+QOZ/FUvbs2ZOe41vf+pZs2rQpr+P/wR/8gViWJV//+tflxRdfzGsfNkIAAQQQQAABBPwImCtH6GcV7OtbICXymHuSe0eHJRojt+x2oY/AYgImKwzemLm12OF4vkwEelqaxVQV31dPnzFexVaDss++c8SY5obWVcbm8jPR9M2bxgOtuh4NyuqXidaxfHnR09RHa4reN7PjxRs3Mg+NfFdzv63ODoHTEECgPASGJic8C7XizSI1Mc84AwgggAACCCCAAAIIIIAAAggggAACCCBQvEB7e7s8/vjj8uijjxYUaNUw6ZNPPikzhop//PKXv5SvfOUr8sILL+R9MpOTk/Ltb39bfv/3fz/vfdgQAQQQQAABBBAoVoBQa7FyIdqvpmdou4jluSd2rl9Qh2jZLAWBqhC4PjNbFedZDSe5obXV2Gk+/957xubKnkhvG28qqLnOYIg3e42FPt5z9JjRCq3Zxzd1HTQIr9Vti2l1Uf+hseu3zFWw1XPQKst+m8k/DvC7FvZHAIGFBXqSAxJvSjg3siISaW5zjtFDAAEEEEAAAQQQQAABBBBAAAEEEEAAAV8Cn/vc5+Thhx+WeDye1zxaoVUDrT/84Q+NBVozB37uuefSFWOff/75zNCi38fHx2XLli3pqq2LbswGCCCAAAIIIICADwHKaPnAC8uuljW3W8SZT875y+mwLJh1IBByAa14aaoiZ6LWf2At5FxVszyt1GqimawQmms9+0+dlocS63I9VdCYvgc02Hrw/IWC9jO5sVrtO/m+ySkdc2kI2FTramqUA2fPFTzdc8ePy0unThW8X/YO1w39ZXb2nDxGAIHqEdA7OwxsHpMXf7bHcdKRxpUyd8H+fErNOcbpIIAAAggggAACCCCAAAIIIIAAAggggEBxAi+//LJoiPSBBx5YNBiqgdZvfOMbgQRaM6v/53/+53Tl2G9961uLVmC9deuW/OQnP5Gf/vSnmd35jgACCCCAAAIIBCZAqDUw2tJMHFs/kkyl5ra6j5ac2Oweoo8AAnkKXP7oppiqMhiN+L+1eJ7LZrMABfT1YOo1se/9qQBXKvLS1Cl5cN3dRoLZ96xcsaSh1qcOHgrUSiuS6peJa7umyFDrmavXRL9oCCCAwFIKDI6Pyst79spMduXnmphEEi0yd/n8Ui6NYyOAAAIIIIAAAggggAACCCCAAAIIIFAxAq+88kq6OuqMXaziD//wD6WmJvfvEY8dOyZPPPGE/OAHPzBeodWN+S//8i/yla98Rb797W/L1q1b3U+n+xpofeqpp9Ih28OHD+fchkEEEEAAAQQQQMCkgLO8p8mZmaskAnNzc4+4D9Ta2SGdvT3uYfoIILAEAibCckuwbA7pEjBVpVWnPXDmrGt2s12t2mmqumpPS4vZxRUw2/FLl0W/gm6mqrWuqK8PeqnMjwACCAQmEG9KiN7pwd2spjb3EH0EEEAAAQQQQAABBBBAAAEEEEAAAQQQ8CFw4MAB+epXvyrPPPNMzsCqBlq1QmspAq2Z09i3b1862Lp3797M0O3vN2/elB//+MfpkC2B1tssPEAAAQQQQACBgAUItQYMHOj03SPtlpXa6T7G4JYx9xB9BBAoQOAGt/IuQKs6Nm1taDByotP2D/76FXSbmr5i5BBtcTPnXcxiZuZKc7vrs9fMVEltqasr5jTZBwEEEAiNwNDkhGct1rIGserinnEGEEAAAQQQQAABBBBAAAEEEEAAAQQQQKB4gTfffDMdbP3JT34iWgU109599135+te/Lk8++aRjPPN8kN9feOGFdBXZX/ziF7cPk12h9dChYO+ud/ugPEAAAQQQQAABBGyBKArlKxCNzu0U+9fM2WegVZb6R4ezh3iMAAIFCmilS1ONoJspyaWdpy1uJtBTisqjKjU1PW0ELBqJSKK2tiRBXCMLLmKSC9evF7GXd5dyq8rc1dQoen2z21JW5s1eB48RQGBpBPRuD+1ru+T0u8cdC4i0tMvsqSOOMToIIIAAAggggAACCCCAAAIIIIAAAggg4E/grbfekq997Wti35lV/viP/1hOnjyZrtD6wx/+MGcFV39Hy2/v/fv3pyu2plIpGR8fT1eTfeKJJ4QKrfn5sRUCCCCAAAIImBMg1GrOsrQzdW+1w6yXd9uhVsdxe4eSEo1xWR0odBBYQoFYjTM0toRL4dA+BNoMVWo1VUF1sVM5cvHDxTbJ+/k7E8vl4PkLeW9fbhtevHHDyJI1/Ksh0VJVmJ1v0T0tzVIfjUpHIpHeZI0dXo3Z69Kqu7pGGgIIILCQwPDkVvlv//kfHJtYDY0iNTGR2U8qRjg2oIMAAggggAACCCCAwCIClmVJLGb/m3KeFnH90eU8mzGMAAIIIIAAAghUnIBWP9XQ6NmzZ+Vf//Vf5R//8R9LXqHVjfryyy+ng62f/exn06FWAq1uIfoIIIAAAgggUAoB0o+lUA7gGDU10/fbgdb27Kk1zJrrtqHZ2/AYAQQWFzB5e3hCZIt7l8MWpqpwmqqgupiZVhu+cP2GmFh3q12ltpJDrSbf743LatPui10fE89rSFWrrXYsX54OsBJaNaHKHAgg0LWhT/TOD1cvZVX8tiKi1Vrnzr0HEAIIIIAAAggggAACRQnM2P+f4tSpU+kqZPo40zTsqoHW64buopKZl+8IIIAAAggggEA5CRw8eFD+6q/+Sq5evbpkFVrdXq+88oq8+eab8tFHH7mfoo8AAggggAACCJREgFBrSZgDOchj7lkzv4R2j9NHAIHCBLSaZnJ1W2E7LbC1Vk3UkGG1NZNhwaW002Co+zbtxa6nlCZagdREqFVfv5XcNPxbDk0D8gNtrbKhdVU6zFrp16UcrglrRKASBfSP5JIT4/Lrnz7rOL1IokXmzr8vkppzjNNBAAEEEEAAAQQQQCAfgffee0927NghtfbPtnp73eymodajR49mD/EYAQQQQAABBBCoOoFLly6F7pwJtIbukrAgBBBAAAEEqkqgspMqFXopY+tHkqnU3Kj79PR2oTQEEPAvcMFwdQi9HfiBs+f8L6zMZljq27CHkauUoVZT55+onf/2gKaOUSnzaPDUZEhWw9Qb29vkvtWrpW/likph4jwQQCDkAv2jw/Liz/bIzK2sP8ipiUmkcaXMXTob8tWzPAQQQAABBBBAAIEwCmgg4o033gjj0lgTAggggAACCCCAAAIIIIAAAggggEAIBSIhXBNLWlRg1lOltbWzQ9rXdi26JxsggMDiAmeuXVt8owK20FBrNbZKqU5bV6aVSk2FK6ORmmp8+RZ1zqYq+urBx9fcJV/dPCZfvLefQGtRV4OdEECgWIF4U0I02OpuVuMq9xB9BBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAwLgAlVqNkwY8YfdIeyo1u919lI2f2eweoo8AAkUKnLlqOtTaUuRKynu369kV3sr4VEze5t1U0LSUnLGayv/7F70uK+rrSsk677EGWlfJ53vXh2Y98y6UJxBAoKIFBreMyRvPv+A4R2tZg1j1CUldn3aM00EAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQRMClR+UsWkVgjmikZTu0QsR/JGqymtH9oYgtWxBAQqQ2Bmbk5MBls7EsvFZDCyXJRn5mbLZakVuc6LN24YOa9Eba2ReZhkYQGt8vqFvl7Zed8ggdaFqXgWAQRKIKB3gejs7fEcKdLc5hljAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEDApQKVWk5pBz9W91Q6zXn7EDrU6jjRg3544GuNSOlDoIOBT4IIdCGyLN/ic5ZPde1qa5cDZc58MGHq0f+q06Jep1rdyhTy88T4j01288ZGReZZ6kpgdNjTRNCxNQyCXgFaJ3fl7g6IBeBoCCCAQFoHkxGY5efiIYzlWvFmsWK2kbt10jNNBAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEETAmYSeqYWg3zLChQU3Nlux1obc/eSMOsg+Oj2UM8RgABAwKHzl8wMMsnU2iotRxaazxubJlnrl41NtdSTnTLUBhVK3GWsiVqY0YOd31mxsg8YZ6kcdnSVaPVKs4PJ+8j0BrmFwhrQ6BKBXqSA6J3hHA3q2m1e4g+AggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggYEyA8p7GKIOfKJVKPWY5i7TKfL9sDn41HAGByhY4cvGi0RPsaWkxOl9Qk7U11Bub+sy1a8bmWsqJTFZYTdTWyvTN0lS3i0ZqjLBdv1X5odZSB44zF0aPu/O+QaNVoTNzz/f9yMUP009duH5DLtoVqbObBpinpqftgG1CHupdl/0UjxFAoEoFhiYn5Ff/9zOOs480rpS58++LpKhA7oChgwACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACRgQItRphDH6S2LpPjaZEku4j6S+aaQggYF5gavqKaOhLbwtuoultxfVL5w1z62pqMrY89auEZjKEGqspXbXWxBJWH62E617MORRa1fbBdXdLUFWcdS3HL12WE+mvS+nHha6vGAP2QQCByhK4d3RYfvPTZ2Um+w8c7D+aiDStkrkPz1TWyXI2CCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAQCgFCraG4DIsvwg60PubeqrWzQ/SLhgACwQhoRcMV9e3GJp+8e638/etvGJsviIna4g1Gpj1z9ZqYrHBqZFEhmKTOvtV8qVrMrgJqormreZqYM0xzaPVcU+2GHSTNt+lxxzrvzHfzvLd77YMzcuDMWdHvNAQQQMCvwDK7gvvg+Ji8uvd5x1RWYqUIoVaHCR0EEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEDAjYCbxYmYtzDKfQPeIpurudz9NlVa3CH0EzAocOn/e6IQDravS1VqNTmpwsr6VK8TUbdinroS7Im0hbJc/ulnI5gtu22gwQLnggewn6w0FaCs9nGyqGvNi18P9/LbuNcbebzr3wfMX5H//1a/le2+8SaDVjU0fAQR8CfSPDXv2t5Y1iNXQ6BlnAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEPArQKjVr2AJ9o9GZ3aLWI57oMebEtKTHCjB0TkEAtUrcMgOiZkO9Gm11rC2UYNVI00HgpfSTF8DWnnWRGuNx01Ms+gcGk7uSCxfdLt8Njhz9Wo+m5XtNi11jv+8Fn0e+jq5cP1GXvtr4NhkldanDh6Wv3v1dZm+aS6AndeJsBECCFSFgN4ZomtDn+dcI02tnjEGEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAb8ChFr9Cga9f/dWO21j7XQfJjkxLtFY6W5j7T4+fQSqQeC6fSvxA2fPGT3VsFZr1QCkrs1UO3jugqmpQjHPhRv5hRUXW2xXU2mq2pk8zplrZgK9i9ks1fMr6uuNHDrfQKserKel2ViV1r3Hjsu+k+8bOQcmQQABBOYTGNy8yfOUFW8Wq9bMHwZ4JmcAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSqVoBQa8gvfSx2eYcdam3PXqaGWftHvbcBzd6GxwggYEYgiLBYGKu1jt3ZYQbMnmVq+krFVYw8a6haacdyM9VTF7tYHYnEYpvk9bzJKrV5HXAJNjJV0baQ4LOGWk20g3Y16WffOWpiKuZAAAEEFhTQO0Q0rlzh2cZqpFqrB4UBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBDwJUCo1Rdf8DvPzlq73UfpHd4o8SYzgSX33PQRQMApcOTih+mQpnPUX08roiZXt/mbxODeK+rrZKTjDmMzmq5uW+zC9BbvptqZa9eNTNUWbxCT65pvUaYqtRZSfXS+tYR9fE2jmeq5hQSfe1pajLC8QIXWBR1L8V5bcAE8iUCFCSS3bfacUaTJrvJu8SOlB4YBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoWoDfQBZNF/yO0XXDWyMRSbqPlOsXyu5t6COAgDmB/adOm5vsdzPt2NAvfTkqnhk/0CITRu0PmYeT9xm7Fboe7sDZs4scdeGnb83NLbxBns/W21WtTbXjly6Zmip963ljk+WYSK+pqUqgxy9dznGE0gxpMDdRWxvowTTQrV8m2oUbH+U9jYnqsGeuXrPfa+fyPmY5bRjGz4By8mOtCAQlMDg+JnrHCEezA63pYKtjkA4CCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQvACh1uLtSrDnnKdKa/vaLmntNHeb8BKcBIdAoOwFXpo6JXobdpNNg4c77xsUE+E2P+v60uAG0eqhppqJyrbTH900spyWOjNhRV3M1PQVuT4zY2Rdwwar4uZakIalTYVBD50/n+sQJRnT98hD96wP9Fimwr+6yCMXL+a1VlMh2qkrV/I6XjluFMbPgHJ0ZM0ImBbQQKsGW93Namx1D9FHAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEihYg1Fo0XcA7do+02/fy3O4+yvDkVvcQfQQQCFhAw4y/OvGe8aNoaO/RoY3GKkUWusDJu9fKQKt922CD7XkDTjdmbhlZkelbjx84Y6YqppqbCp3mghrtvDPXcFFjh85fKGo/UzslV7cFGvze0GomiKWfERp8zqfp+95Eu3D9uolpQjlHWD8DQonFohAoscDgllHPEa3aOrHizZ5xBhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIFiBMwkK4o5MvssKBCNph5zbxBvSkhPcsA9TB8BBEogsOfoMZm+aaaCaPZyNXi561OlD7Y+sO5umby7O3spvh+rj4nboV+fmfW9Fp2gLR43Mk9mkqMf5leJM7P9Qt/H13Qu9HTRz2nVXa3UaqJp1V1T1Wn9rGfHvf1+dp93Xw0Wmwp15xto1cWYCjTPzKXmPTe/T5hyKXYdYf0MKPZ82A+BShJosf/YINfPI5EmM38kUElWnAsCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCBQnQKi1OLdg9+reat8ze26H+yBDkxPuIfoIIFAigZm5OXn60NuBHE1vR/6nI0PGwogLLVJDj3qsbd1dC21W1HN7j50oaj/3TqaClMN32AWvDbaD5y6Ivg5MtJGOO4yFG7PXM9Fl7rqaCChnr63Yxx2J5XYAe22xu8+737buNfM+V+gTGgDOt8UMVWpN1MbyPWRB2+lnw/iauwrax/TGYf0MMH2ezIdAuQr0bxryLN1qaBRrWYNnnAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgUAFCrYWKlWD7WPTyLhHLkcaKxqLSO5QswdE5BAIIzCfw2gdn5Pily/M97Wtcqzc+vPE++aJdlVKrtwbRxuzb0v/ZphHpamo0Pr1Wad138n0j85q69bgGeDUQaarpOR48f8HIdHq9vzS4QUzdil4XpUHEkQ7HfzqKXquGd187/UHR+5veUasKmwy26mtD3w+m2oGzZ/Oe6rKhis8diUTex8x3w6RdgVGrOC91C+tnwFK7cHwEwiKglVpbOzs8y7ESKz1jDCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAKFChBqLVSsBNvPzllfdh+mf3RY4k3mAyzu49BHAIGFBZ46eGjhDXw+q6HEb2z5dDrcaiJ8quG9h3rXyeOfHpMv9PUaDVFmn+pTbx0yVsV0+qOb2VP7emy6Iu2eo+/6Wk/2zj0tzfKgoQChzqXX2VTTgLKGeMPUTAVbNUj8pQFzgeKp6SuiX/m2i9dv5LvpgttphWeTTQOtOzb0m5yy6LnC/BlQ9EmxIwIVJtA/Nuw5o0jTKpFIjWecAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKAQgWDKARayArZ1CER7PnW/WOIpyTq4ZcyxHR0EEFgaAQ2v7Tl6zK4a2R3YAjR0p+FW/Tpz9ZocuXhRTly+nK4Sq/2FmgZhdX+t4jh2Z4doqDXoprepN3mr+jPXFj7HQs5Hg3raNHRr4pbmmQCjqQqwWl11+uYt2XvseCGn5dhWr/nO+wYdY347+0+d9jtFIPvr+y5RG0u/B4sJ3ep7Q4Obpq6fnqRWcC6k6etQK+HqWvw0rfbbt3KF7+rBug4NRJusXOvnvHTfMH8G+D039kegUgQGx8fkxZ/9Qj66dv2TU7IiEmlcKXMfFva5+MkEPEIAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQRECLWG7VVgySPuJXVt6Mt5i0/3dvQRQKA0AlqtU0NxA612RbKAm4ZS07dKF+et0o9funy7MmpLXZ2YrtqY72lpOE8DoybbBbuSpX6ZOicNtt5jh/8Onb+QDgZPTU/nXG62ac4Nfjeo199kiFRv9z58R7s8c/jtggKK6vP53vXGX4caUC6k8uhCVkE8p+HLZPtqee7YCfnVifduvw8WO5a+j3b+3qDxoPf+qVOLHdrzvL6+TQTOv2gHdL/zmxeLDmzrGrRqrcmQr+dkixgI+2dAEafELghUnEA0FpV77TtJvLr3ece5RZpXE2p1iNBBAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIFCBQi1FioW4PZ1fcnumRlru/sQg5s3uYfoI4DAEgv86M23pG1kyEgwrZhT0eqcYWhPHTwUyG3qD50/b7RyZH00KhpuzVRuzWX37V/vS4dpcz2XPaahz4N2QFarZJpqGi58eON96XlfskOSWoU0V8hWr7ueywY7UD3ScYfvap/u9WtIWcO1YW9qoGHg8TWdsu/kVLqa8cUbH4eh3WtPX3c7BBtECF1fC8VUjNV9TIRatVrrF/rvKbgSse63rXtN+j3mt2Ks29tUP8yfAabOkXkQKHeBjZ/Z7Am1SrRWrHizpK5+WO6nx/oRQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQGCJBAi1LhF8rsPOzER3i6QcTzXaoame5IBjjA4CCCy9gN5C/O//vzfkT+1gqwbsqrE9b1fJ3D8VzG3qtapqmG6H7r6+P7bDvH8xtsl4qFSDstlhWX2dadVUDUBqEDHotufosbyCvUGvI9/51WTy7m57c/36uB25+HGQqqelOTMUyHcNAGvV3mKaVgs2tT4N7epcTx96W177YOFbfmt138+uXWsHotuLWXZJ9wn7Z0BJMTgYAiEVSKz4+OeUI68dcKww0twms4RaHSZ0EEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgf4HqTGLl71O6Lbu31olM73QfMLlts3uIPgIIhETgzNVr8v033kzfij6s1Q6DotLg4NOH3wlq+nTFUg0NhtVVb4++99iJ3wUqA2NIB6ZNhR8XW6W+nn9lB5XLvZXKa9/J99OB42K89p86bVeZvauYXXPuo+HeLw1usKuvdokGoTPBXt24pa5ONMzakVheVgF8rYYc5s+AnBeCQQSqUCA5sVncoVarPiHWsgZJfXStCkU4ZQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQ8CtAqNWvoKH9Y9HLu1JiOcrKRWNRuXd02NARmAYBBIIQ0ODVf3n19XSwtVoqtmqg8+9ffyMIzttzapjt+KXLxqpZ3p7Y4IO9x46n11eqEKXBpXum0iDk9w68mQ4Rep5kwCOgr08NNRfbtPquvr67mhqLnSLnfhpc1VYJr8ly+AzIeREYRKDKBDp7e6S1s0POnpxynLnVuEpSZ4v/nHRMRgcBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBKpKIFJVZxvik02JPOJe3uD4mCxrqHcP00cAgZAJaFXE7778arpCYsiWZnw5Gsb7T/tfLsm5Ph/yqqEautNwr1Y4LfemFYf12oapadA2rO2pg4dk+uZNX8sL++vb18kZ2hkjQ5BMg0DAAoNbxjxHiDSuFKmJecYZQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBYTINS6mFAJnq/pGdouYvW5D9U/RpVWtwl9BMIqoIHASg+2alVaPUe/Yb58r+GBs+dEjxnmpsHLv3vt9ZKEfINyeOrg4VA67zl6TDQwHramr8v9U6d9L0vnKdV7qdDFajXmMIS1y+EzoFBbtkegEgX67TtLxJsSzlOzIhJJtDjH6CGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJ5CBBqzQMp6E0sK/Vl9zF6kgPpW3m6x+kjgEB4BTTY+p3fvBjKgKBfNQ3xaVXSUlfPfObw2yU/ZqFWGgDUsK9+L7f29OF3ZN/J90O57Bszt+R7dgXZMLnqWn705ltGvLTS795j4bs1t57j377yqpy5Fo4KxOXwGWDkBcEkCJSxQDQWld6hpOcMIi3t9t/t8eOmB4YBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBYUIDfMi7IE/yTtb1DdoVWy67U6mz9m4acA/QQQKAsBLTy4t+9+ro8+85R0dBauTcNsf7ot2+lv5bifLRapKkQYZDXQgPN6SCgvd5yaHot9bqG+fbu0x/dTFcy/f6BN0PxXsqEPU0Gu9U/TKHizDnq9+u3ZkLxUi6Xz4BQYLEIBJZQYGhywnv0mphYDY3ecUYQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBBQSsBZ7jqRIIRNcN/41I6n/NPlTL6jb5n5/437KHeIwAAmUo0BZvkC8NbJCOxPIyXL2kK85qoDQMt0gf6WiXL/TdI9FIcH+L8e1f7/NdFbQ+GpWHN94nXU3hDfFoKPP7dgXUg+cvBPK67GlplkeHNvqeW6vfHrn4YXqesc477evf63vOYidQM12PhpeDaPqa6Vu5Ioip854zO9CqO33x3n7R952f9ue/fM7P7o59y+UzwLFoOghUmcB/+8//IEdeO+A469SNqzJ78qBjjA4C+QqkUql8N2U7BBBAAAEEEPAhYFn8isAHX1ntyr+vyupysVgEEEAAgTIWqKZ/X/HvizJ+obJ0BBBAAIGyEqiWf19E131SBDS4dFBZXfolWmx3sllkbpf76INbRt1D9BFAoAwFtMLg/7n/ZdFbvIchGJovYaY6q1acDcu690+dTocKNXgX5qZ2/8m+5mGt1PvaB2fkO795MbBAq8lrk/3a02qmTx08vCQVW/V9rNc0qECrmv39628EOv9i10XPUSsNZ7+/Lt4I13utXD4DFrPmeQQqWWB4cqvn9Ky6uFjLGjzjDCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALzCUTne4Lx4AVi0cjOlFh12UeKxqIyOD6WPcRjBBAoYwG9zXvmFuMjHXfIRNcaWVHveNuH5uw0RLjv5FR6vRrODFs7fumyfGffi7JlzV0y0b1GtCpqWNveY8fltQ8+kH9rV5dd6gqcaqTXVqvuBlWdNYjroO+d7KbB1qnpadl536AkamuznwrssQYpnzp4KPAwrZ7rf7FD5A+s6/FdHbUQDD3u3mMn7K/jgZ9jIeuab9ty+gyY7xwYR6CSBdrXdklrZ4ectf8tkd0izW0y+8Gx7CEeI4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAvMKhDcRNO+SK+eJuZS12313Jw20arCVhgAClSWg4TEN5emX3kZ7/K67pCOxPBQnqdUZnzt+QvZPnQp9sO3jEN7xtOOwHRIeu7ND2uLhrACnrlrttqel2Q7hdi1JuFXX8It335VXT58J/bV1vxmyq4ZmnkuHGu1KsxpsVdegmlYufebw2yUNAaeDx799S163w9Bf3NAfeHBXA84/tgO7uZzVNbtSblDOxcxbTp8BxZwf+yBQ7gIbP7NZ9vzDPzpOw1reInLufZHZW45xOggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEAuAdKTuVRKMFbTM7TdDrR2uw+lvwimIYBAZQto9Uf90oqtG1pbZeSO9pIHXDXIduj8ebua6Bk5cvHDsgPXSrJaAVe/NNSaXL06bdixfHnoKuGqr35piHmbHW69Z+WKwKvM6vE0QK3XtxybBhfna3rtv/vyqzLWeacdDu80GmrWIOeeo8fSdvMdP+hxDZt+xw7uPrDu7vQ5mj7e1PQVefadI4sGdmfmZk0f2uh85fQZYPTEmQyBkAusH9oov/7p/yNXL01/slIrIpHGlTJ38fQnYzxCAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEE5hEg1DoPTNDDlmU9IpJyHKYnOSCJFSscY3QQQKByBTRYmglmZgKuXU2N0tbQYDzkqgEwDbNp2PHA2bPpx5Uiq1U19xx99/bp6K3pNehaH43ajgn7e036++0N5nlwa3b+IOU8uxQ0rP7fe+PN9D59drBVw60aatZr77dpGPPguQvpoPIhOxSp13upmh5bX2d+Wj5VQjOVj/U9M3rnnbKxvU2ikUjBh9X1HjhzTl46dcr3ugs++Dw76JqeOnjYDp8elYG2VTJ8xx1FV6bVgPCBs+fSrw19jeRjq8ua/uhmaDzmYbo9XC6fAbcXzAMEKlhA7zjRPzosL/38OcdZRprbZO5D+w8tUsH+t9ZxUDoVKTD1tX9XkefV8c3/6jkvztVDUnYD1X5d//yXzv8WlN0FnGfB/+HfTHie4Vw9JGU3UO3XtewuGAs2KsC/OYxyLslk1f5vjmp6DfNvjiV5ixk9aLX/m6OaXsNGXzhlOFk1XWvOtQxfoK4l89nM/79xvSTKrstruHpew2X34jSwYEKtBhALnSK2fiSZSs3d795vcPMm9xB9BBCoEoHbAdes89XKnhpwbYvHcwYzNcynIT4NdLmDalPT03Lhxkd2eHU6HWBdypBj1imV5KFaZDw0zBfGptU49evpw++kl9fT0pwOt7ba17urqWnRJYf5+mp4Vyuplqodv3RZ9OuZw2+nw+AfB5mj0lJXlzb9ONy83PE+SL8vrlyR9+216nrD2vR9m6nsrGHtgbZW0e+Zc9N162sn+zNAz+36zMdVVk9curRoRdb5zj3zGp3v+TCPl8NnQJj9WBsCfgWSE+Py2t7nZeZW1h9Y1MTEijdL6soFv9OzPwIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIVLgAodYluMCpVOrL7sO2dnZI14Y+9zB9BBCoYgEN24U5cFfFl8b4qWtl0yMXjU9bVRNqAPRjR39VYsOKpkFNrU5LQwABBMIuEG9KiN6B4tD+1xxLjbSslllCrQ4TOggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggg4BUo/D693jkYKUSge6RdZG6Xe5f+sWH3EH0EEEAAAQQQQAABBBBAoOwEhia9t2a2ljWIftEQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBhQQItS6kE8BzsejsDhGrLntqrWY0OD6WPcRjBBBAAAEEEEAAAQQQQKAsBfQuFPrlblqtlYYAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggsJECodSGdAJ5LiTzmnrZ3KCnRWNQ9TB8BBBBAAAEEEEAAAQQQKEuBnNVa480iNbGyPB8WjQACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACpREg1Foa5/RRanqGtttVWtvdh8z1C1/3NvQRQAABBBBAAAEEEEAAgXIR6EkOiN6RwtGsiESa2xxDdBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIFsAUKt2RoBP7asud3uQ+T8Za97I/oIIIAAAggggAACCCCAQBkJ6J0oBjaPeVYcaVxp/50fP4Z6YBhAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIG0AL9NLNELIbZ+JCkS2eo+XHJis3uIPgIIIIAAAggggAACCCBQ9gKD46Oi4VZHq4lJJNHiGKKDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIZAQItWYkAv4+N+et0tra2SGdvT0BH5npEUAAAQQQQAABBBBAAIHSC8SbEtI7vNFzYKupzTPGAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIKAChFpL8TroHmm3rNQO96EGt3hvx+nehj4CCCCAAAIIIIAAAgggUK4CyW3eO1NYyxrEqouX6ymxbgQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQCFCAUGuAuJmpo9G5nWL/2jbT1+9atah/dDh7iMcIIIAAAggggAACCCCAQEUJ6N0p2td2ec4p0tLuGWMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQINQa9Guge6sdZk3tdh9GA63RWNQ9TB8BBBBAAAEEEEAAAQQQqCiB4cmtnvOxGhpFamKecQYQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKC6BQi1Bnz9a2qm77ertDrKEGmYNTkxHvCRmR4BBBBAAAEEEEAAAQQQWHqBrg196TtVOFZiRYRqrQ4ROggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggYAsQag3+ZfCY+xA5f6nr3og+AggggAACCCCAAAIIIFABAvpHfUOTE54ziSRa7L//40dSDwwDCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCFSxAL9BDPDix9aPJC1LRt2HyHX7Tfc29BFAAAEEEEAAAQQQQACBShHoHUqKhlsdrSYmkcaVjiE6CCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQ3QKEWgO9/rOeKq2tnR3SvrYr0KMyOQIIIIAAAggggAACCCAQJoF4U0L6R4c9S7IaV3nGGEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgeoVINQa1LXvHmlPpWS7e/pct910b0MfAQQQQAABBBBAAAEEEKg0gcEtY55TspY1iFWf8IwzgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAC1SlAqDWg6x6NpnaJWHXZ02t1op7kQPYQjxFAAAEEEEAAAQQQQACBqhDQu1Z09vZ4zjXS3OYZYwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBKpTgFBrENe9e6sdZp17xD31wOYxicai7mH6CCCAAAIIIIAAAggggEBVCCQnNnvO04o3ixWr9YwzgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAC1SdAqDWAa15Tc2W7XaW1PXtqDbMOjo9mD/EYAQQQQAABBBBAAAEEEKgqAb1zRePKFZ5ztppWe8YYQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACB6hMg1BrANU+lUo+5p9Vf3sabEu5h+ggggAACCCCAAAIIIIBAVQkkt3mrtUYaV9p/F8iPp1X1QuBkEUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEMghwG8Nc6D4GYqt+9RoJCJJ9xxDkxPuIfoIIIAAAggggAACCCCAQNUJ3Ds6LHonC0eL1EikaZVjiA4CCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCFSfAKFWw9c8JeKp0tq+tktaOzsMH4npEEAAAQQQQAABBBBAAIHyE1jWUC+D42OehVsJu1orDQEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEqlqAUKvJy9890m5Pd797yuHJre4h+ggggAACCCCAAAIIIIBA1Qr0jw17zt1a1iBWvNkzzgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCFSPAKFWg9c6Gp3ZLWLVZU8Zb0pI14a+7CEeI4AAAggggAACCCCAAAJVLaB3ssj1c1KkkWqtVf3C4OQRQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSqXoBQq6mXQPdWO8xq7XRPl5wYl2gs6h6mjwACCCCAAAIIIIAAAghUtcDg5k2e89dKrVat4+8EPdswgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAClStAqNXQtY3FLu+wQ63t2dNpmLV/1HtbzexteIwAAggggAACCCCAAAIIVKNAT3JAWla3eU7damz1jDGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALVIUCo1dB1np21drun6h3eKPGmhHuYPgIIIIAAAggggAACCCCAgC0wuGXU4xBpWmX/vSA/qnpgGEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgCgT4TaGBixxdN7w1EpGke6rkts3uIfoIIIAAAggggAACCCCAAAK/ExgcHxO9w4Wj2YHWdLDVMUgHAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSqQYBQq5GrPOep0trZ2yOtnR1GZmcSBBBAAAEEEEAAAQQQQKASBTTQqsFWd7MaW91D9BFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoAoECLX6vcjdI+12qX/DAABAAElEQVT2vTG3u6dJTlCl1W1CHwEEEEAAAQQQQAABBBBwCwxuGXUPiVVbJ1a82TPOAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIVLYAoVaf1zcaTT3mniLelJCe5IB7mD4CCCCAAAIIIIAAAggggIBLoGV1W86fnyJNVGt1UdFFAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoOIFCLX6ucTdW+tE5na4pxianHAP0UcAAQQQQAABBBBAAAEEEJhHYHDzJs8zVkOjWMsaPOMMIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBA5QoQavVxbWPRy7tErPbsKaKxqPQOJbOHeIwAAggggAACCCCAAAIIILCAQNeGPmnt7PBsYSVWesYYQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACByhUg1Orj2s7OWV927z44PibxpoR7mD4CCCCAAAIIIIAAAggggMACAv1jw55nI02rRCI1nnEGEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgMgUItRZ5XaM9n7o/EhFPSdZcv4gt8hDshgACCCCAAAIIIIAAAghUjYD+geCyhnrn+VoRiTRSrdWJQg8BBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACByhUg1FrstbXkEfeu890y070dfQQQQAABBBBAAAEEEEAAAadANBaVe0dzVGttaXduSA8BBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBCpWgFBrEZe2ri/ZLWJtd+86uHmTe4g+AggggAACCCCAAAIIIIBAngJDkxPeLWtiYsWbveOMIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAxQkQai3iks7MRHe7d2tcuUJ6kgPuYfoIIIAAAggggAACCCCAAAJ5CsSbEjl/roo0t+U5A5shgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEA5CxBqLfTqdW+tE0ntdO+W3LbZPUQfAQQQQAABBBBAAAEEEECgQIHkhPdnK6s+IdayhgJnYnMEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEECg3AUKtBV6xWPTyLnsXx70vo7GoDI6PFTgTmyOAAAIIIIAAAggggAACCLgFOnt7pLWzwz0sVuMqzxgDCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQWQKEWgu8nimRR9y7aKBVg600BBBAAAEEEEAAAQQQQAAB/wKDW7x/NBhpXClSE/M/OTMggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEBoBQi1FnBpanqGtotYfe5dBreMuofoI4AAAggggAACCCCAAAIIFCnQPzos8aaEc28rIpFEi3OMHgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIVJQAodYCLqdlpb7s3rwnOSAtq9vcw/QRQAABBBBAAAEEEEAAAQSKFNA7YfQOJT17R1ra7b8z5MdYDwwDCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCFSIAL8NzPNC1vYO2RVaLbtSq7P1bxpyDtBDAAEEEEAAAQQQQAABBBDwLTA0OSEabnW0mphYDY2OIToIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFA5AoRa87yWc3PWI+5NtUKrVmqlIYAAAggggAACCCCAAAIImBWINyWka4P9t4Wulq7W6hqjiwACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAClSFAqDWf69idbBaZ2+XedHDLqHuIPgIIIIAAAggggAACCCCAgCGB4cmtnpmsurhYyxo84wwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggED5CxBqzeMaxqKRnSJWXfamyxrqZXB8LHuIxwgggAACCCCAAAIIIIAAAgYF2td2SWtnh2fGSHObZ4wBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAofwFCrXlcw7mUtdu92b2jwxKNRd3D9BFAAAEEEEAAAQQQQAABBAwKbPzMZs9s1vIWkZqYZ5wBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAobwFCrYtcv5qeoe2WJd3uzXL9YtW9DX0EEEAAAQQQQAABBBBAAAF/AuuHNkq8KeGcxIpIpKnVOUYPAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQTKXoBQ6yKX0LKsR9yb9CQHJLFihXuYPgIIIIAAAggggAACCCCAgGEBvUPGwOYxz6yRplUidriVhgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAClSPAbwAXuJax9SNJkdT97k0GN29yD9FHAAEEEEAAAQQQQAABBBAISGBwfFQ03OpoNTGx4s2OIToIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFDeAoRaF7h+c3NzniqtrZ0d0rWhb4G9eAoBBBBAAAEEEEAAAQQQQMCkQLwpIXrHDHeLtKx2D9FHAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIEyFrDKeO3BLr17pD0anX3Xvp9lXfaBtv2PXxCtEkRDAAEEEEAAAQQQQAABBBAoncDZk1Py5F//jeeAs++9JamPrnnGGShvgVQqVd4nwOoRQAABBBAoEwHL4lcEZXKpfC+Tf1/5JmQCBBBAAAEE8hKopn9f8e+LvF4SbIQAAggggIBvgWr590V03dBtKyq13qZwPohFZ3e4A61aHah/dNi5IT0EEEAAAQQQQAABBBBAAIHABfSuGfrlblRrdYvQRwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKB8BQi1znPt7Jowj7mf6h1KSjQWdQ/TRwABBBBAAAEEEEAAAQQQKIHApgc/6zmKFW8WqYl5xhlAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIHyEyDUmuOa1fQMbbertLa7nxqanHAP0UcAAQQQQAABBBBAAAEEECiRQNeGPtE7aDiaFZFIc5tjiA4CCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCJSnAKHWHNfNsuZ2u4d7kgPeX566N6KPAAIIIIAAAggggAACCCAQmIDeOSM5Me6ZP9K40v67RH689cAwgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggECZCUTLbL2BLze2fiSZSs1tdR9oeNIz5N6EPgIIIIAAAggggAACCCCAQMAC/aPD8uLP9sjMrZlPjlQTk0iiReYun/9kjEcVJzD1tX9XceekJ9Txzf/qOS/O1UNSdgPVfl3//JfPld01y2fB/+HfeO/ixLnmIxfubar9uob76rC6oAX4N0fQwsHPX+3/5qim1zD/5gj+/RT0Ear93xzV9BoO+rUU9vmr6VpzrmF/NS6+Pj6b+f83i79Kwr0Fr+HqeQ2H+5UYzOooZeNynZvzVmlt7eyQ9rVdri3pIoAAAggggAACCCCAAAIIlFog3pSQ3uGNnsNaTW2eMQYQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKC8BAi1Zl+v7pF2y0rtyB7Sxxs/s9k9RB8BBBBAAAEEEEAAAQQQQGCJBJLbvD+jWcsaxKqLL9GKOCwCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCJgQINSapRiNzu0U+9egWUOiVYDWD3mrAGVvw2MEEEAAAQQQQAABBBBAAIHSCcx3N41IS3vpFsGREEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEDAuACh1gxp91Y7zJranelmvvePDks0Fs10+Y4AAggggAACCCCAAAIIIBACgeHJrZ5VWPFmkZqYZ5wBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoDwFCrb+7TjU10/fbVVodZX00zJqcGC+PK8kqEUAAAQQQQAABBBBAAIEqEuhJDqTvrOE+Zaq1ukXoI4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFA+AoRaf3etUin5hvuydW3oy/lLUvd29BFAAAEEEEAAAQQQQAABBEovMDQ54TloJNFi/70iP+p6YBhAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoAwE+E2ffZFi60eSkYgk3ddr04OfdQ/RRwABBBBAAAEEEEAAAQQQCIlA71BS9A4bjlYTk0jjSscQHQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQKA8BQq3p6zT7mPtytXZ2iH7REEAAAQQQQAABBBBAAAEEwikQb0pI/+iwZ3FW4yrPGAMIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIBB+AUKt3SPtqZRsd1+qXLexdG9DHwEEEEAAAQQQQAABBBBAYGkFBreMeRZgLWsQqz7hGWcAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQTCLVD1odZoNLVLxKrLvkxa7acnOZA9xGMEEEAAAQQQQAABBBBAAIEQCugdNro29HlWFmlu84wxgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAC4Rao7lBr91Y7zDr3iPsSDWwek2gs6h6mjwACCCCAAAIIIIAAAgggEEKBwc2bPKuy4s1ixWo94wwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEB4Bao61FpTc2W7XaW1PfvyaJh1cHw0e4jHCCCAAAIIIIAAAggggAACIRbQO200rlzhWaHVtNozxgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCIRXoKpDralU6jH3pekd3ijxpoR7mD4CCCCAAAIIIIAAAggggECIBZLbNntWF2lcaf8dY1X/2OsxYQABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBMAtU7W/3Yus+NRqJSNJ9cXL9ItS9DX0EEEAAAQQQQAABBBBAAIFwCdw7Oix65w1Hi9RIpGmVY4gOAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgiEV6BqQ60pEU+V1va1XdLa2RHeq8XKEEAAAQQQQAABBBBAAAEEcgosa6iXwfExz3NWwq7WSkMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgbIQcJWxKYs1+19k90i7yOz97omGJ7e6h+gjgMASCDz+6TFZUV9n/Mj/8cX9MjV9xfi8pZpwfM1d8lDvOqOHO3LxQ/nuy68anZPJEEAAAQQQQACBpRIY3DIqr+593nF4a1mDWPFmSV390DFOBwEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEwidQlZVao9GZ3SKWIzEXb0pI14a+8F0hVoQAAsYEHupdb2yuUk+UqK2Vbd1rSn1YjocAAggggAACCJSVQMvqNulJDnjWHGmkWqsHhQEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEQihQfaHW7q12mNXa6b4WyYlxicaqs3Ct24I+ApUq0NPSLEk76FCOTQOtGmylIYAAAggggAACCCws0L9pyLOBVmq1ah1/1+jZhgEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEFh6gaoLtcZi0zvtUGt7Nr2GWftHh7OHeIwAAhUq8NA96yUaKa+Pvo7EchnrvLNCrwinhQACCCCAAAIImBXQSq1asdXdrMZW9xB9BBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAImUB5JbsM4M3OyiPuaTTQGm9KuIfpI4BABQpotdPJu7vL6sweWNdTdkHcsgJmsQgggAACCCBQcQKDW0Y95xRpWmX/fWPV/QjscWAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgTALVNVv9KLrhrfaBRqT7gsyuGXMPUQfAQQqWGDLmrtkRX153H52oHWV9K1cUcFXg1NDAAEEEEAAAQTMCwyOj4nekcPR7EBrOtjqGKSDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJhEqiqUKvI3G43fmdvj7R2driH6SOAQAULRO10++d714f+DMtlnaGHZIEIIIAAAgggUHUCGmjVYKu7WY2t7iH6CCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAQIoHqCbV2j7Tb95rc7rZPTmx2D9FHAIEqENAKqPoV5jbWeWfZVJQNsyNrQwABBBBAAIHqFNj4Ge/PelZtnVjx5uoE4awRQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQKAOBqgm1RqOpx9zXI96UkJ7kgHuYPgIIVInAA+t6RKuhhrElamvlwXV3h3FprAkBBBBAAAEEECgLgcSKFTl/3os0Ua21LC4gi0QAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEKhKgXCmuUxfiu6tdSJzO9zTDk1OuIfoI4BAFQm0xRtEq6GGsYU5cBtGL9aEAAIIIIAAAgjkEhjcvMkzbDU0irWswTPOAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIILL1AVYRaY9HLu0Ss9mzuaCwq944OZw/xGAEEqlBAq6FqVdQwtY7EchnpcHxkhWl5rAUBBBBAAAEEECgbga4NfdLa2eFZr5VY6RljAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEll6gKkKts3PWl93Ug+Njsqyh3j1MHwEEqkwgGonIQ/esD9VZ77i3P1TrYTEIIIAAAggggEA5C/SPef+YMdK0SiRSU86nxdoRQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQqEiBig+1Rns+db+dWUu6r16uX2y6t6GPAALVIZBc3SZdTY2hOFmt0KqVWmkIIIAAAggggAACZgT0DxrjTQnnZFZEIo1Ua3Wi0EMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEBg6QUqPtQqljziZp7vFpTu7egjgED1CHyh754lP1mtGvvAup4lXwcLQAABBBBAAAEEKkkgGotK75Dn7xwl0tJeSafJuSCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQEQIVHWqt60t2i1jb3VdqcPMm9xB9BBCocgGtjjrWeeeSKkze3S2J2tolXQMHRwABBBBAAAEEKlFgaHLCe1o1MbHizd5xRhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAYMkEKjrUOjMT3e2WbbFvM96THHAP00cAAQRkKUOlK+rrZMuau7gKCCCAAAIIIIAAAgEIxJsSOX8OjDS3BXA0pkQAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgWIFKjfU2r21TmRulxtmcMuoe4g+AgggkBbQKqnbutcsicbne9dLNFK5H8lLgspBEUAAAQQQQACBLIHkxOas3scPrfqEWMsaPOMMIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIDA0ghUbIIqFr1sB1otO9j6SYvGojI4PvbJAI8QQAABl8C4XS21I7HcNRpst2/lChloXRXsQZgdAQQQQAABBBCocoHO3h5p7ezwKFiN/DvMg8IAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAkskULGh1pTII25TDbRqsJWGAAIILCTwkF01tZTtgXU9pTwcx0IAAQQQQAABBKpWYONnvNVaI40rRWpiVWvCiSOAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAQJoGKDLXW9Axtt6u09rmhB7eMuofoI4AAAh6BnpZmSa5u84wHMbAUlWGDOA/mRAABBBBAAAEEykFg/dBGiTclnEu1IhJJtDjH6CGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAwJIIVGSo1bJSX3Zr9iQHpKVEITX3sekjgED5CTx0z3qJRoL9iEzU1sq27jXlh8OKEUAAAQQQQACBMhXQO3f0jw57Vh9pabf/LjLYf/t5DsoAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgh4BCrut3a1vUN2hVbLrtTqbP2bhpwD9BBAAIEFBDRwOnl39wJb+H9KA616HBoCCCCAAAIIIIBA6QSSE+Oi4VZHq4mJ1dDoGKKDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAKlF6i4UOvcnPWIm7G1s0O0UisNAQQQKERgy5q7ZEV9XSG75L1tR2K5jNvz0xBAAAEEEEAAAQRKKxBvSkjXBvtvIV0tXa3VNUYXAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQRKK1BZodbuZLPI3C43Yf+Y9/aS7m3oI4AAAm6BaCQin+9d7x420n9gXY+ReZgEAQQQQAABBBBAoHCB4cmtnp2surhYyxo84wwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEDpBFz3XCzdgYM4Uiwa2ZkSy1FWcVlDvQyOjwVxOOZEAIGQC8zMzYkGU/20gdZVol8Hzp7zM41jX52vb+UKx1ghHT2vW/ZXfbSiPsJvE7TFG2RFXZ2saWoSrWir59li93NVzT1z9ZpM37yZ3vfM1av241syNT0tRy5+KNdnZm7PWc0PelqapS0etw2XSZdtOl9TM22Hzp+X45cuz7dZIOOJ2lp7jQ3S09Iy77XWA+s1nZq+kr7mus4L128Esp5STpo5d702eo30WmWamujzmfPOjOtrXl/7Jy5dkjPXrlWEQ+bc/HxXqzvtz4zszw6dL/M5sn/qtPzot2/5OQT7IoBABQm0r+0SvaPH2ZNTjrOKNLfJ7AfHHGN0EEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgdAIVlYiaS1m7LcuJd+/osERjFXWazhOkhwAC8wrsOXpMHlh397zP5/uEVlU9eP6CaJjUbzNR/XXfyfdlQ2trRYVaNXiZXN0mA22t6RBfvs4a+tMvbTpHdtNg5iH7ui1FSDOzDl3To0MbM92iv3/71/vyDi7qaywdxrYt9Xu+we6M3+Td3ekQ5YEz59J2r31wpuh1L7SjXrfk6tXp6565hgttn3lOzynTNNip13f/qdPpsGtmPOzfC329a7A7c31ynZt+Nmnw/sCZs+nvJj6rch0nnzFTr/k//+Vz+Rwu/Xmhnxv6+bGQUfZk+t+Fbd1d2UNFPVbnv/rvzxv5b0OhC/jivf0y0tFe6G6e7Z86eFj0vyk0BKpVYGhyQv7f/+v7jtO3lreInLPfF7O3HON0EEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgNAIVk/as6Rnabgdau91sGz+z2T1EHwEEqkTguF3FUCvz+Q3+aOBuy5q7ZO+x477ldJ5cFUfznVgrNGpYV0Ot5d60quK27jWSbF9dUJA13/Puamq0K5M2Siak+fyJk/L8ifcqtoKrBh/H7dfX+JpO34FnnUvfN/ql4b9n3zmSDnbna7/QdhpK1XXmG0BcaC59b+qXzqfh26cPvX27cu9C+y3Fc/q+/+zatdK3aoXx17sGlzXUqV/aNOD6gh1U1DB+pbbM58dY5515B7czFi/ZIWgTodZMgDyo4Hdmvbm+6+vIRNMgNA2BahboSQ5IvCkhVy9Nf8JgRSTS1CpzF5wVXD/ZgEcIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIBCkQMWEWi3LekQk5bDSX1ImVpj5pb9jYjoIIFA2AhrG29jeVnDoyX2CGozcP3XKV2BOQ1g6j5+299iJsg9larhvomuNHZi8w/d1yddSQ5pqr4HPSgy3akBvwg4I63mabnrr9oc33pcOSP744KG8K8a619G3coVdObknfSt493Mm+ulKv3ZgVt8je46+a2JKI3Oon16fTODUyKSLTJKu1GtbTE1fSYfxlyJ0ucgSfT2tYesv9N1T9OeHVvjVSs4aevfbtEpsqX31NaX/PfHbNPSsfyhBQ6CaBfSOHgObx+TFn+1xMESaVsncxdP2j5f+q/Q7JqaDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAKLCkQW3aIMNoitH0nav3G8373U5ARVWt0m9BGoNgEN7PzsnaO+T1sr8j10z3pf8+j++d4KPteBNKCmlUbLtem5622/H//0mBRTXdHEeWfCrX/xP2xKV/f0cz1MrMfvHBps08CpugYRaM1en4ZSd31qY8GhVF2X3ipd16lhvCCbXk8NL39pcIOv95qJNWauzZ9tGilpoDV77eqtFn8xtslIgDN77qAfz/fe1Ne6vp7mez7fdb3wvn1rcQNNA8R+11LoMgYMVep+/YMPCj002yNQkQKD46Oi4VZHq4mJFW92DNFBAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIHSCFREqHVubs6u0upsrZ0d0tnb4xykhwACVSmwz74Nt1bm89u00mKxlf00XOa3UuPTh9/2ewpLtr+6abjPxC2/TZyEBg4f6l0nf2IHLYMOg5pYb0tdnWcaDZn+2eiI6PdSNa2y++hQ/sFWve7/3r7uWlmzlE3fazvvGyx52DBzjnq+Gpwu5bXJHDvX97Z4g/zpyFA6DFoOr3c9h8ZlzkqkGhzVgK6pz5BXT5+RmTn/FRh1XRpsLWW7x8B7Xs9dDWgIICASb0qI3uHD3SItq91D9BFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoAQC5R9q7R5pt6zUTrfV4JYx9xB9BBCoUgEN7zz7zhEjZ6+3vC6m7bArC/ppB86ekyMXP/QzxZLtqwE/DdRpsC5sraelObRrW8hKq1Vq5VMTtyBf6Di5ntNQpAZbF7qeGvTTNep11yDsUjQNlGqwtZRNbfSYWkk0jOHRTNi22HB+KS2zj5V5zfn9w4DsOU2GOgfaWrOnDvSxvudNXD/9b4qJUG+gJ8vkCJRQYGhywnM0a1mD6BcNAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQRKK1D2odZYdHaHiOVIzGi1nf7R4dJKcjQEEAi1gAZ49Mtv04qr42vuKmgaDZL5ue26Bo+eKdMqrWqlAb8wt0wVSw24lkNTT1PVKos9Xw0ZfmlgQ85KqPrc/2KHWZd6jXpuGmwt1ToywctSV+0s9BpqKFJDyWGpIpvP+nW9JoKc7mO98P777qGi+nrNNchdita3ykxl5tdOf1CK5XIMBMpGQO/y0b62y7PeyIo7PGMMIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAsAKl+Q18gOeQEnnMPX3vUFKisah7mD4CCFS5gAZDTVSm29a9Ju8KmR9XrOzxJb/v5Pty4foNX3Msxc6Td6+Vh3rXLcWhCz6mBhL/xK58arISZMGLWGCH2O8Cc1r9VEPSYWga1HYHRrUq67/fNOIrxG363Cbv7l6wqqyJ44XxvBc6L/1c0kq/YXkt5Vprnf2e1DbWeWdgr6fjly7LmavXch2+oDH1LFWYeUOr/6qw12dm5OD5CwWdIxsjUA0Cw5NbPadpNTSK1MQ84wwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEBwAmUdaq3pGdpuV2l1pHs0zJrr9pHBETIzAgiUi4AGQ3914j3fy9VKhxpszac9aIcQ/dwifvrmTfnZO0fzOVSottEgmoYJy6lpMG3Hhv7AA5DFmCSW1aYrBLtDpMXMZXIffR9kqhDr912f2iga8AxT0+saZLVgfX+H8bzzuQbqEtYgtwbN1Tboz5GXTp3Oh2rRbQba/IdNFzuIvpZNVNjVKq0m/sBjsfXyPALlJtC1oU/0jh+OZkUk0tzmGKKDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALBCpR1qNWy5na7eXL+MtK9EX0EEKhagT1Hj4kGRf228TV33Q7zzTeXhvs03Omn6XrLLXzU09JcNhVa3ddGQ2M7f2+wZLcSdx9/vv59q1eH0lS9NGirgVa9RXzYAq0ZT711fRCVNNOvl/sGQ3vemfNf6LsGudUnjO2BdT2+/iggn3PaP3XKyGesvr709RBkW2d/tpo4xmsfnAlymcyNQNkK6B9HJifGPeuPNK60/44y2Pe356AMIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFDFAmX727nY+pGkSGSr+9rlum2kexv6CCBQvQIaEH360NtGAL7Qd8+C83y+d72vANLU9BXZd/L9BY8Rtic11LjTDvmZCF4t1bm1xRtksWtb6rWZqM4Y1Jo1zPcn9q3stbJmmNtwxx3GlxfmQGi+JxvWYG5XU5OMdDiK8ed7SgVtp3/kcPD8hYL2ybWxOgYRnM4+1gb7vea36fkeufih32nYH4GKFegfHRYNtzpaTUwiiRbHEB0EEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEAhOoGxDrXNz3iqtrZ0d0r62KzgtZkYAgYoQ0Cp1xy9d9n0uWt1wvlt3awjRb8DpqYOHfK+x1BPo7cyDCDdqwHfvseOilWvdXyaupdtJw3SlCNS5j1uOfQ3z6W3iw970/WiykqxWa57v/R92C/f69Pr9TwMb3MNL2p+8u7tkx3/JrtZqog20tZqYZt457llpV4v02bQyLQ0BBOYXiDclpHd4o2cDq6nNM8YAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggEI+AqQxPMQYzP2j3SblmzO+z7QDqm3viZzY4+HQQQQGA+AQ2M/tmmkfmeznv8oXvWy4Gz5xy3r9aQn94220/TOYMIa/pZ02L7amiwx749tqmmFQX3Hjshr53+QPTxfG3P0XfTQdqBtlXy2bVrjQUXtVrrwXMXFjz2fGtiPJwCo3d2yLPvHPW9OA2BljJ06XvBeUygIf2xzjtDUx1aP0dL1bRSq37G+A1n62egrlsrgptuHYnlRj7b9I86aAggsLBActtm+e2+/Y6NrGUNYtXFJXXjqmOcDgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIImBdwpkLNzx/IjNF1Q39pT/x/ZE+uVXV2fvNx7+0iszfiMQIIlIXA458eMxLe+e7Lry54m+WHeteJVlv027SCaHZQTufUuYttGoj663/Zt2CY0pSR3oZanfw2DXJ9dfOY71BYZh1ajVVdiwmHqf+D6+5Oh8sy8xX7/fkT78nTh98pdvd0yPfRIW/Ft6InzHNHDeiduXpNLly/IRdv3EjvpaE4De1peDEsTSvwXp+Zuf0+rY/WSEciIW3xBmOvpexz1eP9xxedQaXs5/N9/KXBDYFUadVrpqHDqenp9Ps/O9iu10+rIPe0tNhGy31Xgs51rnotvvObFxf87Mm1X/aYBtuX4jWfvYb5Hu+fOi0/+u1bOZ829d+D773xZvoa5jyIj8HJu9f6DlLr6+s7+170sQp2RaB6BJ76m7+Vk4ePOE44dfVDmT3lHHNsQMe4QCqVMj4nEyKAAAIIIICAV8CyyvJXBN4TYWRRAf59tSgRGyCAAAIIIGBEoJr+fcW/L4y8ZJgEAQQQQACBRQWq5d8Xdib0tkX5VWrt3loncnm3u0pr/+gwgdbbl5UHCCCQj4BWAU22r/YdnttihyhfOnU6HSLU0OC27jX5HH7ebX5lBykXqkw6745L+IQa+K1yqMvXUN337VCYVk4stmkQ9cSlS/LwxvvSIcBi59H9tHLlz+zKnsWEa/0ct5h9NcD6/Hsn05U2///27i82riu/D/iZ4YxEkSYpSiZNcxWLu7RlaUU2dCjBIioZkrdVFwnQGu1DA7RADLSANw+FgaLoPhRFUBTIQxG0QPOwKZICCZAWSQsH3aDZB7dwgHUT70LaSq7lrO2V/9v6Q8mSLVmW1qbIzqFDi3OHosi5Z4YznM8BuJx75t5zz/nM5Vjc+fJ3V5tvDEQe+YVfCAdHR+o5Te5jYmjzpQ8+/CrIutKAMST92Mhw0sq78Txx7XkracbQ5tQDaW9DHa/7P6+8H8Ug991aDOTGFoPose3duSP83T2PLAaAFzsS/E8MzcYq03cLfiY4RcOGiAHgeN0vD3LHcHC0jW3J7W4TeOnDc0n+yGFieKghodYUVbDjf6c0AgTWJjB17HBNqLXQW6lG31UO4fYXaxvEXgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQI1CXQdqHWrq7r364EWquSOKVyKUwdO1IXgIMIEOhcgRgcjcHWPFVVo14MycUg2O+//MpioDVPuDPOKVYpbacW13/koV1JphwN7xU+W8uJYsDt9069vFgxMs6v3haPjcHWGJRt1RbXGue31tuKx3BkDC2e+/TT3Nf+Wk1isDAGWV9874M1BbZjODFW1Twze3nxNYxh1FQtVqrNc40dG9udaiqL45y5dDk899PX1+Sy/MQx+H22UnU2Bsp/uVKZOFWLYecfnH1z3fNJdf61jhOvqXh9vP7RR5WvK1+FV9d6fHa/WMU0/izlrWQ8MXR/7uB0dm5fVuithOlyttMXL+YcweEEOkdgfGoixDuB3PjketWii4MjYf5y6/6boGqyNggQIECAAAECBAgQIECAAAECBAgQIECAAAECBAi0qUD9SZ8NWnDlDoS/kT310oeO2X7bBAgQuJdADAMuVUC8176rPR+DTPH20EcqAbM8LYbJVquymWfsRh0bK0bmCfIuzev7b5zNFTZcGmfpewyoxTHztlh5N08wNu/5Vzs+BqB/+8RP1hxoXT5WvPZXqwy6fN88j2P4Mt7S/geVirfrrUAcg4vf+8mpxSrIeeaw/NjRvr7lm+t6HMO18XpP1WK4Nt6ufr0uS+eP7xXxNUxxnS+NGb/nrTa9fKzUj6NVXO9v/p+XFsPZMcy9VI0177lSVDKN7xXxvwcp28Rw/vHi+2GsYqsRILB2genjx2p2LvYNVv6+su1+ha5Zhw4CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECrSzQVp/IlR85OFXJCkxlQVf6wDG7j20CBAjcTeD7b/zsbk+tq//4N8bWtX925xiujdUp260dqlQyzdvi2htRDTVWB80bWo6B3dQhtbxeMcQXK9E+/9bbuYaKodh6A5X3OnEMXMbwYZxnnnPEtcawd6qW5zbuBx+sKhSfa0oxYBgDrSlC7Oup1LuWSU+NPNByQe7oFIPR/+aHf7H4XpEqyLrc48S580lej4nhoeXD5n786M6ducdIEdjNPQkDEGgzgT3TUyHeEaSqdZVDsT//z2TVmDYIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEKgSyHxKV/VcC27c/m6lNE7VvIZ2jYb4pREgQKBegVgtMVb7m3pguN4hkhz33GuvJxmnmYMM9/YkqVyZN5y52prj2E//4uRqu9zzuRjyi9dIK7QY0v39//dKkqqLX1b6fC/8vT0PJ11aDGz+3umXk1VYPXPp8mI4OVZKzdvirdzrafG4mQQB7nju6P5fztRfoXWl+f/Rqz8Ng93dYfdA/0pPr6svBrkPjj4YYii8FVq8nqJXrDbayBZfl1MXZitrzxdejiH4WLE1jpe3xXEezVkdOM7jzOylvFNxPIGOE+gd6AuTR2bCqRderFp7ob9SPfkTP1NVKE3cOPev/0kTz9a8U43+2/9cczJrrSFpu45Of13/xf/+87Z7zdYy4d/6W7WVvK11LXKtvU+nv66t/eqYXaMF/Juj0cKNH7/T/83RSdewf3M0/uep0Wfo9H9zdNI13OhrqdXH76TX2lpb/Wq89/y8N/v/b+59lbT2Hq7hzrmGW/tKbMzs2qdS69jBkYWF8FSWQZXWrIhtAgTqEfj+6z9LEkCq59zxmBiYbHRgq965rXZcigqCMbAWQ4uNaq99dCX3a5unumfKdcVA6/d+cipJoHVpXq9eShvMia/n7/zfU8kCrUvzTHWNxJBgPS1eA/Uemz1fDE6m/nmPwcWUFW33V4KZrdDiNf8ffnwiudfd1nby/Pm7PbXm/nidpKruHEPK9QaxlyZ8tvKHG3mqJS+N4zuBThTYN3OgZtmFrT2h0JP/DwhqBtZBgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgMCiQH3Jjg3AK5UWvlOp0tq9/NSxes741MTyLo8JECBQl0AM/MRbsW9Ei2G0GKptx5Yi7Pn6Rx81dOnRN2+AMIbKUlQJzbPQGBb93VMvh9S3XY/jphpzKdAav6du565fTzJkrEJaT9s/lO6W8i++/349U7jnMbHqdAyBpmgPJwzx1jufeB3FEHeq63Mt84iGKa7fieE010uKcOzLFy+uZen2IUBgBYF4R5Dd+/fWPFMcSPMzXjOwDgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEQnuEWseOVsKs889kX6+pY0dCqVzfbYSzY9kmQIDAD997P0mYab2SL7zzXltW0YvVCPfmvC12tHq1gVVal16LGFTL23YPDOQdou7jl8Kijaq2mMInLu5PXnu9YT9DqeZY7qrvnz4pAtzRKIZOUwVP43jZduL8hWxXXdvx5zsGWzeqxSBrrPjbzEDr0lp/9OG5pYd1f49h1BSVffOGqWOoP1YG1ggQqF9g8vDjNQcXereHQrm+P5KoGUwHAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQJVAvUlO6qGaPxGV9enT1WqtI4sP1MMs+47VHs7yOX7eEyAAIH1CMTwz5++0dyKqTGk+MI7765nmi2zb7wtdorQ1oeJKkuuBnP11s3Vnl7Tcw/1b9ythlNVj7zbQm9+MXe3p9bV/0XlZ6hRLVW4sZ5KrTu2dYf4laK9lCAwudo8Tp47H+J7WYr2aILQer3zODN7uWEB6XvN6UTFMG+L7415q6wO9/bkvu7OVP5oINX1kNfE8QTaVSDeGaR/hffDwsAD7bok8yZAgAABAgQIECBAgAABAgQIECBAgAABAgQIECDQ0gJtEWpdWFj4blZxz4HHQu9AX7bbNgECBHIJxADQax9dyTXGeg7+/us/a9vAUYrKpTFs1ajqo8tfhxS3E09VqXP5vFrl8dVbt1plKqvOI8XruOoJ7vJkytf+zOylu5wlTXcM/6Z6DxsfHEwzqTYbJb4npTCcGM53e/KJoXzHR/YYctYIEMgvMPXk4ZpBiv07K3932Ra/TtfMXQcBAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBVhZo+U/hyg//0qFKsaupLOJKHyxm97FNgACBegRitdZmVLaLtyA/fbF9bws92L21Ht6qY2ZvfFa13aiNFMHZWKmzniqfjVqTcZsnMNTTk+Rk8TpMcS3eazLxvSVFi5VCO7X96IMPcy89VmrNU806b6XcGHA+e/Xj3OswAAECIUwemQnxTiFVrdgVigP3V3XZIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIEAgv0Dmk7n8A6YeYSGEmiqtI1/fHYZ2jaY+lfEIECCwKBCDli9VAk1HHvqFhor80V/9tKHjN3rw4d7e3KcY7bsv/NbfOpZ7nGYN0Ld1S1NCic1aj/OsTSDFtR7P9O4n19Z2wpx7nbt+PecIXx4eA5kxyN2MIG6SCSccJFZqjevOE2SPfjHYWs8fL2wrlULeCsGnL1xsyh9oJGQ3FIGWFYiB1hhsPfXCi1VzLPRVqrV+3L5/oFS1GBsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEWkSgtSu1jh0cqTh9O2t14PjRbJdtAgQIJBX4s7NvNTTIdeLchZCqmmLSha9jsE6s4hiDZlrnCQwnqtTarJ/5NxNW5/xaJXjeiS1W646h0LxtYnioriHyVmmNJ60nTFvXZB1EoEMEJp84VLPSwtaeUOjdXtOvgwABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACB+gVaOtRaKs09G0Khe/nyegf6wvjUxPIujwkQIJBcIAaafnD2zeTjxgEbOXZDJrzCoDHcmaeC4QpDtkWXUGtbvEzJJ7ljW9U/ReoeP1UF1XtNIN52/srNW/fabU3PDyWoyLymE7XgTifOX8g9q1ipNVZsXW97dGel+mOOFl//lOHmHFNxKIFNIzD4wPCKv4cW+/P9vG4aIAshQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgkEhg/Z+yJzrxPYcZO1pJkBSezu43fbx9blOdnbttAgTaSyBWU23E7cKff+udhlaBbYbytspteDuxdeq6O/G1XlpzDLTWE0pcOn7593g7+2a1q7fShFo7OcgdK+vm/W9AvHZisHW9bWJ4/ccsP8fpi/mrzC4fz2MCBL4U2Pf4dA1FrNRa2JLmjx9qBtdBgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoAMFWjbUWi5ff7oSah1Z/pqUKiGqPdNTy7s8JkCAQEMFnnvt9aTjx1DbD997P+mYBmueQN+Wrc07mTNtOoFmhlpT4fVtKacaqi3HOZmiWuvw0LrWPj64PeQNE5++OLuuc9qZAIG1CcQ7hsSKrdlW6F/fz3n2eNsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECNwRaNlQ6+3b4Zk70/zy0b5DB0LvQF+22zYBAgQaJhAr9b30wYfJxn/up6+Hufn5ZONt1ECD3Z1ZkSxVxc6Net2cd/0C3aX2rEocbz+fopWKXSmGadsxTl+4mPs9O1ZqXc97x6M7d+Tymr3xWYj/7dIIEGiMwOQTh2oGLg5UqisXWvZX65r56iBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECDQygIt+clb6eEDRyt3a60pyTr5xEwrW5obAQKbVOD5t94JN+fmcq8u3sb6zKXLuccxwMYJdHrVyo2T37gz562YuXzmqYKmy8ds9ONyV0v+U7HRy/5q/Pjen/d9OwZaY7B1rW1iKF/FxxTVZdc6V/sR6ESBySMzYWvPtuqlVwKti8HW6l5bBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAjUIdCiSYX5Z7Nr2bVnPAztGs122yZAgEDDBeItw2OwNW977rXX8w7RMseXKyGtTmydXrWyE1/zdl3z1VtpKrX2bdnSrgTJ5p2iWvfE8NqCqju2dYfh3p5ccz998WKu4x1MgMDqAqVyKXyzcgeRbCtufyDbZZsAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgToEWi+VNHZwpHLvxqeya5k6djjbZZsAAQJNE4ihpjy3c857fNMWusYTfTE/v8Y9N9du1z//+eZakNXcUyBVgHuuQ39m7gncBju8efXjkLfKbqzUGiu23qvtz1mlNcVc7zVHzxMgEMJj31rhd9PSllDo3Y6HAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIGcAvf+dD3nCdZ7eKm08N3sMf07d4TxqYlst20CBAg0TSAG0r7/xs/qOl88NkWl17pO7qCkAnPzC0nHM1jrC6QKcK8l0JhSo29LOclwN+fmkozT7oOcPH8h1xLi6x+Drfdqa9lntTFOX5xd7WnPESCQSKBvx8q/nxYH1laVOdE0DEOAAAECBAgQIECAAAECBAgQIECAAAECBAgQIEBgUwq0Vqh17Gh3CAtPZ6WnnlyhEk52J9sECBBosECsgHfm0uV1n+XPzr4Vrn/++bqPa+UDNtt61mrdqeteq89m3C9lhdW+LVuaRlQqdiU5180vhFojZKy2nbdNDK8edttWKoXdA/11nyZeq2dmL9V9vAMJEFifwOThx2sOKPT0h8LWnpp+HQQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIrF2gtPZdG79nuXTtOwuhUHXPxlK5FL556EDjT+4MBAgQWIPAcz99PcTg0VpbrPKYIgy11vM1a79UQb/XProS/vydd5s17dznmb3xWe4xDNBeAimDzOWu5v0tUd/W5gVo2+sVrW+28TqI71d7K3cPqLfFKqyxYuvd3j8frYydp6Lv2cofXqS8Xutdp+MIdIrA7v17w9Cu0XDpg3NVSy707QwLP/fvhSoUGwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgTWIbD2ZNY6Bq1319vzhV+rfNZf1SaPzIStPduq+mwQIEBgowRiYOh7Pzm1UadvmfN+cXs+yVzKlTf9WAFXI9AJAt3rCMTn9Yg/Wyna1Vu3UgyzKcY4ee58rlBrDKzGYOvpi7Mrejy6c+eK/WvtjPPTCBBorsDkEzPhhf/6XNVJiwP3h/krlaDr/O2qfhsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECKxNIE3iYW3nWnWv0vgvfbvyWf9Udqd9M6q0Zk1sEyBAYKMFYrj3btUG1zO3Zt6OfT3zsi+BJYFrP/986WHu7/1bmlc9dT0VpVdbWIqf89XGb6fnzly6nLsS6sTw0F2XvPf++qvAxtcpzk8jQKC5AvsqdxTpHeirPmmhGIr9+ULq1QPaIkCAAAECBAgQIECAAAECBAgQIECAAAECBAgQINBZAi0Tag2F8EyWfnxqYvGWjtl+2wQIECCw8QKzN/LfWtct0jf+dTSD1QViWDDFtR7PMtTbu/rJEj0bK4KO9t2XZLTZGzeSjLMZBonXwukLF3MtJVZqja9Ptu0e6A95Qv4x0CqAnFW1TaDxAqVyKeyZrvm7zFAcHGn8yZ2BAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwCYVqP1UfQMW2r13aiyEwlPZU+97fDrbZZsAAQIEWkTgSoLbkqeqJtkiJKaxSQVSXOuRJgYXm9FSnmf2s/zh9WasuVnnOHH+Qq5TxUBrDLZm26M781V1PHnufHZI2wQINElg+vix2jN1lUOhd3ttvx4CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBO4p0BKh1rm50rPZmQ4+MBxipVaNAAECBFpTIFUFx/FBoY/WfIXNakngUqJqpaP3pameujSvu30f7cvcCvtuO96jP2WV2nucqm2ePnf90xC/8rSJ4aGaw1cKutbsdJeO659/Hs5e/fguz+omQKDRAr0DfSv+3lrcPtzoUxufAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwKYU2PhQ69jR7hDmv5PVnXziULbLNgECBAi0kMClRBUcxwcHW2hVpkKgVmD2s5u1nXX0DPf2hGZUJ05VqfXKzVt1rHLzH5K3WmsMsMaKrUttx7buMNpXf+D5zOylEAPIGgECGydw4PjRmpMXtvWFwtaemn4dBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAisLnDnE/XV92vYs+XStUqgtVAJtt5ppXIpTB6ZudPhEQECBAi0nMCbiSoDqtTaci+tCWUE3v3kk0xP/ZuNvt5jWDLVOd795Fr9C93ER548dz5XiDS+Rssrsz66c2curdMXZ3Md72ACBPILjHx9dxjaNVozUKH//po+HQQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIrC5Qh+jKWgAALvZJREFUWv3pxj+7EMIz2bPEQGsMtmoECBAg0LoCsYpj/IpVBvO0WFUyVq+8OTeXZ5hVj40Bsqd/cXLVfdby5B//1U/DiXMX1rKrfTaRQLzdfLw+U1RZPTD6YDhz6XLDdPbu3BH6tmxJMv7rH32UZJzNNki8FuJrOPVA/bcWnxgeCkth1P2V96d6W3wPTvUHBvXOwXEECHwp8Ni3Dofn/+C/VXEU+3eG+SvnQ7j9RVW/DQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIE7i6woZVau8ann6pUad2bnV78QFAjQIAAgdYXSBGmilULY9CvkW24tzfJ8Nd//nmScQzSfgJnZtMEUWPAOlXodCXFQ7u+tlJ3XX2vf3SlruM64aBYrTVPi9dBfO+LXw8Pbq97qNMXL9Z9rAMJEEgr8Mj0Y6F3oK960EIxxGCrRoAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIDA2gU2NNRaKCz8Wnaq41MToW/Hjmy3bQIECBBoQYFUlRyfHHtoMdzVqCWO9t2XZOhrnwu1JoFsw0He+vhqslkfeWhXsrGWDzTc2xNipdYULQbWG1k9OcUcN3KM1yqB31gltd4Ww6wx2Bpfr/i43nbyvMrR9do5jkBqgXinkX2HDtQMW9xeqepcCbdqBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAisTWDDPl0rP3JwqvLpXqVSa3WbPPx4dYctAgQIEGhZgXgL7rn5+dzzi5UrZxJWmMxOaPS+NKHWqzlCbNk52W4vgdcuX0lyrcdVH6xUJm5EtdZju3cnQ40/29rqAnkDpRPDQ2H/0NDqJ1nl2dkbn4X4pREg0DoCU8eOhBhurWpd5VDo6a/qskGAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwN0FNizUurBQW6V1aNdo2L1/791n6xkCBAgQaCmBGGh96YMPk8wpVmvdVsoEQRKMHKtXxq+8LVZlVLkyr2L7Hn+9UqU3VudM0WKg9R9P7s9VoTM7jyMP/UIlLDuS7a5rO/5cn77gtvb3wjt5/vy9dln1+cVKrffXX1k3b6h21cl5kgCBugR6B/pW/H22OJjm/bmuSTmIAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQJsJbEyodWxqewjz38la7ZupvV1jdh/bBAgQINBaAicS3f46Bv1+ffqxpMHWeFvvp//GZBKweDt2rbMFnn/r7WQA44Pbw688/I0k48Wx/t6eh5OMFQeJQfUY4tVWF4hB9zxB5/j+lKdi7+mLgserv0KeJbAxAo//yt+uOXGhuzcUtub/A5uagXUQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQ2IQCGxJqLZeKT4dQ6F7uubVnW5g8MrO8y2MCBAgQaAOBc9c/DakCn6N994V/lLCC5T/Y+2iSKq3xZXjr46tt8GqYYiMF4rUev1K1WF31ybHduYbbPdAfnv7FNMHtpYmkCqovjbeZv588l69aa7028T03hmo1AgRaTyDefSR+ZVtx+3C2yzYBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAisIbEiodX6h8Gx2Lt88dCCUyulvO509j20CBAgQSC/w5++8m2zQvTt3LIb08lQwjBUQj3/j60lvx/7a5TS3nk8GZaANEUhZrTUu4Jcr1Vr/5czjIV7362k7tnUv/pz8s4PTSasbn7l0OWlwdz1rasd9o9fNubmmT/30xdmmn9MJCRBYu8D08WM1OxfuGwyhq1zTr4MAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgWqBpqdIu8annyoUwlj1NEJY6YO/7D62CRAgQKA1BeItuGO4a2Lo/iQTjAG/f3V4ZvE26C+8896ab4Uew6wzu75WqX75UK7bemcXEdfnduxZlc7cjtd5vB7WG0JdTWu4tyf808d+cXHcWPkzXmvvfnItzM3PVx0Wq7JuK5XC/srP2cHRB0O83lO2eL4/feNnKYfc9GNFs9MXLi6+7zRrsfGcZ2YvNet0zkOAQB0C41MToXegL9z45PqdowvFUBwYCvNXzt3p84gAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgRqBpodaC4XCMyEsVE1k6UO/qk4bBAgQINBWAjEMF4N+qYJ2cZx4e/YYUj2xGPT7olJB8vpiVcSlwF8M+cX9xgcHK98Li0G/PBVe7wb+ow8+vNtT+jtQ4E9ee32xumqqa32JMP78LA/Lxgqg565/GmLotRHX9dJ5l74//9Y7bmm/hLGO7y99eK6podazVz8Wsl/H62NXAhshEO9AMlH545wf/9nzVacvDtwf5q9eqPw6XP1HC1U72SBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECDQ4QJNDbWWHzk4tbAw/+2s+dSxw9ku2wQIECDQZgJXbt4KP3zv/UqV1N1JZx6DgzHYulEtBmhjZU6NwJJAvNZjBeHj3xhb6mrI91iVdXxwe0PGzg46e+OzxZ/fbL/tewvE4HH8Gu277947J9gjVvPVCBBofYHJI4fCT55/Icx9MXdnsl3lUOjdHhY+9e+KOygeESBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEKgWSHvf2uqxa7bm5+crVVqr29Cu0bBrz3h1py0CBAgQaEuBWOnxzUoVwc3S4m2+n6tU5dQIZAVeeOfdTXOtx4qwf3jm1RCvd60+gRPnK5UXm9Dia3Tm0uUmnMkpCBDIK9A70Bf2HHisZpji4AM1fToIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIELgj0LxQ69jBkUJh4ek7p/7y0eQTM9ku2wQIECDQpgIxcPX7L7+yaW5hHivPxgqMGoGswNK1Hiuctnv7L6+86jrP+SKevnCxKaHgUxdmm3KenBwOJ0DgrwWmnqy9I0lha0+IXxoBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAisLNC3UWi7d/tUQCt3LpxGr1+w7dGB5l8cECBAg0OYCserj751+ue2DVyfOXQg/OPtWm78apt9IgaVrPX5v1/bca2+E1z5yG+y8r9/1zz9vSgXVly9ezDtVxxMg0ESBeFeSka/vrjljcceDNX06CBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBD4UqBpodaFEL6bRY+B1lK5lO22TYAAAQJtLhCrV/7HEz8J7Rr2i4HWP/6rn7b5q2D6zRC4cvNW+N5PTrVldeLvv3E2vPTBh81g6ohznDx3vqHrjMHZs1c/bug5DE6AQHqBA8eP1gxa6OkPoatc06+DAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIEQmhJq7RqffqpSpXVkOXgMs04dO7K8y2MCBAgQ2EQC565/Gn67EmyNob92agKt7fRqtcZc47X+O//3VIhh7nZoc/Pzi6HtF997vx2m2zZzjBVvY/C0Ue3M7KW2r4DdKBvjEmhlgd3794Z4h5KqViiG4vbhqi4bBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAh8KdCUUGvlVDVVWlf8cM+rQoAAAQKbSiCG/GLYL4b+2qEJtLbDq9Sac4zh7RjifveTa605wb+eVaye/PsvvxLita6lF3jpg3PpB/3rEU+e95o1DNfABBooEP+Yc/r4sZozFPt3Vv7us1m/jtecXgcBAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBlhVo+Kdo5UcOThUK4VBWYKXbMGb3sU2AAAEC7S8Qw37//scnwg/OvtWyVQZj0O/5t95ZrF7Z/uJWsFEC8TqKwdZWvdZPX5wN/+4vfxxiRVGtMQInz59vyMDxfbTVA9MNWbhBCWwSgT3TUyGGW6taVzkU+warumwQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIBBC5pO19CTz8/PPVkKtVW1o12gY+fruqj4bBAgQILC5BV54591w5tKl8A+/uS/sHuhvmcW+9MGHi4HWRt42vGUWayJNEYjX+umLF8Pf3/to2LtzR1POudpJ4rX9x6/+VJh1NaREz8Xw6ZtXPw7jg9sTjfjlMPF60ggQaF+B3oG+sO/QgfDKiz+qWkRhYDiEax9V9dkgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0OkCjQ21jh0cKRRu/2rlvopVzivdfrFqBxsECBAgsCkFZm98tljJMgb9jo3tTh78Wg9arFb5p2/8LMQ5aQRSC8Rw4++dennxGo/X+kaEW+Mc/tfbb4dTF2ZbtkpyavdWGC9Wa00daj15/kIrLM0cCBDIITD5xExtqHVrTyh094aFWzdyjOxQAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAptLoKGh1lJp/ulKoLV7OVmsUjM+NbG8y2MCBAgQ6DCBGCiNX6N994UnK4G/Rysh122lhv4naVF4qYpiDJ3FaooagUYLxOssfjXzWo/nixWIT1+cbfTyjL+CQAwR/909jyR7Tzt3/VPh+xWcdRFoN4F4t5Jde8bDB2+8WTX14uBIuH2+uq9qBxsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEOkygcQmisaOVMOu1Z7NVWicOz4RSuXGn7bDXz3IJbEqB965dC1dv3cq9tptzc7nHaNUBUhmdu359Q5cYw1p/+Mqri3OIlSz3D90fJoaHQt+WLUnmNTc/vxiefb0SoH3z6tWmB8PiNZgiPHvps8ZWk71662aSeTb6Zy7VdZ/k4lrnINlrPQa59w8NhR3bqv72Z52jfrn79c8/D69dvhJe/+ijyteV0OjXYbVJtss1v9oa8j4X33dOX7gYZnZ9Le9Qi8cLJydhNAiBlhCYOna4JtRa6N0eQlc5hNtftMQcTYIAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIDARgs0LF3a1XX925VA68jyBcYw6+SRQ8u7PCZAgECNwFLIseYJHV8JbEajpeqtz732xmKFw1jZcrSvL+zo3rr4fWnxpWIx7B7oX7yd+rufXFvqDu9+8kmlb2Fxe/bGjcVg9PLnv9qxiQ9ikPF7PznVxDPWd6oT5y6E+NXqbbNc90vX+vffOLtIHm9VH8OtQz09lWt74J4vQwyjX7n18xC/x2tsI0Os2cm2yzWfnXfq7bW8jms954lz59e6q/0IEGhxgXjHknjnkhufVP9RUazWOn/5/RafvekRIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQaI5Aw0KtCwvhNwqF6kUsfYhX3WuLAAECBAhUCyxVe0xR5bR6ZFsEWk8gXudvXm29eZlRfQJfBvLvq+/gzFExAB0r8WoECGwegenjx8IP//ufVi2o2DcY5j/6MISF+ap+GwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQ6UaDYiEWXHzk4VSmkN5UdO36ApxEgQIAAAQIECBDYrAIzXxtNtrSXL15MNpaBCBBoDYFvHjoQ4h1MqlpXORT7d1Z12SBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECDQqQINCbWGcPu7WdChXaMhfmkECBAgQIAAAQIENqNA35Yt4eDog0mWNjc/H05dmE0ylkEIEGgdga0928LkkZmaCRX676/p00GAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgEwXSh1rHDo4sLISnspiP/8rfznbZJkCAAAECBAgQILBpBJ4ceyiUKrcrSNFioDUGWzUCBDafwL6ZAzWLKmztCYWe/pp+HQQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQ6TSDNp+7L1Eqlhe+EUOhe1hV6B/rC7v17l3d5TIAAAQIECBAgQGDTCMQqrTO7vpZsPSfPn082loEIEGgtgXgHk5V+Py4ODLXWRM2GAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwAYIFJKec+xod6l07e1KqHVk+bh/86lfDgf+zrHlXR4TIECAAAECBAgQ2BQC20ql8I8m94e9O3ckWc+Vm7fCb/7FS0nGMggBAq0p8ObpM+F//qc/qJnc7XdfCQtffF7T34kdC5VbwGgECBAgQIBA4wUKhbQfETR+xs5Qr4B/X9Ur5zgCBAgQILA+gU7695V/X6zv2rA3AQIECBCoV6BT/n1Renj6K6KklVrL5Wu/mg20lsqlsO9Q7e0Vv5qBBwQIECBAgAABAgTaVCBWaP316ceSBVojw8nzF9pUw7QJEFirwPjURBh8YLhm98LAAzV9OggQIECAAAECBAgQIECAAAECBAgQIECAAAECBAh0kkDSUOvt24Vns3h7DjwWegf6st22CRAgQIAAAQIECLStwI5t3eH4N74e/vmhg2G0776k63jpgw+TjmcwAgRaU2DyiUM1Eyv276z8nWjSX9NrzqGDAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQCsLlFJNrvzwLx2q3BxwKjve1JOHs122CRAgQIAAAQIECLScQAyqDnZ3rziv0b6+sK1Uqnx1hYcGBsLugf4V98vbeebS5XD9c7cez+voeALtIDB5ZCb85f/4QZj7Yu7OdItdoThwf5j/ePZOn0cECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEOkggWai1Emj9btZt157xMLRrNNttmwABAgQIECBAgEDLCRx48MFK9dWxDZ3X82+9vaHnd3ICBJonUCqXQgy2nnrhxaqTFvoq1VqFWqtMbBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECHSOQJpQ69jBkRBufzvLNnVMldasiW0CBAgQIECAAAECKwnEKq3nrn+60lP6CBDYpAKTTxyqDbVu7QmF3u1h4cbHm3TVa1tWoVBY2472IkCAAAECBAgQWJOAf1+ticlOBAgQIECAwDoE/PtiHVh2JUCAAAECBNYlUFzX3nfZuVSaezaEQtW9WnsH+sL41MRdjtBNgAABAgQIECBAgMByAVVal2t4TKAzBAYfGF7x9+Zif6Vaq0aAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgAwXyh1rHjlbCrIWns3bTx49lu2wTIECAAAECBAgQILCCwIvvva9K6wouugh0gsDk4cdrlhkrtRa2VP3daM0+OggQIECAAAECBAgQIECAAAECBAgQIECAAAECBAhsRoHcodZy+frTlVDryHKcUrkU9kxPLe/ymAABAgQIECBAgACBFQSuf/55eP6td1Z4RhcBAp0gsHv/3jC0a7RmqYX+oZo+HQQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQ2u0DuUOvt2+GZLNK+QwdC70Bftts2AQIECBAgQIAAAQIZgT985dVwc24u02uTAIFOEtg3c6BmucWB+yt/P5r7V/aacXUQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQaGWBXJ+QlR4+cLRYDDUlWSefmGnlNZsbAQIECBAgQIAAgZYQiBVa37z6cUvMxSQIENg4gckjM2Frz7bqCVQCrYvB1upeWwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQ2tUCuUGsI889mde5268TsfrYJECBAgAABAgQIdLLAi++9H55/6+1OJrB2AgT+WqBULoVvVu54km3F7Q9ku2wTIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQ2NQC9Ydaxw6OVO6F+FRWZ/Lw49ku2wQIECBAgAABAgQILBP4/htnQ/zSCBAgsCTw2LcOLz288720JRR6t9/Z9ogAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIDAJheoO9RaKi18N2vTv3NHGJ+ayHbbJkCAAAECBAgQIECgIjA3Px/+8JVXQ6zSqhEgQGC5QN+OlX+fLg4MLd/NYwIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQKbWqC+UOvY0e4QFp7Oykw9uUJlmexOtgkQIECAAAECBAh0oMC565+G3z31cjh9cbYDV2/JBAisRWDqWO3v1IWe/lDY2rOWw+1DgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoO0FSvWsoFy69p2FUKi6B2KpXArfPHSgnuEcQ4AAAQIECBAgQGDTCsQw6/NvvR3OXLq8addoYQQIpBHYtWc8DO0aDZc+OFc1YKFvZ1j4+WdVfTYIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIbEaBukKtt+cLv1bM1HidPDITtvZs24xG1kSAAAECBAgQIEBgXQJz8/Ph7NWPw48++FCYdV1ydiZAYPKJmfDCf32uCqI4cH+Yv3ohhNtfVPXbIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQILDZBNYdai2N/9K3QyFMZSEmnziU7bJNgAABAgQIECBAoCME3v3kWjh3/Xp479q1EB/P3lBRsSNeeIsk0ACBfZU7oPz4z54PNz65fmf0QjEU+wbD/Mezd/o8IkCAAAECBAgQIECAAAECBAgQIECAAAECBAgQILAJBdYdaq0EWp/JOoxPTYTBB4az3bYJECBAgAABAgQItI3AyfPnw5tXr95zvjfn5ioB1k/vuZ8dCBAgUI9AqVwKe6anwqkXXqw6vDg4ItRaJWKDAAECBAgQIECAAAECBAgQIECAAAECBAgQIEBgMwqsK9TavXdqbG6u8FQWYt/j09ku2wQIECBAgAABAgTaSuDKzVshfmkECBDYaIHp48fCKy++FOa+mLszla5yKPRuDws3Pr7T5xEBAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBTSZQXM965uZKz2b3jxVaY6VWjQABAgQIECBAgAABAgQIEMgv0DvQF3bv31szUHG7O6TUoOggQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBDYVAJrD7WOHe0OYf472dVPPnEo22WbAAECBAgQIECAAAECBAgQyCFw4PjRmqML2/pCYWtPTb8OAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAptFYM2h1nLpWiXQWqgEW++0rT3bwuSRmTsdHhEgQIAAAQIECBAgQIAAAQK5BUa+vjsM7RqtGafQf39Nnw4CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECm0VgzaHWhRCeyS76m4cOhFK5lO22TYAAAQIECBAgQIAAAQIECOQUeOxbh2tGKPbvDKGrXNOvgwABAgQIECBAgAABAgQIECBAgAABAgQIECBAgMBmEFhTqLVrfPqpSpXWvdkFr/QBW3Yf2wQIECBAgAABAgQIECBAgMD6BR6Zfiz0DvRVH1gohsVga3WvLQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQKbQmBNodZCYeHXsqsdn5oIfTt2ZLttEyBAgAABAgQIECBAgAABAgkE4p1RJg7P1IxU3D5c+bvTNf06X3OsDgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQKtLHDPT8HKjxycqnxaVqnUWt0mDz9e3WGLAAECBAgQIECAAAECBAgQSCoweeRQiOHWqtZVDoWe/qouGwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQ2g8A9Q60LC7VVWod2jYbd+/duhvVbAwECBAgQIECAAAECBAgQaFmB3oG+EO+Ukm3FwZFsl20CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECbS+weqh1bGp7CPPfya5y38yBbJdtAgQIECBAgAABAgQIECBAoAEC08eP1Yxa6O4Nha09Nf06CBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECLSzwKqh1nKp+HQIhe7lC4xVYiaPzCzv8pgAAQIECBAgQIAAAQIECBBokEC8W0r8yrbi4APZLtsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIE2lpg1VDr/ELh2ezq9kxPhVK5lO22TYAAAQIECBAgQIAAAQIECDRIYMVqrb2Vm6t0lRt0RsMSIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQaL7AXUOtXePTTxUKYSw7pZU+SMvuY5sAAQIECBAgQIAAAQIECBBIJzA+NRHinVOqWqEYigNDVV02CBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECLSzwF1DrYVC4Znswlb8EC27k20CBAgQIECAAAECBAgQIEAgqUC8Y8rUsSM1YxYH7g+hEm7VCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECGwGgRU/+So/cnAqhIVvZxc4dexwtss2AQIECBAgQIAAAQIECBAg0ASBfYcOhBhurWpd5VDo3V7VZYMAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAuwqsGGqdn5+vqdI6tGs07Noz3q7rNG8CBAgQIECAAAECBAgQINDWAr0DfWHPgcdq1lAcfKCmTwcBAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBdhSoDbWOHRwpFBaezi7msW+p0po1sU2AAAECBAgQIECAAAECBJopMPVk7e/mha09IX5pBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBNpdoCbUWi7d/tUQCt3LFxarwTwyXVsNZvk+HhMgQIAAAQIECBAgQIAAAQKNFYh3URn5+u6akxR3PFjTp4MAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAuwnUhFoXQvhudhH7Dh0IpXIp222bAAECBAgQIECAAAECBAgQaLLAgeNHa85Y6OkPoatc06+DAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQDsJVIVau8ann6pUaR1ZvoAYZp06dmR5l8cECBAgQIAAAQIECBAgQIDABgmMT02EeEeVqlYohuL24aouGwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgTaTaAq1FqZfE2V1t3799Z+WNZuqzRfAgQIECBAgAABAgQIECCwiQSmjx+rWU2xf2fl71Szv+bX7KaDAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQMsKFJZmVn7k4NTCwvyppW3fCRAgQIAAAQIECBAgQIAAgfYSmJ99J8xf+6i9Jm22BAgQIECAAAECBAgQIECAAAECBAgQIECAAAECHS1Qenj6q/WXlh7Nz88/W/gq4rrU6zsBAgQIECBAgAABAgQIECDQLgLF4bEQvzQCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAEC7SjwZYx17OBIqXT77cp9CrvbcRHmTIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0N4CxTj9Umn+aYHW9n4hzZ4AAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0M4ChTB2tLtUuhartI6080LMnQABAgQIECBAgAABAgQIEPhSYG6u+GB458QFHgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAu0k8P8BZhdTehAYdQ0AAAAASUVORK5CYII=" preserveAspectRatio="none" id="img2"></image><clipPath id="clip3"><rect x="0" y="0" width="7588716" height="3314007"/></clipPath><clipPath id="clip4"><rect x="1237" y="5547" width="789" height="241"/></clipPath><clipPath id="clip5"><rect x="1237" y="5547" width="789" height="241"/></clipPath><image width="180" height="207" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAALQAAADPCAYAAABcKPswAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEbAAUAAAABAAAAUgEoAAMAAAABAAIAAIdpAAQAAAABAAAAWgAAAAAAAACQAAAAAQAAAJAAAAABAAOgAQADAAAAAQABAACgAgAEAAAAAQAAALSgAwAEAAAAAQAAAM8AAAAAyu+mMgAAAAlwSFlzAAAWJQAAFiUBSVIk8AAAK9ZJREFUeAHtXQl8FEXW7+qZSTI5CTcoggcC4oeiIHihqLuoq+vtrqiA4ooHQsIRbghnuEXEAxFvPFDU9VpZFQQRQfFAuQQXQQ7lTAJkMpPp7vr+NclggJme6umeme6Zrt9v0p06Xr169e/Xr14dTQQ7xFQCY3f7biGK2F0R6Pjik9N+iGllNnGB2DKIjQQm/uE7W5HJY6DetaYGBdJ+xkFcY4Y3IftiU6tN1Qa0wRgo/pXWcaX7R1NB6AvSzhDky5FW3LCJ68k+hPhDpNtROiRgA1qH8GoXLaZUdO329yJEKAFgG9ROC3O/WRRIwbCmro/DpNvRUUjABnQUQju+yIQ/qjqLijAb8R2OT4v8P/0P8gwc3jR9U+S8do5IErABHUlCKukT99BGDsU/SaBCL2TTI0u/QOns9Kq0iYWnkjKVKu2kCBLQ0wkRSCdv8lxKXeW/+/tSIoxBK3MNbOl+mCzDPY1czxUTohhIN2VI2YDW2NVT9vj/QhQ6C8XaaCzKn50IawVKCoqauJbxF7JzMgnYgObEQcnvlS2cgnO6INCbOYvozobOWegX5KHDmri36SaWIgRsQEfo6Jk7qJs6q4bg2S9C1owI2WOR7MVDNFUhadMGNyYVsaggmWjagFbpzRl7MMtHyQxkOUUlW3ySCNlFqFJU0CjtdUIIPIN2CCUBG9AhpDLzd19bgYiPQThXhEhOaBSQ/JVIaUFBk7RvEsqISSu3AV2rYyYfpHkZVRLzXISb5auVO6G3cLCQFxSHY2RhA/J7QjkxWeU2oNEhlFLy+B7/PVQgk/BvQ5P1kRo7R+ADn0COOB/r15L41DKmSlrKA3rOH1WdKCFslq+jdTudbAWwBzzS2PmeddtgDOcpC+hZmOVzUnmSQGgviDIp5IBGfCZSpfDBxunrjIGH9agkRUdqETsWETnr7/H3haegGOWMnOXTwkYs88rQ1k+m+51j72tGDsayIjPSTilAP7nHfxU6gc3ynWXGzjCYp4N4+4zZ28A1F9PoksG0TUsuJQA9Zzdt7nJKM+Dyitssn4l6fD2zrx9o5PrERDzFjJWkBjSb5cvKkIrQoWyWzx0zKVqAMB7md52CXHRfw4xfLMBu1CwmLaDn7ZFuhrMWs3y0edTSSb6CVWjSTAdxlvRuQA4nX/OSZHRfu2OeO+A7S5ZFZicze9kOoSXwBxXo8H81cL2YbNPoSaOh52KWzyFLo9F/j+AXai9f6K5N4Vh0/hpZEQrub+RamSxisDyg2Szf8/v8vQRCStApVprlMxGG6AJZdA37Vz2y00RMRcWKpQH9/P6qjlgI/zhafkFUrbcL1ZaAh1JhcrbPOeP2ZqSydoKV7i0L6BcP0XrUK32AOb5OVhK42XklAv1NoOKgng2db5md11D8WRbQrDHM3Hj5gL8nFhVNxL9NQjXQjotaAssFhRb2bJj2fdQUElDQ0oAOymvhXprtE6Vh+L8Qv0TsKgmykmxXnPZEnvVXOUbda5HTnpIC0EEUvVZa2UKWndPw/y3BOPtqiATKsUlm3JG6rjlmP+3JtIB+9YD/auzj74sDXKZ1b6Bt9/OCvf4ughhYs3GuId1pEwlKYDMVSeFddZ3scBxTBtMBesFeb0vidEzHdPX1RyVG6asydQ2+W8PuDKyqE1sd9PeGfT0edGx33lFh6r/BNPrH2N84oHt98532ZBpA/3sfzfESaQQAWADPRVoIsWOqloxtVFec3VXD6rFXDtBcpyCNhHuvXxi6IaqyozgkgIMm6Zx00Tn+pnzznPaUcEAzT8WbB+SeWHfBJkYaRRYkWU9Epe/t+drMEAwczxCc8nT4Wv8euQ47B7cEqLCfiGTkhnzxWSxTTfhpTwkF9MID/guhddkZypoPOQQwX3VSx+BbNJghrJPe2O+/ihBxJrTL2dydZmfkkABZSxSl8LYGrs85MscsS0IAvXA/PUkUZXbs7J1omR4eDuPcjOJ6dcXHtZghCyl1iKXKA1igU4z668VMuqlJ+E2JOIbckU+2JaL5esCkmd+PKE2vKFUGYjaK+YyzNBMIW4CsU4jyiFYzZEEZzU+XcWwBIQ+BtL2gKax8NSd4CaXTK/zOKT3ifNpT3AC9qFS6Cef94Gw44VTN4uEv8CqVtZshb+2nrfHGgBkiXM1flZ2TQwK7kGfoTfmOV+O1TDXmgH77oO9sQh2zYFjE6xSigBmSp9EMYZ3zzkHpGtjW2BRAWnN0lp2FXwKrFEXpf0v92J/2FDNALyynddMUuRhtfhA/B3/bjcoJM0RQHrlZozdkKXaFl5UpfWEWjQIn+UZxY9PBzIIgvChLjhFaB/JaZGc4oNmAK6NU6UMxVQpG6mphJkZ5F0iSo0irEN/Gaj5RkcfBTLoffCXggYyRNBJP9gjMj4nOPHHWtcT4054MBfR7Zf4rCGXbn0znEjuMCZvinDravCGs7z+AySQQx0yoF3tLl7EPw1aQG3R9vvNdI8kaAuj3yuiphMrTQMzkxwRUmyF/12iGMIF/WCrdADcfFj6RM4zsAJuW8JlA5cLr6hpz2pMuQC/+g2ZJ6dJQuL0GomOss2yTCgscaY6iq7O0ndwJcyotq1zpjzfQCLQ31wajYRKQMVH2dJXDUXxzLjmgh2rUgP6oXLoTZv5kVH6SHgYSWDbgDcmIwgxZfJg2VGR5IsyQe8C/mMA2JFvVpVCOY9y54tNaJspqC0EzoD8sq+ogMjuZCBfVJmTde7KOUuWRa6MyQ6rOFUngyIQu1m2/CTknZAM7dLJbHe2nPXED+tMjtJEsyWyrE9NK3OVMKK7QLMEMoVGYIYzYf8uk26Ctp+C2BfvfDgZJgAjvUcEx6Oo8wn3aU0RgMrsx/7DSD6viRoLNZLcbD8EbMjYtV7s3BP7rDKlcGYgDEocA3NkGdalNRhCqINNZlT7nxBs4TntSBfQn5dLfMADClDBpmVKSpfCGOJS+3XJcy7W2++MK2kSU5BII9m6UVZWvVtopnn8PpDn8qhzHC2rT6CEFvuQQbaVQ+VEIMNXXNiyQnNq9IQx4n5bjzBAhYF9jiawdDJTAt9jf2P/K3NCnPR0D6E/YcVoOdpwW6QsGXAYyYWVShyCPsSQKM4Q1ekm51B0X5g06mf1vB8Mk8BqRHEO6HnfaUwDQbP/d5Yfl3qhqAuy/BoZVmVSEAktU+14ZhRny/m6amZ0lFWFnx2D4W1P6WF+DIeHB93GmZmSL0y4i1ac9kaXlVZ2J6HgCp7a0N7iyZCW3gIqOwV2zyR9aG7jMQ5tRWZ4M//0dWsva+VUl8JtChIFX5DgXkWXl8usYRTbFa/ViFDnGBFElkdqJh7BoaawcpRmy7BCFD5+NUYiFv7xlJgDQb8GNfFmuszNZfkjBGQu0GyLYl5PYrFcqfH8EzTQkrKNE7HtZDtHsDWGbg784LPcEqJlvv4kh3KQaESrshDLGiamkM5q+p0uuowlZwQBNAoBm4sCuXfolMrTBfX0WYQcOCWBSxi+KUZkhS3GMmcutDEMt9jFmHKKuyeIDRr8CVtlBncExyd5LchyNA+sQmJ1R8xPxyd1Lce/CAvfluMq10oJ57Ouf8mKyUAgRPGmUKQPtoWtDcgQdMcIhiWdB9otseR/FYkiciYR+DXNvD/B5OWTlri0vJn1nANEn9kMeyLL1CZvxq8TvnBOz2DGQ0RKRkAGds8mPeqVxYfUu6dtWHqaX4RlhcwD2MWa1hUqE7dDIe3Fg0AVQIGGDk4RBdE2JMwNXAjOEktNwb9t61YL5BTtyii7Kchi6OJ2RviiHLIN93WGVB25Uah9jBpF4oDi+BpgvxLV5QG1X90HIv+E09LGZacADcgQEPwdhjNBDHtV1bJnk/K8cfvqJOVni7LaEVMWqiZjaZebLvFUH6BskXWFraPrhF+p4tFixYA66lK7CctJmwNzlHAwF9Db5+shRLwdHmUCWbahgH8CdSi4ntgB9vksQR52XQ9D2+IZvy+kZkkinY8r37/GtOUG1EWEr7OQyKI/zNHCw74JsRyOnijkSjlYLgBk/itcAaSwIwin4JXEgn2GwNqBDDvkpUY08v3r55I3fHGGfdmbHmAnJeowZswLWYJKPzYmcpgWbAH8gOJlNoqVgTTlcCPtQjxevhM/BALvP/DMtCe6osIXZyR2zHP/W25rVXnqmI13Y2YEQ2IPRh47Zrk+xnLf9aR6lD6iMxS+JjjELuItP5zQvThBiEMbkuwrNJscJxBCxC4zsAMiZg9vqoQxT0xMV2MkAoF9PY1bjKASXQykGjQfw2w1//5DzMp2v66EZLPsjjjHzO5UxGPE/hDhnMN5qV5w6uxnmhRGetP3nZTkakh8AaOxm7maEIPCUfAdbMxcq34o7o2XIYB6+Qju6Qy7Zr0ceWOzvzKtUHkRHFYPOcYfV0JXYhVFwXhZerQaENYdoa4eDnfZErzGAXDxJlOPt/gPe7pegUiPOPdnfPghoCMMQQNdIQwKjK8AoM+hz4ymh6OsinxCFDDwnh7Dpf13hhyP+bkLguF6BzbaGC8zke8FFxRFto1jkFIro9xUSzm0hM/B2aRUq3URx0J8wLwhhfBq5svPAuVmOBmRtJUwOaiigg7KDN4D+DDPEzIueNqPtg8/Jcr4fZDra6w+wk4lCp6PN1/HSgAY/jLfZJLdbnNXSgFOE1lDqclUqD6P+0fjV4eUjXvnQ3o1UFGQ8dGfHoM4D52QC0D/GDtBBnn9CA9jgU01jBfPG61qGisZXucU5eu3kNdgU4cpQRsLcYr7iaDdFbMWDNahdljGnCMEMqZ/uVMbiNWCWY8wO4kHH248w80J9Ki9KBED+B/6PAXpd7AHNvCiYKKArsAGVPZl1o+TZiGLMTp6LdRNj2ug80ASzeeJ6NptHyATQNOjVSZYoCilsl22Mi3DDYXq2LNJH4b++0gjhaaUR534/eHamoz5ZHwdA1xJEKYANf27ADDFiIFCLdIRbQv4rSGRg2xyyPkLOiMkbK+llCo3Zegu2IGwuMeChCzZkfYV0Ax48mEPC6cG4mF8JliMrGOzF7818sC0D9EYA2igvB6+Q8Cr8WaSCDy6bdrxlos0XqIvQwW0ynB9ESyNYDkBugUV1U/D/bcG4GF5LIZ+xe9PFJ6M9Rag2b+txHAWpxDFmRBiB+JgN1vEw7gOeNtWYF/g3buHgWe4aQENrGunl4G4BGv4lZuGYi68RdyH+jKVwIY73uMUn9NrJaynNSvcq7LyNQag+g58F/TlR5yaRKANaZ7g+1k9NEH7CgUFOhzIetO7Fz0h7FuYcWQHz5lzIHas14x5K27gd9cjPzORIEKBrmnwYgsAWmsB0Z7SDqtrSkzAInatIYrEBdjL52SvfiYduMipoWruSuN9T8qEikoFtMshmI+reUkHby6L8KNrWRTc9IqyFzN2gU706UzfBqAiUtgoCGu+FhGjo49jeCj5KoZHOPy6e+1+UXSyi01umkw3chcJk3OzBbgic2IPkTmGyJCLaDw34uJgujj+dkHIjGNjiobdhip+ZUS2ioPcHyvwPP+aaTXQoPdMt1iNbvExDmwLQ1QIhwio87ewMC/bjDZtEgQw6PYN8xFsgXL5NFbSpw6mUYC3yXcgTTxswHEuh4vdBs446PV14tmapaag83HG/4hgzuUoYCM/NUBTK4ijIlgRgq17g+5LZHPljn4UKpS0ZoH8xG6Crm+7FZRV+TDuyV1nIAFutFB06dke6oHvgFOxU+IN5OzUkT/GMxNO2VqGksKWbfG5EvexhdjqVSXiY7wa9cA/zd0hjkzZsw4eZQtkZGWJdshWAxqvaDCZHKOHsxOKbnQBu5+MSJewpe8qVLo5tRgic9vrCVh+9FdppKqi00EcpYaXfEikpOtVNthnBwf+qaEc82LPwpsQukaNhF/piR4i+OJohwTdlpzNA/2pODX2MbDBl+i1cWGyRD9MK/4FWHtQinWw8JlMU//xaRc8VFMr2710WRXGzFWEfu5yRkSFObkxIhRHMbfPR7njQi0FrJ35mXyJcdioD9DafyWzo8D3hx+RAYYs08mT4LHwpWyhtkOZTxuMYqftQwkjXFR8Dsc21Gw/8sFNcwiu44uWrL2z3YdAo0Df0UYlL6fIW6WJ+oDOZsWSBnwuvQLYGI+rAFu/85qUD0qroZjwc96PNOLbBEm3X0j9NYS68uL2KfoU30PGmWlSys4qMWOP4NslGJQbjC+lRpdu89G+kCuuGiXAm66BkD2jjBQD2lzt88gJBFoc1yyTYhBFdsJK8nGxBhe73UnRy0lwqGj53+2gbvDLxncHEzIZqbqSxBYBFuB8d9KadPjrFnyZMP5UQ5kHiDvFdcMPNVqiMgefOyVLUDu4IVTJRcVj/wR22U5rv8Ctj8LgGtihZSctwN5I/I3zLdFyaX+gNYBednE7e5C9qDXwE+zeaXd9aZJGQvL/7aVfqpwvRFfUYA8HGJoQZc1XaHOPEN3ZXKQ83cZGrMGiUIrFnpTc4a4ulAM0LTCjy9sgbAHOkDkvR9C6/Vx9cExHQkA88ppYIATYtNSjktTgsZPclDClspwNPsJ6G1uM64JGIgXlETl5ZNquoFQPFExtSQLR+b3ZsWAtF1VImR6gGhIuzyGsyHPumireILKtNDiY5izDM7V4MaGhTQcK6zFjO5OB8i5uiR7TwapWH1BSCVWfCUtab04Ge5x1sqbc79qm8fAZsbd7MsWfb0jUENLSFtEP1xIpFRG5r6MR0lEXwHGAzKQeFVrP7EgNT/lotAuhAgywFaC1WhJU6gR9a8c9pNeXgxKbSpDOiA6aJVRaoxB+jgRo1fCyH4sNICeJSU7UBJpNSQzNAa9HmmsSWYpmtp6Et1Pn2oDAxT5Ml9HONaCzl5dDSnVbqBC3tSkRei8iylslhFY45VbTthzYO9g7YHNjpbfZQCTOfnW5VvXzUInjmtoutZveZHS2ceiQhzQB238GZFgMyCNkeALSZmT1eQlp4tcpDenwbzfa/iRX0BixC6u8i5LPaMrOUl6M242r3DPjmf0uqtcBcaSZTDuUwL8ajj2eH2nHjTEI3dAAN1nCdmgu44bgxiSyZjnoRQB4KIO8Nx2tAQ5vsCQzHK3e8FtOEm2iKZmTjEROEr8FDPwCZXVWD3feq4rETEyyBPaifnW51IQ+YGa/OgE/GKgYn78pcfKLItqKZDFRC4MwolfRgEvsIW/z3YGHzLp0jiGIxgHwoyIraFWfwsWOAs5yCHOh9tbzmSeO1jRTWJqs8pQkSr8hrTGA7bXwh8qmgOPqTdJHrME4AmQiyfJcgySXo8/ucVEHH8wIlQbIPVouvvQZvI1wVK0wGRGhDbJN5JYnPZGLDcTyUA9mGagaTdOci3pZTv/9C6pfYVxY6sjIYvFKYHHj8eF/lvDXFKh/hVBWBPfrx6IRYNdRMdJmG5oZ/NIxXQstOEzPSJsO84DqmjHo8zRTiLKEy7X5shYQBGlEWmNsMMM4rV4UhmjfzsSKx/ztOAuwomthp6LcJUQaKbvf242oN+S+Anyl4qwYDrkU4C9t9glUpKgB0wN4MWd58kZwKuvrDtTagDepAnHPJLXjeKtfjZduf5LiX8BQI2MkV3juox8vWa5wctozENHRAmVnk9czr4Wfyt8pbJ2zvmCRBhormlXtkltks31ghyz0H5gXT/REDLfdcQI9UMju58wka+cTSCpaPWmcApXC+TaqzWeQhPbFT4hJDcLgdT6jWd7plyQg8jw+IDifZOWFn+WrzAzv5JEWiE4HOuwFkrtetTGRmcuhmtjYfsb3nahZYYK5I3ryx5dj61HW+wWGvrHYQsR/Jy/qGRxh0B3UruZ6BShUdgtcs8y3zB1RmLRua450TaD1T0TagIwCBU0Uzm5SZcNrDHjjPhjvq5LwA84JLa0oHDt+ukMNTUeUp2qtja6EDXg6uuqKhb3wZLSC1bWhj5K9dQ7OPcs4RBWksqVuPb5ZvX1kHWST4TLNyMa/OCtm4AKCtZHLwql3bbReyv6OKhLOXd54CevgTIjoKSP0cvlm+fRVNFOKfAAj2IviCaFT81S4k+6nTUgMoXvcRXpHohdpNte+jlQA0NMdM4TaQH+holP8OTzX0118zlKy8QlmpGob8hn1amX2NFTY0ONb/bPC0w4A8vIwGEG1AfTYJQYB3jYZdlMn28k0RPYemklNP5Zrlk/YcuAU6fxqcES2Y2jE2BGxo1vm8QDG2eu3UwCtPYHaf4cLiqTgJ84S3od9yCOIg0rTubzytrtq1t71IxEcFmXbhyR9VHkWAH5o9JFYZQPE+0NV2VFQySZlCYZVuCAnUGmfhrb5eEcT+rpPrLwmR84QoumdPI1kSxwNn9wJnWmo9gVakCLbO1Vp+aF7biClyPk9RJBnZ6exVV63wyvAeHyue3OAJB8csH11P05Tcvf3lKjISJHLiI0gMCqnMbGhrmBy8GMWbB75TXnUeH1Gbrhbe5dB+SQE+5jvS6HDSpMk+nnZIv+25UaZ7pmEMf3pcTT+RQkNbyeTgffDYQNcy4wIeiCQuj6N5448wMfIhDwe+rXvaYXr7USorXXnyG55HCqyHtpAm49W64Qcyhssw2QnyzPLRLb83kEU6jgryfVCQvLo/FqKzmA3Nq6EpXQWzbwcMqWaxkFoS0HxP+Llpld520DVrXFJe40ckoowCkPMCb3u9RHWVZ4PCJFzI4zrjpJV09+420mG5COPIwZCRW5eckqUwEdbBpzzAdeZJn+ptkrRl1/WSoEzHgLGlabxkxIGZQjxWvIMtvULQW17h1dCoiDRt6sGl2PPLrvlOWZ6C+3/qrd/C5Q9AImNdLU9+CiYEM8iiDr4t29sKMpmpKPJfoiYSs4IK/NABn601vBzRLPvKPOOkHZBfd/+m7U/ADGELxc+PmTzNR5gton/a5RfGkHanlOphj27cWc8vKMWCRB/AiNuRePMiRGvEKpgcLFhlYkXHc+dq3fxLbOW5wL9h+z1wgEwAqcYhRJI0Udjd/180ZkB62+Yb9DSK2cn+zAYP+ak0GnTy9dCKdVnAuMaGjnVNRtHXaRvVjNifo5s2vVkluUYQgfQHa+lGsWcSOlvghR+UcfZp7+vlR1q/7Ro/VWZivU8rvbTiUl5ifmg2q2bK90fsREBatz4M6kO9P22Z56DidDgub4hdbXGjfAgP7ASnWDmbtG2ry4Ph+2lbG4HKMxRZvjpu3BtREV7BmFhhw0KLBA2DQp4WZfxfy/8h302Va/93pSjQR3F/Nk85k+Vh3vnnsKp+ZE6707n264Xjn67fUdcv+UZTRXoIeZyWwUWwQdWABtuW4TzwOgmyb9jVfc7pn9GFtL2/5S99UMNY2Nf1DCMeQ0LotuU4oq4w/dyW3+uphi5d6vTlN3ugqspbDDp19dBKaFnL2dAxnM4mtwfcWU/SH398ze/PGAPZBLRUQjsofOXboYWGZJzfemH4LHwp3m83dqui4gwiy2fxlTBzruBqOx3eg/g2L7yG9q3ZOIzKwqKMTm026+GJtGtXivIFvm83z4VLcyYmZrrpoWdw2QrQm5p2OH0a6cq3oD5c/d7VG88kIsHEiHBd4BVtmbd0uBahFZSy9dAWMjlUhI6tb9diUDS26uuNT7lEcTzp0Gp/+KZHTkk//0y2L+4adPx1MN2n4/7MyKViloN10quy5BiaeVGrXXpqod9/X8cvZYxCt/dF37usY25ytJqZHAzPVvFDA7DhWwXHK9rCBjKP+BS5h3f1hknpHvfjejUZNP4H8MUu9klZ/SAorO3FmoU4BrT4a0ztFWR2PmuVnmoxRnD4mm/4l6+KjEN/19dDy8RlmdtO10xofNumdgpm9ZMZ5IctlJniy6h4yLdy3fC0C9u+XuODDqZrupIOHdjW/BmHl3/3ksuZNgHrg3sD3DHdfYGp6t14hQ5Pv7Dty3p4Zw2tXLn+Sh/Z8CjWiMOLo/Ka0yQVE2aGTqtZnKSi+czEt8pxumwbfIiuao64Bd6V6woqv/xxsPvidsv1NCeny3lsgXufqi/XP6UQuPmocJkeemHKesHzzIwjcgnpdg6zmaMO3pXrz4Bbdhp+NyQzjv8UkOUW+P/J+gl3Kmul8bh2RP7PvV/8+C7WgwzNuPRcXQPHtIvb/gB6XX3L195KCZmK+xb4GREWUUke7O7afpseYnTVllyf5B2Jk+1hJglpemhZrCxem+xVbZWfmnXEPk0QuR03YifLOu+ytY8dWrpGtx2Z3uWct9KlnLNQMdYD0wqO+kPLWqFrsTj+ioxL292mB8wwUcTK5T/+y+f3bIaHZhD4SYuap8iyDN2WBJbD1ygwKFTRbGZ7OqmKyaHhwBy2IOuRNOLs4fn8+xI3rTNbz8CxpuzEihXfPS9KpAS2712gz2vD7cM4dzQejHmwk8P7JDk6onLp95f7lq1ls53nQEWlZgjMFFpqy78KThRZJfHE/kWn56HAZK9Q+qDns++Gu69or2vgmHXJebtRS0/PkjVPEkFky1Q7nVjr0Rg/6p6TLivjyF86lB+NjeLG++k3pwmiYyr68mb8oqCQREVkth6ajRasIgc1JcbeNJogfbQjm6PYAu+S7wsqP/tusPvK83QNHDOv6LAawLoI9KCpKTS20PRoTeyGCh8RwTEg4yp9djxdsSLH680chqFwIcyK9GPqSNF/qMI+SRH4rFt0SDCV3AI2tB6OaPXA8dM1GDjKQzOu6hT1wLHGzfYyXbz2ba/oHwquBuK3DfEA8vkf6+KymIrei9f08laSCRjgNraMMtLTaN6yaYFvrLDcFlHRams5DBoLQBI3CoJ4nWfx109JaeL43K4dop5xrHG7jar473dPZbrK95KuXSXevgmVr/Ljby/1kTXwJwvnWabPQjUkVnEBDW2h5aNq7xFKmQ2tlkOTFAMDR6dP7uH5eHWJO92DgWNXryYKtTJn/TVgX9eK0XZb+eGXzanDOZUK0m2pbiarSQ7ngbD10MhiFSmpaWjm0gs1taImgchpbJp7cmWl+8HKD1ePyLj2gtf0ztpFrvLPHFjWme3xuIeg2QMxy5fxZ4p9F1oCGfBDMy9HAn2HmuoOfLIrdFPQCB4/dLRtbU4F5ZXKj1athrbsEo4Do+IxqCSVH3zVs9KTsQmNGgEZZWiSk1X602A+qzU0Dus17k1tVJdGQccgGzpCzR1w6MPnnvdXviuKjqEZf4t+4BiuHs8HKy8CmJk/uaNVhjbh2hL3+KANbRnBqY1dFbWVS4aL9kZ8Yu46z7srnpYc/nG513eNeuAY5Mzz7rJm8CdPhtfpjmCcfdUmAeqSFCdmC7WVSmBuHIMZtvYETCqwgWNfh+y8+8g7y0uy6shRDRzp+2syPVJlEXphMOxk+4SnsD0cOQEn6bLlo9YBtOoneo346ExkmYXKEZhx9JQ6HjyyaNmIrJu7cA0cmZ1c8c7y7h6/h02+nGyZt2QoCZglTmGAjo13IDZNDK+gawZ7samWk2pzsPeKZ9HygspFywa5b7ks7Iyj560vOnneXv4onDKdwTgneTtbJAlQJROAZoi2ikzV3iYyvBzGu+0iyTBUegew+XnFm0vfdciYcfznVUdnHD0LPztJEEkJfOZ3wmuh9niGomvHRZAA9UND4/PI+ywjWjUIsE++mevBvFEm4nUVbyx5mjjJVOx5vBfDlSLoj8xozuiL0JepnrwN0Bha585LS53uTct7eVpd+gWwMA5SaWRuyYRHNJbBhk9MXKMCA0daRduBhZj7rxPXzMTUjA7Hjh4yNTPPO5Vce62PceEkxcVQbcK8ffNXvO7O9MKJL/TH/+ZcvaW22s7c3hpzvTtYz1s7MHm+KhBlSNYdf2XLdo8GpkECoUHvSwLnvVW+tHiu4hCmwcS7OZhmnquKEo6vH9o8IkkxToDk1Q6FFmT26LY6VNOPAjqY6O7R7Vfc33r4hcVdiCiwGav2wbTEX9nLJEwITKOGSUt4NHsQbSWtsxt2Q4Qjsnv89SW19TRiuEpyenVbnrX1q45wHNyHEfkfplhLEI5ZFs9s6ACoARyzXdkRr2bjyTL8CF747EuyBKFVds9uL6qBmcHgBA3NIoOhxr5+bu8TSxe6MyvxoXFSiLSErfqCXy7I2glXK+2NPIF5OyKcBN7Bl9oG5t9zzbZwGY6PVwV0MHPDh7sewf2I0uf/M8+pKOx7JbcF0+J6VXtrm9uGVuM8riK0RGVUWAt/fUHOvdcu08ovF6CDRGuelH8cmv/+49jVMRNnb3UIpsXnGl5DV5uoNm7i0w8xqoXQfYJCirN3rZlbYx1orkgToIPUc3tfvwJ2TaeKZz7sgU2aExF/7EbQYEajr8zuCxfMraHDcW3HV0vAjzHGk37BXVy3z1907YKPCtCMhxrj/MU/Xlr8VlZl1RCMethG0BivFrOqhmZ8qzyMTKApGiCVjwWZFuY+dMPPRoggakAHK2/coxtma4TRlXPff1ZSlMnouH8G0wy/qmFC47kchvOmRtDGcyjpbIJSHJDzwA26dsEfT1g3oIME3X2u/w333Q/NeftxqG/mv74gmGbUVe3Dm+b2crAnUeXtYpSArEGnDIvIJuTszZ9NivXtgg/VXMMAHSSe2/fmr2BfX3hozrt3gvFJ6MaTg2m6r2qnPJl+9Zra60W3ZKxAgC1Uni+4XCNz+1yve4dPuAYbDmhWUY19/cruue+/neWtKsL5bYMQjVVmOoOaklNbWqqzWru4bgksFRVamF1w64+6KUUgEBNAB+ts2uf66u9tz17E7Gu2M6M7fmqwDBYNfVVb72xqL0fKmhy/wrU7JKfw9rdCd6jxsTEFdJDdzH637MT93eWz3ppDqj/Ec2EwTdNVZSlHYCtZ9I+KJjY0Z2Z8qbkcNRM0fYEjaPKUbEf2dNKvellnvDiOC6CDjckruJWtkLq4fObC7uhgprGbBdO4rirfWGF79MzqGQvgmauBls+EUx7oAidxDckqvOX3RLQmroAONjBvwO2v0pkL3zkky7CtSRHisfaEJ6ioaDPb0CmBaLpKUBwFeUP+8TVPT8YqT0IAzRpDBtxeicv4iimvzZdEOgna9W4WzdLCBhUNjR3h6mXDEo1DQhI7OCD0XXg5Ds8t+ucrkVbCxUHSQtjlo/GonNWRNeSO3XmDu/eCUDrBDv4yYAszbRvqJzPPT5gQKr8dF1qOBsgFfn8vDsWZ5CFS67whd+j+UleYXtUcnTANfTyndYbcsQZxl5aVvPIPODMw4yg0Pz6Pqo1sYhv6hHZYPAJaeRGcSoPqjLxru9maYhpABwVTZ9hdb9Di5/9dnuYcAAAPRXx2MM2q66HxgVM0wbwWUVC+Ea9U+IEKYkH+yDuXR8yboAymAzSTAym+x4vLpIqJLz/vp8oE4KEn/scXu1RAIcOGVklmdBMWGF/Wdtux7zOOzpN/nRftss54yd6UgA42PmvE3cz107us+LknMP3IPiQfPgS2FIVPtlOikoAfEyNz5IyqcXWH9tG1rDOq2qMoZGpAB9tTp/je73B/+b4p83OCcSdcmQ1t2mA9kwMcf+QQxAG5xb2OnvxkWvHWYswSgA7y22BIb3bUQujARu5mDZhuMMkxZRElBCluFEUysE7xPYYu64xYsUEZLAVotTbj9Hbz2tCMcRM/bzVyLcMYZVwd1445sJN1fdxIrZ9inZY0gIbDFZ8mNrHVEeuejJ6+DDfpPJGmjcot6XkgejLmKJk0gCay1JUS5ziI9QH8HOYQr9m5IEvwscrC/Cl9fjI7p7z8JZ1KOzhk7v9hCnY23vGX8Qoh9vnIMlPxQ4St4Kcov+SBt2Pf9vjWkHSADoqvbPDT/8CO9Kn4X9uKviABY69sIqKLsSSjonYEqxIn53ulGeTxfoHTOqOiYuJCSQtoJvPdxXMz049IQzFaZDtmEnbiE+pONKCZl/5lRXIMazCrT0KWdbL+iEdIakAHBVhW+ORpikhn4P8bgnFxvRIAmiZMQ3+FRbcF9Wc8/E1c25ygylIC0EHZHih84q/YBTAL/7cOxsXlSoQvAOhL41LXn5Xsgu97WP7Mvgss4wT/k/eo71IK0ExK9P65roNuXz908ij8mxu15LQV/ALZ4wXoSngvZ/g8aZObPtOH7elMqZBygA727p7+sxrBZzmZUNIDcbGWQ7wA/SaRHUX5T/TdHmxnql1j3ZGml+f+h2d1qnHzdYwhszEFNDrxe0yOFOY/UWDaZZ0xlO0xpFMe0EwaWK9MDj702L24YwdPNjxGQsb8swJkLjGG1DFU9uLlMqpuw7L5Zl/WeQzXMfzHBnQt4R68f3IeFdPGwADpi2gjZ1GNBrQfD99swekaX+/xfodqNSHlb21Ah4DAgT7Tz0I084ZcFSI5iigCQFNDNDT8yR8qgjKg4dyiLVEwkvRFbECrdPH++6bfTAidjiwtVLLxJH2JTBfzZFTJsxHmxYB68wYtVsmT8kliyktARQD1nx309iF/xVkwssdiyrgSP+yk0v5jZ0xGU66mTCmqLKh7UsU5NphVOqsmydbQkWUUyFHae3pzmcrT8M+tnEWOZoOZsBKCvuhoBN+NjGzPpEny6NyXhlt+WSdfk/XnsgGtUYb77ym5Al9pewwDx7a8RTUDmgqfiU6lsO78Eet467DzVUvABnQUSKCXFzsPNE97CEWL8asTkQQlX2EamueAyq3wIA6u/9KwdyLStDOElIAN6JBi4Yv8/Z5JDdIkZSK2DMKHrXoK1VdIVwP0YexqLynNy360ZZIu6+STqP5cNqD1y1DY133c+UQUZ8O0CAlaDO5WYTayc4iq2KaxFxWnNLzhC8V/hEi3ozRKwAa0RoGFy85mGw90H383QD0ZeRrXzoe4VRD08YBeKShiQYPXR7Ij0OxgkARsQBskyCCZfX+fkiNkeUfB1dcfcS4WD0CvhqA71eTZCX/y0Aavj3615n/7YqAEbEAbKMzapPbdNr6VIMrsa2BX1wC6HTwj0/wVwtSm7xen3LLO2rKJ5b0N6FhKF7T33T76ekLFTpjBeqbum8W/xbi6lCf//wfwv5oLyY34AAAAAElFTkSuQmCC" preserveAspectRatio="none" id="img6"></image><clipPath id="clip7"><rect x="1824" y="5560" width="180" height="207"/></clipPath><linearGradient x1="2.53282" y1="25.2686" x2="2.53282" y2="-99.4357" gradientUnits="userSpaceOnUse" spreadMethod="pad" id="fill8"><stop offset="0" stop-color="#E73768" stop-opacity="1"/><stop offset="0.5" stop-color="#FFFFFF" stop-opacity="1"/><stop offset="1" stop-color="#69E0F9" stop-opacity="1"/></linearGradient></defs><g clip-path="url(#clip0)" transform="translate(-1237 -5367)"><g clip-path="url(#clip1)"><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 6046)">A_shared</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4754.51 6046)">=</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6046)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4840.45 6046)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4869.1 6046)">alloc_shared</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5212.85 6046)">((</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5270.14 6046)">block_M</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5470.66 6046)">,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5527.95 6046)">block_K</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5728.47 6046)">))</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 6111)">B_shared</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4754.51 6111)">=</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6111)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4840.45 6111)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4869.1 6111)">alloc_shared</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5212.85 6111)">((</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5270.14 6111)">block_K</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5470.66 6111)">,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5527.95 6111)">block_N</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5728.47 6111)">))</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 6176)">C_local</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4725.87 6176)">=</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4783.16 6176)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6176)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4840.45 6176)">alloc_fragment</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5241.49 6176)">((</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5298.78 6176)">block_M</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5499.3 6176)">,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5556.6 6176)">block_N</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5757.12 6176)">),</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5843.05 6176)">accum_dtype</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6158.16 6176)">)</text><rect x="4456.5" y="5986.5" width="1769" height="211" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="none"/><rect x="4370.5" y="5690.5" width="1855" height="193" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="#FFFFFF" fill-opacity="1"/><text fill="#FF0000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4267.53 5429)">import</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4468.05 5429)">tilelang.language</text><text fill="#FF0000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4983.68 5429)">as</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5069.62 5429)">T</text><text fill="#FF0000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4267.53 5560)">def</text><text fill="#0000FF" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4382.12 5560)">Matmul</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4553.99 5560)">(</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4582.64 5560)">A</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4611.28 5560)">:</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 5560)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4697.22 5560)">.</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4725.87 5560)">Buffer</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4897.74 5560)">,</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4955.03 5560)">B</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4983.68 5560)">:</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5040.97 5560)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5069.62 5560)">.</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5098.26 5560)">Buffer</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5270.14 5560)">,</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5327.43 5560)">C</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5356.08 5560)">:</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5413.37 5560)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5442.01 5560)">.</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5470.66 5560)">Buffer</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5642.53 5560)">):</text><text fill="#FF0000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4382.12 5732)">with</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4525.35 5732)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4553.99 5732)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4582.64 5732)">Kernel</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4754.51 5732)">(</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 5797)">ceildiv</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4697.22 5797)">(N,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 5797)">block_N</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5012.33 5797)">),</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5098.26 5797)">ceildiv</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5298.78 5797)">(M,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5413.37 5797)">block_M</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5613.89 5797)">),</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5699.83 5797)">threads</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5900.35 5797)">=</text><text fill="#1F77B4" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5928.99 5797)">128</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4382.12 5862)">)</text><text fill="#FF0000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4439.41 5862)">as</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4525.35 5862)">(bx,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 5862)">by):</text><rect x="6180.5" y="5981.5" width="45" height="143" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#E57C62" fill-opacity="1"/><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 6267)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4525.35 6267)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4553.99 6267)">clear</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4697.22 6267)">(</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4725.87 6267)">C_local</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4926.39 6267)">)</text><text fill="#FF0000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 6451)">for</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4611.28 6451)">k</text><text fill="#1F77B4" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 6451)">in</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4754.51 6451)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4783.16 6451)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6451)">Pipelined</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5069.62 6451)">(</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5098.26 6451)">ceildiv</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5298.78 6451)">(K,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5413.37 6451)">block_K</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5613.89 6451)">),</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5699.83 6451)">num_stages</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5986.28 6451)">=</text><text fill="#1F77B4" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6014.93 6451)">3</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6043.58 6451)">):</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4611.28 6608)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4639.93 6608)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 6608)">copy</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4783.16 6608)">(</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6608)">A</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4840.45 6608)">[by</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4955.03 6608)">*</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5012.33 6608)">block_M</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5212.85 6608)">,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5270.14 6608)">k</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5327.43 6608)">*</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5384.72 6608)">block_K</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5585.24 6608)">],</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5671.18 6608)">A_shared</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5900.35 6608)">)</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4611.28 6673)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4639.93 6673)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 6673)">copy</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4783.16 6673)">(</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6673)">B</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4840.45 6673)">[k</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4926.39 6673)">*</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4983.68 6673)">block_K</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5184.2 6673)">,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5241.49 6673)">bx</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5327.43 6673)">*</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5384.72 6673)">block_N</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5585.24 6673)">],</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5671.18 6673)">B_shared</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5900.35 6673)">)</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4611.28 6844)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4639.93 6844)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 6844)">gemm</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4783.16 6844)">(</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6844)">A_shared</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5040.97 6844)">,</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5098.26 6844)">B_shared</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5327.43 6844)">,</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5384.72 6844)">C_local</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5585.24 6844)">)</text><rect x="4370.5" y="5615.5" width="1855" height="76" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#CCDAF6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4917.51 5673)">Kernel Context Initialization</text><rect x="6180.5" y="6124.5" width="45" height="73" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#B4E5A2" fill-opacity="1"/><rect x="4456.5" y="5912.5" width="1769" height="76" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#CCDAF6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5109.45 5970)">Buffer Allocation</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6241.78 6181)">Register</text><rect x="4456.5" y="6224.5" width="1769" height="63" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="none"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5431.51 6272)">Initialize Accumulate Buffer with Zero</text><rect x="4456.5" y="6395.5" width="1769" height="523" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="none"/><rect x="4456.5" y="6321.5" width="1769" height="76" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#CCDAF6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4852.81 6380)">Main Loop with Pipeline Annotation</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 7081)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4525.35 7081)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4553.99 7081)">copy</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 7081)">(</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4697.22 7081)">C_local</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4897.74 7081)">,</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4955.03 7081)">C</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4983.68 7081)">[by</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5098.26 7081)">*</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5155.55 7081)">block_M</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5356.08 7081)">,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5413.37 7081)">bx</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5499.3 7081)">*</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5556.6 7081)">block_N</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5757.12 7081)">])</text><rect x="4456.5" y="7027.5" width="1769" height="84" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="none"/><rect x="4456.5" y="6953.5" width="1769" height="76" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#CCDAF6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4937.86 7012)">Write Back to Global Memory</text><rect x="4580.5" y="6543.5" width="1645" height="153" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="none"/><rect x="4580.5" y="6469.5" width="1645" height="76" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#CCDAF6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4827.91 6527)">Copy Data from Global to Shared Memory</text><rect x="4580.5" y="6786.5" width="1645" height="92" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="none"/><rect x="4580.5" y="6713.5" width="1645" height="75" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#CCDAF6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5311.86 6771)">GEMM</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6241.78 6054)">Shared</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6241.78 6114)">Memory</text><rect x="2093" y="5556" width="505" height="130" stroke="#000000" stroke-width="10.3125" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#73BBBE" fill-opacity="1"/><rect x="3523" y="5556" width="505" height="130" stroke="#000000" stroke-width="10.3125" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#8ED973" fill-opacity="1"/><text fill="#73BBBE" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2070.16 5779)">Global Memory</text><rect x="2808" y="5556" width="505" height="130" stroke="#000000" stroke-width="10.3125" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#E57C62" fill-opacity="1"/><text fill="#E57C62" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2785.16 5779)">Shared Memory</text><text fill="#8ED973" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3534.24 5779)">Register Files</text><path d="M4154 5390 4154 7114.23" stroke="#000000" stroke-width="10.3125" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="41.25 30.9375" stroke-opacity="1" fill="none" fill-rule="evenodd"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="110" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1620.74 7280)">(a) Efficient GEMM with Multi</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="110" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2978.85 7280)">-</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="110" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3016.09 7280)">Level Tiling on GPUs</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="110" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4290.32 7280)">(b) Describing Tiled GPU GEMM with </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="110" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6019.39 7280)">TileLang</text><g transform="matrix(0.000360892 0 0 0.000360892 1339 5961)"><g clip-path="url(#clip3)" transform="matrix(1.00011 0 0 1 -0.0235485 0.164795)"><use width="100%" height="100%" xlink:href="#img2" opacity="1" transform="scale(2768.59 2768.59)"></use></g></g><g clip-path="url(#clip4)"><g clip-path="url(#clip5)"><g><path d="M0 0 1013.13 0 1013.13 744.948 0 744.948Z" fill="#0A0619" fill-rule="nonzero" fill-opacity="1" transform="matrix(1.00126 0 0 1 1131.64 5290.99)"/><g clip-path="url(#clip7)"><use width="100%" height="100%" xlink:href="#img6" transform="translate(1824 5560)"></use></g><path d="M69.3696-87.6953 2.53282-87.6953 2.53282-65.7044 23.3616-65.7044 23.3616 0 48.3918 0 48.3918-65.7044 69.3696-65.7044ZM90.7347-75.2994C101.849-75.2994 103.727-76.4317 103.727-87.5761 103.727-98.5417 101.849-99.4357 90.7347-99.4357 79.7392-99.4357 77.713-98.5417 77.713-87.5761 77.713-76.4317 79.7392-75.2994 90.7347-75.2994ZM78.7261 0 102.713 0 102.713-68.2372 78.7261-68.2372ZM140.259 0.744948C143.149 0.744948 145.95 0.506565 148.721 0L148.721-18.0575C147.321-17.7 146.934-17.8192 146.069-17.8192 142.136-17.8192 140.885-19.458 140.885-25.1494L140.885-96.0387 116.867-96.0387 116.867-19.458C116.867-5.6914 121.427 0.744948 140.259 0.744948ZM220.624-38.5287C220.624-59.3873 214.038-69.4888 189.783-69.4888 167.166-69.4888 154.025-62.1585 154.025-34.1186 154.025-6.07878 167.166 1.25151 188.651 1.25151 202.149 1.25151 213.025-1.25151 217.197-4.17171L217.197-21.8717C213.145-19.458 202.656-16.9252 193.448-16.9252 184.717-16.9252 179.294-19.3389 177.894-25.2686L219.849-27.8015C220.236-28.8146 220.624-32.9863 220.624-38.5287ZM177.774-41.5681C178.281-50.0307 181.827-51.9378 189.902-51.9378 197.501-51.9378 199.378-48.3918 199.378-43.207ZM258.02-21.6035 258.02-87.6953 232.871-87.6953 232.871 0 290.47 0 290.47-21.6035ZM330.399-69.3696C325.483-69.3696 318.152-69.1014 312.968-68.2372L312.968-48.1534C317.407-48.8984 322.056-49.2858 327.002-49.2858 336.985-49.2858 339.518-48.2726 339.875-41.3297L325.87-41.3297C306.025-41.3297 297.056-35.5191 297.056-19.8454 297.056-5.18484 306.025 1.25151 320.298 1.25151 332.306 1.25151 338.117-2.9202 340.143-6.31716L342.021 0 363.773 0 363.773-46.3656C363.773-62.2777 354.297-69.3696 330.399-69.3696ZM327.896-16.5378C322.95-16.5378 320.298-17.4318 320.298-20.9777 320.298-25.0303 322.712-26.1626 330.28-26.1626L339.875-26.1626 339.875-20.8585C337.998-18.4449 333.826-16.5378 327.896-16.5378ZM424.531-69.4888C413.416-69.4888 406.861-65.9428 403.434-61.2943L403.434-68.2372 379.447-68.2372 379.447 0 403.434 0 403.434-44.9949C404.447-48.3918 407.099-50.5373 413.178-50.5373 420.747-50.5373 422.892-49.1368 422.892-40.6742L422.892 0 446.909 0 446.909-46.1272C446.909-62.665 440.592-69.4888 424.531-69.4888ZM505.015-68.2372 505.015-62.2777C502.363-67.3433 497.208-69.4888 485.557-69.4888 463.954-69.4888 458.411-53.4575 458.411-34.7444 458.411-14.0348 463.954 0 485.557 0 497.059 0 502.363-2.9202 505.015-7.83685L505.015-6.07878C505.015 4.79747 499.592 7.44948 485.051 7.44948 479.121 7.44948 471.403 6.43635 465.98 5.06565L465.98 23.6298C472.416 24.6429 481.147 25.2686 487.583 25.2686 519.944 25.2686 528.794 13.0217 528.794-5.6914L528.794-68.2372ZM493.901-16.5378C484.186-16.5378 482.667-24.8813 482.667-34.7444 482.667-43.9817 484.186-52.2954 493.901-52.2954 505.403-52.2954 506.177-45.7398 506.177-34.7444 506.177-23.1232 505.403-16.5378 493.901-16.5378Z" fill="url(#fill8)" fill-rule="nonzero" transform="matrix(1.00126 0 0 1 1271.86 5696.36)"/><path d="M0.238383-25.0303 0.238383-20.9181 7.80706-20.9181 7.80706 0 12.6343 0 12.6343-20.9181 20.1434-20.9181 20.1434-25.0303ZM22.8252 0 27.6525 0 27.6525-25.0303 22.8252-25.0303ZM32.5393 0 50.7459 0 50.7459-4.70807 37.3964-4.70807 37.3964-25.0303 32.5393-25.0303ZM53.6661 0 72.2004 0 72.2004-4.70807 58.5231-4.70807 58.5231-10.5783 71.0383-10.5783 71.0383-15.0182 58.5231-15.0182 58.5231-20.352 72.2004-20.352 72.2004-25.0303 53.6661-25.0303ZM85.9372 0 104.114 0 104.114-4.70807 90.7943-4.70807 90.7943-25.0303 85.9372-25.0303ZM119.996-25.0303 113.888-25.0303 104.918 0 110.133 0 111.712-4.64848 122.589-4.64848 124.257 0 129.83 0ZM112.964-8.40301 116.778-19.8454 117.136-19.8454 121.248-8.40301ZM132.452 0 137.13 0 137.13-17.0146 137.309-17.0146 149.347 0 154.085 0 154.085-25.0303 149.466-25.0303 149.466-7.80706 149.288-7.80706 137.19-25.0303 132.452-25.0303ZM179.026-3.75454 178.847 0 183.376 0C183.376-0.178788 183.376-13.1707 183.376-13.3495L170.057-13.3495 170.057-9.83331 178.937-9.83331C178.579-6.79393 175.45-3.63535 170.921-3.63535 165.378-3.63535 162.577-7.50908 162.577-12.5151 162.577-17.3722 165.945-21.1267 170.951-21.1267 174.795-21.1267 177.238-19.3389 178.192-16.4783L183.287-16.4783C182.065-22.7358 177.745-25.3878 170.921-25.3878 162.935-25.3878 157.899-20.3818 157.899-12.5151 157.899-4.67827 162.458 0.357575 170.414 0.357575 176.106 0.357575 178.281-2.38383 178.847-3.75454ZM203.579-9.77372C203.579-6.19797 201.196-4.3505 197.769-4.3505 194.223-4.3505 192.226-6.19797 192.226-9.77372L192.226-25.0303 187.34-25.0303 187.34-9.05857C187.34-2.65202 192.197 0.357575 197.799 0.357575 203.609 0.357575 208.317-2.65202 208.317-9.05857L208.317-25.0303 203.579-25.0303ZM224.885-25.0303 218.776-25.0303 209.777 0 215.022 0 216.571-4.64848 227.448-4.64848 229.146 0 234.718 0ZM217.823-8.40301 221.667-19.8454 222.024-19.8454 226.136-8.40301ZM256.262-3.75454 256.083 0 260.613 0C260.613-0.178788 260.613-13.1707 260.613-13.3495L247.293-13.3495 247.293-9.83331 256.173-9.83331C255.815-6.79393 252.686-3.63535 248.157-3.63535 242.615-3.63535 239.814-7.50908 239.814-12.5151 239.814-17.3722 243.181-21.1267 248.187-21.1267 252.031-21.1267 254.474-19.3389 255.428-16.4783L260.523-16.4783C259.302-22.7358 254.981-25.3878 248.157-25.3878 240.171-25.3878 235.135-20.3818 235.135-12.5151 235.135-4.67827 239.695 0.357575 247.651 0.357575 253.342 0.357575 255.517-2.38383 256.083-3.75454ZM264.784 0 283.319 0 283.319-4.70807 269.641-4.70807 269.641-10.5783 282.157-10.5783 282.157-15.0182 269.641-15.0182 269.641-20.352 283.319-20.352 283.319-25.0303 264.784-25.0303Z" fill="#FFFFFF" fill-rule="nonzero" fill-opacity="1" transform="matrix(1.00126 0 0 1 1395.89 5764.9)"/></g></g></g></g></g></svg>
\ No newline at end of file
+<svg width="5243" height="2012" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><defs><clipPath id="clip0"><rect x="1237" y="5367" width="5243" height="2012"/></clipPath><clipPath id="clip1"><rect x="1237" y="5367" width="5243" height="2012"/></clipPath><image width="2741" height="1197" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAACrUAAAStCAYAAADgNkLIAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEbAAUAAAABAAAAUgEoAAMAAAABAAIAAIdpAAQAAAABAAAAWgAAAAAAAACQAAAAAQAAAJAAAAABAAOgAQADAAAAAQABAACgAgAEAAAAAQAACrWgAwAEAAAAAQAABK0AAAAAJ6/OxQAAAAlwSFlzAAAWJQAAFiUBSVIk8AAAQABJREFUeAHs3X2MXld5IPDr1F84W4aRUmeAxiATgSNjy+tptRPKpPHYwjOKZY+lick6iWOtE3nWbmviRTukWzSkKU0olI9q4Y9GSC0UVKVUabpICSsIu1A14Y8EiSwCKmBVoLCEaEOgiQp0yd4z5J3c97wf887M+77363cl79xz7r3nPOf3jNVU+/RxkrgIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQI5CzwSznvb3sCBAgQIEBg/QKz6RJb0z//Z/1LWYEAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIDA2gS+mH52/9o+9RUBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACB9QuELq3Pv/Bn3/qXswIBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACB1Qt8Nv2kUdSqW+vq/XxBgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQEIENBYlDGAQIECBAgMDqBUJn1i9mP/ulK65KNmzZlp1yT4AAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQKAUApeUIkpBEiBAgAABAu0EFuPJn//f78VTxgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgRKIaCotRRpEiQBAgQIEGgRCF1aZ+PZ55/9YfL8T56Lp40JECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIFF5gY+EjFCABAgQIECDQTuBsu8kw9/++/ZVOj8wTIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKJTAxivHl+PRqXWZwg0BAgQIECiNwFga6enSRCtQAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAj0IKGrtAckrBAgQIECgYAILaTxbCxaTcAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAisS0BR67r4fEyAAAECBIYuELq0zg99VxsSIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQGLCAotYBA1ueAAECBAj0WeCGdL2mLq2bt2xKRreP9HkbyxEgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAYroCi1uF6240AAQIECKxHIBSzLsQLzJyaSubOHYmnjQkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAiUSkBRa6nSJVgCBAgQqLnAfHr+saxB6NIaClpDYaturVkZ9wQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAmUTUNRatoyJlwABAgTqLNC2S2soZm0Ut9YZx9kJECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgTKLaCotdz5Ez0BAgQI1EdgNj1qU5fWcPSD108uC+jWukzhhgABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoIQCilpLmDQhEyBAgEAtBRbjU09Mjyc7d+9YntatdZnCDQECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQAkFFLWWMGlCJkCAAIHaCYQurfviU5+8eDyeSnRrbSExQYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUBIBRa0lSZQwCRAgQKDWAhfi048f2NvUpbXxXLfWhoSfBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECZRNQ1Fq2jImXAAECBOomEDq0XhsfeuamA/HU8li31mUKNwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAiUSUNRaomQJlQABAgRqKbAYn3rn7h3JxPR4PL081q11mcINAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAiQQUtZYoWUIlQIAAgdoJhC6ts/GpT148Hk+1jHVrbSExQYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUHABRa0FT5DwCBAgQKDWAmfj019+xWVdu7Q23tettSHhJwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQFkEFLWWJVPiJECAAIG6CYylBz4dH/romcPxVMexbq0daTwgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAooICi1gImRUgECBAgQCAVWEj/bM1KjG4fSUKhaq+Xbq29SnmPAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgCAKKWouQBTEQIECAAIFmgdCldb55Kknmzh1JQqHqai7dWlej5V0CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIE8BRS15qlvbwIECBAg0F7ghnS6qUtrKGY99ObJ9m93mdWttQuORwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAoUSUNRaqHQIhgABAgQILBWzLsQOoePqpS/dFk/3NNattScmLxEgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECOQsoKg15wTYngABAgQIRALz6XgsO7febqvr/T4bi3sCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECgxJQ1DooWesSIECAAIG1CbTt0jq6fWRtq73wlW6t6+LzMQECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwBAEFLUOAdkWBAgQIECgR4HZ9L2mLq3hu4PXT/b4eefXdGvtbOMJAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAMQQUtRYjD6IgQIAAAQJBYDFmmJgeT3bu3hFPr2msW+ua2HxEgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECAwJAFFrUOCtg0BAgQIEFhBIHRp3Re/c/Li8XhqzePQrfXomcNr/t6HBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAYpoKh1kLrWJkCAAAECvQtciF8dP7C3b11aG2sfOjGZhOJWFwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIGiCShqLVpGxEOAAAECdRQIHVqvjQ8+c9OBeGrd49HtI8nMqal1r2MBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAv0WUNTab1HrESBAgACB1Qssxp/s3L0jmZgej6f7Mp47d0S31r5IWoQAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQKCfAopa+6lpLQIECBAgsHqB0KV1Nv7s5MXj8VTfxrq19o3SQgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAn0UUNTaR0xLESBAgACBNQicjb+5/IrLBtaltbGXbq0NCT8JECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgSKIqCotSiZEAcBAgQI1FFgLD306fjgR88cjqf6Ptatte+kFiRAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEFingKLWdQL6nAABAgQIrENgIf12a/b7YRab6taalXdPgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECCQt4Ci1rwzYH8CBAgQqKtA6NI6Hx9+mIWmwyygjc9pTIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQCAWUNQaixgTIECAAIHhCNyQbtPUpXXzlk3JoTdPDmf3F3YZZhHtUA9mMwIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgdIJKGotXcoETIAAAQIVEAjFrAvxOWZOTSWXvnRbPD3QsW6tA+W1OAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwCoEFLWuAsurBAgQIECgTwLz6Tpj2bVCl9bQNTWPS7fWPNTtSYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgEAsoao1FjAkQIECAwOAFWrq0HjwxmYSuqXlcurXmoW5PAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBWEBRayxiTIAAAQIEBiswmy7f1KU1bDdz89Rgd11hdd1aVwDymAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAYOACiloHTmwDAgQIECDQJLDYNEoHE9Pjyc7dO+LpoY51ax0qt80IECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgTaCChqbYNiigABAgQIDEggdGndF6998uLxeCqXsW6tubDblAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBA4AUBRa1+FQgQIECAwPAELsRb7bn6qty7tDZi0q21IeEnAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAHgKKWvNQtycBAgQI1FEgdGi9Nj74sVvfFE/lOtatNVd+mxMgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEai2gqLXW6Xd4AgQIEBiiwGK8187dO5KJ6fF4Otexbq258tucAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIFBrAUWttU6/wxMgQIDAkARCl9bZeK+TF4/HU4UY69ZaiDQIggABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQOwFFrbVLuQMTIECAQA4Ct8R7ho6oRevS2ohRt9aGhJ8ECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQLDFFDUOkxtexEgQIBAHQXG0kPPxwcP3VCLfOnWWuTsiI0AAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUE2BjdU8llMRIECAAIHCCCykkWzNRlOGTqiNGB+491PZ0N0TIECAAAECBAYi8Pzzzw9kXYsSIECAAAECzQIbNmxonjCqrID/vqpsah2MAAECBAom4L+vCpYQ4RAgQIAAAQKVENCptRJpdAgCBAgQKKjAy9K42nZp3bxlU0FDfjEs3VpftHBHgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECAweAFFrYM3tgMBAgQI1FfgdHr0pi6toZj1mmMTpRBpdGstRbCCJECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKL2AotbSp9ABCBAgQKCgAqGYdSGObebUVBKKRcty6dZalkyJkwABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQfoGN5T+CExAgQIAAgUIKzKdRjWUjC11aQ5Foma5Gt9YH7v1UmcIWKwECBAgQIECAAAECBAgQIECAAAECBAgQIECAQJ8Fnn/++T6vaDkCBAgQIECgncCGDRvaTddmTlFrbVLtoAQIECAwZIEL8X4HT0yWqktrI/6D108milobGn4SIECAAAECwxL47tvPDGsr+wxI4BV3fbhlZXltISndhLyWLmU9BSyvPTGV7qV2eS3dIQTcVwH/c7ivnLks1u7vtbzmkoq+biqvfeUszGLyWphU9DWQdnnt6wYWI0CAAAECBAgQWBK4hAMBAgQIECDQd4HZdMVXx6vO3DwVT5VivHP3jmRierwUsQqSAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgvAKKWsubO5ETIECAQHEFFuPQQlFoKA4t63Xy4vGyhi5uAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBkggoai1JooRJgAABAqURCF1a98XRlr0oVLfWOKPGBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAEC/RbY2O8FrUeAAAECBGoucDY+/56rryp1l9bGeUJh7qMPPdYY+kmAAAECBAgQIECAAAECBAgQIECAAAECBAgQIFBzgff9w1trLlD+49/+2ve0HEJeW0hKN9Eur999+5nSnUPAzQKvuOvDzRPpSF5bSEo30S6vpTtEnwPWqbXPoJYjQIAAgVoLhA6t07HAsVvfFE+VcqxbaynTJmgCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQGkEFLWWJlUCJUCAAIESCCzGMVatEDR0a3URIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQGISAotZBqFqTAAECBOooELq0zsYHr1oRaNWKdON8GRMgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECOQnoKg1P3s7EyBAgEC1BG6JjzO6fSSZmB6Pp0s/rlqhbukT4gAECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgYoIKGqtSCIdgwABAgRyFRhLd5+PI5g7dySeqsRYt9ZKpNEhCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQKFE1DUWriUCIgAAQIESiiwkMa8NRt36NI6c2oqO1Wpe91aK5VOhyFAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIFEJAUWsh0iAIAgQIECixwMvS2Nt2ad28ZVOJj9U9dN1au/t4SoAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgsHoBRa2rN/MFAQIECBDICpxOB01dWkMx6zXHJrLvVPJet9ZKptWhCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQK5CShqzY3exgQIECBQAYFQzLoQn2Pm1FQyun0knq7cWLfWyqXUgQgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECuQooas2V3+YECBAgUHKB+TT+sewZQpfWuXNHslOVvtettdLpdTgCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwFAFFLUOldtmBAgQIFAxgQvxeQ6emKxFl9bGuXVrbUj4SYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgsF4BRa3rFfQ9AQIECNRVYDY9+Kvjw8/cPBVPVX6sW2vlU+yABAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIGhCChqHQqzTQgQIECgggKL8ZkmpseT0Lm0bpdurXXLuPMSIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAYjoKh1MK5WJUCAAIFqC4QurfviI9a5Y2mdzx7/HhgTIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAisTUBR69rcfEWAAAEC9RY4Gx9/1/4ra9mlteGgW2tDwk8CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIG1CihqXauc7wgQIECgrgKhQ+t0fPi589fFU7Ub69Zau5Q7MAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgrwKKWvvKaTECBAgQqIHAYnxGXUp/IcIh/s0wJkCAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQWI2AotbVaHmXAAECBOouELq0zsYIOpS+KMLiRQt3BAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECqxNQ1Lo6L28TIECAQL0FbomPP7p9JNl/YG88Xduxbq21Tb2DEyBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgTWLaCodd2EFiBAgACBmgiMpeecj886d+5IsnnLpni61mPdWmudfocnQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECKxZQFHrmul8SIAAAQI1E1hIz7s1e+bQpXXm1FR2yn0qoFurXwMCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIG1CChqXYuabwgQIECgbgIvSw/c0qX16JnDurR2+E04dP0bOzwxTYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQKC9gKLW9i5mCRAgQIBAVuB0Omjq0rp5y6bk0InJ7DvuMwIT0+NLHVszU24JECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIdBVQ1NqVx0MCBAgQILBUzLoQO8ycmkpGt4/E08YZgZMXj2dGbgkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAh0F1DU2t3HUwIECBAgMJ8SjGUZQpfWuXNHslPu2wjo1toGxRQBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEBHAUWtHWk8IECAAAECSwIXYodrjk3o0hqjdBjr1toBxjQBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECLgKLWFhITBAgQIEBgWWA2vXv18uiFm6O3Ho6njDsI6NbaAcY0AQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAi4Ci1hYSEwQIECBAYFlgcfnuhRtFmrHIymPdWlc28gYBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECSKGr1W0CAAAECBNoLhC6t++JHCjRjkZXHCoFXNvIGAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQICAola/AwQIECBAoJPA2fjBrv1XJjt374injXsQUAzcA5JXCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQI1F9Cptea/AI5PgAABAm0FQofW6fjJ3Pnr4injHgV0a+0RymsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgRoLKGqtcfIdnQABAgQ6CizGT0KH1lCY6Vq7gG6ta7fzJQECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgDgKKWuuQZWckQIAAgdUIhC6ts/EHCjJjkdWPdWtdvZkvCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQJ1ElDUWqdsOysBAgQI9CJwS/zS6PaRZP+BvfG08RoEFAevAc0nBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIGaCChqrUmiHZMAAQIEehIYS9+aj9+cO3ck2bxlUzxtvAYB3VrXgOYTAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEBNBBS11iTRjkmAAAECPQkspG9tzb4ZurTOnJrKTrlfp4BuresE9DkBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoKICilormljHIkCAAIFVC4Ri1pYurUfPHNalddWU3T/QrbW7j6cECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgboKKGqta+admwABAgRigVDQ2tSldfOWTcmhE5Pxe8Z9ENCttQ+IliBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIVExAUWvFEuo4BAgQILAmgVDMuhB/OXNqKhndPhJPG/dBQLfWPiBaggABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQMQFFrRVLqOMQIECAwJoEQpfWseyXoUvr3Lkj2Sn3fRbQrbXPoJYjQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECJRcQFFryRMofAIECBDoi8DZeJVrjk3o0hqj9HmsW2ufQS1HgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECi5gKLWkidQ+AQIECCwboHZdIVd8SpHbz0cTxkPQEC31gGgWpIAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUFIBRa0lTZywCRAgQKBvAovxSjqIxiKDG7MenK2VCRAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQJlE1DUWraMiZcAAQIE+ikQurTuixfUPTQWGeyY92B9rU6AAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKIuAotayZEqcBAgQIDAIgVviRXftvzLZuXtHPG08QAHdWgeIa2kCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQIkEFLWWKFlCJUCAAIG+CoQOraFTa9M1d/66prHBcAR0ax2Os10IECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAkUWUNRa5OyIjQABAgQGKbAYLx46tIauoa7hC+jWOnxzOxIgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEiiagqLVoGREPAQIECAxDYFe6SUuXVt1Ch0HfeY+D1092fugJAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBA5QUUtVY+xQ5IgAABAm0EzsZzo9tHkv0H9sbTxkMUmDk1lYQ8uAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBOopoKi1nnl3agIECNRZYCw9/HwMMHfuSLJ5y6Z42niIAsE/5MFFgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQTwFFrfXMu1MTIECgzgIL6eG3ZgFCd9DQJdSVv4BurfnnQAQECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgbwEFLXmJW9fAgQIEMhDIBSztnRpnblpSpfWPLLRZk/dWtugmCJAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQI1ERAUWtNEu2YBAgQILAkEApam7q0hiLKmZt1aS3S74durUXKhlgIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAsMTUNQ6PGs7ESBAgEC+AqGYdSEOQQFlLJL/WLfW/HMgAgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAHgKKWvNQtycBAgQI5CEQurSOZTdWPJnVKNa9YuNi5UM0BAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIFhCChqHYayPQgQIECgCAJn4yAmpseT0e0j8bRxAQQUHBcgCUIgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECAxZQFHrkMFtR4AAAQK5CMymu+6Kd547fySeMi6QgG6tBUqGUAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECQxBQ1DoEZFsQIECAQO4Ci3EEoUvrzt074mnjAgno1lqgZAiFAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIDAEAUWtQ0C2BQECBAjkKhC6tO6LIzh58Xg8ZVxAAd1aC5gUIREgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEBiSgqHVAsJYlQIAAgcII3BJHEjq06tIaqxRzrFtrMfMiKgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIDAIAQUtQ5C1ZoECBAgUBSB0KE1dGptunRpbeIo/EC31sKnSIAECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgb4IKGrtC6NFCBAgQKCgAotxXKFD68T0eDxtXGAB3VoLnByhESBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgT6KKCotY+YliJAgACBQgnsSqNp6dI6d/5IoYIUTG8CurX25uQtAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECZBRS1ljl7YidAgACBbgJn44ej20d0aY1RSjLWrbUkiRImAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQGAdAopa14HnUwIECBAorMBYGtl8HN3cuSNJKI50lVNAt9Zy5k3UBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIFeBRS19irlPQIECBAok8BCGuzWbMChS2soinSVV0C31vLmTuQECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgV4EFLX2ouQdAgQIECiTQChmbenSOnPTlC6tZcpih1h1a+0AY5oAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUAEBRa0VSKIjECBAgECTQChoberSGjp8ztysS2uTUkkHurWWNHHCJkCAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAj0IKCotQckrxAgQIBAaQRCMetCHK3unrFIucfyWe78iZ4AAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0ElAUWsnGfMECBAgUEaB0KV1LBu4zp5ZjWrcy2k18ugUBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIFYQFFrLGJMgAABAmUWOBsHPzE9noxuH4mnjUsuoFtryRMofAIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAGwFFrW1QTBEgQIBAKQVm06h3xZHPnT8STxlXQEC31gok0REIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABApGAotYIxJAAAQIESiuwGEceurTu3L0jnjauiIBurRVJpGMQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBF4QUNTqV4EAAQIEqiAQurTuiw9y8uLxeMq4QgKhW+s1xyYqdCJHIUCAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAjUW0BRa73z7/QECBCoisAt8UFCh1ZdWmOV6o3nzh1JQnGriwABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoPwCilrLn0MnIECAQN0FQofW0Km16dKltYmjsoPR7SPJzKmpyp7PwQgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECdRJQ1FqnbDsrAQIEqimwGB8rdGidmB6Pp40rKqBba0UT61gECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBA7QQUtdYu5Q5MgACBSgnsSk/T0qV17vyRSh3SYboL6Nba3cdTAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEBZBBS1liVT4iRAgACBdgJn48lQ4KhLa6xS/bFurdXPsRMSIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAEC1RdQ1Fr9HDshAQIEqiowlh5sPj6c4sZYpB5j3VrrkWenJECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBKotoKi12vl1OgIECFRZYCE93NbsARU2ZjXqd6+guX45d2ICBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgWgKKWquVT6chQIBAXQRCMWtLl9aZm6aSzVs21cXAOSMBRc0RiCEBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoGQCilpLljDhEiBAgMCSQChoberSGopZZ26ewlNzAd1aa/4L4PgECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAqQUUtZY6fYInQIBALQVCMetCfPKZU1NJ6NTpqreAbq31zr/TEyBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAuUWUNRa7vyJngABAnUUCF1ax7IHD11aQ4dOF4EgoFur3wMCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQDkFFLWWM2+iJkCAQJ0FzsaH339gry6tMUqNx7q11jj5jk6AAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAiUWkBRa6nTJ3gCBAjUTmA2PfGu+NQnLx6Pp4xrLqBba81/ARyfAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKKWAotZSpk3QBAgQqK3AYnzyienxZOfuHfG0cc0FdGut+S+A4xMgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQKlFFDUWsq0CZoAAQK1FJhOT70vPvnc+eviKWMCSwK6tfpFIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAiUS0BRa7nyJVoCBAjUWeBsfPjQoXXX/ivjaWMCSwK6tfpFIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAiUS0BRa7nyJVoCBAjUVSB0aJ2ND3/y4vF4yphAk4BurU0cBgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBQgsoai10egRHgAABAi8ILMYSoUvrxPR4PG1MoElAt9YmDgMCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQKEFFLUWOj2CI0CAAIFU4NXpn5YurUdvPQyHQE8CurX2xOQlAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEDuAopac0+BAAgQIEBgBYEL8fPQffOaYxPxtDGBtgK6tbZlMUmAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKJyAotbCpURABAgQIJARGEvv5zPjpVudN2MR45UE/M6sJOQ5AQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQCB/AUWt+edABAQIECDQWWAhfbQ1+1jXzayG+14F/N70KuU9AgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEB+Aopa87O3MwECBAh0FwjFrKfjVw6dmEw2b9kUTxsTWFFAt9YVibxAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEMhVQFFrrvw2J0CAAIEuAvPps5dln4di1qNnDmen3BPoWUC31p6pvEiAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQyEVAUWsu7DYlQIAAgRUEQpfWhfidmVNTSShMdBFYq4BurWuV8x0BAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAYPACiloHb2wHAgQIEFi9QOjSOpb9LHRpDQWJLgLrEQhF0RPT4+tZwrcECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIDElDUOiBYyxIgQIDAugRuib/ef2CvLq0xivGaBObOK45eE5yPCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIDFlDUOmBgyxMgQIDAqgVm0y/2xV+dvHg8njImsCaBnbt36Na6JjkfESBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQGK6CodbC+VidAgACB1Qssxp+Efy4+FCK6CPRLQJF0vyStQ4AAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBDon4Ci1v5ZWokAAQIE1i8wnS7R0qV17vx161/ZCgQyArq1ZjDcEiBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQKIqCotSCJEAYBAgQILAmcjR1C8eGu/VfG08YE1i2gW+u6CS1AgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEOirgKLWvnJajAABAgTWIRA6tM7G3ys8jEWM+yWgW2u/JK1DgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEOiPgKLW/jhahQABAgTWL7AYL6HoMBYx7reAoul+i1qPAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQILB2AUWta7fzJQECBAj0T+DV6VItXVqP3nq4fztYiUAbAYXTbVBMESBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgRyEtiY0762JUCAAAECWYEL2UG4H90+klxzbCKeNibQd4HQrfXRhx7r+7oWJECAAAECBNYn8Iq7Pry+BXxdSAF5LWRa1h2UvK6bsJALyGsh0yIoAusS8Pd6XXyF/VheC5uadQUmr+viK+zH8lrY1AiMAAECBAgQIECgYAI6tRYsIcIhQIBADQXG0jPPx+eeO3ck2bxlUzxtTKDvArq19p3UggQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBNQkoal0Tm48IECBAoI8CC+laW7PrhS6tM6emslPuCQxUIHRrdREgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECOQroKg1X3+7EyBAoO4CoZj1dIxw6MSkLq0xivFABXRrHSivxQkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECPQkoau2JyUsECBAgMCCB+XTdl2XX3rxlU3L0zOHslHsCQxHQrXUozDYhQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECHQU2NDxiQcECBAgQGCwAqFL6/9O/4xltzl22+HktnfcmJ1yT2BoAn9w5gPJow89NrT9bESAAAECBAj8QuD5559vofju28+0zJkol8Ar7vpwS8Dy2kJSugl5LV3KegpYXntiKt1L7fK6YYP/L4HSJXKNAfvvqzXCFfyzdn+v/fdVwZPWQ3jy2gNSCV+R1xImrYeQ2+W1bv991e6/MXqg8woBAgQIECCwSoG6/TdG4Nl45fiykk6tyxRuCBAgQGDIAqFLa1NBa+jSOnfuyJDDsB2BFwV0a33Rwh0BAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAYNgCilqHLW4/AgQIEGgI3NK4afzcf2BvMrp9pDH0k8DQBXbu3pFMTL/4f/0z9ABsSIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBCosYCi1hon39EJECCQo8Bsuve+eH9dMmMR4zwE/B7moW5PAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECSKGr1W0CAAAECeQgsxpuG7pihS6aLQN4CurXmnQH7EyBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAnUV2FjXgzs3AQIECOQmMJ3u3NKlde78dbkFZGMCsUDo1vroQ4/F08YECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIZgff9w1szI7dlFLj9te9pCVteW0hKN9Eur999+5nSnUPAzQKvuOvDzRPpSF5bSEo30S6vpTtEnwPWqbXPoJYjQIAAgRUFzsZvhM6Yu/ZfGU8bE8hNQLfW3OhtTIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECNRYQFFrjZPv6AQIEMhBIHRonY33DV0xXQSKJuD3smgZEQ8BAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQdQFFrVXPsPMRIECgWAKLcTg6YsYixkUR8LtZlEyIgwABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEKiLgKLWumTaOQkQIJC/wKvTEFq6tM7cPJV/ZCIg0EFAt9YOMKYJECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgMQUNQ6AFRLEiBAgEBbgQvx7Oj2keTgicl42phAYQR0ay1MKgRCgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQI1EBAUWsNkuyIBAgQKIDAWBrDfBzH3LkjyeYtm+JpYwKFEtCttVDpEAwBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQYQFFrRVOrqMRIECgQAILaSxbs/GELq0zp6ayU+4JFFIgdGvdtf/KQsYmKAIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAlQQUtVYpm85CgACBYgqEYtbTcWjXHJvQpTVGMS6swNz56wobm8AIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAlURUNRalUw6BwECBIorMJ+G9rJseJu3bErmzh3JTrknUGiBienxJHRsdREgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECAxOQFHr4GytTIAAAQJJErq0LsQQM6emktHtI/G0MYFCC5y8eLzQ8QmOAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIFB2AUWtZc+g+AkQIFBsgdCldSwOUZfWWMS4DAK6tZYhS2IkQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEyiygqLXM2RM7AQIEii9wSxxiKAzUpTVWMS6LgG6tZcmUOAkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIEyCihqLWPWxEyAAIFyCMymYe6LQ1UUGIsYl0lAt9YyZUusBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQNkEFLWWLWPiJUCAQHkEFuNQFQTGIsZlFFCYXcasiZkAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAog4Ci1jJkSYwECBAon8B0GnJLl9Zjt76pfCcRMYFIQHF2BGJIgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEOiTgKLWPkFahgABAgSaBM42jdLBzt07kj1XXxVPGxMopYBuraVMm6AJECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBggsoai14goRHgACBEgqEDq2zcdyKAGMR4zIL6NZa5uyJnQABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECiqgKLWomZGXAQIECivwGIceujSGooAXQSqJKBQu0rZdBYCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgCAKKWouQBTEQIECgOgJj6VFaurTO3DxVnRM6CYEXBHRr9atAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEOivgKLW/npajQABAnUXWIgBRrePJAdPTMbTxgQqIaBbayXS6BAECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAQQQUtRYkEcIgQIBABQRCl9b5+Bxz544km7dsiqeNCVRCQLfWSqTRIQgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIGCCChqLUgihEGAAIEKCIQurVuz5whdWmdOTWWn3BOonIBurZVLqQMRIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECOQkoas0J3rYECBComEAoZr0hPtM1xyZ0aY1RjCsnoFtr5VLqQAQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEBOAopac4K3LQECBComMJ+eZyx7ps1bNiVz545kp9wTqKyAbq2VTa2DESBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAkMUUNQ6RGxbESBAoKICoUvrQny2mVNTyej2kXjamEAlBXRrrWRaHYoAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAYsoCi1iGD244AAQIVFDidnqmpS2s4oy6tQcFVJwHdWuuUbWclQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEBiGgqHUQqtYkQIBAvQTOxscNXSt1aY1VjKsuoFtr1TPsfAQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgMCgBRS1DlrY+gQIEKi2wGx6vH3xEXWsjEWM6yLgd78umXZOAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAYBACiloHoWpNAgQI1EdgMT6qbpWxiHGdBPz+1ynbzkqAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAj0W0BRa79FrUeAAIH6CFybHrWlS+uxW99UHwEnJdBGQLfWNiimCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQI9CChq7QHJKwQIECDQVuBCPLtz945kz9VXxdPGBGoloFtrrdLtsAQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEAfBRS19hHTUgQIEKiRQOjQOhufV4fKWMS4rgL+LtQ1885NgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIrEdAUet69HxLgACB+gosxkcPXVpDh0oXAQLJ0t+F0e0jKAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBFYhoKh1FVheJUCAAIElgbH0/23p0jpz8xQeAgQyAnPnjmRGbgkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBFYSUNS6kpDnBAgQIBALLMQToSPlwROT8bQxgVoLzJyaSnRrrfWvgMMTIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECqxRQ1LpKMK8TIECg5gKhS+t8bBA6Um7esimeNiZQa4Hwd0K31lr/Cjg8AQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgsEoBRa2rBPM6AQIEai4QurRuzRqETpShI6WLAIFWAd1aW03MECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQ6CShq7SRjngABAgRigVDMekM8ec2xCV1aYxRjAi8I6NbqV4EAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0LuAotberbxJgACBugvMpwBjWQQFe1kN9wTaC+jW2t7FLAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIEAgFlDUGosYEyBAgEA7gdCldSF+oFgvFjEm0Cqg+LvVxAwBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoJ2AotZ2KuYIECBAIBY4nU40dWkNLxw9czj8cBEgsIKAAvAVgDwmQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECKQCilr9GhAgQIBALwJn45cmpseTy6+4LJ42JkCgjYBurW1QTBEgQIAAAQIECLksBUoAAEAASURBVBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEIgFFrRGIIQECBAi0CMymM/vi2ZMXj8dTxgQIdBHQrbULjkcECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgVRAUatfAwIECBBYSWAxfiF0ad25e0c8bUyAQBcB3Vq74HhEgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEEgFFLX6NSBAgACBbgLXpg9burTO3HSg2zeeESDQQUC31g4wpgkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECqYCiVr8GBAgQINBN4EL8MHRoHT+wN542JkCgBwHdWntA8goBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQWwFFrbVNvYMTIEBgRYHQoXU2fuvkxePxlDEBAqsQ0K11FVheJUCAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBGoloKi1Vul2WAIECKxKYDF+O3RpnZgej6eNCRBYhYBuravA8ioBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBQKwFFrbVKt8MSIECgZ4Gx9M2WLq0Hr5/seQEvEiDQWUC31s42nhAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQL1FVDUWt/cOzkBAgS6CSzED0e3jyShEM9FgMD6BXRrXb+hFQgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIHqCShqrV5OnYgAAQLrFQhdWufjRebOHUlCIZ6LAIH+COjW2h9HqxAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQLVEVDUWp1cOgkBAgT6JRC6tG7NLnbpS7fp0poFcU+gDwK6tfYB0RIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBApQQUtVYqnQ5DgACBdQuEYtYb4lUOvXlSl9YYxZhAHwR0a+0DoiUIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACByggoaq1MKh2EAAECfRGYT1cZy66km2RWwz2B/gr4+9VfT6sRIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAEC5RZQ1Fru/ImeAAEC/RQIXVoX4gV1koxFjAn0V8Dfsf56Wo0AAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAor4Ci1vLmTuQECBDot8DpdMGmLq1hg6NnDocfLgIEBiSgW+uAYC1LgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIlE5AUWvpUiZgAgQIDEzgbLzyxPR4cvkVl8XTxgQI9FlAt9Y+g1qOAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKKWAotZSpk3QBAgQ6LvAbLrivnjVkxePx1PGBAgMQEC31gGgWpIAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAonYCi1tKlTMAECBAYiMBivGro0rpz94542pgAgQEJHHrzZBKKW10ECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgboKKGqta+admwABAi8KXJvetnRpnbnpwItvuCNAYOACl750WzJzamrg+9iAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIFBUAUWtRc2MuAgQIDA8gQvxVqFD6/iBvfG0MQECAxaYO3dEt9YBG1ueAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKK6Aotbi5kZkBAgQGIZA6NA6G2908uLxeMqYAIEhCIxuH9GtdQjOtiBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgSKKaCotZh5ERUBAgSGJbAYbxS6tE5Mj8fTxgQIDElAt9YhQduGAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKJyAotbCpURABAgQGJrAWLpTS5fWg9dPDi0AGxEg0CqgW2uriRkCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgHgKKWuuRZ6ckQIBAO4GFeFIxXSxiTCAfAd1a83G3KwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIJCvgKLWfP3tToAAgbwEQpfW+XhzhXSxiDGBfAQUmOfjblcCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIEAgXwFFrfn6250AAQJ5CVxIN96a3fzSl25LZk5NZafcEyCQo4Ai8xzxbU2AAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAjkIqCoNRd2mxIgQCBXgVDMejqO4NCbJ5PNWzbF08YECOQkoFtrTvC2JUCAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBHITUNSaG72NCRAgkJvAfLrzWHb3UMwaukK6CBAoloBurcXKh2gIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBwQooah2sr9UJECBQNIHQpXUhDmrm1FQSukK6ii3w7I+eS5545CvLf55+8pliByy6dQvo1rpuQgsQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECJRLYWKJYhUqAAAEC6xe4IV2iqUtrWPLomcOrXvn9t9+bfPq+z6/6u+wHe66+amm4eeumZP9v7kn2vOGqZOfuHdlX3L8g8OhDjyUfvOPPkmwha+iwe8sdJ5Jjt60+f2DLIxC6tT74kYeTn/7kZ+UJWqQECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgTUIKGpdA5pPCBAgUGKBC3HsE9PjyeVXXBZPD2Ucuo42rsc++6Wl29CZMhTxhe6xoWjTlSTf+fr3kj8696GWosZQ5HjvOz6WXDqyLTl0YhJVRQUa3VofuPdTFT2hYxEgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEfiFwCQgCBAgQqI3AbHrSffFpT148Hk/lOg6dSEOh5n/8zYUkW/Saa1A5b/5I2qW1W5fOz//tF3KO0PaDFgiF3oq8B61sfQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQCBvAUWteWfA/gQIEBiewGK8VejSunP3jni6EOPvf/upZPHG9ySNDq6FCCqnIJ78zlNdd/7+t3/Q9bmH5RdodGst/0mcgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECDQWWBj50eeECBAgECFBCbSs7R0aT10/RsLfcTQnfSdZz6Q3PmxtyZ7rr6q0LEOMrjX7X9N8uBHH+64xa79V3Z8lseDj//x/cnH33t/x60/+U8f6fjMg84CoVvrgx95uGvX3s5fe0KAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKL6ATq3Fz5EICRAg0A+BhXiR0KE1dGot+hUKW//8D/+q6GEONL5rjk107KgbOnjecseJge5v8WII6NZajDyIggABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEBicgE6tg7O1MgECBIoiEDq0zsbBnLx4PJ7q6/hXr3x5MvorL+u45tM/+GHyna9/r+Pz7IOvPv715Jtf/lbHws7su2W+f/ZHz6Xn/Mfk2WeeS77/7aeSY7cdXjrO5i2bkvf8t8XkE//1k8kTj3w1+VrqsXP3q5JgHApaQ7Gjqx4CurXWI89OSYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIE6iqgqLWumXduAgTqJLAYHzYUQw66S+vc+SPJoROT8dZN41DE+YkPfjJ54N5PrfhPqj/40YeT8/ecbvq+KoNP3/f55NFPPZ48+tBjy0e6/IrLlotaw2QobD35nwZbiLy8uZvCCjS6tYa/My4CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQNUELqnagZyHAAECBJoExtLRdNNMOpi5aSqeymV86Uu3LXUafcv7bltx/ye/89SK75TthZ/+5GfJH537UPL+2+9tKmgt2znEO1yB0K01FDm7CBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQJVE1DUWrWMOg8BAgSaBRbS4dbsVKPTY3Yu7/trjk0k4U+36/vf/kG3x6V8FgpaP/fAo6WMXdD5CRTx73B+GnYmQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEqiSwsUqHcRYCBAgQaBIIXVrnm2bSQVG7PO65elfXAs8nv722Tq2PffZLydce/0by9A+eSb7z9e81cVx+xWXJ69N9x6/dm4RCwX5c30/jfPShx5Jnf/RcErrLhnG4do2/Jtm0eVPyq1e+PJmYHk+e+Puv9NydNazV7Z+bP3hiMgln6fUKDo+kMf4s7RT7tS9+I/npv/xs+dPNWzclr/u3aaxpJ9BQaNzLuh//4/uXvw83Tzzy1aZxPIjfbzw/dtvhJHTv7eUaVF5Xsm4X46fv+3zyT9/4XvLVx76xFPp//tC5vv0+dbIIf48f/MjDSej26yJAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIVEVAUWtVMukcBAgQaBW4kE41dWkN/2T5zKmp1jcLMBOKPbtd21dRtPn0k88kodDwob94eLmotN3aTzySLL0Xnu25+qrktjtvTHbu3tHu1RXnQsfVBz/62bSg8ytt383OhzxcOtJb8WZY7J+feS75+HubC0ezm+x5w64Vi09DsWaI8TP3/V3y1ce/nv285T4UjIbrz+++Lxk/sDc5eP0bu3bS7RZby+LpRKf3Q3Fut6LWYeR1JetsjA9+9OGls4S4stcwCk0b3Vq7FTtnY3JPgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECiDgKLWMmRJjAQIEFi9QChmPR1/FgpaQ0FlEa9GR9NOsV1+xa90etQ0H4pH/+A/fGCpU2rTgxUG4bvfedPvJTM3Ty0Vt/bqFAoa33/x3qRRCLrCNkuPQ9HjTzOFkGGvUCx5zbF/t/R85+5XdS3u7GWP7Dvf/PK3kneeeX/XAt/s+9n7cK7w5xMf/GRy9yd+t69xZfdZ6X7Yee0WT8jfh972Z8sF0d3eHeQz3VoHqWttAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAIA+BS/LY1J4ECBAgMHCB+XSHsewuoXAyFMEV9fpfK/yT9St1cg3nCl0r75i7e9UFrVmT0H1z8cb39LRGKGh969E7V1XQmt2rcX/NsYnk/D2nl7rFho6x3bqVNr7p9WfoynrH3B+uqaA1u0cojA3rhI6vw76GndeVzleEgtYQY6Nb60rxek6AAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQKIuAotayZEqcBAgQ6F0gdGldiF8PXVpDEVwRr9D5MnTi7HYdvH6y2+Olrpn3vuNjXd/p9WGI5X2339v19RDzO898oGuxaCgkDkWqu/Zf2XWtT9/3+aWC3K4vreFhKET93XUW+Wa3bRS2hrMP6wo2w8zrSud6/H98KfcOrdkYj545nB26J0CAAAECBAgQIECAAAECBAgQIECAAAECBAoqsGHDhkJF9pKXvCQpWkyFAhIMAQIECBAgkJvAxtx2tjEBAgQIDErghnThpi6tYaOZm6YGtd+61g3dThdvenfX4tCJ6fFk5+4dHfcJa9y72L2gNXR6HT+wN7n0l7ctrfO1L34jeeLvv5J0KtB89KHHktC1debm9m4PfuThJHRBbXeFYtZ/f/F4cv1vvdgZN3Q4DTGGIs1215/ffV8SOrb2s/A4xN/pfCGGUGx78MQbk0YX3J/+y8+Sx//nE8nnHng0CabtrlDY+vhnv5SEnAz6yiOvK51ppd+zlb7v9/PLr7hsKRfh99VFgAABAgQIECBAgAABAgQIECBAgAABAgQIFE8gFI4ePXo02bVrV/Knf/qnydNPP517kAcOHEhuvPHG5K//+q+TBx98MPd4BECAAAECBAgQyAooas1quCdAgEA1BC7ExwgFiI3CxfhZHuNQ4PnNL/9jWlT61eQzf/X5rgWtoUD0ZFog2u364B1/loQ1213h+1vuOJEcu621o+V3vv695P1pR9ZOxakff+/9adHnZBLWiK9PfOiT8dTy+LY7b2wphr30pduSt7zvtuSf0zjbFSCG4tNQ8JothF1ecI03oatopysU+N75F29teRzmg/cdc3+Y5uhbLc/DxOf+9gtNRa13f+KOpvc+c9/fdSzeDS/G7zc+jgt688hrI5ZOP9sVCYfi4E3p78jO1+9YKpr+NyO/KJzutEa/50O+2v1O9Xsf6xEgQIAAAQIECBAgQIAAAQIECBAgQIAAAQKrF7juuuuSd73rXckrX/nKZHR0NLn77ruTZ55p31xk9auv/ovf+I3fSO68884k/Ny3b1/yk5/8JHn44YdXv5AvCBAgQIAAAQIDElDUOiBYyxIgQCAngdl0333x3isVhcbv92McikXDn/VcoRD0zo+9tWuX1u9/+6muBX3tCkwbMYVC3//y4QvJ7xz+vbadSUOn0M+khaZxt9YnHvlK2/fDumHN+P3GfuHn3PnrOsYbCnz7WdQabDpdE4f3d3qUBPfgdsfc3W3fCQXJ2WvP1Vdlh0vFyk0T0SB+P3q8NMwjr+3i6DS3lOe0+3G/u+t22q/bfOhiHArXFbZ2U/KMAAECBAgQIECAAAECBAgQIECAAAECBAgMX2B6enqpiPV1r3vd0ua//du/nfz85z9fKnLNo7D1DW94Q3LXXXclk5OTS/GMj48n99xzT/K2t71NYevwfz3sSIAAAQIECHQQuKTDvGkCBAgQKKfAYhx2KHYLRW9lu8I/q373J343CV0wu12fe+DRjo/Dt90KTMOHoTvo3LkjHdf43ANfaHkWOrx2ulaKd+fuV3X6NHmySxFqx4/W+ODBj3b/v7jNO8488torZehm+yf//Q+Wuv/G3WV7XaPf7+VRuN7vM1iPAAECBAgQIECAAAECBAgQIECAAAECBAhUSeDw4cNLxauvf/3rl4+1bdu25MKFC0tFpCMjI8vzw7iZmJhYKmi99tprm7b79V//9aU4Dx482DRvQIAAAQIECBDIS0Cn1rzk7UuAAIH+C0ykS7Z0aT10/Rv7v9MAVwxFodM3H1jqgLk5/SfdV7q+8KnHO76y/9o9HZ9lH4QixXvf8bHs1PL91x7/ehL+yflsLE9+p3MH1Fe+5uXL37a7CeuEQsjQBTa+wj5hvl+FkqEwuFO31m9++VtLnViP3fqmpS6fcSyhW+sn/+kj8fTQxnnktZfDHToxmZy753TT70Mv3w36Hd1aBy1sfQIECBAgQIAAAQIECBDoJrBly5bkta99bbJ58+alzmPZdy+55JLkm9/8ZvL0009np90TIECAAAECBCotcOjQoaUOqHv37m05Zyhsfctb3rI0/653vSv54Q9/2PJOvydC4erv//7vJwcOHEg2bNjQsvyv/dqvLXds/cxnPtPy3AQBAgQIECBAYJgCilqHqW0vAgQIDFZgIV6+UegWzxd1HP5Z+pm0oDUUmWaLSDvFGwpAv5oWnXa69rxhV6dHTfPhn5IP+4Wi0vgKc6GwNcTWuLb98rbGbcvP5378XMtcPPHsM53fuXSk89rxOiuNX5cWCHcqag3fPvHIV5b+hALWq9OOvqEIOHwTimHzvPLKay9nPnjijT39bvayVr/fCd1aH33osX4vaz0CBAgQIECAAAECBAgQILCiwBVXXJH85V/+ZXL55Zcn//qv/7r8fiiYCEWtt912W/I3f/M3y/NuCBAgQIAAAQJVFQj/7TM1NbXU+XTfvpZeNMvH3rp163Jh6z333JM880xrM5Tll9d5Mz4+vlTQGgpt2xW0NpZvFLbecccdSShsff755xuP/CRAgAABAgQIDFVAUetQuW1GgACBgQmE/614Nl69bP8kebbI8tith5O53zrStYDw6R90/1/w75i7OyZZ0zgUhu65+sVPuxV9disiDSuEgs12xbPhWejQ2ksxb3i3l2vu/JHkcw88uuKrz/7oueTT931+6U94OZxvIi1yPXj9ZBIKo4d95ZXXYZ+z3/s1itgVtvZb1noECBAgQIAAAQIECBAgsJLAxo0bk5e//OXJ6Oho21df8pKXtJ03SYAAAQIECBComsDY2Fhy++23J/v371/xaI3C1lA8+u53v3sgne1DHKFD6+HDh7sWtDaCDe+Hd7/whS8kP/7xjxvTfhIgQIAAAQIEhipwyVB3sxkBAgQIDEpgMV44dB8NhYllvEKR5cffe3/yzjMf6FgAGs717I+eHcrxnkyLWrPXzt2vyg6b7kNhbihc7XR1KzL91de8otNna5oPRY4zN0+t+ttQmPvAvZ9KfudNv5ecmbiY3PuOj3Xt+LrqDVb4IK+8rhBWKR6XrZC9FKiCJECAAAECBAgQIECAAIEVBUIhxs9+1vov4DQ+/PnPf9649ZMAAQIECBAgUGmBp556Krn//vuT7373uz2dMxS2hiLYhYWFjv8HQj0t1Oal0Cn2zjvvTGZmZv4/e3cCHlV99n38npkkQAKBJCwJm8iiIIVXwVejVSqCIuojoIBeguJTpCLIIloRAVmE4gbutBWsguhzSW0t1YpY6gJPFX0EnxekQFmqIIYd2YIkJPOe+9AZM2eWTCaznJn5/q8rnXP+55z/8jkRKf64E1agVX/P9u6778prr70mJ07E57/BBVg2XQgggAACCCCAgFCplW8CBBBAIPkFCo0tXGPdRt+hNQ8zWseozbkGatue51/lU6twfrutxAyk7ti4M+QUaz9cL4vmLJUR04cEvK+6qqgBH4pCpycwHKgipgZadc3jnxrhN5Ou98357/j1ezr63Xm15zBqn6MfvUPKT5V7q7DWdGBPwHX54g9EK79WVz23puMHuj9R7zXQWpKtj2qtyfbGWC8CCCCAAAIIIIAAAggggAACCCCAAAIIIIBAKgmUlZXJ4sWLpaKiQh555BFp0aJFtdvzBFv1xsceeywqFVu7du1qBlqvu+66sAOty5cvl8mTJ8v69eurXTM3IIAAAggggAACsRQg1BpLXcZGAAEE4iMw0ZimbtWp9MfY9709waHWPt2k9+DLqy7L71hDrU/f+6KECrdqxdBegy4XDevZqfUd2lMChVp1jSuXrjaDuzcYIVV9F9o2r90ub77wjhHmLTXPrf/TrFVj6dazq7U7KucasC023kdtKq6WGcFYrZ675cvtMmPJ/VFZF4PERkCrtQb73ozNjIyKAAIIIIAAAggggAACCCCAQHILOOrmiKNufTESH8mzEbexVHeFVB7ZnzxrZqUIIIAAAgikiYAGW5csWWKGSTXY2rx59T+pLysrS8aPHy9aLfWJJ56oVbD1Jz/5iUyfPl1uuOGGsMR1zhUrVsjUqVMJtIYlxk0IIIAAAgggEGsBQq2xFmZ8BBBAILYCWqV1pHWKgaOul6w6mdZu251rUPXJt6eZP+Zeq7cGaxqmnPLSOL/L1e2xyyWd/J6JpKOpETi1tu5GAHXYpMFmVVbrNT3fvG6bbB61LdAlvz7dx+SXxsf0nWnlXP3SsOOaFevkU+MzWMDWb4FVOjzVc3XvsWqJfK+x2lM8x6Vaazy1mQsBBBBAAAEEEEAAAQQQQCAVBBz188TZoEDcZSeTZztOlziy6hFqTZ43xkoRQAABBNJMoLy8XF599VUz2Dpz5sywgq116tSRe++915R68skn5dChQzVW69SpkxloHTBgQFjPaqB15cqVZqD1yy+/DOsZbkIAAQQQQAABBGItQKg11sKMjwACCMRWQJOePlVaNRCY6CqtNdmyrveB+aPNYGuw59Z9uF4O7zvirXrquc9TBdVzbv0cMWNITCu8Drrnetn37QFZ/uoH1qnDPs/JzTb2Pyqm66y6GE+4Vau3Rhpw1eq5Nwzv4/c+qs5Tm+NEv9farN0uz1Kt1S5vgnUggAACCCCAAAIIIIAAAggkhYAjQ9ylR6Ri79dJsVxdpFaWdTU/J2nWy0IRQAABBBBIRwENti5evFjcbreEW7G1bt263mDr3Llz5eDBg2HTdezY0Qy03nTTTWE9o4HWDz74QCZPnixr164N6xluQgABBBBAAAEE4iHgjMckzIEAAgggEBMBDbPeYR1ZA63VVbq0PpPoc60sqZVPg7WyU+VGdVH//zPdtGWTYI+Y/d9u+y7k9WhcHP3oHdKjX3FEQ3Xs1l7mGpVqQ+09ooHDfEgDrhpufWPTb+TZ92eZlWdbti+q9ml9HyuXrq72vkhvsMN7jXTtdnnOU63VLuthHQgggAACCCCAAAIIIIAAAgjYWsDhtvXyAi7O4TC6k3DdATdDJwIIIIAAAqkr4KnYOnXqVCkpCf5TC6sKeIKt999/vxQUFFS9FPT43HPPNQOtgweH/5P2PvzwQzPQ+sUXXwQdlwsIIIAAAggggEAiBAi1JkKdORFAAIHoCIw0himsOpSGWQeOur5qV9Ic9xp0Wci1rlr2md/1+g2z/fqqdmz5cnvV06gfrzUqyN52wRhZtWxN2GNrZdbegy+XKS+NkyfffljCCZGGPXgtbtQQpFae/c3Hj5nr0vNQbff24H/wonsM1bTqbqiW6Pcaam3JdE2rtdIQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgsQKeiq2RBFvvu+++aoOtHTp0EB375ptvDnujH3/8sUyZMkU+//zzsJ/hRgQQQAABBBBAIF4CGfGaiHkQQAABBKIqoFVaJ1pH1Cqt1f3oduszdjmvrlrphk83yYmjpVI1MKnHXS7pJHotUNvx1c5A3X59Wnn0xrbD/fo9HSOmD5F+I/p4Ts3P3z//jiyas9SnT+2HTRpsVm7dsfEbKTfGrdq0AmmzVo2rdsXsWE02fLI54PjdenYRrRIbrOm1J40Ksnf/bKLs3XUg4G3B+vXmnGrCxuodqiXyvYZaV7Jd81RrXfOef5XjZNsL60UAAQQQQAABBBBAAAEEEEAg5gLGjwVOquauTKrlslgEEEAAAQTSXeD06dOyePFik+GRRx6RoqLqf3JenTp1ZPz48eYzc+fOlYMHD/oxtm3b1gy03nrrrX7XgnWsWrXKrNC6Zk34RVuCjUU/AggggAACCCAQCwFCrbFQZUwEEEAg9gK3GFP4VGnVKfsOvTL2M8doBg0yaghvx8bgQVStjNqjX7HPCnr0uzhoqFWDnfpMdYHZb7cFrzqqk1mrqW5et80v0Kr3PTB/lBmy1eNQoVG9Huu2ee12eX3eWwGnOXGstNr1adXfszufFTTUGnDgf3fmNWkY6rJsMfyqC/cm4r2GXHSSXtRqrYRak/TlsWwEEEAAAQQQQAABBBBAAIH4CtTNEWeu/nhfZ3znjWg2I4CbVc9Yqiuip3kIAQQQQAABBBIjoBVbFy1aJA6HQ2bOnBlWsLVevXpmsNVt/AUcDbYeOnTIu/g2bdqYgdahQ4eaY3ovhDj4+9//blZo1U8aAggggAACCCBgV4Fk+NMZu9qxLgQQQCCRAuOskxdf090vfGm9x+7nxX26h1ziuo82+F2v7pn5k16pNpgZLPzpmaytEe6s2tZ96L8Ovf63pf8th/cdqXprwo5DVezVoG911VJ14ft27Q+6/lCh1Ooqtb75wjvVOiXivQbdbBJf8FRrTeItsHQEEEAAAQQQQAABBBBAAAEEYi9QdlLMQq25TUQ02Gr7L+MnAdWrL5WnSmNvwwwIIIAAAgggEFUBrdj6yiuvyMMPPywlJaGLrngm9gRb77vvPsnLyzO7W7dubQZab7/99rADrZ9++qlZoXX16tWeoflEAAEEEEAAAQRsKUClVlu+FhaFAAIIhBTob1w933qHVmRM9tbl0o4i84LvQiuvWpuGN/uN6CPLFqywXjLP9+46IA8N+pWMmD5ENPhbtem1RXOWhqxkqaFAa0BUK50GaiuXrhb90vtbtmvuc0vbn7SWnAbZ3j4dV8OfXS7p5O2L5kGocbUy7ezhz8j4eSP89uZZg+4jVNXcc7u189zq96lVarXy7omjgZ103LF9pvgZaaVbj3Ui3qvfRlKkg2qtKfIi2QYCCCCAAAIIIIAAAgggYHMBrTiWrK3y+30i+kVDAAEEEEAAAQTiIOAJtupU4VZszc7Olnvvvdf4izhueeutt+QXv/iFDBs2TJzO8OqYffbZZ2ag9eOPP47DDpkCAQQQQAABBBConQCh1tr58TQCCCCQCIFp1kk1rKkhyWRv5xphSP2x98GqiGoIVQOZLdsX+Wx12KTBotVH9Vqgps/NMkKcGrT0VF09vP/7oPdXHSNQWLhLccegIVp9Vqu1Wiu2BgrkeubRAGqPfhdLr8GXm/v39NfmUyup6rjB5lWv2y4YY8xbbIRLfT03fLo56HO6Jn1HutZQrXvPrrJq2ZqgtwQysr73eL/XoItN8gueaq1r3lub5Dth+QgggAACCCCAAAIIIIAAAgjUXEDDtllZWZKZmSmVlZU1H8AGT5SWBv6LwzZYGktAAAEEEEAAgQgFPMFW/b3KjBkzpKjI97/VBBpWK7ZqsPWqq66S888/X1wuV6Db/PrWrl1rBlo//PBDv2t0IIAAAggggAACdhQg1GrHt8KaEEAAgeACxcYlvyqtvQddFvyJJLqiYUkNtgYLYupWNIxpDbXqc+OfGiEPDZwTNBCrz2rl0FBj6z1VmwYzrdVd9br2aRg0VGiz6jjVHeua9Ov1eW/JjCW/jFpAedhDg6o1iWQPGjZV81DthjuvrrVPvN9rqP0k+zWqtSb7G2T9CCCAAAIIIIAAAggggAACkQrk5OTIuHHjpFu3blJRURHpMAl9bvDgwQmdn8kRQAABBBBAIDYCGmx9+eWXzeqr06dPlxYtWlQ7kVZsveiii6q9z3ODBlofeugh+dvf/ubp4hMBBBBAAAEEELC9AKFW278iFogAAgj4CEz0OTNOPFUYrf3Jet7lko4hg6cb1myWfiP6+G1Pf+T9r96cJLONiqzWKql+N4fRoYHWyS+NC3rnA/NHiVZDXbZgRcggbdABAlzQdU8a+CuZ8+ZDUQm2qomGfZ++d0HU1tjbqNAayN+6HZ170D3Xy++ff8d6qUbn8X6vNVpcEt3s+XWCaq1J9NJYKgIIIIAAAggggAACCCCAQFQEtEprr169pGfPnlEZLxGDOBs28Z/W+NHD+uOH3ccO+l+jBwEEEEAAAQSSRsBTsVUXrBVbmzdvHrW1r1+/XqZOnSrvv/9+1MZkIAQQQAABBBBAIB4CznhMwhwIIIAAAlER0Aqt/a0jaQXGVGpdLu0YcjvrjEqt1h9T73lAA5DPrpglXS7p5OmK6FNDmxpora4a6cDR10svI+QZzabVZDXYGo1grq5LK8pq2Nda3bama85r2lBGP3qHGZIN91mt6Dpi+hDJyc0O95GA98X7vQZcRAp0ptqvFSnwStgCAggggAACCCCAAAIIIIBAHAQ0+Hnq1Kk4zBS7KRwNCsTvq1GhuIraxW5SRkYAAQQQQACBuAl4gq1arXX37t1RmXfDhg1mhdbly5dHZTwGQQABBBBAAAEE4ilApdZ4ajMXAgggUDuBadbHNahYfE13a3dSn59rBFM1TBosuKr9W9ZtCxpc1fDlHCPEudYIv/7t9/8tq5atCctD51RLDWJqBdbq2sqlq2XBtNdEQ6jRbjrm2o/Wi1ZFjUbTUOhvPn7MrIC7atlnpkm469aKtb0GXWaGYyNZiwaENVi7ZsVaY/7N5nsJd+6q88XrvVadM9WOqdaaam+U/SCAAAKpL9D8kZdSf5NpuEPea2q+dN4r7zU1BdhVTQU0PEqLjUDFd1v9BnbUrS+uwrZ+/aE6+PU6lE7yXuO9Ju+7C7Vy3msoneS9xntN3ncXj5VrsPXll182p9Jwa20qtv7jH/+QKVOmyF/+8pd4LJ05EEAAAQQQQACBqAsQao06KQMigAACMREoNEa9xjpy36FXWrvidn6mSullQedr2S6yH4+i4VKtLFpuhFeDtXDG1jCmfml10Q2fbJIdG3eaw2mwUltW3Uw594Iz1Sy69ewiGvwMt2lg9ul7FwS8XYPGU14aH7AyqlZf/Xb7d7J31wFZNGdpyGqsW9Zt94ZaPYHOgBManW07nxXskk+/VrDVL9Pk003m/N9uKzHv2fLldmnasrHkNWkoOp/uo2nLJmEFfH0mCXCi4/W97Urzq+plfScnjp7wdul91bVYvledO1rW1e0jUde1Wuua99YmanrmRQABBBBAAAEEEEAAAQQQQACBSAQqK/yfCtTnfxc9CCCAAAIIIJBEAlWDrZMnT5azzgrvv/94tqh/yeirr76SadOmyZ///GdPN58IIIAAAggggEDSCRBqTbpXxoIRQCBNBSYa+65bde9mUPD2xIVaNfSoX7FoNQmYVjd/Tm62WYE1WhVttcro46PmB5xWK7w++/4ss9JsoBv0nelXl0vErF469uop4gmVWu/XarSepkFfDaNGs0V7vEjWppVDI23Rfq+edcTC2jO2HT6p1mqHt8AaEEAAAQQQQAABBBBAAIHUEygrK5NQlVrLy4P/5eXU04jTjhyOOE3ENAgggAACCCAQTwENti5YsEByc3Nl6tSp0rBh9QVBPOvbv3+/PPPMM/LWW295uvhEAAEEEEAAAQSSUoBQa1K+NhaNAAJpJqBVWkda9zxw1PVBw5PWezmPnoBWfdVga6B2jVE5V0OR4TS9r8cNxfL6vMB/sBBsjnDG5h4EQgn0u/NqqrWGAuIaAggggAACCCCAAAIIIJDiAjk5OXLeeedJVlZWyCBquAwVFRXStm1bc7xgz/zkJz+RXbt2SWZm8D830VCsI4ygpt6j927btk327dsXbEpvv96ve065ZhjQEEAAAQQQQCA1BTTQWlBQUOPNZWRkSJs2bSQvL08OHz5c4+d5AAEEEEAAAQQQsIsAoVa7vAnWgQACCAQXGGdc8qnSqoHI3jdfHvwJrsRMYMOazUHHLj9Vs6ojJ44FDsfqBHlNGgWdhwsI1EZAq/RqxdYdG3fWZhieRQABBBBAAAEEEEAAAQQQSFIBDaD+/ve/l8JC/XvUtW8aMHU6naIhimBtypQpMmnSJDO0Giy4GqrSq3XcyspKufvuu2XRokXWS37nWiX2k08+kVOnTvld83Q4s+qJOI2wbNkPRpcNK6BmZHmWeuZTA60ZwQPCvjdzhgACCCCAAALJJKCVWe+//34ZNWpUjaq06h7z8/NlzJgx5u/L5s6dKwcOHEimrbNWBBBAAAEEEEDAKxD8T5m8t3CAAAIIIJBAAQ2z3mGdv+/tV4r++HVa/AUaNW4YdNK//X619Bp8uTRr1TjoPZ4LGihc+cZqz6nfZ5dLO/r10YFAtARunTBAZg1/JlrDMQ4CCCCAAAIxEXjqn/fHZFwGjZ/Avec86TcZ79WPJOk6Ar3X76YOT7p9sGBfgeaPvOTbYZzxXv1Ikq4j0HvVTWgANTs7W+rUqRO3PWmF1lBVWiNZSKgQbdXxjh07Jg8++GDVLr9jZ+NW4mzUVNynThqZVvuFWl3NO/iuWUOtLpfxVbP/xMO/h30Zk/Es0L+Hea/J+CZ91xzovfLvYV+jZDwL9O9h3msyvknfNQd6r7531O6sQYMGMn78eBk7dqxotdZImoZiR48ebVa212DrwYMHIxmmRs8E+nWsRgNwsy0FeK+2fC21XlSsfx2r9QIZICIB3mtEbDxkc4Ga/YmHzTfD8hBAAIEUFBhp7MmnbIZWaR046voU3GpybEkrXAZre3cdkPtvmCE3DO8j1wYJHmuYdfmrH8iqZWvkxNHAlVr1HesYNARiJVB8TXeqtcYKl3ERQAABBBBAAAEEEEAAAZsLaEXU06dP23yVoZdXUVEhWq01Wq3yyD5xlx6xZaA16B412KpfNAQQQAABBBBICYH69evLuHHj5N5774040OqB0HDsPffcY57OmzePiq0eGD4RQAABBBBAIGkECLUmzatioQggkIYCWqV1onXfWqU1r2nwaqHW+zmPrkD3nl1DhgEP7zsii+YsNb90Zq3amtekkWxety3shQybNJh3HLYWN0YqQLXWSOV4DgEEEEAAAQQQQAABBBBAIOUEyk+J2/iiIYAAAggggAACiRDQQKtWZ73vvvtEK61GoxFsjYYiYyCAAAIIIIBAogQItSZKnnkRQACB6gVuMW7xqdKqj/QadHn1T3JHTAXGP/ULmTTwV0ErrVadXKu36le4rd+IPqJfNARiLUC11lgLMz4CCCCAAAIIIIAAAggggIAdBDIzM+Wiiy6SZs2a2WE5Ea3hj3/8Y0TP8RACCCCAAAII2F8gJydHRo8eLffff780atQoqgvWYKuO7XA4ZO7cuVRsjaougyGAAAIIIIBALAUItcZSl7ERQACB2gmMsz7uCaFZ+zmPr0Dbzq3l2fdnyezhT8uOjTujMrlW3x0xfYj06FcclfEYBIFwBKjWGo4S9yCAAAIIIIAAAggggAACqSWgoYaMjOT+TwMul0ucTmdYL0bDHI8++qhcdtllYd1vx5v0ndEQQAABBBBAIPUEsrOz5e6775YHHnhA8vLywtrgt99+K++++65cffXV0qZNm2qfyc3NNYOteiPB1mq5uAEBBBBAAAEEbCKQ3H9yZRNEloEAAgjEQKC/Meb51nE1gEazh0CzVo3NYOua99bK6/PeijjcqgHZvrddKb0GXy5ZdTLtsTlWkTYCnqB8tMLZaQPHRhFAAAEEEEAAAQQQQACBJBaorKyU0tJSOXXqVFR24Xa7zYCpBmWDBU3Ly8uloqLCrBIWLKCp44TbdA+nT58O63Yd9/jx42Hdy00IIIAAAggggEC8BOrVqye/+IXxkwEnTZL8/Pywpt27d6/Mnj1b/vCHP8jnn38u06dPl5YtW1b7rKdiq/6+6KmnnpL9+/dX+0xNbvhu6vCa3M69NhRo/shLfqvivfqRJF0H7zXpXllYC+a9hsWUdDcFeq9Jt4koL5hQa5RBGQ4BBBCIksA06zjde3YVDUDS7CWgoUD9OnG0VDav3SZfrdlsfG4Pucgul3SUlu2L5Nxu7UXDsTQEEilAtdZE6jM3AggggAACCCCAAAIIIBB/gR07dsigQYMkKytLahIkDbZSDau2bdtWXnjhhaAVxmbNmiXLly+XzMzgf6FX1xIs8Fp1br1H7922bVvVbo4RQAABBBBAAIGkEahbt66MGDFCpkyZUqNAq/6e6uWXXzb/ctKrr75q/t7p4YcfllatWlW7dw223nPPPeYzGmzdt29ftc9wAwIIIIAAAgggkCgBQq2JkmdeBBBAILiAVmj1q9Lad2jP4E9wJeECObnZosFj/aIhkEwCVGtNprfFWhFAIG0FHMaP1nVXpu322TgCCCCAAAIIRFfgxIkT8j//8z9RHVRDEWVlZUHH/Oqrr6I+Z9DJuIAAAggggAACCNhYoE6dOjJ8+HCZOnWqFBQUhLVSrayqFVoXLlzorbavv/davHix+XwkwdZ58+YRbA1Ln5sQQAABBBBAIBECxn8ZoyGAAAII2EzAr0qrVmjV4BkNAQQQiIWAVmulIYAAAgjYV8DZqKk46of3Y+jsuwtWhgACCCCAAAKpLKBVX0NVWQ1VoTWVXdgbAggggAACCCBQVUB/z3THHXeYgdbGjcP7SX4HDhyQOXPmmIHWH374oepw5l8q0mDrzJkzZdeuXT7Xgp3Ur19fRo8eLRMmTJCmTZsGu41+BBBAAAEEEEAgoQKEWhPKz+QIIICAn4BWaO1v7SVwZhXhHAEEoingqdYazTEZCwEEEEAgigJGpVZX4dmi4VYaAggggAACCCCAAAIIIIAAAggggEDyCWigddiwYTJ9+nRp1qxZWBs4dOiQPProo/Lb3/5WTp48GfAZT8VWDbbu3Lkz4D3WTg223nPPPQRbrTCcI4AAAggggIBtBAi12uZVsBAEEEDAFJhodWjWqjFVWq0onCOAQNQFCM9HnZQBEUAAgagLOBu3EmdBi6iPy4AIIIAAAggggEAqC2gF2YYNG6byFtkbAggggAACCNhcQKvW33rrrWagtbCwMKzVHj58WB577DEz0FpaWhryGQ22vvrqq/LII4+EHWzNyckh2BpSlYsIIIAAAgggkEiBjEROztwIIIAAAj4C+v9i/aq03jC8j89NnCCAAAKxEPBUa92xMby/yR2LNTAmAggggED1As68QnFkZErF3q+rv5k7EEAAAQQQQAABBMyqZr/73e9k9erVaCCAAAIIIIAAAnEX0EDrzTffbAZOmzdvHtb8R44ckSeeeELmz58vx48fD+uZU6dOyeLFi8XtdsvDDz8srVu3rvY5DbaOHj3avG/evHmyb9++ap/hBgQQQAABBBBAIB4ChFrjocwcCCCAQHgCWqW1btVb85o2lL63X1m1i2MEEEAgZgJarXXW8GdiNj4DI4AAAghER8DRoEBcLiPYWrJdxF0ZnUEZBQEEEEAAAQQQSFEB/VG9CxcuTNHdsS0EEEAAAQQQsLNARkaGDBo0SGbNmiUtW7YMa6lHjx6VJ598Up5//vmwA62egT0VW51Op0ydOlVatWrluRT0s379+t5g61NPPSV79+4Nei8XEEAAAQQQQACBeAk44zUR8yCAAAIIhBTQKq0jrXcMHHW9ZNXJtHZzjgACCMREwFOtNSaDMygCCCCAQFQFHNm54mp5rojTFdVxGQwBBBBAAAEEEEAAAQQQQAABBBBAoPYCGmi96aabzEDrWWedFdaAx44dE62Y+txzz4keR9I02Lpo0SKzMuw333wT1hAabL3nnntk/Pjx0qRJk7Ce4SYEEEAAAQQQQCCWAoRaY6nL2AgggED4Ahpo9anSqmHW3jdfHv4I3IkAAghEQUCrtdIQQAABBOwrUFBY5F2co062EWztKI7MLG8fBwgggAACCCCAAAIIIIAAAggggAACiRXIzMyUAQOMn4xmVGg9++yzw1rM8ePHRSulPvPMM3LkyJGwngl2kwZbX3nlFXP+cIOtOTk5ZrD1vvvuk8LCQnE4HMGGpx8BBBBAAAEEEIi5AKHWmBMzAQIIIFCtgIZZ77Le1ff2KyUnN9vazTkCCCAQUwGqtcaUl8ERQACBWgtc1PtauXnML73jOLLqiqvFuaIBVxoCCCCAAAIIIIAAAggggAACCCCAQGIFXC6X/Md//IfMmTNH2rdvH9ZiTpw4IU8//bT59f3334f1THU3lZeXy+LFi2X27NkSbrDVU7FVq7Y2aNCguim4jgACCCCAAAIIxEyAUGvMaBkYAQQQCFtAq7QWVr1bq7QOHHV91S6OEUAAgbgJUK01btRMhAACCEQk8NPr+suwB2dIZta/K7RmZBnB1nPEUTcnovF4CAEEEEAAAQQQQAABBBBAAAEEEEAgOgLZ2dnSo0cPadeuXVgDlpaWyrPPPmtWaT18+HBYz4R7UyQVW+vUqSONGzeWLM+fO4U7GfchgAACCCCAAAJRFCDUGkVMhkIAAQQiENAqrROtz2mV1rymDa3dnCOAAAJxEaBaa1yYmQQBBBColUD3K3rLyFnzpF79f1fNcLrOBFtzGtVqXB5GAAEEEEAAAQQQQAABBBBAAAEEEIhc4Pjx47Jw4UL54x//KBUVFSEHOnnypDz//PMyd+5cOXToUMh7I72oFVsXLVoks2bNqrZiq977X//1X+Z6Dh48GOmUPIcAAggggAACCNRagFBrrQkZAAEEEKiVQH/jaZ8qrTpar0GX12pQHkYAAQRqK0C11toK8jwCCCAQe4EOXS+QsY8/J7n5+WcmczjFVdROnLkFsZ+cGRBAAAEEEEAAAQQQQAABBBBAAAEE/ATcbrd89dVXMnXqVHnrrbfk9OnTfvdohwZa58+fL08++aTEOkCqYdVXXnnFDLZ+/fXXAdej9yxdulQeeeQR2bp1q+g+aAgggAACCCCAQKIECLUmSp55EUAAgTMCflVaqZDItwYCCNhBgF+L7PAWWAMCCCBQvUCLth1kwlMvSkFhkfdmZ9M24sxv7j3nAAEEEEAAAQQQQAABBBBAAAEEEEAgvgL/+Mc/5OGHH5Zly5b5BVt/+OEH+e1vfyuPP/647N+/Py4L03CtBltnz54t1mCrXvvDH/7gDbTGZUFMggACCCCAAAIIhBAg1BoCh0sIIIBAjAW0Suv51jmojmgV4RwBBBIlwK9HiZJnXgQQQKBmAvnNiuSB518WDbh6mjO/SJxNWntO+UQAAQQQQAABBBBAAAEEEEAAAQQQiLPApk2bZNq0afKnP/3JG2zVQOuLL74ojz76qOzbty+uK/IEW2fNmiX/+te/zLkrKirMirIzZ86ULVu2xHU9TIYAAggggAACCAQTINQaTIZ+BBBAIPYC06xTdO/ZVdp2JnxgdeEcAQQSI0C11sS4MysCCCAQiUC9+g1kwtMvSqcLi72POxs2EVdROxEH/9ffi8IBAggggAACCCCAAAIIIIAAAgggEEeBjRs3msHWv/zlL3LkyBGzWqoGWvfu3RvHVfw4lSfYOmfOHNm6dau88847Mn36dNEALg0BBBBAAAEEELCLQIZdFsI6EEAAgTQT0AqtflVa+w7tmWYMbBcBBOwuoNVaZw1/xu7LZH0IIIAAAoZAZlaW3PnwHFn6/BPy2fvvmiaOnEbiat5eKkq2i1RW4IQAAggggAACCCCAAAIIIIAAAgggEGeBf/zjHzJjxgxZuXKlvP3221JSUhLnFfhOp9VZFy9eLLt27ZKdO3eKro+GAAIIIIAAAgjYSYBQq53eBmtBAIF0EvCr0qoVWrUqIg0BBBCwk4CnWuuOjTvttCzWggACCCAQRECDrUMmTJZ6OQ3ko7feMO9y1GsgrhbnSMV320QqyoM8STcCCCCAAAIIIIAAAggggAACCCCAQKwEvvzyS9Evu7RTp07Je++9Z5flsA4EEEAAAQQQQMBHgJ9B6MPBCQIIIBAXAa3Q2t86k1ZDpCGAAAJ2FLjhzj52XBZrQgABBBAIIXDjXWNFvzzNUSdbMlp1FEdWXU8XnwgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgjYToBQq+1eCQtCAIE0EJho3WOzVo2p0mpF4RwBBGwj0KNfseQ1bWib9bAQBBBAAIHwBK4YcLMMvW+yaPVWs2VkiaulEWw1Aq40BBBAAAEEEEAgmgIOh0MyMzODDul08p8iguJwAQEEEEAAAQQQQAABBBBAAAEEEEDAR4A/SfLh4AQBBBCIuUChMYNfldYbhlMFMebyTIAAAhELZNXJlIGjro/4eR5EAAEEEEicwEVXXSt3Pjznx2Cr02UEW88VR3Zu4hbFzAgggAACCCCQcgKnT5+WkpISOXjwoOzdu9f7tW/fPjlw4ICcPHky5fbMhhBAAAEEEEAAAQQQQAABBBBAAAEEYiOQEZthGRUBBBBAIIiAVmn1+ZmvWv2w7+1XBrmd7mQXOHG0VHZs/Ma7jZbtmlPx0qvBQTIJ6K9Tb85/Rw7vO5JMy2atCCCAAAKGQKcLi2XC0y/Ksw+MkZPHj4k4nOIqaieV+3dK5dGDGCGAAAIIIIAAArUW2LVrl9xyyy2SZVSIr6ys9BlPq7Tu2LHDp48TBBBAAAEEEEAAAQQQQAABBBBAAAEEggkQag0mQz8CCCAQfQGt0jrSOqxWP9QqiLTYCTx97wJZuXR17Cb7e1lUAABAAElEQVQwRh7/1AjpPfhynznWvLdWXpj0ik8IUN/1sEmDpd+IwNV5dZ263mDtnd2Lg12iH4GYCniqtS6Y/lpM52FwBBBAAIHYCLRo20EmvvCyPDdxjBzcU2IGW51N24gYlVsrv98Xm0kZFQEEEEAAAQTSRuDUqVOyYcOGtNkvG0UAAQQQQAABBBBAAAEEEEAAAQQQiJ2AM3ZDMzICCCCAgEVAA60+VVo1JNb7Zt8gpOUZTpNU4NttJfL4qPk+gVbdStmpctFQYKxDtknKxrJtLqDVWrW6NA0BBBBAIDkF8psVyb3zXhQNuHqas3Er0S8aAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAnYQINRqh7fAGhBAIB0ENMx6l3WjGhDLyc22dnOeAgKfGlVaNcAarK3+82fBLtGPgG0FPNVabbtAFoYAAgggUK1Abn6+jH38OenQ9QLvvc5GTcXVrI1ZvdXbyQECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCRAg1JoAdKZEAIG0FNAqrYVVd044rKpG6h3v+/ZAyE3t3bU/5HUuImBXAaq12vXNsC4EEEAgfIF69RvIyFnzpPsVvb0PORoUiKuoHcFWrwgHCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACiRAg1JoIdeZEAIF0E9AqrROtm+41+HJ+jLcVJYXOz+1mhEJCtI7d2oe4yiUE7CtAIN++74aVIYAAAjURyMzKkmEPzpCfXtff+5gjO1dcLc8Vcbq8fRwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggEE8BQq3x1GYuBBBIVwFNCvhUaVWIvrddma4eabHvHv2KpW3n1gH3mte0oQybNDjgNToRSAYBqrUmw1tijQgggEB4AjeP+aX0HTrce7OjTrZktD5PHJlZ3j4OEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEIiXQEa8JmIeBBBAII0F/Kq0Fl/TPWjgMY2dErb1lu2LJK9Jo1rNn9ekoc/zWs3yybenyZvPvyMbPt0sW9ZtM975WaJzaaBVg600BJJVwFOtdcH015J1C6wbAQQQQKCKQN+hP5eCZoWyZO7sM70ZWUbF1o5S8d02cZ8qrXInhwgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgjEVoBQa2x9GR0BBBDQKq3nWxlunTDA2sV5AgUGjr5eeg++POor0ODfrffxrqMOy4C2ENBqrW/Of0cO7ztii/WwCAQQQACB2glcdNW10iAvXxbOnCTlZWUirkxxtThHKkq2i/vksdoNztMIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIhCngDPM+bkMAAQQQiExgmvWxLpd0okqrFYVzBBBIOgFPtdakWzgLRgABBBAIKtDpwmIZ8/jzUq9+gzP3OF3iat5eHPXzgz7DBQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSiKUCoNZqajIUAAgj4CmiFVr8qrf3uvNr3Ls4QQACBJBXQaq15TRsm6epZNgIIIIBAIIE2HTvL2Mefk4LCojOXHU5xFZ4tzoZNAt1OHwIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJRFciI6mgMhgACCCBQVcCvSmvbzq2l+JruVe/hOEUFThwtlWULVgTdXa/Bl0uzVo2DXo/VhbUfrpct67bL4f1H5NttJT7T6Hp+cklH6X5F16gFFXWOT99bK+WnymXLl9ul7Idy75xZdTPl3AvaSWadTOnRrzghHt7FcBCRgKda64Lpr0X0PA8hgAACCIQp4Irv/3Vv0baDjHnsOVkwY5Ls3rHVXKSzSWsRV6ZUHvouzEVzGwIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAI1F4jvfxmr+fp4AgEEEEhWAa3Q2t+6+FsnDLB2cZ6iAsePlMrr894Kursul3aMW4jz8L4jsnLpanlvyQeyd9eBoGva8KmY9+kNXS7pJCNmDBENYte0aaB31bI18rel/y2b120L+biGbLUtmrNUuvfsKr0GXWYGXEM+xEVbCWi11jfnvyP6fUZDAAEEEIiCgNMljro5xld9cdTTrwZRGLTmQ+Q3KzIrtv56yn3y9eaN5gDO/CJxZGZJxd6vaz4gTyCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAQhoAzjHu4BQEEEECg5gITrY/oj+imSqtVhfNYC2z4dJOM/NlEMzQaKtBqXYc+N/bqKfLCg69ImVFlNdy2Y+NO73PVBVqtY2rA9fFR883nNRhLSw4BT7XW5Fgtq0QAAQTsJ+DIqivO3ALRSqiu1p0lo+354mreQcwAaYICrR6levUbyJjHn5dOFxZ7usTRoEBcRe1EHPxxgheFAwQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgagJ8F+hokbJQAgggIBXoNA48qvSOnDU9d4bOEAgHgLLFqyQSQPnSG0Costf/UCmDXkyrDE0xDpp4K9CVoMNZ98ajNVxarPucObhnugJaLVWDe7TEEAAAQSqF9AqrM68QjMYmnF2VzPI6mzaRpwNm4gGXAO13Px86X5Fb7nxrrFy8VV9A90Ss77MrCy5e9Zcufjqa71zOHIaiavFOSJGVVkaAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAtEUyIjmYIyFAAIIIGAKaJVWn0SChr009EVDIF4CK5eulgXTX4vKdFq19al7F8iUl8YFHU+DqA8ZAdqaVHUNOphxwRNsffLtaaKVQGn2FvBUa43W95y9d8vqEEAAgRoIGKFPDbE6jIqrns9wnm7RtoO06dRZ2ne5QM42PvObFYXzWEzvGTJhsuQ3LZLlS14y59H9aLC1cs92cZeXxXRuBkcAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEgfAUKt6fOu2SkCCMRHQKu0jrROpVVaCeZZVexzvmbFOtm360BEC2rbubUUX9M9omdj9dDhfUdkwbTQgdaW7Yuke8+uktMg21zGli+3y4ZPNgUNpa55b61o1da+twUOZ+u1UIHWjt3aS6/Bl4nOq63sh3JZ9/EGWbVsjeh6AzUNtq77cL3tfAOtlT4xg/tvzn8n6PvECAEEEEgXAUdmljiyG4pWM3Vk51a7ba2E2qajEWDt2s34PM/81D47tr5Dfy65efnyxnNPmMtz1Mk2gq3nSsV3W8Vd9oMdl8yaEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEkkyAUGuSvTCWiwACthfQQKtPlVYNs/boV2z7hafzAjWwqV+RtN6DL7dd6PKFSa/IiaOlAbej34/DJg2WfiP6+F3/dluJPG1UZN28bpvfNe14fd5bRjD18oAB7XUfrQ/4jHZqeHbGkvv9rmv/rRMGyKSBvzIrs/rdYHSs+vNntvMNtE76xPy+0AA/1Vr5bkAAgXQU0HCno37emSBrls9vBf04CgqLzBDrWeeeJx3+TzfRqqzJ1H56XX+pm1Pf+H3BbCkvMyq0ZmSJq2XHM8HWH04k01ZYKwIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAgA0FCLXa8KWwJAQQSFoBTTDcZV1939uvlLymDa3dnCMQE4G9RsXZUAHdETOGBK22qlVUJ780Tsb2mRKw2qZWVP3b0tUBn9d5g7XiPt2CXZKc3GzRNU0aOCfgPTs2fhOwn057Cuivd1Rrtee7YVUIIBBlAYfTrMLqrP/vaqyuzKATaGhVw6saYj27U2fJb3amannQB5LgQvcrektufoEsnDlJTh4/JuJ0GRVbz5GKku3iLj2aBDtgiQgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAnYVcNp1YawLAQQQSEIBrdJaWHXdWhVTKxfSEIiXwKpla4JO1bFb+4CB1KoPaAA71PfsqmWfVb09rOPlr34Q8r62nc8Ken1fiLBs0Ie4kDABfs1LGD0TI4BAPASM4KqzYRNxFbWTjLb/x/x0NCgQsQRaM7OypOulPWTofZNl1utvy8T5r8iNd40VDYKmQqDVQ92h6wUy9vHnjHBr/pkuI+jrat5BnLmGCQ0BBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBCAWo1BohHI8hgAACFgGt0jrR0mf+qHaqtFpVOI+lwGcr1gUdvtsVXYJeq3qhe8+uQX+E/JZ126TsVLn5o+arPtOsVWMJVq11x8adZiXWfndeLcXXdK/6mHms1Vrf2b3Yr5+O5BSgWmtyvjdWjQACgQUcdbLNiqyO+nmix8FaQWGRdOx+sXQyvjTQmi5Nq9BOeOpFeW7iGDm4p8TctrNpGzPoW3l4T7owsE8EEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIEoChBqjSImQyGAQFoL9Dd271OlVTX63nZlWqOw+fgKHN53RDYbodNgrculHYNd8ulv2b7IDK1qeNXatE+DrV0u6eRz6VyjCmywUKveuOHTTeaXBlgvMYKtGrDVZzQMS0stAU+11gXTX0utjbEbBBBIGwFHVl1x5DYRZ/1GIhlZQfetgc5uP+sl5/3fYtHjdG1affaB51+WZx8YI7t3bDUZnAUtTLvK/TvTlYV9I4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIRChAqDVCOB5DAAEELAJ+VVq1ImXbzq0tt3GKQOwEDu8/EnLwSQPnhLwe7kUNr3a5xPfugaOvl1XL1vh2Bjg7cbRUVi5dbX7pZQ216j8rvQZdzj8vAbyStYtqrcn65lg3Amks4HSJs0G+EWZtHLQia2ZWlnS6sFi6XnK5UZW1WHLz89MYzHfr9eo3kAlPvygLZ06STV+c+f2As2ETcbhcUrH3GxF3pe8DnCGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAQRIBQaxAYuhFAAIEaCGiV1vOt9986YYC1i3ObCox/aoT0Hny5TVcX/rJOHD0R/s21uHOfEWq1Ng1wa2Xi5a9+YL0U8lwDsssWrDC/PAHXG4b3oYJrSDX7X9RqrX2HXimvz3vL/otlhQggkNYCjpxGRpg1Txz1AwdUCwqLjADrxdLJ+Op6aY+0tqpu8xr6vfPhObL0+Sfks/ffNW9XV5czQyr27BCprKhuCK4jgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggIIRa+SZAAAEEai8wzTqE/mh2qrRaVTiPtYAGRBPZRj96h5SfKvdWYa3pWjwB1+WLPxCt/DrwnutFw5G05BTQkPObL7wjZcb3BA0BBBCwk4Ajq65RkbWJGWYVl/+/ZzScqQHWi3r3NSuz2mntdl+L2g2ZMFnq5TSQj956w1yuIztXXC3OkYrd/yTYavcXyPoQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAARsIEGq1wUtgCQggkNQCWqHVr0prvzuvTupNsXgEIhXQqrfFfbrJgumvSaQhWw1BaoXPLV9ulxlL7o90KTyXYIG8pg2l7+1XmlV4E7wUpkcAAQREnC4jxJpvhFkbi6NOdkCRNh07y8VXXSvdftZL6tVvEPAeOsMTuPGusZLftJn88bfPmg+oeUbr84xg6xZxl5eFNwh3IYAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIpKUAoda0fO1sGgEEoijgV6VVK7QWX9M9ilMwFALhCVRX1VQrCEejNW3VOOQw+v2vX2veWytrVqyTT43PE0dLQz4T6OLaD9fLojlLZdikwYEu05cEAgNHXS9aeZdqrUnwslgiAikq4MhpZFZkddTPD7jD3Px8I8h6nVmVtVmrswLeQ2dkAlcMuFmyjXDwG889IeVlRpA1I0tcrTTY+k9xn6r57wsiWwVPIYAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIJJsAodZke2OsFwEE7CSgFVr7Wxd064QB1i7OEYiLgFbGDNVGzBgiGrqOV/OEW7V6a6QB12ULVsgNw/tIdXuL156Yp2YCVGutmRd3I4BAdAQcmVniaNjMDLOKKzPgoN2v6C0X9OglXS/tEfA6ndERuMiofNsgL18Wzpx0JthqVMx1tThHKvbsEHfp0ehMwigIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIpJeBMqd2wGQQQQCC+AuOs02mAiyqtVhXO4yXQtGWTkFN9u+27kNdjeVH/udBw6xubfiPPvj/LrL7asn1RtVNqhc+VS1dXex832FdAq7VWV0XYvqtnZQggkEwCjro54ipqJ66zuoizUVOxBlpbtO0gN4/5pTz25nsy7MEZBFrj9HI7XVgsE55+UeoZVVvNpsFW4z0Fq54bp2UxDQIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAgE0FCLXa9MWwLAQQsL1AobHCW6yr1PAWDYFYChzedyTo8PUbZge9phe2fLk95PV4XdRqsYPuuV5+8/Fj8uTbD1dbPXb39pJ4LY15YiDgqdYag6EZEgEEEDAFHNm5ZvVPV8uO4shp5KOSm58vP72uv0xe8LpMnP+KeewNV/rcyUksBTRQPPGFl6Wg8N9/ocXhFFfh2WfCx7GcmLERQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSSTiAj6VbMghFAAAF7CEw0llG36lIIblXV4DhSgbwmDUM+enj/EdHvtUAtJzdbulzSSTZ8uinQZdnx1c6A/dZOrY56Y9vh1m7v+YjpQ6TfiD7ec51vwyebvedVD7r17CIdu7Wv2uVzrNeefHua3P2zibJ31wGfa56TYP2e63zaX0AD/8sXfyD6vUVDAAEEoiXgzC0QR8Om4qjj/5c6tDroRb37SvcrekdrOsappUB+syK5d96L8uspE2T3jq3maM7GrcyKupUHd9dydB5HAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBIFQFCranyJtkHAgjEU0CrtI60TsiP2LaKcB6JQFbdzJCPbVm3LWRl0x79Lg4aatXw6doP10v3nl1DzvHtttCVUVu2/3eVtX+Psnntdnl93lsBxzxxrDRkqFUf0h9Nf3bns4KGWgMOTGdSCXhC/8sWrEiqdbNYBBCwoYBR4dPZsLFR4bOZSEaW3wIvvvpauaL/YNHKoDT7CWjl3LGPPycLZ06Sreu/NBfozCsUR0bo3//YbyesCAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAIFYCzlgNzLgIIIBACgvcYezNp0qrhvJ69CtO4S2ztXgJNG3ZJORUyxauCBn+LO7TPeTz8ye9EvJ5fThYQNUzcFsjgFq1Bascq/doiDac6pz7du2vOqTPcbNWjX3OOUlOAYL/yfneWDUCthFwZYozv7lknN1VzOqeVQKtmVlZcsWAm2X6ojdlyITJBFpt89ICL6Re/QYyctY8nyq6jgYFcv3wsfLDqbLAD9GLAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCQNgKEWtPmVbNRBBCIkoCGWcdZx+p7+5VBfyS89V7OEQgloAHOnFz/H6PseUarqI69eopMGjjH+3V43xHPZfP7sN+IPt5z68HeXQfkoUG/kjXvrbVeMsOuj4+aH/Ca5+a2nVv7fa93uaST57Lfp6539vBnpOoarTetXLpadmzcae32np/brZ33mIPkFfBUa03eHbByBBBIhIAjM0ucTVpLRpufGKFWo1K40+Vdhlb97Dt0uMx6/c9y411jRX+8PS05BDSIPOzBGfLT6/p7F7xi1Sdy2aA75Pujx7x9HCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCKSfQEb6bZkdI4AAArUSGGk8XVh1BK3SqhUIaQhES+CSa7qLBj2DtRNHS2XDp5u8l62VUIdNGmxWSNVAaaCmwdZZRtBUw7OeqquH938vwe6vOsatEwZUPTWPNYirwdaqa6p6k1Zrve2CMWY145btfANHGz7dHPQ5HUP/+eo1+PKqw3GcxAL6a+XyxR+EVb03ibfJ0hFAIAoCjjrZ4mzUVLSCp7UVFBZJ70FD5aKrrhUNR9KSV+DmMb+U3LwCWb7kJXMT/2/TP41g63/KX373rJzVonnyboyVI4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIRCxAqDViOh5EAIE0FNAqrROt+9bAXagfv269n3MEqhO45raeIUOt1T2vQdDxT42Qh4xqrtbAa9VnreHYqtcCHXfv2VWKjcBtoDbsoUHVzrdq2ZpAj4bs04Cu7oeWGgKeaq3LFqxIjQ2xCwQQiLqAGWY1KrI6chr5jd2ibQfpPXioz4+t97uJjqQT6Dv051LQrFCWzJ1trn3Ljq+l99CR8ub8J0V+LMybdPtiwQgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAApEJOCN7jKcQQACBtBS4xti1T5VWVeh725VpicGmYyfQsVt7GXRP7ar/6hi/enNS1ALXGmid/NK4oJvW+TRIG80Aam8jMN5vRJ+gc3IhOQW0Wms0v0+SU4FVI4CAn4ArU5xNWourVSe/QGunC4tlzOPPy8T5rxBo9YNLjY4LftZb2nTs7N3M199+J3v2H/Cec4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIJA+AoRa0+dds1MEEKi9wDTrEFq1sm3n1tZuzhGotYBWKB396B2Sk5sd8VgaNH12xSzpckmniMfQBzVYqoHW6oKIPfoVm0Halu2LajWfVvPUvWtIlpZ6Ap5qram3M3aEAAIRCThd4sxvLhltfiLOhk18hrj46mvNIOvds+ZKh64X+FzjJLUEXp83W77evNG7qZcemyZ9elzqPecAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgfQQy0mer7BQBBBColUB/4+nzrSPcOmGAtYtzBKImoFWAi/t0lzUr1sqGTzfL2g/Xy4mjpTUaXwOEc4yKrfrs337/37Jq2ZqwntcAq4a2NVzbrFXjsJ7RmzRI+5uPHzPWu8mY6zNzvnDXrNVgew26TDQcS0ttAa3WunzxB1J2qjy1N8ruEEAgpICzUVNx5hlF8I0qrVWbBlhvHDlOWrTtULWb4xQVeOO5J2TtRyu9u5t9/z1y+43/4T3nAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAIL0ECLWm1/tmtwggELnAROujWv2SKq1WFXueDxx9vfQafFnQxbVs1zzotUgveMKkwZ5v2/msYJd8+s2qlka4VQOuVduOjTuNgOsJs0vvqa5pYFS/tALqhk82iT6vTcOy2rLqZsq5F7Qzj7v17GKGU82TCP9H//nQL3M+I+B6eN8R+XZbiTnali+3S9OWjSWvSUPRtWtl16Ytm9QoPBvhsnjMJgLm9/XtV8qyBStssiKWgQAC8RRw5DQSV5NWIhlZPtNqiPWGn4+UThfylxt8YFL4ZPmS38nf//In7w4fuOsO0S8aAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBA+goQak3fd8/OEUAgfAGt0OqXruh359Xhj8CdCRXQ0KR+xbNppVMNdcaqRRqozsnNNiuwahXWeLVYOsRrD8wTfQGqtUbflBERsLuAo26OOAtaiKNeA5+l5ubnyw3/ebdcdNW1Pv2cpLbA5399V5Yvecm7yWE3/YdolVYaAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAegsQak3v98/uEUAgPIFp1ts0UBjPUKB1fs4RQACBZBegWmuyv0HWj0D4Ao7MLHE2biVaobVqq1e/gVzRf7D0HjxUMrN8q7ZWvY/j1BNY/8kqWTJ3tndj7tKj8vyMSd5zDhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBNJXgFBr+r57do4AAuEJaJXW/tZbb50wwNrFOQIIIIBADQWo1lpDMG5HINkEXJnizCsUZ6Omfiu/YsDN0nvQUNEqrbT0Etj0xRpZ9OiPf2fM/cMJqSjZLnXrEGxOr+8EdosAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIBBYg1BrYhV4EEEDAIzDOc+D51OqC3Xp29ZzyiQACCCAQoQDVWiOE4zEE7C7gcP4YZnW6fFbb9dIecuNdYyW/WZFPPyfpIbB7x1Z55dHpUl5WZm7YfapUKr7bKuKuTA8AdokAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFCtAKHWaom4AQEE0lig0Nj7Ldb9a2XBrDqZ1m7OEUAAAQQiEKBaawRoPIKAjQUcdXPE2bSNOLLq+qyyTcfOcuPIcaKftPQUOLS3RH49ZYKcPH7sDMDpMqncs12ksiI9Qdg1AggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAQAFCrQFZ6EQAAQRMgYnG//okMjxVBfFBAAEEEIiOgOfX1WULVkRnQEZBAIHECGh11vwis0Jr1QUUFBbJgF+MFa3QSktfgaOHDslzE8eIfprNCLJqhVZ3+ZmKrekrw84RQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSsAoRarSKcI4AAAmcEtErrSCvGDcP7UKXVisI5AgggUEsBqrXWEpDHEUiwQKDqrJlZWXLNkJ/LVTffluDVMX2iBbQyq1ZoPbin5MxS3JVSsfuf4i77IdFLY34EEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEbCjhtuCaWhAACCNhB4A5jET5VWrPqZErvwZfbYW2sAQEEEEgpAU+11pTaFJtBIB0EnC5xFrQQV8uO4sj68bdNbTp2lgdeeIVAazp8D1Szx/KyMlk4c5Ls3rH1zJ0aaC3ZLu5TpdU8yWUEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIF0FaBSa7q+efaNAAKhBDSVMc56Q9/brxQNXtEQQAABBKIvQLXW6JsyIgKxFHBk54qrWRsRV6Z3Gqqzeik4+LfAokenydb1X3o9KvZ+I+7So95zDhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBCwClCp1SrCOQIIICAy0kAorAqhVVo1cEVDAAEEEIiNgP6lgV5Uw44NLqMiEE0BozqrhlldzTv4BFo7XVgs0175A9VZo2md5GO9Nm+2rP9klXcXlQd2ifv4Ie85BwgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggEEqBSayAV+hBAIJ0FtErrRCtAj37FVGm1onCOAAIIRFmg721XyvJXP4jyqAyHAALREghUnbVe/QZy011j5aKrro3WNIyTAgLLl/xOPnv/Xe9OKg+VSOX3+7znHCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAQTIBQazAZ+hFAIF0FrjE27lOlVSFuuLNPunqwbwQQQCBuAm07t5bia7rLmvfWxm1OJkIAgTAEtDprk1biaFDgc7NWZx0yYbLk5uf79HOS3gIfvfWGLF/ykhfBfeygVB76znvOAQIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAKhBAi1htLhGgIIpKPANOumNWClQSsaAggggEDsBW6dMIBQa+yZmQGBsAWcuQXibNxKxAi2epqGWG/4z7upzuoB4dMrsPajlfLH3z7rPXef+F4q9n7tPecAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgeoECLVWJ8R1BBBIJ4H+xmbPt25YA1Y0BBBAAIH4CFCtNT7OzIJAtQIOp7iatvarznrx1dfKjb8YK/XqN6h2CG5IL4FNX6yR1+fN9m7aXXpUKvb8y3vOAQIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALhCBBqDUeJexBAIF0EJlo32rFbe6q0WlE4RwABBGIsQLXWGAMzPALVCDiy6oqzsJ3op6dpddYhEyZLpwuLPV18IuAV2L1jqyycOUnKy8rMPvepUiPQukPEXem9hwMEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEwhFwhnMT9yCAAAJpIKAVWv1SGgNHX5cGW2eLCCCAgL0EPNVa7bUqVoNAegg46ueLq2VHn0CrVmed/OLrBFrT41ugxrvcu+sbefaBMd5Aq5wuk4rvtolUVtR4LB5AAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAgEqtfA8ggAACZwSmWSEIVVlFOEcAAQTiJ0C11vhZMxMCpoDDKc6CFuJs1NQLkpmVJTeP+aVcdNW13j4OEKgqcGhvifxm6n1y8vixM90V5VKxe4uI8UlDAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAIBIBKrVGosYzCCCQagJapbW/dVMaqKIhgAACCCRGgL9YkBh3Zk1PAUdmlrhanOMTaG3W6iyZ8PSLBFrT81sirF1rkHXBjElycE/JmfuNyqxaodVdXhbW89yEAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQCABQq2BVOhDAIF0Exhn3XBe04bSrWdXazfnCCCAAAJxFOAvF8QRm6nSVsCRnSuuVueJo26O16DrpT1kwlO/lRZtO3j7OECgqkB5WZn8esp9snvH1jPd7kqpKNku7lOlVW/jGAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIEaCxBqrTEZDyCAQIoJFBr7ucW6p4GjrpesOpnWbs4RQAABBOIoQLXWOGIzVVoKOAtaiKu5EVx1urz7v/GusXLnw3OkXv0G3j4OEKgqoIHWhTMnydebN3q7K/b8S9wnj3nPOUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgUgFCrZHK8RwCCKSKwERjI3WrbkartPa9/cqqXRwjgAACCCRIgGqtCYJn2tQWcGWKq8U54szTv9tzpuXm58uEp1+UKwbc7OniE4GAAkuff0I2fbHGe61y39fiPvG995wDBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBGojQKi1Nno8iwACyS6gSY6R1k3cMLwPVVqtKJwjgAACCRKgWmuC4Jk2ZQUcdXMko3UncdT7sRJrh64XyOQXX5c2HTun7L7ZWHQE3n75N/LZ++96B6s8VCKVRw96zzlAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoLYChFprK8jzCCCQzAJ3GIv3qdKaVSdTeg++PJn3xNoRQACBlBOgWmvKvVI2lCABrczqatlRxKjU6ml9hw6XMY8/L/Xq/xhy9VzjE4GqAn9941XRL0+r/H6fVB76znPKJwIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJRESDUGhVGBkEAgSQU0DDrOOu6+95+peQ1bWjt5hwBBBBAIIECVGtNID5Tp4aAwymuZm3EWdDCux8Nsd49a670Hfpzbx8HCAQT+Pyv74pWafU09/FDUnlgl+eUTwQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSiJkCoNWqUDIQAAkkmMNJYb2HVNWuV1oGjrq/axTECCCCAgE0EqNZqkxfBMpJPwOkSV1E7cTQo8K69TcfOMvGFl6XThcXePg4QCCaw6Ys1smTubO9ld+lRqdj7jfecAwQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSiKUCoNZqajIUAAskioFVaJ1oX26NfMVVarSicI4AAAjYRoFqrTV4Ey0guAQ20tjhHHNm53nVffPW1Mubx5yW/WZG3jwMEggl8vXmjLJw5yXvZfapUKkq2i7grvX0cIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIBBNAUKt0dRkLAQQSBaBa4yF+lRp1YXfcGefZFk/60QAAQTSUoBqrWn52tl0hAKOzCzJaH2eOOpke0foO3S4DJkwWTKzsrx9HCAQTGD3jq3y6yn3SXlZmXmLu+wHqdj9TwKtwcDoRwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQiIoAodaoMDIIAggkmcA063qLr+kuWgWQhgACCCBgXwGqtdr33bAyewlokNXV6jyRjB/Dq0Pvmyx9h/7cXgtlNbYVOLS3xKzQevL4sTNrPF0mlSVbRSorbLtmFoYAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIJAaAoRaU+M9sgsEEAhfoL9x6/nW26n+ZxXhHAEEELCnAL9e2/O9sCr7CDiyc8XV4hwRp8tclFZlvfPhOXLRVdfaZ5GsJGIBrZ6qX7FsGmR9buIYObin5Mw0RpC14rut4i4/U7E1lnMzNgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIZECCAAAJpJjDRut+O3dpTpdWKwjkCCCBgUwFPtdY176216QpZFgKJE3DmFoiziVF53nHm7y7Wq99A7p41V9p07Jy4RTFzxALlZWWybf06+XrzP/79uVG0r+/Q4dKibYeIxw31oI7/7ANVAq3uyjOB1rIfQj3GNQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSiJkCoNWqUDIQAAkkgoBVai63rHDj6OmsX5wgggAACNhbQaq2EWm38glhaQgScjZqKs3Er79wFhUVmhdZYhR+9E3EQNYG9u76RbzZvlH8ZX19v2hjziqzWhWugdeHMST/Oq4HWku3i/uGE9VbOEUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEIiZAKHWmNEyMAII2FBgmnVNnop/1n7OEUAAAQTsK+D5tZtgq33fESuLr4CzoIU48wq9k2qQdcS0OZLfrMjbx4H9BLau/9IIr34l32z5h1GN9Ss5euhQQhf5+rzZsumLNd41VO7fKe7So95zDhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBCIhwCh1ngoMwcCCNhBQKu09rcuZMfGnXJ9i9ut3ZwjgAACCCCAAAL2F3A4xdW0tTgaFHjX2qZjZ7l71lypV7+Bt4+DxAtoFVQNjJoBViPIqoHWcJr7VKlZKdWRVVcc9WL3Tt947glZ+9FK75IqD+ySyqMHveccIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIBAvAUKt8ZJmHgQQSLTAuEQvgPkRQAABBBBAAIGoCWigtaidOLJzvUN2vbSHDHtwhmRmZXn7OEicwN5d35hB1k1rP/epgBp0Re5KM8DqPnnc+NSvEyKVFebtzvzmMQu1Ll/yO/n7X/7kXVbl4T1S+f0+7zkHCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCMRTgFBrPLWZCwEEEiWgP4/3lkRNzrwIIIAAAggggEBUBQIEWi+++loZMmFyVKdhsJoJeKqxblr7mWw2vg7uKQk9wOkyb3jVDLIaVVnj3T7/67uyfMlL3mndxw5K5cHd3nMOEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEIi3AKHWeIszHwIIJEJgojFp3URMzJwIIIAAAggggEBUBQIEWvsOHS59h/48qtMwWHgCh/aWGFVYP5P1n66uthqru+wHcZceFffJY2eqsFaUhzdJjO5a/8kqWTJ3tnd0XVvF3q+95xwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggkAgBQq2JUGdOBBCIp4BWaR0ZzwmZCwEEEEAAAQQQiJWAq2lrcWTneocn0OqliNvBpi/WyKa1n5sh1r27vgk+r7vyTIhVg6ylR8RdXhb83jhf0T0senSad1YN2laUbPeec4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAogQItSZKnnkRQCBeAnuMierFazLmia9ARUXFU06nc3x8Z2U2BBCwo8CAAQPkT3/6kx2XxpoQiJqAq1kbcTQo8I5HoNVLEdOD8rIy0aqmX61ZbX7qebDmrcaqIVYjzGrHtnvHVnnl0eni2Yf7VOmZQKsRwqUhgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggkGgBQq2JfgPMjwACCCAQkYDb7W5kfFGFNyI9HkIg9QQmT55MqDX1Xis7qiLgLGjhE2j96XX9pe/Qn1e5g8NoC2iQdYMZZF0tJ48fCzy8jauxBlrwob0l8uspE37cz+kyqdxjVGitrAh0O30IIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIxF2AUGvcyZkQAQQQQCBKAnc4HI66URqLYRBAIMkFLrzwQjn//PPlf//3f5N8JywfAX8BZ35zceYVei9cfPW1cvOYX3rPOYiewNebN8pnf31XNny6So4eOhRw4GSoxhpo4bqf5yaO+XFfRpC1YvcWcZcHrzwbaBz6EEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEIilAKHWWOoyNgIIIIBAzASMKq3jjFBrzMZnYAQQSD6BBx98UG655ZbkWzgrRiCEgLNhE3HmF3nv6HRhsQyZMNl7zkHtBfbu+kY+X7lc1n28Ug7uKQk4oBlkPbpf3Ce+T8oQqFaa1Qqt3v2ZgdZ/JuVeAr4gOhFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBFJGgFBryrxKNoIAAgikj4ARaO1v7LZN+uyYnSKAQDgCN910kxQWFsqePXvCuZ17ELC9gDO3QJxNWnvXqYHWOx+e4z3nIHKBQ3tLZP0nq+Tv7y4TDbUGbKfLpPLYIXEfOygaak3WVl5WJgtnTpLdO7ae2YK7Uir27BD3qdJk3RLrRgABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQSGEBQq0p/HLZGgJpJ9Dm/xZmZFQGLq+VdhipveHMDhem9gbZHQIIRC5Qv4VktG8R9vMVJdvNyothP8CNCMRJwJHTSJxN23hna9G2gxlozczK8vZxUDMBDXd+/td35TPj6+vNGwM/XFEulceNaqwaZP3hROB7kqx30aPTZOv6L72rrtjzL3GXHvWec4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIICAnQQItdrpbbAWBBColUBGhntirQbgYQQQQACBtBPQSpgVxo8TpyFgJwFHdq64Cs/2LkkDrWMff04ItHpJanSgVVn/unSJrPv4b3Ly+DH/Z43Kpe7jh8+EWVPs14PX5s02K9J6Nl25fydBfg8GnwgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAArYUINRqy9fCohBAoMYCba6oK3L0FhFHjR/lAQQQQACB9BXQapiOrLpJ/aPF0/ftpebOHXVzxFXUzvgtjdPcYEFhkRlorVe/QWpuOIa7Wv/JKvn7u8tk0xdrAs7iNgKslccOnwl5GsHWVGvLl/xOPnv/Xe+2Kg+VSOWR/d5zDhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBCwowChVju+FdaEAAI1FsjMPHaH2+0orPGDPIAAAgggkPYCjtwm4j6wK+0dAEi8gAasXc07+ARaxzz2nBBoDf/daCXWz/76rny87PdycE+J/4Ony6Ty+71SefSgSGWF//UU6fnorTdk+ZKXvLvRMGvloe+85xwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggYFcBQq12fTOsCwEEaiRQUSF3Oc8UNKvRc9yMAAIIIICAs2HjM2GvFA648ZaTQMDpOhNoNT61aZBVA635zYqSYPGJX+LeXd/Iyt8vkXUfrZTysjK/BblLj5pVSrU6a6q3tYbBH3/7rHeb7uOHpHL/Tu95Mh3ce86TybRc1hqmAO81TKgku635Iz8G6ZNs6Sw3hADvNQQOlxBIUgH+PZykL66aZfNeqwFK0sv8ezhJX1w1y+a9VgPEZQQQQAABBBBAAAEE/i1AqJVvBQQQSHqBjPYXXiHiPt+6keXXnCvnNapr7eYcAQQQQCCNBU5VuKXLHzeIfnqb8WPenbkFRvXGfd4uDhCIt4CrsK1IRpY5rQZaxz5OoDWcd6ABzo//tFS+3rzR/3YjqF557JC4v98j7nL/oKv/A8nfs+mLNfL6vNnejWiYt2LvN95zDhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBCwuwChVru/IdaHAAJhCFSOM35Or899FxTkEGj1EeEEAQQQQEAF6rgcMrRdY3npn/t9QJyNmhFq9RHhJJ4Czvzm4sjO9U45ZMJD0qJtB+85B74CRw8dks/++hf5eNlS0WNrc5f9IO4j+6Ty6EHj7z1VWi+n7PnuHVtl4cxJ3kq17lOlUrFnR1oZpOzLZWMIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQBoJEGpNo5fNVhFIRYG6Hc9vc/q0o791b3d3amLt4hwBBBBAAAFTYEiHAr9Qq1bIdOQ0+v/s3Q10XGd97/v/3vOi99Ho1RKxEtmybMuE1E5wSJMDiF58naQraUgcmuWYkoi1sNNDMZQLuaX09gCltyWFtmlOIb1N4BTCgrAgNCzsA86CNDQBUg4+DZA3Eyex48hvkvUuWdLsffcz8mzN3jOSZo/2vH+ftYT288zez/Pszx4jZ/mn/0glfDQ5b4PiElDvO725097UdXveJ5dd/Ta7z8GigAqwPvbNr8qT33vEDm4uvmplNyeGxRg9K+b0ePJwRRyfOv6q3PuxP1p0mZ+V2IkXRaxqtTQEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEESkmAUGspPS32igACKQLz80GrSmvSR0hbZ7TXBGXn2saUcxlAAAEEEEBACfQ0VMV/Tnz/tVEHiN7YJrHJEccYHQRyKaCFwhJY020vocKs1+0ZsPscLAgsG2aNzcUrsppjZ8Scm61IsuFTg/LFP/uITE9cCPPGA60vEGityHcDN40AAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlL4AodbSf4bcAQKVK9DdXy0yfocbYF9fu3uIPgIIIIAAAg6BW7qbxB1qVR//rlXVivrIbhoCORfQdNE7e0X0QHyplo5Ouf2PP57zZUtpgeXCrOrPqTl6Wozxc9bvNxmldFu+7lUFWf+/T/6JDJ0cXJjXqswaG3ypbAK+f/vi/+WrF5PlX+DDG/8mZVGeawpJyQ2ke66v/9n7Su4+2LBT4A2ffsA5YPV4rikkJTeQ7rmW3E2wYV8F+DnsK2dBJkv3c5jnWpBH4eui6Z4rP4d9JS7IZOl+DvNcC/IofF003XP1dYESnQyXEn1wK2yb57oCUIm+zHMt0Qe3wrZ5risA8XJJChBqLcnHxqYRQEAJhIJj+0zRoskaVQFNbuhqSh7iGAEEEEAAgRQBVdF7S7Ranh2ZcbymRVrFPHPMMUYHgVwIBDrWiRa2fj/HaqFwWPZ9+nNSU9+Qi6VKbs6VwqzG8KCYVFWWudlZ+cInPiInjh5ZeMZWuDceaCWYX3LveTaMAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCwKECoddGCIwQQKDGBmKG9V9edm961rlnaa/i/NqcKPQQQQACBdAK71jfLp37xuuMlPdIiKjAn1kea0xDIlYAebRetbvH3cnb/8Z/Kmq5LcrVcycw7fGpQDj70oPzi8cfigc3kjavKrIRZF0VUoPWfP/Un8srzv7YH44HW6XG7zwECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACpShA8qsUnxp7RgABCfZcfq1ostVNsWdDq3uIPgIIIIAAAmkF9vS0yt/96pSMzcYWX1cfCd/QJMbI6cUxjhDwUUCraRC9tcuesf9dvy9X9L/T7lfiQSLM+rMfHEi5fXNmUoxzJ6nM6pJ5+L575Lmf/9QeNU6/IubUmN3nAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIFSFSDUWqpPjn0jUOkCmux1E/R3RuIfJe0ep48AAggggEA6gaqAJrd2N8sDL55xvKw3dRBqdYjQ8UtAC4Ul0LHOnq73sm1y894P2v1KOxgbHpbvfvkLkjbMalUcjVdmpfJoytviyQOPiLJLNGPohBhjQ4ku3xFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoaQFCrSX9+Ng8ApUpEN54xWbDkJvcd7+7p9k9RB8BBBBAAIFlBQY2taaEWiUQin80vDk5suy1vIiAJwFVBXjNelHvL9VaOjrlvf/3pzxNUS4nz83OymMPf9X6+oqo4+RmEmZN5kh77Ai0WlVsVSVbGgIIIIAAAggggAACCCCAAAIIIIAAAoUSeP3P3leopVnXJ4E3fPqBlJl4rikkJTfAcy25R5bRhnmuGTGV3EnpnmvJ3YTPGybU6jMo0yGAQO4FDEOzqrSajoW66sOyc22jY4wOAggggAACKwmsrVv4+fH910Ydp+rRdokRanWY0FmdQKD9YtGq6+KThMJhK9D6SYk0V94v5Dzz1BPy8H33OCqNKhTCrN7fX+b4kKgqrTQEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEykmAUGs5PU3uBYFKEOjurxYZ2yeiOe72zo2tjj4dBBBAAAEEMhUYsH6GuEOtWk2DaFW1Yp6fynQazkNgSQGtNiJaQ4v9+s1790v35jfa/Uo4OHH0iHz7i38vR5457Lhdc2YyHsxUoVZa5gLm1JjETh/L/ALORAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKBEBAi1lsiDYpsIILAgEAqO7TOtOmfJHlUBTW7trrxKZ8kGHCOAAAIIZC9wVXu9bIlWy7MjM45J4tVaT73iGKODgGcBTZfAmm77ssuufptc87s32f1yP5ieGJdHv/RFefJ733HeamxOjOFBMUbPOMfprSiggsCxwZes8rbGiudyAgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAKlJkCotdSeGPtFoMIFTJG9boI9Pa0SCQfcw/QRQAABBBDIWGDPhlb5+M9fc5yv1TeJnLU+2tsK39EQyFZAb10rEgjFL6+pb5B3f+Cj2U5Vctc9/sg35OBDXxIVbLWbFcQ0Rk6Lce6kiBGzhznITEBVj469foRAa2ZcnIUAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFCCAoRaS/ChsWUEKlUg0HOFKmu22X3/t/cufpyv+zX6CCCAAAIIZCKwa12z/N2vT8rp6fnF060Km3qkZSF8tzjKEQIZC2g1DaI3ttnn37L3gxJpLv/q8keeOSwP/8M9cur4q/a9qwNzakyMM6+KOTfrGKeTocD8rBgnrQqthIEzBOM0BBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBUhQg1FqKT409I1ChApqmWVVarVqtSW3n2kbpaahKGuEQAQQQQAAB7wJVAU1u6GqSB150fhS6Hm2PV5XkY769m1b8FSoU3XaxzdB72Ta5csf1dr8cD4ZPDcq3779XnnnqCcftmbMzYpw9Hg+1Ol6gk7mAMR+v0EogOHMyzkQAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEChNAUKtpfnc2DUCFScQ6t2+1TSNa903fku39dHQNAQQQAABBHwQ2NfXnhJqVR8br9VGxJwc8WEFpqgkAb25U7RwdfyWQ+Gw3P6RPy3r2z/0ja/INAiBxQAAQABJREFU/3zoQZmbTarCalUUNYZfXwiGl/Xd5/7mjJHTuV+EFRBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoAgFCrUXwENgCAgisLGCa5nvdZ6kKrapSKw0BBBBAAAE/BNprgvGfK99/bdQxnQonxgi1OkzoLC+gVdWKqvKbaNfePiDNazoT3bL6fur4q/LQ5z4jrzz/a8d9GaNnxBg6IWIFW2kIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBApgKEWjOV4jwEECicQPfWqJWI2CeiOfZwe2+Lo08HAQQQQACB1Qrc1dcm7lCrCiiqL/P81Gqn5/oKEdDbL7H+2qLH77Z78xtlx++/pyzvPF11VnNmUowzx/jzUpZPnJtCAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAIPcChFpzb8wKCCCwSoFQUL/DFG3h83svzFUV0GRPT+sqZ+ZyBBBAAAEEnALbWupkS7Ranh2ZcbygN62R2MmXHWN0EEgnoDd1xEPQ6rVQOCy//8GPpjutpMfSVmc1DTGGB8U4d7Kk743NI4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAYQUWygcVdg+sjgACCCwrYJjafvcJKtCqgq00BBBAAAEE/BZ436a2lCm1OqtoeCCUMs4AAskCWrha9OZOe+id736PXLS+1+6Xw4GqzvrZ/3qHvPL8r+3bUdVZY8efI9Bqi3CAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALZChBqzVaO6xBAIC8CgZ4rbtI06XYvNrCJKq1uE/oIIIAAAv4I3HBxk7TXuD7QwPooeb0xNezqz4rMUi4CetvFItZ7RbU1XZfIO9+9p1xuTVR11s9/6P3y3S99UeZmZxfuS1VnHTohsdeeF3PWWd24bG6cG0EAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgbwKEGrNKzeLIYCAVwFNM1KqtO5c2yhr68Jep+J8BBBAAAEEMhJQlcB3dTennKs3Wr9QcSGwmPIiAxUvoELPWk2D7XD7R/5UQuHy+PsK1Vntx8oBAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgjkWMBVgirHqzE9Aggg4EEg1Lt9q2ka/e5LdvekBo3c59BHAAEEEEBgNQJ3bmyTB148I+dj5uI0gZBodVExJ4YXxzhCQAlY7w295SLbov9dvy/dm99o90v1QFVnfehzn5FXnv/14i2o6qzDg2KcO7k4xhECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCDgkwChVp8gmQYBBPwXMAxjr6Y5590SrZb+zohzkB4CCCCAAAI+C7TXBEVVBn/01RHHzHrTGokRanWY0BHRmzqs/wnEKVo6OuWGO/eVPIuqzvo/H3pQ5mZn7XsxZybFOP2KmLMz9hgHCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAgJ8ChFr91GQuBBDwT6B7e4emxe6wPufZMeeu9VRpdYDQQQABBBDImcBdfWtSQq1aVa1o1XWiwn00BOICqkprY6uN8a73f1BC4bDdL7WD6YlxeejzfynPPPXE4tapzrpowRECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCORUQM/p7EyOAAIIZCkQCsZuswKt1cmXq6p5e3oWQyPJr3GMAAIIIICA3wKqOrj6crd4VU73IP2KFYi/H7SF/6y6aH2vXHb120rW4sTRI/LZD9zpCLSqAHfs+HNinDtZsvfFxhFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoHQECLWWzrNipwhUjkB3f7Upcrf7hm/oapKqgLNyq/sc+ggggAACCPgpoKq1uptWGxGxqnPSENDC1Y4qrdftGShZlCe/9x35/IfeL0MnB+17MEbPSOzEi2LOzthjHCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAK5FAjmcnLmRgABBLIRCATGr7WqtHa4r93X1+4eoo8AAggggEBOBXaubRRVKfz09PziOlZVTj3aLsbQicUxjipSQG+5yPory8LvCfZetq0kq7TOzc7Kw/fdIz/7wYHFZ2gaEjv1qpgTw4tjHCGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJ5EKBSax6QWQIBBDwLpFRpTYSKPM/EBQgggAACCKxCQFUI393TmjKDHmmxw4wpLzJQEQJaVa1odVH7Xm/et98+LpWDU8dfjVdnTQ60qqqssePPEWgtlYfIPhFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoMwFCrWX2QLkdBEpdINS7faumyVXu+xjYmBoocp9DHwEEEEAAgVwI3N7TIirc6miBkMSDrY5BOpUkoDd32rd72dVvk4vW99r9Ujh45qkn5PMf3isnjh6xt6sqs8Zee15UsJWGAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIFEIgWIhFWRMBBBBYSsAwjP1WqNXRtkSr5ar2escYHQQQQAABBPIl0F4TlBsvaZJvHnV+FLsWsX7hYvRMvrbBOkUk4K7Set2egSLa3fJbmZudle9+6Yvy+CPfWDzRNMQYOiHGyOnFMY4QQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBAghQqbUA6CyJAAJLCHRv79A08zb3q3s2UKXVbUIfAQQQQCC/AgMb21IWjAcbaxpSxhkof4FSrdI6Njws//CxDzgDrfOzEjvxIoHW8n/bcocIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlIQAodaSeExsEoHKEAgGjTtEtOrku1XV8Xata04e4hgBBBBAAIG8C6iq4dta6lLW1aPtKWMMlLdAqVZpfeX5X8tn3r9b1PdEM6fGZP7Ys2LOTCaG+I4AAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggUVIBQa0H5WRwBBGyB7n4rzGrut/sXDnZ1N0tVQHMP00cAAQQQQCDvAnf1panWWhcVCYTyvhcWLJxAKVZpfe7nP41XaJ2eGLfhjOFBib1+RMSI2WMcIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAoUWINRa6CfA+gggEBcIBCZusqq0diRzqDDrnWk+7jn5HI4RQAABBBDIl0B/Z0RUBXF305scP77cL9MvIwGtNiKaCjJbLRQOy7s/8NGiv7unDx2QL3ziIzI3O7uw1/lZiZ14UYzh14t+72wQAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQqT4BQa+U9c+4YgaIUME3zbvfGlgoPuc+jjwACCCCAQD4E1C9b7OtrT1lKb2iyfi+Dv1anwJThgN5ykX1X1/zuuyTS3Gz3i/Hg4FcflK9+7jP21szzU1ag9QUxpxcrttovcoAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggUgQD/+l4ED4EtIFDpAqHe7Vt1Xba6HdJ9zLP7HPoIIIAAAgjkU+CGriZR4VZHC4REb2x1DNEpPwGtuk60qtr4jakqre+8dU9R3+RDn/+MHPzqA/YezZnJeIVWc+5CxVb7FQ4QQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACB4hEg1Fo8z4KdIFDBArGUKq1botWyraWugk24dQQQQACBYhRorwnKrnWp1Tm1hpZi3C578lFAb2yzZyvmKq1zs7PyhU98RH72gwP2fs3JkXigVYyYPcYBAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggUIwChFqL8amwJwQqSaB7e4dpyk3uW76rb417iD4CCCCAAAJFIbBnQ2pVVlXBU6uNFMX+2EQOBPSAaPVN9sRv2XGdfVxMB9MT4/L5D71fnvv5T+1tGaNnJDb4kohp2GMcIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAsUqQKi1WJ8M+0KgQgSCwfn9Ilp18u2qKng71zYmD3GMAAIIIIBA0QioauJXtden7Ce5kmfKiwyUtIAesSrxagv/6dS9+Y1y0freoruf4VOD8vkP75UTR4/YezOGTohx5pjd5wABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAodgFCrcX+hNgfAuUs0N1vhVm1O9y3uLunVaoCmnuYPgIIIIAAAkUjMLAxTbXWuqhooXDR7JGN+CegNVih1gvtv/xuSoH5xEsF+66CrJ//8Pvl1PFXF/ZgVWU1Tr8ixrmTBdsTCyOAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALZCBBqzUaNaxBAwBeBUGjsNivU2pE8mQqz3t6zGBxJfo1jBBBAAAEEikVAVRTvqk8NsGqNa4pli+zDJwGtuk60qtr4bDX1DbLt7e/0aWZ/pnnu5z+Vez/2RzI2PLwwoRVojQ2+JMbYkD8LMAsCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCQRwFCrXnEZikEEHAKxGLafueIiAoJtdcE3cP0EUAAAQQQKDqBO9NUa9UbrQquFz6mvug2zIayEtAb2+zrLn/7/yGhcGqY2T4hzwcq0PrPn/oTmZ4YX1jZiEnstRfEnBrL805YDgEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEPBHgFCrP47MggACHgVCGy6/Stdlq/uyu/qocOc2oY8AAgggUJwCt3Y3i6ow7mhWoDUebHUM0ilZAT0gWn2Tvf3/8rs32ceFPkgEWudmZxe2Mj8rsePPinl+qtBbY30EEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgawFCrVnTcSECCKxGwBS52339tpY62RKtdg/TRwABBBBAoCgFIuGA7OmxKrO6mhZZrOzpeoluiQnokRa78u5F63tFfRVDO3H0SLxCqyPQesKq0Dp3IeBaDJtkDwgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEAWAoRas0DjEgQQWKVA9/YOa4Zr3bPc1UcIyG1CHwEEEECguAV2rW9O2aAWrhatLpoyzkDpCWgNVqj1QiuWKq0q0Hrvx/5ICLQmngzfEUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgnAQItZbT0+ReECgRgWDQtKq0ao6SrO01QenvjJTIHbBNBBBAAAEEFgRUhfF0P7/iFT5BKmkBrbpOtKra+D2EwmG5csf1Bb+fRKB1emJ8YS9GTGJUaC34c2EDCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCDgnwChVv8smQkBBDIR6O63wqzGbe5T79zYJlUBzT1MHwEEEEAAgaIX2N2TplqrValVVWylla6A3rhYQV4FWlWwtZAtfaD1RTHnZgu5LdZGAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEfBUg1OorJ5MhgMBKAqHQ+B1WldaO5PNUmHVXd2ogKPkcjhFAAAEEEChWgZ1rG6WnoSple1pkMRSZ8iIDxS2gB0Srb7L3+JYCV2ldMtB6fsreIwcIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAOQgQai2Hp8g9IFBCArGY7HVv98ZLmqS9Jugepo8AAggggEDJCNze25KyV72xVcQKR9JKT0CPWM9TW/hPpYvW90r35jcW7CaGTw3KFz7xxzI9Mb6wByMmsRNWhVYCrQV7JiyMAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAK5EyDUmjtbZkYAAZdAcMOb+3VdtrqGZWAjlezcJvQRQAABBEpLYE9Pq6jK445mhSLj4UjHIJ1SENDqovY237LjOvs43wcq0PoPd/+RjA0PLyxNoDXfj4D1EEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgzwKEWvMMznIIVLaAsd99/1e118uWaLV7mD4CCCCAAAIlJaACrSrY6m56dI17iH6xCwRCotU0xHcZCoflLTuuL8iOE4HWoZODC+ubhsReP0KF1oI8DRZFAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIF8CRBqzZc06yBQ4QLVm7d2W5/je5ObYWBjagDIfQ59BBBAAAEESkHg9l7rI+vdLRiW5Kqf7pfpF5+AXhexN9W9+Y1SU78QcLUH83AwPTEuX/jER8QRaB18ScyZyTyszhIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBA4QQItRbOnpURqCiB+flgSpXW9pqg7FzbWFEO3CwCCCCAQPkK9DRUpf25pje2le9Nl+GdafXN9l1d/vZ32sf5PPjyX/03OXX81YUlVYVWFWidGsvnFlgLAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQKIgAodaCsLMoAhUm0N1fLWLe4b7rfX3t7iH6CCCAAAIIlLTA7p7FQGTiRrTaiGhVtYku34tZQA+IVlNv77DvzW+xj/N18O3775Xnfv5Te7nYqVcJtNoaHCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQLkLEGot9yfM/SFQBAKh4Ng+axvR5K1UBTS5oaspeYhjBBBAAAEESl6gvzMiW6LW73K4mhZpdY3QLUYBrdaqIK8t/CdS9+Y3SvOazrxu88nvfUcef+Qb9prGuZNiTgzbfQ4QQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBchcg1FruT5j7Q6AIBGKG9l73Nvb0tEp7TdA9TB8BBBBAAIGSF9i1PrVaqx5pEQmESv7eyv0G9Hor1Hqh9b35qsRhXr4feeawfPv+v7fXMidHxBg6Yfc5QAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBCpBgFBrJTxl7hGBAgoEey6/Vtdlq3sL6QI/7nPoI4AAAgggUIoC6hc3IuGAc+tW9U+9gQrlTpTi68UrtV7Y1mVXvzVvGxw+NSj//Kk/kbnZ2fia5vkpiZ18OW/rsxACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQLAKEWovlSbAPBMpVQJO97ltb6qOZ3efRRwABBBBAoBQFqgKa3NqdplprU0cp3k7F7FmrqhXRF8LINfUNctH63rzc+/TEuHzhEx8R9T3eYnNinHxJxDTysj6LIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAsUkQKi1mJ4Ge0GgzATCG6/YLKLd5L6t3T2pQR/3OfQRQAABBBAoZYGBTa2p2w+ERKuLpo4zUhQCWk29vY/ey7bZx7k++PJf/Tc5dfzVhWWsIGts8CUx5xYqtuZ6beZHAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEik2AUGuxPRH2g0AZCRiGllKltas+LDvXNpbRXXIrCCCAAAIIpAqsrUv/806PtqeezEhRCGg1DfY+Nrxpq32cy4Nv33+vPPfzn9pLxE69KubMpN3nAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEKg0AUKtlfbEuV8E8iXQ3V8tYuxzL3fnxjSV69wn0UcAAQQQQKAMBAbS/MxTwcn4x9yXwf2V2y0kh1p7f+vynN/ek9/7jjz+yDfsdYxzJ8WcGLb7HCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAKVKECotRKfOveMQB4EQsExK9CqWcHWxVYV0OTW7ubFAY4QQAABBBAoY4Gr2utlS9TxozB+t1RrLb6HHg8a64H4xmrqG+Si9b053eSRZw7Lt+//e3sNc3JEjKETdp8DBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoFIFCLVW6pPnvhHIsYApste9xJ6eVomEFwIj7tfoI4AAAgggUI4CezakVijX6ptEAqFyvN2SvSetpt7ee+9l2+zjXBxMT4zL//ir/0fmZmfj05vnpyR28uVcLMWcCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQMkJEGotuUfGhhEofoFAzxU3WVVaN7t3entvi3uIPgIIIIAAAmUtsGtds7TXBJ33qOmiR/iZ6EQpbE+rabA30HfFW+zjXBw8fN/fyNjw8MLUsTkxTr4kYhq5WIo5EUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgZITINRaco+MDSNQ/AKapqVUad25tlF6GqqKf/PsEAEEEEAAAR8FqgKa3NBlVWZ1NT3abv3+B38Vd7EUrKtV1dprd/e90T72++DpQwfkfz3+mD1t7PQxMecWKrbagxwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACFSzAv6RX8MPn1hHIhUCod/tWq9zYte65b+lODfS4z6GPAAIIIIBAOQrs62sXFW51tEBItNqIY4hOgQT0gEgwHF88FA7LRet7c7KR4VOD8q3777XnNseHxJwcsfscIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgiIEGrlXYAAAr4KmKb5XveEqkKrqtRKQwABBBBAoBIF2muC0t+ZGmDVmzsrkaPo7lmrrrP3lKtAq1rgy//vn8v0xHh8LXN2RlSVVhoCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAgFOAUKvTgx4CCKxGoHtrVMTY557i9t4W9xB9BBBAAAEEKkrgrr62lPtVH3mf/LH3KScwkBeB5GfQ3XdpTtY8+NUH5ZXnf70wt2mIcfoVq7C9kZO1mBQBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoZQFCraX89Ng7AkUmEArqd4ho1cnbioQDsqenNXmIYwQQQAABBCpOYFtLnWyJOn5Exg30pjUVZ1FsN5xcqXXt+g2+b0+FWR97+Cv2vMa5U2LOTNp9DhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEFgUItS5acIQAAqsUMExtv3uKW7ubpSqguYfpI4AAAgggUHEC79uUplprnVXkPBCqOItiuuHkSq0X9fT6urW52Vl56HOfEfVdNRVmNYZf93UNJkMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQTKSYBQazk9Te4FgQIKBHquuEnTpNu9hYFNVGl1m9BHAAEEEKhMgRsubpL2mqDz5jVd9MbUsKvzJHo5E9ADIsFwfPpQOCwXrfc31PrwfffIqeOvLmzfNMQ4dTRnt8LECCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQDkIEGoth6fIPSBQBAKaZqRUad25tlHW1i0ERYpgi2wBAQQQQACBggqoyuW7e1J/2UNvtMascCst/wJauNpetH3tJfaxHwfPPPWE/OwHB+ypjDPHxJxbqNhqD3KAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIOAQ4F/PHRx0EEAgG4FQ7/atInq/+9rdPc3uIfoIIIAAAghUtMDtPS2iwq2OFgiJVhd1DNHJk0Cwyl6o42L/Qq1jw8OiqrQmmjkxLMbYUKLLdwQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBJQQItS4BwzACCGQuYBjGXvfZW6LV0t8ZcQ/TRwABBBBAoKIF2muCoiqZu5vetMY9RD8PAlposaJ885pO31b87pe/ICrYGm+xOYmdPubb3EyEAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlLMAodZyfrrcGwL5EOje3qFp5h3upfZsSP14Zfc59BFAAAEEEKhEgbv6UgOsWlWtaNV1lchR2HsOJoVa2zt82csrz/9afvaDA/Zc8UCrEbP7HCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIILC1AqHVpG15BAIEMBELB2G1ixXCST1VV6Hata04e4hgBBBBAAAEELgioaubqy930Jn9Cle556S8toIUXn0N71yVLn+jhlW/ce499tjk1JubkiN3nAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEBgeQFCrcv78CoCCCwn0N1fbYrc7T7lhq4mqQpo7mH6CCCAAAIIIHBBIG211tqISCCEUR4FtFCVvVrLmtWHip/83nfkxNEjC3Oahhhnj9vzc4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggsLIAodaVjTgDAQSWEAgExq+1qrSmJED29bUvcQXDCCCAAAIIIKAEdq5tFFXZ3NE0XfQoP0MdJrnsWN4SDMdXCIXD0rymc1WrjQ0Py8GHHrDnMEZOizk7Y/c5QAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBYWYBQ68pGnIEAAksLpFRpTRvSWfp6XkEAAQQQQKAiBVRF8zs3tqXcux5psX5fhL+ip8DkYEALLoaKI82W+yqbCrSqYGu8xebEGB5c5YxcjgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCFSeAP9iXnnPnDtGwBeBUO/2rZomV7knG9jY6h6ijwACCCCAAAJpBHZ1N4sKtzpaICR6Q5NjiE6OBCzrRGtuTyk8n3gpo+8njh6RJ7/3Hfvc2OljIqZh9zlAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEMhMgFBrZk6chQACLgHDMPa7hmRLtFquaq93D9NHAAEEEEAAgTQC7TVBufGS1ACr1tie5myGfBdICrXW1Desavpv3HuPfb05PS7m5Ijd5wABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAIHMBQq2ZW3EmAggkBLq3d2iaeVuim/j+vk2pH6OceI3vCCCAAAIIIJAqMLAx9WenVlUrWs3qQpapKzHiFtACAXuooanZPvZ68PShA/LK879euMyqzmqcfc3rFJyPAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIHBBgFArbwUEEPAsEAwad4ho1ckXqmpzN1ycWm0u+RyOEUAAAQQQQMApoKqcb2upcw5aPT1KtdYUFL8HglX2jJGmFvvYy8H0xLg8+qUv2JcYo2fFPD9l9zlAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEPAmQKjVmxdnI4BAd78VZjX3uyF2dTdLVUBzD9NHAAEEEEAAgRUE7upLU621LioSCK1wJS+vSiAQtC+PZFmp9eBDX5Kx4eGFeWJzYgydsOfkAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEDAuwChVu9mXIFARQsEAhM3WVVaO5IRVJj1zjQfn5x8DscIIIAAAgggkF5g59pGURXP3U1vcvy4db9Mf5UCWnAxNFxdV+95NhVmffyRb9jXxc68Zv3ej2H3OUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQ8C5AqNW7GVcgUNECpmne7Qbo74ykDeO4z6OPAAIIIIAAAukF9vW1p7ygNzRZv0fCX9dTYPwa0AP2TJHmFvs404PHvvlV+1Tz/JSYExcqttqjHCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIeBXgX8m9inE+AhUsEOrdvlXXZaub4EOXUknObUIfAQQQQAABLwI3dDWJqnzuaIGQ6I2tjiE6xSGgqrQ++b1H7M0Yw4P2MQcIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAtkLEGrN3o4rEahAgVhKldYt0WpRXzQEEEAAAQQQyF6gvSYou9Y1p0ygNXivIJoyCQPpBZKq4IbC4fTnLDGqqrTOzc7GX41XaZ0cWeJMhhFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEvAgQavWixbkIVLJA9/YO05Sb3AR39a1xD9FHAAEEEEAAgSwE9mxIrcqqVdWKVhvJYjYuWUlAC4bsUyJNqYFi+0XXAVVaXSB0EUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAR8FCLX6iMlUCJSzQDA4v19Ec5RkVVXldq5tLOfb5t4QQAABBBDIm4CqfH5Ve33KenpjW8oYA4UT+MW/HaJKa+H4WRkBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBMpcgFBrmT9gbg8BXwS6+60wq3aHe67dPa1SFdDcw/QRQAABBBBAIEuBgY1pqrXWRUULhbOckcv8FJibnZXHvvlVe0pz7Kx9zAECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAwOoFgqufghkQQKDcBUKhsdtMU+tIvk8VZr29pyV5iOMiFgh3rRe9vlFCnRfbuwx1dFm1d2vj/djIWYmNDC0cn1PHZ2X+3FD8u30BBwgggAACORdQFdC76sNyfGLWsZbWuEbMs8cdY3TyL/Dk9x6RseHhhYVjc2KMLfzszP9OWBEBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoTwFCreX5XLkrBHwViMW0/bqrrrMK3bTX8H8hvkL7OFnVuk0SXrdZwt3WdyvQqgVDK8y+Ke3rKtx6/uUXZNb6Ut9V3+/WMvAxUfvNpKn1T33u7kxO5RwEykogevOA1G67JqN74s9JRkxFfdKdVrXWT/3idcce9UiLGEMnREzDMU5nFQKat2rz7iqtxrmTPI9V8HMpAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggkE6ARFo6FcYQQMAWCG24/CpTZKs9cOHgrr417iH6BRbQraqrdVfvkLrffqeoYz9aINpqBenU10KYToXlxn/4qEwdftKP6ZkDAQQQQCCNwK3dzfLX/zko52PWT+BE0wOiN7aKMXI6McL31QoEVvqFD+cCTx864KzSOur/L3o4V6SHAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIVJ4AodbKe+bcMQKeBKw4TUpZzG0tdbIlWu1pHk7OnUAuwqxL7VaFXFXFyPq3Xidjh74lM88dXupUxhFAAAEEshSIhAOyp6dVHnjxjGMGLdImQqjVYZLPzr9/7zv2clRptSk4QAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBHwVINTqKyeTIVBmAt3bO0Ri17rv6q4+K1RDKwqBqt5Lpende32rzJrpTQXbOqV59wdkbvCYjDzypfj3TK/lPAQQQACBlQV2rW9ODbWGq0Wri4o5ObLyBJzhq8AzTz0hJ44eWZjTNMQYP+fr/EyGAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIILAgoAOBAAIILCUQDJpWlVbNUZK1vSYo/Z2RpS5hPI8CtduuiQdLVaXWQrVQ58XSMvBRUeFaGgIIIICAfwKqIvrOtY0pE+qRlpQxBlYvMHZueNlJ/u07D9uvG6Nnrd/5mbP7HCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAII+CdAqNU/S2ZCoLwEuvutMKtxm/um7tzYJlUBzT1MP88CKtAavXlAtGAozyunLqdCtapqK8HWVBtGEEAAgdUI3NLdlHK5qtSqWRVbaf4KzM3OLjnh8KlBOfLMYft149xJ+5gDBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAF/BQi1+uvJbAiUjUAoNH6HVaW1I/mGVJh1V3dz8hDHBRCoedOV8UBrAZZeckkVrm35gw+LCtvSEEAAAQT8EVCVWnsaqlIm0yJtKWMM5E7gZ4cO2pObU2NUabU1OEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQT8FyDU6r8pMyJQFgKxmOx138iudc3SXhN0D9PPo4AKjzZen1JAN487WH6pxhvfI6HOi5c/iVcRQAABBDIWuL23JeVcvbFVRA+kjDOQG4GnHztgT2yMDdnHHCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAII+C9AqNV/U2ZEoOQFghve3K/rstV9I3s2WCEaWkEFaq/sF72+saB7WG5xFbqNvutOUd9pCCCAAAKrF9jT0yqqUrqjabrokdSwq+McOp4Ezp0aTHv+kWcOy9DJC68ZMTEnR9KexyACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAgD8ClFz0x5FZECgzAWO/iDNAc1V7vWyJVpfZfZbW7aigaMNbr1vVpmeeOyzTv3xajIlRMWamZW7wWHy+qnWbRKz5w109Ur1566qqrapKrdV92+LrrGqzXIwAAgggEA+0qmDrAy+ecWjo0TVijJx2jNHxX8BRpXV8WMQ0/F+EGRFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEbAFCrTYFBwggoASsQGP3/Lx2k1tjYCNVWt0m+e7Xbrsmqyqt5vycjB36lkw/sxBmTbfv8y+/EB8+f+RXMv7DfxUVcq1/6/VS1XtputNXHKvd3k+odUUlTkAAAQQyExjYlBpqlWBYtLoolUMzI1zxrNHhoZRz5mZn5RePP2aPm+Op59gvcoAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggg4IuA7sssTIIAAmUjMD8ftKq0Olt7TVB2ri3ej7x37rZ8e8HOLs83pwKtw1+7TyafOhSvzprpBCrkOvQvfyvjP3o000sc56lQbCBKENqBQgcBBBDIUmBtXTjtz2G9sS3LGbnMLTA+cs49JIf/7TFRwVbVzPNTYs5MppzDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIICAvwKEWv31ZDYESlugu7/aim3c4b6JfX3t7iH6BRAItnZ6XnXswNdFVV/NtqmqrSoQm00LZRHCzWYdrkEAAQQqQWB3T3PKbWq1EdGqalPGGfAuMH4utQrrzw4dsCeiSqtNwQECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCQUwFCrTnlZXIESksgFBzbZ+04mrzrqoAmt3anBmmSz+E4PwKhtg7PC82sItCaWGz04Ndl9vjRRDfj71XdmzI+lxMRQAABBJYX6O+MyJao9bsnrqY1tLhG6GYjMDbsDLUOnxqUI88cXpjKNMQYT63kms06XIMAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggsLwAodblfXgVgYoSiBnae903vKenVSLhgHuYfp4FtGBI9PpGT6vGRs6K+vKjTf3H456n0Ru87dfzAlyAAAIIVJjArvWpv2SiN7aKBEIVJuH/7Q6fPumY9GeHDtp9c2pMJDZn9zlAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEMidAKHW3NkyMwIlJRDsufxaXZet7k2nC9C4z6Gfe4FgW6fnRWLjVgjHpzZ/dtDzTF5DuJ4X4AIESlRA/Xnmz0eJPrwCbzvtL5pouugNTQXeWekv767U+vRjB+ybMsacVVztFzhAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEPBdIOj7jEyIAAKlKaDJXvfGl/qoY/d59ItTINjW4dvG5s8NyfmXX/A03/zgMU/n+3WyXl0rNZdfI+p7INpifbVKoD4igaZWmT1+NL6MCumeP/Kr+Jc5n9/qeyrMWLVuk4TXrpdg58Xx/QSbFvaZbKD2qvZmzkzJ3MnjC/s+s7Bvwxrzq9VuuyZuk8l889b60798etlTa950pajQZqijSzTrGYQ6u+LP4tTn7l515WA1r5pftcT8bju1x9iEFei27GZ+8yuZtd63cwV6L6p9Kt+q3kvjIdaQ9WfSHWZN7Pe8tdepXzwpxsSouoyGQFqBqoAmt3Y3ywMvnnG8rjd1iDFy2jFGx5vA3OysTE+MS019gxx55rAMnbzwyxxWhdZ4pVZv03E2AggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggECWAoRas4TjMgTKSSC88YrNhiE3ue9pd0/qxxy7z6GfH4HYubOeF4qHO60A4EohxEwmVkG7oQc/m8mpBTsn3LVearf3x0OPWjD9R3GrMKlq6nudda4KjU49/biM/+hR8TMo6kYIWeHV+rdeJ2qPKmSbSVPnJlp137bEYfy7CuSq5zp1+EnHeDadGhW6vOCy0vUq2Jzu/aSCmrVWkLhu+9szvr+V1kq8rp6lCrKqZ5tsknjd/V0FX9WXaipMqpp6/85YZuM/fHTVwdr4hCv8j/Kou7I/7uEOsbovTexXPYPIjlvivpM/OWQHsN3n00dgX197SqhVAiHR6qJiTo4AtAqBsXPD8VDrb6xQa6IZE5apaSS6fEcAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQyLEAodYcAzM9AqUgYBiaVaXVdGy1qz4sO9c2OsboFE5ABS7VlwqqemmN198Wr/KpqkGWa1OhwOi7BjIKPLoNVGCy7uod8cqu5x6+P1651X3OavoqzNrwjhvFHUpdzZzqWhXWVF91v/1OGTv4dc9VdFe7fvL1yq/xutuSh3w5Vs9VBYFVoHWpkHKmC8VDt1Z4V1VNVUHgXIZbG37n9+L7znbP6n7V16j1XCefOpTpLXJeBQm01wTjP5+//5qzqq8ebZcYodZVvROGTw3Kmq5L5JmnnrDnoUqrTcEBAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggkBcBPS+rsAgCCBSvQHd/tVXHcJ97g3duzKyapPs6+rkTyKpaq1UxsvX9H/c9VJm7u/Q2swr/qfvLpILncjOrsHDLH3w4Hnpc7jwvr6lwY9sf/nlO7VVotmXgY/EKn1725te5jTe+JyeBVhUCVs9VhVCzDYcudY9qzjUf+et4mHmpc7IZV/uM3jwQDzH7sWcVFFbz+TFXNvfDNcUtMJDmZ7RW0yBalbdffCjuu8z/7k4df1WmJ8blxNEj9uLm9Lh9zAECCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQO4FCLXm3pgVEChqgVBwzAq0alawdbFVBTTZ00OodVGkOI5iI0NZbUQFNpt3fyAefsz0Y+azWijPF6lAZdO793quXrvcNlWIMPGR9cudt9JrKtCqKrTmq9W/7fp4ADJf66mgpQoB123v931JZafer16rEnvdiAqNqveQHy3+Z8znULTalwrg+rVHP+6TOYpH4Kr2etkSdfzojm9OVWulZS9w6rVj8uzPf2ZPYM5MWr/3E7P7HCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAII5F6AUGvujVkBgaIWMEX2ujeoAq0q2EorLoHzr7ywqg2pQKuq6tn+wb+IhxFzHRpc1WZXuDiy45acBCrVsk1WsHU1NvkOtCaoVABSrZ3LlqgaqkK0foR/k/caD4ZaYdZ8hoFVKFeFcxP3lbyfTI/jAd+Bj0quAuPqudZdvSPT7XBeBQns2ZD6yydafZNIIFRBCv7e6omXjsjzv0gKtU6N+bsAsyGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIrChAqHVFIk5AoHwFAj1X3GRVad3svsPbe1vcQ/SLQGDq6cfFmBhd9U6CbZ3x6o9r7v58vLqn+qj31YT6Vr0hjxOogJ8KVeaq6fWNWQcrVdAzn6FMt0H9W6+TQDQ16OY+L9t+oCESn1+t43dTVXfVezHfTT2z1VRDVc871HlxTretQtzhtetzugaTl57ArnXN0l4TdG5c00WP8DPciZJ5T1Vq/c0zh+0LzOlx+5gDBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIH8CLj+JTw/i7IKAggUh4CmaVaVVqtWa1LbubZRehqqkkY4LBYBc35Oxn98UNTHpvvRVJBVVYFUX6rNPHdYzh/5lUxb3/0Iz/qxR/ccKnDq1/27507u11x+jYwd+pYocy9NVf5cTTNmpsS0vrINpqpn2nj9bTL8tftWs41lr1Xz+x2CVu9Bvyu/LnsTrhfV+rMvvyBTh590vbJ8VwXE81FFVXmrtWgIJAuoiuq7upvlH587nTwserRdjBFrzDQc43RWFpieGBf1FW+WnzkzufJFnIEAAjkV+PDGv8np/ExeGAGea2Hcc73qGz79QK6XYP4CCPBcC4DOkgjkWICfwzkGLtD0PNcCwed4WX4O5xi4QNPzXAsEz7IIIIAAAggggAACJSdAqLXkHhkbRsAfgVDv9q2maVzrnu2Wbuuji2lFK6CqtTZYVTJVuNPvpqpkqi9VtXJu8JhM/e+nZObZwxIbOev3UlnP53eYcqmN6NW1UvOmKz2FHFW1zmwqjaog5bT1dd4KVSY3FfJUIVmvc6rz1f5VQNbvpsK22QZul9qLmi+bSqnqfTl7/KjMnz1pBVKfj08ftJ5BsLFFwus2ea6eqvYw+5o135nBpbaaMq4C1vl6T6YszgAClsCdG9vkgRfPyPlY0i+oBEKi1UbEnBzBaBUC5tQYweBV+HEpAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggkK0AodZs5bgOgRIXME3zve5b2BKtFlWplVa8Aqpy6LlvPyjNuz+Q0zCdCmg2qi8rtFesAdd0T0kFOecGj9sv6dU1nsONiYtVqNRL5c7a7W9PXJrRd+U6/LX/vmRoWFXNVV9VVkCz+Q8+7Ol5q2Crl71ntOEcnRS9ecDTvak/A5NPHZLxHz2aUkk3ORisqq9GrKqyKuCbSVPh1MiOWzKucqtCz9lUl1X7n/7l0zL1H4/HQ7lqb2rtcNd6CbZ2SsPv3JiT0HomBpxTegLtNUHp74zI918bdWxeb+qQGKFWh4nXjjl9oWKr1ws5HwEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEBgVQKEWlfFx8UIlKhA99aoiLHPilI5bmDX+mZHn05xCqigo/p4+VwHWxN3nxxwVaHByZ8ckpnnDideLorvKsA5/sNH0wZEVSg0YoVz1X14aaGOLi+nS7UVgs20qfDtcoHW5HmU+dihb8UDxsnjyx17DeQuN1emr6mwpnpfqMqpqtqpMbEQsps7c9I+ds+lKtGq55NpU3OeffCejKqpqvfEjPVnpXXgoxJs68xoCeWmqiAn9r7cRbXW3r025TLyyIN2mDVxvbJTz1l9qX03vONGqX/b9YmX+Y7AsgJ39bWlhFq16jrRqmrFPO9/xeZlN1NGL5rTE2V0N9wKAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggUDoChFpL51mxUwR8EwgF9TtM0aqTJ4yEA7KnpzV5iOMiFsh3sDVBoQKI6kt99PvEEwfjFSdz8TH3ifVW+r5StVN1vQoKDv3L30nb3o+L+qj7TJsKQqoqn5nen5e5Rx/9StoA7lJ7U5VJa7denXEwV1X9zFebPX5Upq0gpqo+mqlVYm/VVrVTL01VKVbB0EybCqee++Y/Sdsf/nlGl6iKqQ1vvU5GD3592fPV+8JLGFdNpmzU+1D92VmuqYCrCjGff+UFabEq9NIQWElgW0udqErrz47MOE7Vm9ZI7OTLjjE6GQrE5ggEZ0jFaQgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggIDfAoRa/RZlPgRKQMAwtf2as0ir3NrdLFUB12AJ3Eslb1EFW8/84yel8Yb3eA7YrdZNBTgbb3yPNPyft8jEjw9aAdcDq53S8/Uq0DpkVe3MJEipwo2jB74er27rZaFgW0dKVc1013sJtKrr1bPz2mZfO5pxqFWzQpe5bipcqgKj6jlk01RFVC/BUFXBNBs3tT8VCq67ekdG26y9sj8eKlXh0qVadd+2pV5actxrkFnd64gV4o3ePLDknLyAQELgfZva5CM/O57oxr9rdVZR9kBIxApo0rwJmFNj3i7gbAQQ8EVAc/8Hii+zMgkCCCCAAAIIIFC5Avz9qnKfPXeOAAIIIIBAvgXe8OkH8r0k6+VBgOeaB+QCLMFzLQB6HpbkueYBmSXyLqDnfUUWRACBggoEeq64yfr34m73JgY2ZV7B0n0t/cIJqGDh0IOfjYffMgl3+r1TVbEysuMWaX3/n2b8Ee9+7MFLoDWx3sxzhzP6WPnE+eq7qtyZSQs2tWRyWvwcFbDN5lnFRoYyXkM9l1w2Fbg8+09/mXWgVe2txmMwdMwKJWfbVOXTTJt65itVuq3qvTTT6eLnqWeu3n9emwryqkAuDYGVBG64uEnaa1y/q6bpoje2rXQpr18QaH/DRbaFOTNpH3OAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIJBfAUKt+fVmNQQKLqBpxn73JnaubZS1dWH3MP0SElDht9OfuzsegMsmMLnaW1UhQPUR7/Vvu361U614fTaB1sSk8+cyD4aqazKuwGoFIc+//EJGX9PPPJ3YjqfvsXPLf2y9ezJVCTUXbfb4URn+2n1ZBXOT91P9piuTu8seq2e+mve1qrqqAuCZtvC6zUueqkKvXkOtEz95TJar/LrkYtYLKpCrQrE0BJYTUJXWd/ek/nKK3miNWeFW2soCXRs2Lp4U5O9EixgcIYAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAvkVcJV0yu/irIYAAvkVCPVu32qaRr971d09ze4h+iUooEJ/owe/Hg/B1W67Rup++515rZ6qwn6qaqtqE08cyJmg+sj7bAOOcyePrViFM5uNq8ql6iuXzWuwMdMqs173PPLIg1kHNBNrqcBt1bpNie6K32etwPBq2/zZkxn/eQh3L723sLVvr5VwV/PeUGHY8R8flMbrblstAdeXucDtPS3yhedOyfmYuXingZBodVExJ4YXxzhKK9C1vlf+1xM/ir+mVdelPYdBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIHcC1C6KffGrIBA0QgYhrHXvZkt0Wrp74y4h+mXsIAKwU3+x+Ny+t5PWB8R/xlRVVyzDYFmw6CCrdUeP1reyzrm3JyX0x3nqmAjbXUCq/FPrKwq+3ppcyePezk97bmeKrVa+1sqFBzwWAE3NnJWVKXZ1bSZZw+v5nKurRCB9pqgqMrr7qY3rXEP0U8j8IZLuu3ReKiVCre2BwcIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAPgWo1JpPbdZCoJAC3ds7NC12h/U5xI5d7NmQ+nHFjhPolLSA+qh49aWa+sh0FTatsb5y9dH0Caymd++1ArV/ueowX2I+v76b01N+TZXzeYJtnRKoXwycBzsvzvma+VrAazBUVR2usaoPr6YlW640jwq0qj8jKpDqbnpDamjQfU5yf/4MQepkD45zK3BX3xp59NURxyJaVQ90/ywAAEAASURBVK2okKY5M+kYp+MUqItEpLG5RUaHh6y/KukLZtPjzpPoIYAAAggggAACCCCAAAIIIIAAAggggAACFS6gac5/a65wDm4fAQQQQAABBHIkQKg1R7BMi0CxCYSCsdtMK6KRvC9V1W3XuubkIY7LWEB9BLr6Gn30K/GPfq/evC0edFXhSb+bCgVGrI9LH3rws35PXfLzqY+uD3V2xUOTyl63gqvB1k4JNrVIIFoZIXOvwdBQAQK96nmkDbVW1Xh6D3qpEOtpYk5GII2Aqr6uvp4dmXG8qjd1SGzwJccYnVSBizf0yi+ftkKtVtNqGsQk1JqKxAgCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCQYwFCrTkGZnoEikKgu7/alLG73Xu5oatJqgL8Np3bpRL6519+QdSXHBRRwcp4FVcVcl23ybfbV3OpuQn1XaiSu+HSnIWIfXtoeZpIBXlLtXkN5M6PLgTkSvV+2XfpCXzo0g55/7+/4ti4Vmv9mQuERGJzjnE6ToG16zdYodafxge1mnrni/QQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBPIiQKg1L8wsgkBhBQKB8WutmmMd7l3s62t3D9GvQAEVOlVfk08dilcPrbnsSql789vjgdTVcqh5Rg9+fbXTlOT1NW+6UtSXCgyryrW0RYFAfeNip0iPFqrmWsFvV1OVdr00c3rKy+mci8CqBfo7I6IqsZ+enl+cS9NFj7aLMXRicYyjFAFVqTXRtOo6669OuohpJIb4jgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCQBwHrX2ppCCBQAQIpVVp3rm2Mh14q4N65RQ8CxsRoPNx6+t5PyJl//ORCNVcP17tPVQHZSgt0qjBkyx98WJrevVeq+7ZV3P273wPp+noJhFrT7VuNaR5DrUvNwzgCuRJQFdjv3NiWMr0eaVkIaaa8wkBCINLUIu1vuGihawVa4xVuEy/yHQEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgLwKEWvPCzCIIFE4g1Lt9q6bJVe4d3NWXGnhxn0O/sgXmBo/J0IOflaF/+VuJjZzNCkOFF8Nd67O6thQvqn/b9dK+/y/i1VlLcf/52nOgwfoo9BJtQSv0RkOg2AV2dTeLCrc6WiAkekOTY4hOqsCGS3/LHtTro/YxBwgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAC+REg1JofZ1ZBoGAChmHsdy++JVot21qsj9WlIZCBwPkjv5KzD9yzqmBrBsuU9CmqGm3LwMcksuMWKrNm8CTnzw1lcFZxnmJMTxXnxtgVAkkC7TVBufGS1ACr1tiedBaH6QTWbe6zh6nUalNwgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCORNIJi3lVgIAQTyL9C9vUPTYrdZnzfsWPt9m6jS6gChs6KAqtQ6/LX/Lm1/+Ocrnus+Qa+Aj2uP3jwgVes2uW/dU3/+zKDMnz0pcyePx6+bffl5x/VVGy4VVQm2HJo54y0YOnv8qJjzc3m99djEaNr1jJnptOMMIlBsAgMb2+SbR4cd29KqakWraRBzetwxTmdR4A2XrJPG5hYZHbbC91Z1W626TsyZycUTOEIAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQyKkAodac8jI5AoUVCAaNO6xAa3XyLlT1thsuTq3elnwOxwikE5gbPCaqamtV76XpXl5yTG9oXPK1cnihum+b1LzpSs+3YljBzpnnDsv0L5+W2ZdfWDG0GYi2el6jWC9YKjC61H5Hv/sVUe+/YmjGEmHXpfamqvjSECiEgKrKflV7vfz09IRjeT3aLjFCrQ4Td6d7U5/850/+PT6s1UUJtbqB6COAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAI5FCDUmkNcpkagoALd/VaYdWy/u0rrru5mqQo4K7cWdJ8svqKAqgJau+2aFc9LPuHcw/fHw5LJY34cn3/lBc+h1kC0xY+li3aOhnfc6HlvU4eflLEDXxcVbK3EZkyMebptvT7i6fxcnuz1mZX7+z+X1sy9eoGBja0poVYV0lQVSCWW3+rHq7+b/M3Qe+lljlCrDJ3I3+KshAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCFS4gF7h98/tI1C2AoHAxE1WoLUj+QZVmPVO6+OIaaUlEBuxPgLZYwu2dXq8IrPT588MZnZihZylqqeGOi/2dLcTTxyQkW8/WLGBVoUVO3fWk1mgvniq/XoN5AaayqfCrqeHxslFIbBzbaOoCu3upjc5/nrgfrni+109G6W6pibuoIWrRQuFK94EAAQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCBfAoRa8yXNOgjkWcA0zbvdSy4VbnGfR7/0BYKtBJby8RSr1m3ytMzc4DEZO/QtT9eU48leq52GOrqKhsEYH/W0F/4seuLi5BwI7OtrT5lVb2iyfu+F/wxIgbkwEAwFZd3mLfbLWgPhdBuDAwQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCDHAvxrdo6BmR6BQgiEerdv1XXZ6l77rr417iH6JSDgtaqluiUVttSC1sdL+9yyqQDrNQTo85ZzOl2oa72n+Sd/8pin88v15NnjRz3dWthjeNjT5B5P9hzItSr56tW1Hldxnh7qLJ5Qr3Nn9EpB4IauJlGV2h0tEBK9kaCmw8TV2fRbV9gjeqTFPuYAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCC3AqmfR5rb9ZgdAQTyIhCzqrQ6AyxbotWivmilJzB38rjnTevWx7XXXtkvk08d8nztcheE13oLcaq5YpNjy01Z0q8FWzs97X/2NW9hTk+Tl9DJxsSonH/5hXj4OpNth1Qw1HpPq+uybY3X3SZ1V+/I+PKz//QZSRe+Vfv22lQod+a5w14vi5+vwunRG9+T1bVchIASaK8Jyq51zfLQb4YcIFqDFdQcOe0Yo7MooCq1VtfUyMz0tEgwLFpNg5jT44sncIQAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggkBMBKrXmhJVJESigQPf2DtOUm9w7oEqrW6R0+uoj671Wh1R31/DW6yQQ9a8SX3XfNlFfXtu8tf9ybXp1jadby6bqbmKBUEd5Veuc+eXTiVvL6Lt6P6+mea0yPH/mZNrlEoHctC8uMVi77ZolXll5OLLjlnigd+UzOQOBpQX2bEj9WaBV1YpWG1n6ogp/JRgKyqati9VatfqmChfh9hFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIH8CBBqzY8zqyCQN4FgcH6/VaXVUZJVVWnbubYxb3tgIf8FZrOoDqkqW7a+76O+BFvDXeslevOA5xsz5+fSVrv0PFGRXqCqaHppgabUYFkm16sKo16qjGYyZ6HPmbYql6r3R6ZNVR72GkxNnjtkvYczbSq4ulyQ/PxvfpXpVPHzVBhcVZv12hp+5/fK7rl7NeB8fwRUpfb+ztQAq97Y5s8CZTrLpdvfYt+Z3tBs/fWK/3SyQThAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEciTAv8zmCJZpESiIQHe/FWbV7nCvvbunVaoCmnuYfgkJTB1+MqvdqkqtbXs/Hg/GeQ1gqgX16lqpf9v10jLwsfix102cP/IrT8FFr/MX+vzYiPPjvFfaT7C1Y6VTUl5XVT4br7stZbzUB1Rw1EtYW71/m259f1bvQxUOVe/lTNvc4PFlT51+xluVWTVZ4w3vWXZO94tqzw3vuNE9TB+BrAV291ihTFfT6qKihcKuUboJgTdcsk4am1sWunpAlBcNAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCC3AsHcTs/sCCCQT4FQaOw209QcqTkVZr2950IgI5+bYS1fBWasqpYqBKiqr3pt6hoVilQf366qY6og4ezxoxIbOeuYSoX+Qp1d8TVURcxAtEVq3nSlZBOGTUycbRg3cX2xf5+3DKs8bLJue7+oZ5lJU+6R628TdU25NvX+qOq9NOPbU9VOm969V4a/dl/GYemqdZs8h0PnBo8tuyf1Z2f+zKCnyrGJasejj35l2b2rP3vqz6sXl2U3y4sIXBBQFdu76sNyfGLWYaI1rhHz7PJBbscFFdZ545vfIk/94ED8rvVIi8QmhitMgNtFAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEE8itAqDW/3qyGQE4FYjFtv+6qv3zjJU3SXsMf9ZzC52ny0QNfjwf6sl1OhVtVQDJfIUkVDMw0wJntPRX6Oq+VWlVQse7qHTL51KFlt67Oa3jH74kKQpZzm/7l01JrvSdV8DTTpmza9/+FjP/wUVkuNK3e7yrIXXtlf6ZTx88z5+dk4iePrXjN1P9+SiI7blnxvOQTVNXdYGunTP7kkKh7T7REiLy6b5uoc1YTJE/MyXcE0gncubFVPvWL1x0vqaCmMXRCxDQc43QWBN505WKoVauNiARCIrE5eBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEciRA0i1HsEyLQL4FQhsuv8oU2eped2Bjm3uIfokKxAOAVuCtVCo4jn73KyUqnfm2V6romW4mVYWzevM2mfjxAVEfcx+vwHuhSm4g2ir1VhBThRwrpZ17+P54SFVVCs60KafozQPS8Ds3yvy5IZl95QX7Ur0+Eg+OqkBwNuHQqacfjz8Te8IlDlQwuW77262Kxq1LnJF+WO0r3LV3VQH19DMzisDKArd2N8tf/+egnI9Zf2NIND0gemOrGCOnEyN8TxKINLXIGy5ZJ6+/+nJ8VG9owirJh0MEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEDAbwFCrX6LMh8CBRKw4il3u5fe1lInW6LV7mH6JSygAoAtAx8V9THsxdxU4G/2+NFi3qIvezt/5FeiPorea7BRVSb1Up3Ul80W6SQq1Dv66FeyCnkqd/Xll6Wq0jr+44MZSalzVfXk5t0fyOh8TkKgGAQi4YDs6WmVB14849iOFrF+AYZQq8MkuXPp9rcshlqbOsQYPUtl22QgjhFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEfBVwfVO7jzEyFAAL5E+je3mEtdq17wbv6qNLqNin1vjEzJUMP3mNV+DxWtLeiAq2jB79etPvze2OZfFS9H2uqEGW5NlWFeOrwkwW/PfXeVSHbTNvMc4dFfeW6qT/3519erEab6/WYv7wFbu9tSblBLVwtWl00ZZyBBYE3vvkqqYtEFjqBkGi1F44BQgABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBDwXYBKrb6TMiEC+RcIBk2rSqvmKMnaXhOU/k5CF/l/GrlfMRFsLcaKreM/elTGf/ivuUcoohWmf/GkNLzjRtGra3O2K/XMR779YFlXBR2zqp76WXXV68NQodqxQ9/yelm8WmvYqrybq+evwsznvnafhNdt9q0ireeb5IKyEuhpqJKdaxvl+685A9yBzp6yuk8/byYYCspb3rFDfvivC/8fgZWfusyFAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIOAWo1Or0oIdA6Ql091thVuM298b39bVLVUBzD9MvE4FEsHX2+NGiuSP1EfKVFmhV+OpZDP/L30quKqkmnnU+KoIW8s20cJ+fFRWMzndTgVYVGs6mxUbOxqsne6nw6mUdFfZVVVqVDw0BvwRu6W7ya6qKmWfz1itEhVtpCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAK5FSDUmltfZkcg5wKh0PgdVpXWjuSFVJj1hi4CK8km5XisQm5n/+kzMmxVcZwbPFawW1RhyzP/+EmZ/I/HC7aHQi+swsXnHr7f920kAq2FfL6+39QKE6pgtHpP5yPEqYLIE08cyDrQmrgV9XzO3P+XMn9mMDG06u9qbyoonvhzZU4Tal01KhPYAqpSq6rYSstcoC4Skd/67bdmfgFnIoAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAghkJUC5oazYuAiB4hGI/f/s3Qt0HPV99//vzO5Ku7qs7rJkW0ZClm/Bxo5j10CghhOuCeAGp/iPnWKcUzC0T2hLW5q2OTlp+vRp+jR92ue0ef5NT8i/TdqSPqEtpIUGcoBAuAdIwAkXAyYYfJFtWZatiyXtzn++I2u9O7uSZla70l7ev3M2mpn9zW9+v9esHHH00Xdicpvpiqdv7WqU1gjf3oVzl/I7Ew2V6iu8cp3UXbPNeYR7fq84MbpeU6tqllPgcjpX9ThmV2xt+OQuMWvqpuvq6T2tHjrw8H2SXAFUt3MxtqcJzGMntTzyN190Ps/6uc5HG371eTlhV0FN9p3NdbRi69Gv/Yk0/spvSkXHubMZypmTBnuTKzHr+DQEcimwvadJ/uilA64hrZHx8Xi7vPvjftcbJbkbXPphO4k+8YdBgwMDosHV6dovXHq5vPjEY4kuhmGuG9v7wo8TB9hAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEJi1AKm3WRMyAALzJxBc+pHNItZa9wx2LG12H2K/DAQ0CKivyOqNUtG1XCo7l0uwpT2nK9eQ3ei+12V4zwuEWTPInt67R3r/6g8laoeLq9ZdlKHHzIfUeOChf04JNE6epdU7y6VpiFODnRrijazZKNUf+cVZf541wHp63xty6smH8vL5nayeXGl//9VcfI1U9pzn63bp/IZeesqZn7tSbTnde19odM5aYEd3s3z5JwfldMxKGsMIh4LmTvtfmr9MOliym5ZljBjGxPJisZn/fdXQa8/qNbL31VeckyzLutneINRasp8QFoYAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAvMhQKh1PtS5JgI5E4jfaVcYSxltU2uNrKoPpxxjp7wEtAKlvrSZ4aqzAdf2JWIEQ56rSOqj1GOnBmT03TecIKuGLfMRrBu2K5LqNbw2ayT7x7CPHdrvVJf1ei3t77dpGLH/X+9xqqxG7CqjYTtkrCHHqZr21zCsvkbs13SVQ089833nnk41VvJxr05z6Z88P6/b6jH49CPOS0PaTmh7sV0J1f4sh1rapq1c63yG+4/JyFt77M/wG3kJsmZahwZn9aXzrbn4aqd6crChKWMVZf2+0jUmf99mGnP8yCHPn113IDbTeBxDoDJgiAZbv/7mkRSMuGXYP1uUR6g1ZeEedy684uOJUKv9h0W/EerZ8PdUa/WIRzcEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEDAg0BqGs7DCXRBAIHCEAivWNs5Ph7Y557N1z7aKVcunv2jz93jsl+aAvqYdA26atMAq4YAafkTmPTWcLCGGWm5EdBqrhpw1TZmhz+nCwbn5orZjxKobxYjFOJ7LXtCzsyhwPuDo3LRd19LG9Gy5Jdib7/472lvlNiBQPf6Z+xKrZt0WZ+5+/PS2LrA0wq/83dflX2v/+xMX+vfx9966Zc8nUgnBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIEZBajUOiMRHRAoTIHx8aBdSS35kcEirZEggdbCvF0FOyuClXN7a/DOj7eGWE/br2Josf6jxTBN5lgmAourK5yfG773fur3j2EYt9kEJR9qNYz4iF3T3LnbgycHPIdaL/n49UmhVmOLXa11LdVay+SbhmUigAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEDeBSZ+i5v3y3ABBBDIqUDn5rAdaN3pHnP3ylb3IfYRQAABBBBAAIEpBW7qbszwnnWVBjUzvMEhW6B14SLpWb0mYWFZsS8kdthAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEJiVAKHWWfFxMgLzIxAKDuy2r1yffPXKgCGf6swUTEnuxTYCCCCAAAIIIHBWYHN7VFbV238r42qWZd3sOlRyu3ZF2kPZLurCKz6edOpEtdakA2wigAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCGQpQKg1SzhOQ2A+BWJxIy1osqO7WaIVgfmcFtdGAAEEEEAAgSIU2Hpupj+Kie+Wzg1tRbgcz1OOx42Ryc4jQ8OTm56+Uq3VExOdEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAd8CQd9ncAICCMyrQLD7w1eJIWmPBM4cSJnXqXJxBBBAAAEEECgCAf3DmP/3tV7pHR5Pmq0RDgbjB2Xp+qRjpbVpGCKWNbGm0yNDvhen1Vr3vvrKmfOMLcGl68+M5nsoTkAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQOCNApVY+CggUm4Aht7mnPNWjg9392EcAAQQQQAABBNwClQFDru1ocB8u+f3JQKsuNDaeHOj1tnR3tVZvZ9ELAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQGA6AUKt0+nwHgIFJlCxbP0KEWOLe1o3dWd6bLC7F/sIIIAAAggggEBmgd0rWzO/USZHTw0MZLVSrdZKQwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBDInUAwd0MxEgII5FsgHjfsKq2pT7btrq2UKxfX5fvSjI8AAggggAACJSzQGgk6P0987/0TKau0hk9K7IM3U46Vyo4ZbRKztdNZzsn+vqyWpdVaV2/cJK8+/+zE+eOjMv7zn9o/rsWzGo+TEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgXIUCC5dn1g2lVoTFGwgUOACnZvDIvHd7llu72lyH2IfAQQQQAABBBDwLbBrWXPaOUakVozKqrTjpXDAGhtNLCPbSq06wMVXXyfhSGRirGCFmA1tiXHZQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBDwJ0Co1Z8XvRGYN4FQcMAOtBp2sPVsqwwYsqM7PYBytgdbCCCAAAIIIICAN4FNrTWyqj7lRw3nRLO+1dsAxdYrNpaY8eBAaoXaxBseNqqjUbnwimsSPc2GBWKEKhL7bCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIeBcg1Ordip4IzKuAJXKbewIaaNVgKw0BBBBAAAEEEMiFwGeWt6QNY9Q0iARCaceL/YA1nhRqPXVyVstZf8ml0rpw0cQYhilmc8esxuNkBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoFwFCLWW651n3UUlEOhev8Wu0rrCPentPU3uQ+wjgAACCCCAAAJZC1y7pEFaI8HU8zWkGS3BnzniMREr7qx1cGAgdc1Z7F2+dVviLKO6XvRFQwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBDwJ0Co1Z8XvRGYFwHDMNKqtF65uE66ayvnZT5cFAEEEEAAAQRKU0ArwG/tbExbnFnfav99TQn+p0NsPLHWgePHEtvZbCw8p0tWb9yUODXQYldrLUWzxArZQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBHIvUIK/mc49EiMiMJ8CoZ4Na+0yYle553BTd3rgxN2HfQQQQAABBBBAwK/ALctaRMOtKS0QEqMqmnKoFHassdOJZZzKQbXWi6++TsKRyMSYwQpxwsCJK7CBAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIDCTAKHWmYR4H4F5FrAs62b3FFbVh2Vze+kFS9zrZB8BBBBAAAEE5l6gNRLM+HOG2dA295PJ9xVjY4krnOjrS2xnu1EdjcqFV1yTON1sbBejIpzYZwMBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAYHoBQq3T+/AuAvMr0Lm2XiS+2z2JredSpdVtwj4CCCCAAAII5E7g9pUtaYMZ4WoxKqvSjhfzAev0cGL6Rw58kNiezcb6Sy6Vhed0TQxhmGK2ds5mOM5FAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEykqAUGtZ3W4WW2wCoaC5U8RIKe8VrQjIju7mYlsK80UAAQQQQACBIhJY11QtWhne3cyGBe5DRb1vjY4k5t935FBie7YbV2/bIcFQ0BlGw8Bm48LZDsn5CCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAQFkIEGoti9vMIotVIG4Zd7rn/qnORqkMGO7D7COAAAIIIIAAAjkVuH1leoDVqLaLyAdCOb3OvA42Ppq4/LHDhxPbs91obF0gl12/NTGMhoFLrcptYnFsIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAjkUINSaQ0yGQiCXAoHu9VsMQzrdY+5aTpVWtwn7CCCAAAIIIJB7gSsX10lrZKLaaGJ0wxSzriWxW+wbKZVaew/L+Nh4zpZ0/gUflY7unonx1K31HLsAP//5lTNgBkIAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgZIU4LeqJXlbWVQpCBhGPK1Kq4ZLFldXlMLyWAMCCCCAAAIIFLiAVoa/qTv9j2nMOvtYqYQzrbikBFuP5K5aq97ea/6fHRIMTQSDtVKr2dBW4Hed6SGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALzK0CodX79uToCGQVCPRvWipib3W/uWpYeLHH3YR8BBBBAAAEEEMiVwPbuJtFwa0oLhMSork85VNQ7YyOJ6R87fCixnYuNaEOTXHb91sRQZsMC0XArDQEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgswCh1swuHEVgXgXi8fht7gmsqg/LptYa92H2EUAAAQQQQACBvAm0RoKileLdTcOZpdKSK7UeOfBBzpd1/gUfla4VqybGtSvcBtq77b9dCuT8OgyIAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlIIAodZSuIusobQEOje0GYa1072oHUup0uo2YR8BBBBAAAEE8i9w+8r0AKtWGzXC1fm/+BxcwTo9nLhK35HcVmqdHPjqG3dIOBKZ2A1WSGBB5+RbfEUAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQSBIg1JqEwSYChSAQCsa2iR0TSZ6LVknb2tWYfIhtBBBAAAEEEEBgTgS0Wvy6pvQAq9nQNifXz/tFxkYSl9j/1t7Edi43qqNRuerGTyeGNKrrxaxvTeyzgQACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAwIUColU8CAgUmYInc7Z7StR0NUhkw3IfZRwABBBBAAAEE5kTg9pUtadcxqqIigVDa8WI7YJ0eEonHnGmPDA9L74EP8rKEntVrZP0llybGNps7xIjUJvbZQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAQIdTKpwCBAhIIdK/fYldpTSl7pmHW3Sup5FVAt4mpIIAAAgggUHYCm9ujopXjU5phlky1UWv4ZGJp+996M7Gd641LrrleFp7TlRg20GZvl0AwOLEgNhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIFZChBqnSUgpyOQY4G0Kq0ZQyQ5vijDIYAAAggggAAC0wnoH9ncsiy9WqsZbbL/Hqf4/5MiOdR65GB+KrWqbzAUlGs/vVPCkcgEtx1odYKt0+HzHgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFBGAsX/G+gyulkstbQFQj0b1hqGbHKvMtPjft192EcAAQQQQAABBPItsLWzUTTcmtLsUKZZ25ByqBh3rOFTiWm/99bexHY+NqINTXLVjZ9ODG1EasVsXJjYZwMBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoZwFCreV891l7QQnE4/E73RNaVR+WdU3V7sPsI4AAAggggAACcy7QGgnKdeekB1iNutY5n0uuL2iNjohYcWfYE33HZOD4sVxfImW8ntVr5MIrrkkcMxvbxaiKJvbZQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBMpVgFBrud551l1YAp0b2gzD2uae1GeWpz/m192HfQQQQAABBBBAYK4Edi1L/9nEqKwSrTZa1M0OtFojg4kl5Ltaq17ooiuvkY7unsQ1A23nilrSEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgXIWINRazneftReMQDAY3ylihJMnpNXQrl2SXg0tuQ/bCCCAAAIIIIDAXApoFflNrTVplzTrS6Ba6/CpxLref+etxHY+N67dcYvUNTZNXMIMSKC9WyQQyuclGRsBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoaAFCrQV9e5hcWQh0brbDrNad7rVu7WyUyoDhPsw+AggggAACCCAwrwK7ljWnXd+ori/6MKY1fDKxrrmo1KoXq45GZcstt0o4Epm4drBiIthq8J9piZvBBgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlJUAvy0tq9vNYgtRIBA4tcWu0tqWPDcNs96S4fG+yX3YRgABBBBAAAEE5kPgysV1ohXl3c1sSPlxxv12we9bI4Mi8ZgzzxN9x6Sv9/CczLl14SL5hF2xdbIZ4WoJtHVN7vIVAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQKCuBQFmtlsUiUIACRkP7Nw1DUlIgH19SL5/qaizA2TIlBBBAAAEEEEDALspqGvKDg2crm6qJEaqQ+Ikj9pZVpESWGJURMSomqqbW1tfL4q7uOVlLQ3OL1NhVW9/+2R7nekaFXcjfrtaaXD12TibCRRBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIF5EDAbFyauSqXWBAUbCMy9QKhnw1rTlLXuK9++coH7EPsIIIAAAggggEDBCFzb0SBaWT6lBUJi1jWnHCq2HWtoIDHlfa+/ltiei43zL/iorL/k0sSltPKtGW1K7LOBAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlIMAodZyuMussYAFYne7J7eqPiz6oiGAAAIIIIAAAoUq0BoJyo7u9ACrUVvcIcz4qX670GzcYd//9l4ZGR6e01tw2fU3SM/qNYlrmi1LxKiKJvbZQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBEpdgFBrqd9h1le4Ap0b2ixLtrgnSJVWtwj7CCCAAAIIIFCIAlvPbUybllFZVdwhzHhMrJHBxLre2vOTxPZcbXxi+y5pXbho4nKGKYH2blFXGgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFAOAoRay+Eus8aCFAgGrd0iRkpJVq16duXiuoKcL5NCAAEEEEAAAQSSBbSy/Ob29CqiZl1Lcrei27YG7WqtZ9pbP31lcnPOvgZDQdn6q78mdY1nqt5qsHXRMoKtc3YHuBACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCMynAKHW+dTn2uUr0LnZDrPGb3MD3LKsRSoDhvsw+wgggAACCCCAQEEK3NSdoVprdb0YoYqCnK+XSSWHWve9/jMZHxv3clpO+1RHo7Ltjs+eDbaaAQksXFrUrjkFYjAEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEChZAUKtJXtrWVghC4RCA9vsKq1tyXPUMOvWzvRgSHIfthFAAAEEEEAAgUIS0ArzHTXpAVajbkEhTdPXXKyxUbFODznnaKBVg63z0aINTbLlllslHIlMXD4Qsiu2LifYOh83g2sigAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggMCcCRBqnTNqLoTAWYFYzLjz7N7E1nXnNEhrJOg+zD4CCCCAAAIIIFDQArcsa06bnxltsv9+p3j/U8MaPJFY009ffDaxPdcbrQsXyY13/IYEQ2d+RgxWiNnWLWJXbqUhgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACpShQvL9pLsW7wZrKQiC09MObTFPWuhe7a1mL+xD7CCCAAAIIIIBAwQvs6G4WrTif0uzQpVmXHnZN6VPAO9bJo4nZaaXWwYGBxP5cb2iwdcvOWxPBVqOyyq7Yuoxg61zfCK6HAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJzIkCodU6YuQgCZwUskbvP7k1srWuqllX1Yfdh9hFAAAEEEEAAgYIX0ECrBlvdzYgW7x/sWGOjYo0MOksaHxuXt376int5c7rftWKVfGL7rsQ1CbYmKNhAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIESE3CVVCqx1bEcBApNoHNDWzAY22c/jzclwfq1j3bKlYvrCm22zAcBBBBAAAEEEPAk8PbJ03LZf76e1jd28G2xBvvTjhfDAbOuRcyWJc5UF57TJds/e9e8T3vPC8/KQ/d+KzEP6/SQxD54UyQeSxxjA4FSEbAs+88BaQgggAACCCCQdwHD4FcEeUcukAvw81WB3AimgQACCCBQ8gLl9PMVP1+U/MeZBSKAAAIIFIhAufx8EVy6PiFOpdYEBRsI5F8gGLTsKq2pgdbWSJBAa/7puQICCCCAAAII5FGgu7Yy488zZrQpj1fN79Dxk30iVty5yIGf75O+3sP5vaCH0c/bsEmu2Lot0ZOKrQkKNhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoEQECLWWyI1kGUUg0LnZrs4aP5tCODPl3Stbi2DyTBEBBBBAAAEEEJhe4IbOhrQORnW9GBUpBerT+hTsAbv6qTU0kJjenheeS2zP58b5F3xUrt62IzEFgq0JCjYQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKAEBAi1lsBNZAnFIRAKndxpV2ltS55tZcCQazvSAyDJfdhGAAEEEEAAAQSKQeDKxXWiFVvdzYi2uA8VzX584Fhirnt+9Gxie743tGJrpmCrEaqY76lxfQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQGBWAoRaZ8XHyQh4F4jF5DZ3761djdIaCboPs48AAggggAACCBSlwPaeprR5m3XNImYg7XgxHHAqtdoVW7UNDgzI/rf3Fsy0MwdblwvB1oK5RUwEAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSyECBNlwUapyDgVyC49CObRay17vN2LLVDHrSSFFhw15clUO///sZPnZDDX7lbrPGxeXExw1XSeucfi1lT5/v6Ou9Y/1Hf53ECAggggEDpCOzobpa/3HNYBkYngqDOygxTzGiTxPt7i2+hVlziJ/vErJuoNvvik49JR3dPwaxDg62BYEj+69vflPGxcZFghQQWLZfYB2+INTZaMPNkIgjkSuC3v/9YroYqqHH+/GOXps2HtaaRFN0B7ivfr0X3oXVNmM9w+XyGXbee3TIT4GeO4r/h/HtdPv9e8/3K92sxCZT7v03FdK/yMdcDn/9MPoad9zEXfunraXNgrWkkRXeg3O8rP18U3Uc2bcLl/v+55fQZTrv5ZXCASq1lcJNZYiEIxO90z2JTa42sqg+7D7Nf5gIaJq255Jp5U6i99LqsAq3zNmEujAACCCBQUAKVAUM+1dmYNiezfkHasWI5YA2c/YONva++IgPHjxXU1FeuWy9bdt4qwdCZv1fUYGvHKjHC1QU1TyaDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIeBEg1OpFiT4IzEagc0ObiLHFPcSuZf6reLrHYL80BWouvjqrKq+z1Qi1L5GqjZtnOwznI4AAAgiUucCu5Rl+xrGDlkZ1fVHKWKeHxBrsT8z9R088ntgulI2uFatSg61mwK7YuqxozQvFlXkggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggMDcCxBqnXtzrlhmAsGgdbd7yR01FXLlYv+Pd3ePw35pChj2Y4Trrtk254vTKq16bRoCCCCAAAKzEVhcnfnnHLOuZTbDzuu58f7exPV/8syTMjgwkNgvlA0Ntt54+50SjkQmpmSYEmjvFrO+tVCmyDwQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBGQUItc5IRAcEZiHQuTksYu10j3ALVVrdJOy7BMIr10llz3muo/nb1evpi4YAAggggEAuBG7qbkwbxqiKilFZlXa8GA5YwydFK7ZqGx8bl9d//GJBTnvhOV1y812/J3WNTYn5mc0dYrYsSeyzgQACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCBSyAKHWQr47zK3oBULBgd32IlKetVsZMORTnelBj6JfLAvIuUDd1dvmpHKqVmeNXn5DzufPgAgggAAC5SuwuT0qq+rtv+1xNaP2bNjS9VbB78b7Dibm+Nxjjzjh1sSBAtqINjTJr/zW74kGXCebVsnVqq1iV2+lIYAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAoUswG81C/nuMLeiF4jFjZvdi9jR3SzRioD7MPsIpAkEW9qlauPmtOO5PqDX0GvREEAAAQQQyKXAjqXNacOZdfaxQCjteDEcsAb7RWJjzlQHBwbsaq0/KthphyMRufH2O6Vn9ZrEHI3qegksXi5i8nNoAoUNBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoOAFCrQV3S5hQqQgEuz98lWnKWvd6tp5LlVa3CftTC2gFVbOmbuoOs3xHx6ZK6ywROR0BBBBAIKPA1q5GaY0EU9+zK4WatQ2px4poL378UGK2zz1auNVadZLBUFC27LxV1l9yaWLORmWVBJesEv1KQwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBApRgFBrId4V5lQaAobc5l7IlYvrMj6K192PfQQmBYxgSKJX3DC5m/OvtZddJ3oNGgIIIIAAArkWqAwYcm1HeoDVbGjL9aXmbLz4iaMi8Zhzvb7ew/KTZ56cs2tne6HLrr9Brti67ezpwQqnYqtRwx9anUVhCwEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEECkWAUGuh3AnmUVICFcvWrxAxtrgXdUNnerDD3Yd9BNwCVesukoqOc92HZ70fal8i1Rs2z3ocBkAAAQQQQGAqgd0rW9PfCoTEqK5PP14MR6y4xPsOJGb63GOFXa11cqLnX/BRp2prOBKZOGRXzA20dYnZ3DHZha8IIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAQQgQai2I28AkSk0gHjfSqrR211aKVmqlIZCNQN21n87mtGnPyceY016QNxFAAAEEyk6gNRLM+POPWZ8h7FokOk611tiYM9vBgYGiqNaqk+1ZvUa2f/a3pbF1QUJa70Ng0TIRM5A4xgYCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAwnwKEWudTn2uXpkDn5rD9XNrd7sVt72lyH2IfAc8Cua6qGlm9MS/VXz0viI4IIIAAAmUjsGtZc9pajUitGJVVaceL4oBWaz1+KDHVYqnWqhPWQKsGWzXgOtn0XgSXrCre+zG5EL4igAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEBJCBBqLYnbyCIKSSAUHLADrYYdbD3bKgOG7OhOD3Sc7cEWAjML1F52nZg1s6/2awRDUnfNtpkvSA8EEEAAAQRyILCptUZW1af8aOSMSrXWHOBmMUQ4EpEtO2+VSz5+3dmzgxUSWLxcjJrGs8fYQgABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBOZBgFDrPKBzydIWsERuc69QA60abKUhMBsBDbTWXnz1bIZwzq29NDfh2FlPhAEQQAABBMpG4DPLW9LWatQ0iARCaceL4kCGaq0jw8NFMfXJSf7CZVfItjvuFA25Os0wJdDWJYEFnSJmYLIbXxFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIE5FSDUOqfcXKzUBQLd67fYVVpXuNe5vafJfYh9BLISqL7wcgm1L8nqXD1Jg7E6Bg0BBBBAAIG5FLh2SYO0RoKpl7RDlGa0eH9Gip84KhIbc9Y0ODAgTz/8YOr6imCvo7tHbr7r96R14aLEbI3aJgme8yExqqKJY2wggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACcyVAqHWupLlOWQgYhnWze6FXLq6T7tpK92H2EchaIHr1tqzPrb/u02IEi7QqXtar5kQEEEAAgfkW0Ir1WzvTH21v1rfafw9UpP9J4qrW+pNnnhQNtxZbizY0yfbP/o6s3rjp7NTtCrqBhT1iNncU7/05uxq2EEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgiARc5ZKKaOZMFYECEwj1bFhrWXG7Umtqu6k7PcCR2oM9BETip044VVS9WFR2LZfI6o0y/OrzXron+uh54ZXrEvvTbfiZz3Tj5Oq9YEu7s2YdL9TWIUa4SoJ2CCdQ35y4xPiRgxI7ZYeJxsdk5K09MrrvDRk7+F7i/VxtVK27SAINZ6873bix40dl6OWnpuvihIz1fuqYZk1Ugs3tYoYjTkXe0f3viGWvR+/HyGsvy+m9eyQ+MjTteDO9qWaRNRud6wab25zPXcC+rl5fr6dt/OhB51p6zflq+nmt6JoofF2x+FwRO4wdau+wbaok1n9Uxo8fc1zGjx5yjEb3vZ6Yfz7m7Oe+62dxpu9Pvef6uZ78PE+u7fBX7pZAbVQqe1Z7XoZ+Rk49kdsKkZPz8zoJ/azk4/vN6/XpVxwCtyxrka+/eUROx6yzE7bDk1oR1BrsP3usiLa0WqvZ0CZir2N8bFyesqu1XrE1+z8+ma+lB0NBuerGHbL8/A/LQ9/+ViKcq6FjvT/x3nfFGhmcr+lxXQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQKCMBQq1ldLNZan4FLCu9Suuq+rBsbufRrfmVL43RTz76gESv2ea5imqd3VdDZBpm89q8VnjVMXU+dXZV1/lsWlFWg3VVGzZLRYcdapyhaUBQX9oqe85zvjphUDsIquvRIGQuWsQOtWrg0ks7bQdrpwq16lyrL/iYs0YNamZqyetWC2163088eK/v9TiWOvczNpmuN7ku/Vptu+tnYej5x+XkYw/MOkyb6XruY15M9BwN5iYHmifH0fs99NJTMvjCD3z7TI4x1Ve/9z1TqNWsqZOqD19k2/5ixvlPXlsDu82XXje56+lrrkO90StumHaO7kmpOw2BmQRaI0Hn56LvvX8ipauGQmNFGmoVu1prrPc9CbR3O2v6yTM/lLUXXiytCxelrLFYdrpWrJJdv/t5+Y9vfUP2vf4zZ9pGRVgCi1dI/PghiR/7oFiWwjwRQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKBIBQi1FumNY9oFJtC5tt6utbnbfj5rysS2nkuV1hQQdqYU0MqYGh6svvDyKfskv6HhuFo79DbwyH3Jh6fc1nFD7UumfD/5DZ2Hzme+mgYbay6+2gl7arB1Ns0JEdpBTq2yqeHSXIZbvcwr0/x1Tg2/fJvnYKz7OlptV19amdPL/de1a0BRr+u36fz1sxOxg5gDdpB2qoCu33Hd/TVAW2UHPb1+Rt3nT+7rGmsuucZ5adXZwWcembFi6uS5+f6qjnVXe6veqOFcrco7XQDZPd/winU5q1Sr9yFTaNh9zcl9tc5VaHxyTL6WrsBvnNcm7lCrEa4Wo7JKrNOzq0Q9X2paZdYaGnAqmuocHvrnf5Cb7/rcfE1n1tcNRyKy9VfvkD0vPCuP3X+fjAwPO2Nq+NiI1E5UbR0dmfV1GAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBDIJGBmOsgxBBDwJxAKmjvtQGs4+axoRUB2dHt7RHnyeWyXr4AGFDXM5rVpSM5L8EyrgNbaIVEvTa/vJSjpZaxs+mhYs/nW33dCqJkCodmMOXmOhjsX3PVlz8HhyfNm81UfI5/cdH2td/5x1oHW5LE0vFn/yV3Jh1K2NeDZ9Cu/6fTJJtCaPJh+hvRaes1cNh1X56hVgWcbaHXPS6vcani48aZfl6kq4brPyde+rs9roHVyDn4DxJE1E5V8J8+fzVf9nPppw3ZgnIaAVwGtYq8vdzMbFrgPFdV+/Oh+0aqt2noPfCBasbXY23kbNjlVW7V662TTAHKgY6VowJWGAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAII5EOAUGs+VBmz7ATilnGne9Gf6myUykBq5VZ3H/YRSBbQR73rY+W9Ng191l0zc+VHrejqNdSolUx1HvPRai+7fk4CiBou1JDhXDe9Zq4DlhrUzRSW1ONOeLbnvJwuM3r5DU7gOBeDaoi15de+4KsaaTbXnQxK5zo062Uu+j2qoV2tROu3jbz2sq+QuwbcNcibi1bRudzzMPrvxbA9VxoCfgRuX5keYDWq7aL3gdlV5/Yzh1z3tezKpfH+3sSwTz38oAwODCT2i3WjOhp1qrZe8vHrJBg685APwxSzaZEEFi0r6ntWrPeEeSOAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQ6gKEWkv9DrO+vAsEutdvMQzpdF9o98pW9yH2EZhRYPjV5309QlwDe9NVVdQgX9XGzTNeVzuMHXxPBl943FPfXHbSKpoa9tTw7Vw1DRlq2DDX1WCnmr+GQbMJNk41XvJxrdhb2XU2hKjhYK2qmq/qpFE7SO01JJ08z+RttdCKvF4qDSefl+12sKU9UQE42zH8nDf5udLKtpVZBoudsOgrz/u5rIRX+Kuwmmlw/dwkf54y9Uk+NrrvDV/h2+Rz2S5fgSsX10lr5ExAcpJBg5J1LZN7Rfk13ndQJDbxhyEaaNVga6m0X7jsCrn5tz4nC8/pSizJiNRK8JwPiRltShxjAwEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEJitAKHW2QpyftkLGEY8rUprxrBG2UsB4FXgxHe/6bWr008Dk5MhOveJ073n7uv3uu7zs93XR8RPF8zNdtyZztOw4VxUbNXQqYYb89k0xKqfAa3amu9wsIYe62dR6dapLmufP9VnNl9Oej110uvnuwVqo05gt+biq2d1qaEfP+3r/Miajb76Z+rs93tRg/g0BPwKaCX7m7qb004z6+xjdri1aJsVl1iNSsl3AABAAElEQVTve4np/+SZH0rvgQ8S+8W+0di6QLZ/9i7Rqq2JZgbEbO2UwMIeEXubhgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCMxWoIh/azzbpXM+ArMXCPVsWGv/Bn+ze6Rdy9KDGu4+7CMwlYDfiqlahVKDk+6m4TSvVSL1Ueej+99xD5H3fQ0Yep1jPiaj189nyFErmmrQNN9NK55q5dNMn4N8XFs/W9lUa9V5aqXX+Wx6/bmoEFtnX2e2wV39t0BfXpuuq6LjXK/dM/bz8/3oVJMl1JrRkYMzC2zvbhINt6a0QEiM6vqUQ8W2Yw32izU0kJj2I9+5N7FdKhtatVXDrRpynWxGVdSp2qpfaQgggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggMBsBAi1zkaPc8teIB6P3+ZGWFUflk2tNe7D7CPgS+Dkow9IfGTI8zlanTM5ZKhhOq3S6qVpMK3/AX/VYb2MO1MfDeBlUyk11n9UtDrkyccekGP3/JnzOvHQvTL49CO+AoCT89M5aDA4H222oUY/cwq1L/HTfdZ9ay74mK8x1EKr8mql1/lsen2dRz7vjX62/VY8ncpk6IUfTPVWxuPhFesyHvdyUE38hFo1DK//ftAQyEagNRKU685pSDvVbDgblEx7s0gOxI/uF7Grtmo78PN98tyjDxfJzL1Pc+E5XXLzb31ONOCaaHYoWSu2Btq6xAhVJA6zgQACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCPgRCPrpTF8EEEgS6NzQZhixnfZzcpMOiuxYSpXWFBB2shKInzrhhDa9VvnUMJpWhjz+L3/rXK9q42bPQc1TTz4ker25bvooeD/BQg3PaXBVw6zuIN3pfW8kpq+VV7Uap9fwpM5BA8B9//TXiTHmckMrccZHhhOX1EqbflwSJ3rcSLbS62Rb2VMfdT/wyH0erypSc8k1WV9Lw5Njh/ZL7PhR0VBz0A7w6v2t6FwulV3LPc9hsqOuWedz8tH7Jw8V7FcNcOvn2etnwu99SV64unj9vtHzhl5+Kvl0thHwLbBrWYv833f6Us4zKqvECFeLNTKYcryYdqzREYn3HRSzaZEz7acfflB6zjs/pbJpMa1nqrkGQ0G55OPXSdeKlfLdb31DBgcmKtQaNY0SsCvuxvt7JX78kEg8NtUQHEcAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQTSBAi1ppFwAAFvAqFgbJtlxy6Se2vVsa1djcmH2EYgawENcFatvVC8VuCMrN4og888IuPHj3mu0qph1lNPPJj1HLM9sXrDZl9hRJ3n0Xv+p4wfOTjjJTVoN7J3jzTv+h3PwV6tTqmVbucq3KtB1sFnvp8xFKihwuoLLxetvpurpiZDLzwuo/vfSRtSq9TWXftpX/dDB9FqpDpXLxWF9TOczXp0vie++820CrzJwVwNYur8vX6fTALofJywrH0v5rJpIFuvO370kPN5nvzMjR05lPHzp77aX7+/vTS9L2qS6V7PdL6fKq8679GkMPlMY/M+ApkEtLr9uqZqeflYaoDVbGiT2MG3M51SNMc0zGlURcWI1Mr42Lh895v3yPbP2v+/ZAdBS611dPfIrt/9vDzxn/fLT5754cTyDFP0PprRJifgGz9xpNSWzXoQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCBPAmaexmVYBEpewBK5273IrZ2NUhlIrdzq7sM+An4EBh661093J9wXveIGz1UdTzx4b1rVU18XzLJz2GNAb3L44/96j6dA62R/Ddwd/79fm9yd8atWway9+OoZ+822gwYatZruka9+MWOgVcfXEKNWED3xwDdnezk5bYd7D335t6Tf9psq5KhB4b5/+F+SHBT1euGQHZ700jSc7bdpkPPo1/57WqDVPY6uSz11rX5bNvPye43J/k5A176nh+37oZ8BvcdahVXd9TUZbp3sn/xVA8l+mp9wavK44VXrknen3R6274+7YvK0J/AmAlMI3L6yJe0dDYOK/Sj7Ym/x3ndFrLizjN4DH8hzjz5c7Euacv7hSESu2LpNbr7rc6Ih10Sz76PZskQCHSudkG/iOBsIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIDAFAKEWqeA4TAC0wkEutdvETHakvtomPUW+zG6NARyKaBhNw33eW1arbJq3UWeumvITkN1c920IqqfR8ZrldFsAotONVS72q3XVrVxs+cwsNcxk/tpWPXYPX/m2XzQDjJ6qUybfI3JbQ0bDjxynxyzw6rThSVT+vsMUOu5WuXVS4us8VZldHIsXbcGP/007R/rP+rnFPE7L1+Dn+msa9HQrQZ09Z56qWzrvo7+O+BnbdmsSyu86strG5mHfzu8zo1+xSWwuT0qWuk+pWmVz/rWlEPFuGONjUr86PuJqT//2MOi4dZSbq0LF8m2O+6ULTtvlbrGpsRSjcoqCSzscV5GRcqDDhJ92EAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQRUgFArnwMEshNIq9KaMZSR3dichUCKQL9d3TEfFRH1se7z0SIrvVeD1PkN2NVks20a7PTatFqrPrY9H00DiUe/9idTVkud6prDe16Y6q0pj+tnpe+f/lpOPfHglH0yvaEhYA1P+mlmuGrG7pU954kGmf00rbLr9zOvYdG+f/obP5dx5qXzy1fTMLbed7WdbRt6+WnPQ2g41e9n2U+VVv08+/2seJ48HctOYKo/CtLH1osdbi32Fj9xRKzhk84yxsfG5aF//gfRr6XeelavkVv/4Ity2fU3iFZxnWxahTew5ENiNneURDXeyXXxFQEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEcidQ/L8pzp0FIyHgSSDUs2GtYcgmd+dMj89192EfgWwEtNLmqScfyubUKc/R6qe5CNpNeYFp3giv9l61U+eYTWXLyctrMNJPtdOKrhWTp+bsq67hyN/+ia95TF7cz9z1nMlAazaVbfX88aMH9YvnZtbYjwifoUV83G8dSisIZ/vZ1PP8nut3fjMsN/G2rkPDxbP5/CYGszeGXnoqeXfG7fAKf+FxP/2HX5n7Cs8zLpgORS2wtbNRNNya0vSx9bUNKYeKdSfe+65IPOZMXyu1Pv2wvz86KNZ167zXX3Kp/Oof/JHzNXkdWok3eI4dbm2wH3xQAuHl5LWxjQACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACsxMg1Do7P84uQ4F4PH6ne9mr6sOyrqnafZh9BHImoFU3/Tx+fLoLa/Bx4GHvFUynG8vve1qxs7JruefTRn1WDs008PjRQ5kOZzxW0el9bhkHyHBQq45qMDmb5mfuOr5eJ9tAq57v93pa3Xa6pu/7DY2eenJ2Ya/BZ74/3ZTS3tP5zbSOtJM8HOj/t3t8V5udblinOqpd+dVri6zxHh7Xirt+KrsO/dh71Viv86VfeQu0RoKytasxDcGoa007VowHrLFRiR/dn5j6c48+LHtffSWxX+obWqlVK7be9odfFK3gmmhmQMymRU641aiuTxxmAwEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEyluAUGt5339W71egc0ObYVjb3Kd9ZnmL+xD7CORUQIOoJx68NydjDjxyX9Yhy9lOwE9wTq81duhsCCjba/updqrzy3XA0Roby3bqEjt+NOtzszkx19fLxnM2oVxd88hrL/taut5vv59LLxeYzX2fanytsOy1BeqbPa+rsuc8z597/X7y8z3ldb70Q2DH0uY0BKOySoxIbdrxYjwQHzgm1mB/Yur/9e1vysDxY4n9ctiINjTJlp23yrY77pTWhYvOLjlYIYH2bgksWiZGmD8SOwvDFgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALlKRAsz2WzagSyEwgG4zvtZ6SGk8/W6mLXLimNx+Mmr4vtwhPQsJ4G/jSAlm3Tao9Dzz+e7emzPi9gV2r106ov+JhE1l3k55S0voGaaNqxqQ5owFGryeaqKu5U1/F6PFePrvd6PWtkyGtXT/3U0k/TsKQGuGfT1EzHCba0ex7G7zw9D5zjjvpvgK5PK6t6aeEV62R0/zszdvXzbwpVWmfkpEOWAlr1flNrjTzbeyplBH1MfWz4ZMqxYt2JHX5XgktWidghzpHhYfm3e74m2z/7OxIMldd/knV098jNd31OfvLMD+Wphx+UwYEB55ZqgDmweIVYp/okduR9kdjs/v+gWD8nzBsBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBchcor9+glvvdZv2zE+jcbIdZB+60Q60p49zU3SyVgdRjKR3YQSCHAlpltblrueeqiu5La7XX2YYG3WP62Tdr/YUcQ+1L/Ayfk75Bu5JcoYRac7KgeRzE7/0eP3ooJ7PVCr++Qq0+P5c5mWQWg+j37vBLT0n1hZd7OjuyZqPovxkztfDKdTN1Sbw//MrziW02EMi1wK5lzWmhVuex9IFQaQQc4zGJHXrHqUgqhim9Bz6Q7//rvXLVjTtyTVkU451/wUflQx/ZJM89+rA8/9jDMj427szbqGmUoP2yTh6T+PFDYo2OFMV6mCQCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCORGwMzNMIyCQOkLBAKnttiB1rbklWqYdXt3U/IhthHIq8DYwfeyrrR6et8bvh/NnuvFmD6qpub62ow39wKBau9VcnV2uXqsvd9wrN95zr3k2Sv6qZQaqG+Wio5zz56cYavSDsl7rfyq/4YQ+M6AyKGcCVy5uE60Ar67mQ0pP3653y6qfWtkUOLHPkjM+dXnn5U9Lzyb2C+3Da1Se9GV18itv/9HsnLd+pTlG7VNEljyIQm0d4tWcaUhgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEB5CBBqLY/7zCpzIGBZ1t3uYaYKX7j7sY9ALgW08mL81AnfQw48dK/vc3J9QsDn4+hzfX0v42kQkJYbAb+VWq3YRJW+2V49ftLf94ffec52frM5X4Pt+vLawiumr8Ja0bXC61Ay8ipVWj1j0TFrgd0rW9PONWsb7L8rKp3/bIn394o12J9Y5yP33St9vYcT++W4UR2Nyid23CI33/U56ejuSSHQar2BRcsksHiFOJV7U95lBwEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEESk2gdH47XGp3hvUUlECoZ8Na05S17kndvnKB+xD7CORdQB9BPvDwzI8UT57I4NOP+ArCJZ+by22zCEKtuVxvuY/lNyAcO340J2T6PeKn+Z2nn7Hz0ddPtdbImo3TTiG8Iu3/2jL2V9NhQq0ZbTiYW4FPdTaKVsJPaYGQmHWl9QcHscPvijU64ixzfGxc/u0bXxP9Wu6tdeEi2XbHnU64tWf1mhQOI1ztVG0Ndq4WM2o/KaGEgs4pC2UHAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgTIXINRa5h8Alu9VIJZWpXVVfVj0RUNgPgSGXn7Kc0g1PjIkJ598aD6mmXbNQK2/x9GnDcCBohIINtihoyJoxTLPScrhl54Sr8FdDexWdJw7eWrKV30v1L4k5dhUO6P73hD9t4SGQL4FohUB2dGdHmDVR9GXVIvHJH7obREr7ixLK7X+xz/eU1JLnM1iNNy6ZeetctsfflFWb9wkwVDw7HDBCjFbOyXYeZ6YDW0iZuDse2whgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEDRCxBqLfpbyALyLtC5oc2yZIv7Or9xnv1LdBoC8yjQ/2/f8HT1k489IPFT/h7H7mngLDqNHz+WxVmcUqwCsZMDRTH1YpnnJKaGS0dee3lyd8av4RXrMvYJ95yX8XimgxqkpyEwVwJbz21Mu5RRWSVGVWn9YYRWao0feS+x1r2vviKP3u+vEnvi5BLdiNp/HHHVjTvk1t//I/mFy66QcCRydqVawbdpkR1utSu3NneI2Ps0BBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAofoGkkjfFvxhWgEA+BIJBa7f9fNOUkqytkaBsbi+tYEU+7BgzvwJjB98TDZpVrbtoygs5fZ5/fMr35/oNy2elx9H973iuSJmrtcQKJACcq/XM5zh+w9RGpGpeput3nvMySddF9Xs/snqj62jm3ciajTLwSHpQLrwyc9jVPYpWhfUTonWfzz4CfgW0Er7+nPX4wdRgvFnXIrGh1GN+xy60/vEB+489KqrErG91pvbiE49JU+sCOf+CjxbaVOd1PtXRqFzy8etk42WXy09feFaee+wRGRw481mwK7Wqn76sk8ckfvyQaGCYhgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACxSlAqLU47xuzniuBzs12mHXgNjvUmnLFW5a1SGUg9VhKB3YQmCOB/n+9R/RVLM1vYPTEd78pGsylFaeA3/tthnMTag00pD+6fDpBv/Ocbqy5eu/03j1OBWazpm7GSwbqm6Wi41zRkPhkM4IhqehaPrk77dfhV5+f83D5tBPizbIQuKm7MS3UalTXixGqEGtstKQM4kf3O+vS9Wl79P7vSLShUbpWrCqpdeZiMVqpdf0llzqvPXa49emHH5ITfWerwBu1TRKwX9Zgv8T7e8UaPpmLyzIGAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgjMoYA5h9fiUggUnUAoNLDNDrS2JU9cw6xbO9Mfi5vch20EEMgsED/lr8KeWUNF5MySxXF0vu63WZn0eGoPVH7n6WHIOeky+MIPPF8nvCK1Kmtlz3miwVYvTUOtNATmWuDKxXXSUVORdlmjbkHasVI4EDu0T6zTQ85SxsfG5T++9Q3p6z1cCkvL2xrO27BJbv2DL8qWnbfKwnO6Uq6jAeHAomUSWLxCjBp+bk/BYQcBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBAhcg1FrgN4jpza9ALGbc6Z7Bdec0SGuEIsduF/YR8CIQO37US7dEn4CHKpSJzmwUnIDf+x1qW5KTNQTb/Y3jd545mWQOBhl66SnPo0TWbEzpq6FWLy1+6oRoVVgaAvMhcMuy9KrLZrTJ/nujEvxPGCsusQNviYxPVKEdGR6W7/zdV0W/0qYX6Fm9RrZ/9i7ZdsedotvJzQhXS6CtS4LnrhWzZYkYlbmpCJ58DbYRQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCC3AiX4G+HcAjFa+QqEln54k2nKWrfArmUt7kPsI4CAR4H4yEQVOo/dJdTW4bUr/QpQIGYHIv20YEtKYWw/p6b0reg4N2V/ph2/85xpvLl6P9Z/VE7ve8PT5QL1zZLsEvYYah1+7WVP49MJgXwI7OhuFq2Qn9LMgJh16WHXlD7FuhMbk9jBt0XiMWcFJ/qOyX12sFUrt9JmFujo7nGqtn7m7s/L6o2bUk9wPjctEuhYKYElHxKzoU2MUHol4NST2EMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgfkQINQ6H+pcsygELJG73RNd11Qtq+rD7sPsI4CAR4HR/e947DnRraJrua/+dC4sgVj/MV8TMsNVouHL2TQNbhrBkK8hxo8c8tW/kDoPvfC45+mEV6xz+obsSrZenYdf9l4N1vNE6IiARwENtGqw1d2MaOn+gZF1ekhih87+f+WBn++Tf///vuYmYH8agcbWBXLVjTvkji/8ifzCZVdIdTSa0tuoCIvZtEgC56yWwKJl4lT/tUOvNAQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQKAwBnqFeGPeBWRSaQOcGu1xg7Cr3tG5fWbohCvda2UcgHwLOo8ztypKVHsOqGr4za+pEz8u21V29TaovvNzz6Ue/9t/Fb/jW8+Bl1nH8yEHHMrlC6EwENRd8TE48dO9M3aZ8P7Luoinfy/SG3muteFqsbcSupKoVkDUQPFOLrNkoA4/cJ+GVE+HWmfqrC98LMynxfr4Ftvc0ydffPJJyGQ0lGtX1Yg32pxwvlR1raEDiR94Ts2WJs6R9r/9M/uvb33KCmqWyxrlYh4ZZL/n4dc5LDd/4yUvy2ss/Sql8a0RqRV9mS1ysU8clfrJP1J+GAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALzJ0Cl1vmz58oFLBAMWnaVViOlJGtrJChXLq4r4FkzNQSKQ2Dk1ed9TbT24qt99Xd3Dra0uw9Nu1/MVTunXdg8vem30mfVxs2eApqZlqPBziqfoVa/88t03fk8Zo2PybDH7ymtzqoB48ql53ma8tDLT3vqRycE8inQXVuZ8ecvp7pmPi88z2PHTxyReN/BxCxeff5Zefg72Qf+EwOV6UbXilVOKPi/fenP5eptO0T3U5philHbJIGFPRLsWiNmc4doeJqGAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJzL0Code7NuWKhC3Rutn+DHd/mnubula3uQ+wjgEAWAsN2ZUkN4nltGnL0G0xNHjtkh/i8Nq0Iq1UvabkT8Hu/jWBIai+9LqsJ6Hl6vtfmBELtz2Oxt6EXfuB5CVUbNjvBVi8neA3LehmLPgjMRuCGzoa007VSa6mHDuN9ByTe35tY+0+e+aE89b0HE/ts+BcIhoJy3oZNsvVX75A7vvAnctn1N0hj64LUgQIhMetbJbDkQ85Lt8U+RkMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgbkRINQ6N85cpYgEQqGTO+0qrW3JU64MGHJtR3qgIrkP2wgg4E1Ag6Oj+97w1tnupSHFhk/dmlX1ztrLrvd13tjB/Z7nRUdvAn7vt45afeHlElm90dsFzvSqtsOaep6fpp9DnV+xt7GD74m+vDSvlWx1vPEjZ6tEehmbPgjkS0Ar5a+qT6+aaURb8nXJghk3fnS/WCePJebz9MMPyotPPJbYZyN7gepoVNZfcql85u7POy/d1mPJTYPTWrVVq7dqFVejptH+wYT/hE42YhsBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBXAvwG7lcizJe0QvEYnKbexFbuxqlNRJ0H2YfAQSyFBh6+SlfZ4bal0jDL9/mqwpnZddy3xU/vQYDfU2ezuL3fitZ/Sd3iQZVvTQNs0avSSuwPeOp2cxrxkHnqcPQj5/O6ZWH97yQ0/EYDIHZCmw91w4TuppZ11wWAcPY4XfFGhpIrP7R+++TPS88m9hnY/YCWq1Vq7Zq9Vat4rp64ybRqq7JzaiKSqCtS4Lnni+B9m4x6+xQNRVck4nYRgABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCAnAoRac8LIIKUiEFz6kc2mKWvd69mx1A5N0BBAIGcC+ljz0z6qteqFK3vOk9Y7/1hmqjRp1tRJ3dXbpPFXftPXfPVR9Kee+b6vc+jsTSCb+60Veuuu+7S03PEFCa9cJ8GW9pSLadBZq7m2fvaPnfut/f00/fzpvEqlDb/yvOhnOFdt6CV/wfNcXZdxEJhKYEd3s0QrAqlv2xUznWBr6tGS3IsdfDsl2PrQvd8i2JqnO921YpVcdeMO+W9f+nO5etsO0f2UZn/ujOp6MVuWTFRw7VgpZuNCMSqrUrqxgwACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAC2Qmklp/JbgzOQqCEBOJ32iW/UtazuT2a8ZG3KZ3YQQAB3wLH/+VvnZCqGfYeAgnUNzsVPGsvu07Gjx+T0XffSFzXrIlKsLldKjrO9VXRdXKAoecfL4lH0U+up9C+9v/rPdLya18QP/db16Dh1cabfj2ny4mPDInOp5Ra/NQJOb13jxMAnu26NPCr49EQKCSByoAhn+pslK+/eSRlWmb9Aon396YcK8kdKy6xQ+9IYNGyRHhSg63aztuwqSSXPN+L0kqtaquvwYEBef3HL8qPn/mh9PUeTpmahlmdQGuj/ccXsTH7389+J4DsVNe17xsNAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQT8CRBq9edF71IW6NzQJhLf4l7iTd3pj7t192EfAQT8C2ho7sQD35SGX77N98kabtVXZddy3+dmOkErXJ588qFMb3EsRwKx/qNOkDTXAdVspqeBVp1PqbXBFx7PSah1+GWqtJbaZ6NU1rNreXNaqFWCFU7VTGuwv1SWOfU64jGJffCmBBavEKMi7PQj2Do1Vy7fqY5GZf0llzqvAfuPava++ors3fOK7H97b+plAiG7enCLiL7sQKs1fMp+nZx4jQym9mUPAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQyCqSWpMzYhYMIlIdAcOlH/pf92+ffSF5tR02F/PATK5MPsY2AJ4EFd33ZCV166mx3OnbPn4lWRyyUpmHRpl2/62s6h79yd1ZBwfpP7pKqdRf5ulauO5964kEZeOQ+T8Oqi58wbbYuk5NZ+KWvT27O+FWDmnq9bJvf+z5khx/9Vjytu+7TUr1hc7ZTnPV5GvzUMLXfNtf33e/8Jvu33f0XYtbUTe76/qoB78Nf/i3RarY0BApR4NYfvivfez+1krBWxIwdcIULC3HyuZqTGUip2KrDXr1tBxVbc+XrY5yR4WHZ9/rP5O2fvuJ81f0pGyHXKWmmesOyrKne4jgCCCCAAAII5FDAMPgVQQ45C3oofr4q6NvD5BBAAAEESkignH6+4ueLEvrgshQEEEAAgYIWKJefL4JL1yfuA5VaExRslLVA52a73NXJnW6DW5Y1uw+xjwACORYYePDenFZd9Ts9DWZ6DbT6HZv+6QLO/bZDl+GV69LfzPORkddeFr1+Kbehl56SmkuuyXqJp/fuIdCatR4nzoXALvtnM3eo1aiKOo9/t06XSRh7smLromUTj7234anYOhefvvRrhCMRWbluvfPSdzXgqhVc333jNTnRdyz1BMMU57Nqf16dRsg11Yc9BBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBM4IEGrlo4CALRAKDuy2xKhPxqgMGPKpzsbkQ2wjgEAeBLQipFaqrb3seqm99Lo8XGHqIbOpNDr1aLzjRUArgfb90187wcvo5Td4OWXWffSaGlwefPqRWY9V6AMMvvCDWYVah199vtCXyPzKXGBTa42sqg/Lz/pHUiSM2iYpm1CrrnyKYOuJvj656Mrsg+0pqOz4FuhasUr0pa2v97Dsf3uv83rP/jo4MJA6njvkar9rDZ8Ua2Qw8ZLYWOo57CGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQBgKEWsvgJrPEmQViceNm00ztt6O7WaIVgdSD7CGAQN4ETj56v4wdfE/qP7lLzHBV3q6jA2vIUQOOVGjNK/O0g5964kEZ3fe6NN7062LalVvz1WL9R+X4v/ytjO5/J1+XKKhxdb2n970hlV3Lfc9LA+ZazZaGQKEL7FjaLL//o/dTpmnWNUv8+CEpqxBghmDr0w8/6LgQbE35eMzLTmPrAtHX+Rd81Ln+jCFXu5cRqXVeiQnbodbkkKuGXmkIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIlLoAodZSv8Osb0aBYPeHrxJD1ro7bu9pch9iHwEE8iyggbojf/NFqbtmW94eT6+VKE/Yj6CPnzqR59Uw/EwCGjTtPXO/I6s3ztTd1/saXNZKvCcfvk80rFlObdhedzahVv3+UzcaAoUusLWrUf7yp4ekd3j87FTtqpdmbYPE+3vPHiuHLQ22vv+GBNq7nUfb65I12Hp6ZFguu35uqmGXA3Mu1jhdyPXAz9+VE33H0i8TCIlRXe+8Jt/UkKuMjYg1ar+0qqv9tazC3JMQfEUAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEChZAUKtJXtrWZhnAUNuc/e9cnGddNdWug+zjwACcyCglSb18fRavTOyZqNUf+QXJdjSPqsra4BVq1eeevIhpxrsrAbj5JwK6L3RSqonHvimaLC1+oKPzep+a1BWQ50aXi63MOvkjdG1R+1guN+Kx3oeDYFiEKgMGHJtR4N8/c0jKdM1G9rKL9SqAlZcYgffTgm2vvjEYzJw/Jh8YvsuCYb4T76UD0qB7LhDroMDA6Lh1oPvvWt/3Sf7396bcaZGuFrEfhnJ79rhZuv0kP0aTgm86mej1Npvf/+xUluSs54//9ilaetirWkkRXeA+8r3a9F9aF0T5jNcPp9h161nt8wE+Jmj+G84/16Xz7/XfL/y/VpMAuX+b1Mx3at8zPXA5z+Tj2HnfcyFX/p62hxYaxpJ0R0o9/vKzxdF95FNm3C5/39uOX2G025+GRzgN5xlcJNZ4tQCFcvWr4jHZYu7xw2dDe5D7CPgS+DUM9/3FSgbt4MnhdR0Picfe8DXlKwcV8PUsOPg0484Lw21auCxYvG5YqdzJNTSNu0j68ePHJRY/zEZeWuP/Yj7N3IaZNXA5Oi7b3i2ma2Ln/sw2xCn3/s+dvA9zw4zddS5D77wuPMKtS9xKvVWdC53TpvqfmsA2vnesauLjr5vh1ntUKbe+3y0ub7vs1mDVlsds8O9lT3neR7GCX7v3eO5Px0RmG+B3Stb00KtcqaqpTXYP9/Tm/vrTwZb27oSVT33vvqKfPv//JXc8Kt3SDgSmfs5cUVfAtXRqPSsXuO8Jk/sPfCBHLQDrhp0/eDdfdLXe3jyrdSvZkCMSK3zSn5jsoqrE3bV4OvIKRHnq13tlYYAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAgQoQai3QG8O05kYgHjfsKq1WysW0QqtWaqUhMBsBDWMWc9Ow4MlH7y+YJWhQMdN8tJqrBh61jR05JBrMy3fTR9rPZcu07nxdv1Duu4ZlMwVmDTvQXNFxrlN1N18GU4071/d9qnl4Oa7fFxVdE4FgL/21z/ArVGn1akW/whBojQSdn9e+937qv/tmfavEyjHUqrflTLDVbO4QddCmFT//4S/+VLbawVatDEorLoHWhYtEX+df8FFn4iPDw3LkwPvS+8H7cuJ4n/P14Hv7ZHxsPOPCjIqwfTycFnZ1Op+p7irjo2KN2a/REZHY2ETo1a76SkMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEBgvgQItc6XPNedf4HOzfZveQd2S+rDO2V7T9P8z40ZIICAJwGnuuQcBFk9TYZOeRfQCqSn7cq7tOkFIms2igaA/bShHz/tpzt9ESgIgdtXtog71OpUq6ysch7FXhCTnIdJxI/uFxk/LRpu1Xai75j84//+c6di68JzuuZhRlwyVwJacbeju8d5JY85ODAgfUcOy3tv7ZWT/X1y7PBhJ9Cc3Cdt+0x1Vz1upL155oATeD3t7ExWe9XwtDVyttKrpQFYOyBLQwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQCBXAoRacyXJOEUnEAoO7LbE0PJFiVYZMGRHd3Ninw0EEEAAAQSKSUDDrLUXX+1ryloJOVNlXF+D0BmBeRBY11Qtq+rD8rN+u8JkUjOizWIdeS/pSPltxvt7neqbgTY7xGqYohU+v/1//kquuvHTsnLd+vIDKfEVV0ejoi8NvCa3gePH7FDzREVX/QwcfO9du6rrmOx/e29yt6m3gxX2H0lUOO9rYNxTSwrCOv2dirDDnk7VTomKsZ7PoCMCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggECpCRBqLbU7yno8C1git7k7a6BVg600BBBAAAEEilGg+sLLxayp8zX1wR/9wFd/OiNQSAKfWd4idz1nVyZNama0SeJ9BycepZ50vNw2rcF+iX3wpgQW2kFHuyqnPqL+P771DRk6OSDrL7m03DjKcr3RhibRlzvsqhgacj1y4H05pVVeew/LcbvSq25rO/jePufz4uz4/Z+kIOzkqUZ1/eQmXxFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBCYUYBQ64xEdChFgUD3+i32ula417ZrOVVa3SbsI4AAAggUh0D1hs0SvfwGX5O1xsdk+KWnfJ1DZwQKSeDaJQ3y5VcOSu/w+Nlp2ZVJnWDr8UNnj5Xplj4mPrb/Z2K294hRMfGAgkfvv8+p2KlVW4Mh/nOwTD8aEo5EMoZdkz00CK0BV22T4VfdTg7A6r6GYzUkS0MAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgFwL8FjMXioxRdAKGYd1sP4s1Zd5XLq6TxdUTj9dMeYMdBBBAAAEEClTACIYkvHKdVK27SCp7zvM9y6HnH5f4yJDv8zgBgUIR0Ar7Wzsb5auv9aZMyaxvlXi/fcyKpxwvxx1rbFRi77/uVGw1wtUOwWsvvyjHDh+SX9p1q1PJsxxdWPPMAhp6zlTldaYzB44fkxN9fYluI0PD0msHX720px9+0Es3+iCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCJSwAKHWEr65LC2zQKhnw1rLimul1pR2U3djyj47CCCAAAIIFIpA/Sd3OcHVXM9n8Ec/yPWQjIfAnAvcsqxFvv7mETkds85eOxASoyoq1mD/2WPlvBWPSeyDN8VsXixmXYsj0XvgA/n7r/ypbLnl1qyCi+XMydqnF4g2NKWFpXtWr5n+pDPvEmr1xEQnBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKCkBcySXh2LQyCDgGVpldbUtqo+LJvbo6kH2UMAAQQQQKCEBUZee1nGjxws4RWytHIRaI0ERSvuu5vZ0OY+VN77dtXa+JH3nNdkBVt9ZPy9X/0refGJx8rbhtUjgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEDBCBBqLZhbwUTmRKBzbb1IfLf7WlvPpUqr24R9BBBAAIHSFbDGx+TEg/eW7gJZWdkJ3L5yQdqajXC1GJVVacfL/UD8xBGnaqvExhIUj95/n/zHt74h42PjiWNsIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAvMhQKh1PtS55rwJhILmThEjnDwBre61o7s5+RDbCCCAAAIIlLTAgB1ojfUfLek1srjyEtCq+/pyN7MhPezq7lOO+9bIoIz//KdiDZ9MLP+1l1+Uf/zf/1P6eg8njrGBAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIzLUAoda5Fud68yoQt4w73RO4tqNBKgOG+zD7CCCAAAIIlKTA6X1vyOALj5fk2lhUeQtkrNZabRfpD4TKG2aq1cdjTsXWeH9vokfvgQ/k7//if8hzjz6cOMYGAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggMJcChFrnUptrzatAoHv9FsOQTvckdq9sdR9iHwEEEEAAgZIUGN3/jhz/p78uybWxKASuXFwnWoE/pRmmmHUtKYfYSRWIH90vsUP7RKy488b42Lg88Z8P2FVbvyIDx4+ldmYPAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQyLMAodY8AzN84QgYRjytSmvG8EPhTJmZIIAAAgggkDOB03v3yLF7/kziI0M5G5OBECgkAa28f1N3c9qUzDr7mB1upU0tYJ3qk9j+18QaGUx0OvDzffL3X/lT2fPCs4ljbCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAL5FuC3u/kWZvyCEAj1bFgrYm52T2bXsvTgg7sP+wgggAACCBSzQPzUCTn52APSZ1dotcbHinkpzB2BGQW2dzeJhltTWiAkRnV9yiF20gWs0RGJvf+6xI99kHhzZHhYHrr3W/Kdv/uqDA4MJI6zgQACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCORLgFBrvmQZt6AE4vH4be4JraoPy6bWGvdh9hFAAAEEECgJgbGD78mJh+6Vw1+5W04+ej+B1pK4qyxiJoHWSFCuO6chrZvZsCDtGAcyC8SPH3LCrRpynWz7Xv+Z3PNnXxL9SkMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQTyKRDM5+CMjUBBCHRuaDOM2E77ubMp09mxlCqtKSDsIIAAAggUrMD40UNyet8b085vbP/bYsXGZXTf6zK6/53/n717gY7zrA+E/5/RyJKsSLJ8kRXHiZXIduzEBjmOib1fkmOHJYTuts3ZQpsDOYeQAnbYstn2Y3GANpSm3GkPu92FLaV8sAvb9OuN0pa22R74FtoCZTmBJlwWcO5xEidxHDuOnVia+eaZIFkzI8eyNJLm8nvOmcz7PO/7PpffO1ZGmv/8X0GsL6plZzML3Lh+RfzRPQfLlpjpWBzpUXju2bJ2lakFCsePxtiD34/s0rMj2z9YOihlbU0ZWze/bHvs/Jmfi86urqlP1kqAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIFZCAhqnQWeUxtDoD03dl0hMp2TZ5uyeL36/KWTm2wTIECAAIG6FXjmK1+M9FAIEDi9QMrGv2VZd9z55NGyg1OA5tgj+8raVF5EoJCP/JMPR+HooWg7eziirb108F3/9PW4p5ix9aevf0OcO7zuRTqwiwABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAmcukD3zU5xBoLEEChF7K2f86qGl0dFWnrm18hh1AgQIECBAgACBxhS4aeOKqolnFvdOBGZW7dRwSoGUtXX0/u9G4ciTE8ccPXw4bv/Yf4yv/NUXYvTE6ES7DQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECMxWQFDrbAWdX9cCbcNbr43IvHDP1J/MNAWzvqF4W1qFAAECBAgQIECgOQV2nt0bKTN/WclkI7tkoKxJZZoC+bEYe+y+GNv/o4ixExMnfeNLd8Rnfvv9sf/+eyfabBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQGA2AoJaZ6Pn3EYQqMrSOmWQQyOsxBwJECBAgAABAgSmJZC+xLRnY3UAa7Z3WfH7Tn4FmhbiFAcVnj38QtbW4vN4OXjgsfjcf/qtSAGuCgECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBGYr4BPd2Qo6v24F2tdtG8lkYnvlBKe6HW3lMeoECBAgQIAAAQKNLfDT5/ZHCm4tK23tke3pL2tSOUOBlLW1mLE1f+C+iOL2ePnKX32hFNyaglwVAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIzFRAUOtM5ZxX9wL5fP7mykletKQztizrrmxWJ0CAAAECBAgQaDKBga5cvPr8pVWryvRVZ3CtOkjDaQXyh598IWvrsSMTx+6//974/Q/eFl/68z+Jo4dPZnOdOMAGAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIETiMgqPU0QHY3qMDQtsFMpnBd5exv2riyskmdAAECBAgQIECgSQWuX7u8amWZjsWR6eqpatcwA4GxEzH28A8j/+TDEYX8RAff+sqX41Mfui2+8aU7YvTE6ES7DQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECJxOQFDr6YTsb0iBXC5/Q0Smc/LkU7auV67um9xkmwABAgQIECBAoIkFUpb+7QNnVa0wu0S21iqUWTTkn3o0xh74bhSOHpro5fixY/GVv/pCKbj1R3f980S7DQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECLyYgKDWF9OxrzEFhnYWg1kLN1dO/rXDy6OjLVPZrE6AAAECBAgQINDEAjeunyJba/eSiLb2Jl71/C+tcOL5GHtkXylza+G5Zycm8PTBJ+Pzn/5E3P6x/xgH9hczuioECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBB4EQFBrS+CY1djCrS1PXNtMUvr4OTZp2DW1w0vm9xkmwABAgQIECBAoAUEUqb+c89aVLXSbH/Z28Wq/RpmJlA4diTGHvx+5B9/IGLsxEQnD+77UXzmt94fd/zx7XH08OGJdhsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBCYLCCodbKG7aYQKBQKeysXkoIZBrpylc3qBAgQIECAAAECLSDwhimytWZ7+ovfg/Lr0Fxd/vzTj8fo/d+N/FOPFm+ikJ8Y5jtf+/v41Idui3/42y/G6InRiXYbBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQSAI+xfU6aCqB9nXbRrLZGKlc1E0bV1Y2qRMgQIAAAQIECLSIwGuGlkbK3F9W2toj2yuTf5lJrSv5scg/+XCMPfDdKDxzcKL348eOxT/e8cX4xPtuje/f+a2JdhsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAQ1Oo10GQCY1VZWrcs646LlnQ22TothwABAgQIECBAYLoCvYva4vrh5VWHZ3qr26oO0jBrgcKJ52Ps0Xtj7KEfROH40Yn+jh4+HH/52f8nPveffiv233/vRLsNAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgRaV0BQa+te++Zb+dC2wUIhrq1c2E0bV1Q2qRMgQIAAAQIECLSYwKsvWFq14kzH4sgs7q1q1zA3AimgNQW2pgDXGDsxMUgKaE2BrSnANQW6KgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQItK6AoNbWvfZNt/JcrrAnIlOWknWgKxc7zxao0HQX24IIECBAgAABAmcokDL3T/W+MNvnC1BnSDnrwwvPHIzR++6O/MFHIgr5if6+f+e34hPvuzX+4W+/GMePHZtot0GAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQOsICGptnWvd3Csd2lkMZs3vrlzkG9aviI62TGWzOgECBAgQIECAQAsKvHZ4imyt3Usi076oBTUWeMnFYNb8wf0vBLc+/fjEZEZPjMY/3vHF+Ph73hF3/PHtcfipJyf22SBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoPkFcs2/RCtsBYH29sPXFQqZwclrTcGsrx6qDlyYfIxtAgQIECBAgACB1hF45eq+GO7piH1HnitbdKZvZRSeeLCsTWWeBMZORP7xB6Jw+InILl8dma6e0sApuPU7X/v70mPjlq3xsquujoFV58zTpAxDgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgMBCCcjUulDyxq2pwNhY5ubKDn9mTX8MdInbrnRRJ0CAAAECBAi0ssDr1i2rWn62t9iW8atRFcw8NhSeezbGHv5hjD2yLwrHjpSN/P07vxWf+a33x+0f+49x7w++V7ZPhQABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACB5hIQ8ddc17MlV9O+9pLthYiRysXfuH5FZZM6AQIECBAgQIBAiwtcP7w8PvidR+K5seI7yPGSbYts3/LIHzow3uJ5gQQKRw/FWPGR6eyObP9gZLqXTMzkwX0/ivRIGVu3XrkrNm3bPrHPBgECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECzSEgHVFzXMeWXkUxHGFvJcD2gbPioiWdlc3qBAgQIECAAAECLS7Q0ZaJFNhaWTK9vhBVabKQ9cLxo6WsrWP33xWFI09GFPIT0zmw/+H469s/Gx97zzvjW1/5chw/dmxinw0CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBBpbQFBrY18/sx/aNlhEuKYS4sb11YEKlceoEyBAgAABAgQItKbA69Ytq1p4ZlFnWVbQqgM0LIhA4cTzMfbYfTF6392Rf+rRiPzYxDyOHj4cX/rzP4nfe++t8ZW/+kKkukKAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQGMLCGpt7OvX8rPP5QrFLK2ZspSsA125eOXqvpa3AUCAAAECBAgQIDC1wHBPx5TvF7O91cGuU/egdd4Fxk5E/smHi8Gtd0X+iQcjivXxkjK1fuNLd8Qn3ndr/M0ffjYOHnhsfJdnAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQaTCDXYPM1XQInBYZ2FoNZD19XDGo92Vbc2rNxoKyuQoAAAQIECBAgQKBS4LXDS+NvH3q6rDnTvSRSxtbC88fL2lXqSKCYqTV/6EDpkYKQM0sGS9cszXD0xGjc9U9fLz3WbX5JbL1iV5w7vK6OJm8qBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAicTkBQ6+mE7K9bgfb2IzcUCpnByRPsaMvET5/bP7nJNgECBAgQIECAAIEqgZ1n98ZFSzrje4fKA1gzvSuikDKBKnUvkD/8ZPE7bk9GCkbOLhmITFfPxJx/dNc/R3oMrDonNm3bHhtGtkZ3b+/EfhsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECNSnQLY+p2VWBE4vMDYWuyuPun54eQx0idWudFEnQIAAAQIECBCoFnj1BUurGrN9y4s3AvBrUhVMHTcUjh6KsYd/GGMP/SDS9uRyYP/D8aU//5P42HveGZ//9Cfi+3d+q5TRdfIxtgkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQqB8B0X/1cy3M5AwEcmsv3RlRGKk8ZarAhMpj1AkQIECAAAECBAgkgfSFqI/e/Vgcfn7sJEgxoDUFtqZb3CuNJVA4fjTGHtkXmfZFkV26KjJnFe/gMClAeTx7a2dXV6zb/NK4+NLL4tzhdY21SLMlQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0OQCglqb/AI37/LyNxc/oS5b3vgtZMsaVQgQIECAAAECBAicQqCjLROvGVoav//Dx8uOyC5ZKai1TKSxKoUTz8fYY/dFPP5gZHuWRqZnWWQ6uycWcfzYsbjrn75eevQtXRYbRrbGyL/4v6K3f9nEMTYIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEFgYAUGtC+Nu1NkIDG0bjMhfW9nFa4erbx9beYw6AQIECBAgQIAAgckCN164vCqoNXKLItO9pOpW9pPPs90AAvmxyD9dDFguPlL21kzP8sj2FgNXi9d3vDx98Mn4xpfuKD1WrTk/Nm27LC4sBrmmbK4KAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQLzLyCodf7NjThLgVyusLeyi3PPWhSvXN1X2axOgAABAgQIECBA4EUFVne/8D7ybx96uuy4bN+KGDt6qKxNpXEFUvbWwsH9kS8+UtbWlL01ZXGNbNvEovbff2+kxx1/fHts3LI1hi9+Sel54gAbBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAjMuYCg1jknNkBNBYZ2dkYcuaGyzzesX17ZpE6AAAECBAgQIEBgWgI3Ft9LVga1Zhb3RqZjcRSee3ZafTiocQQKx49GeuQffyAyZy2N7Fl9pefJK/j+nd+K9Pi7P7m9lLk1ZXBNmVwVAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgTmVkBQ69z66r3G03v0xAAAQABJREFUAu25w3sKkVkyuduOtky8ZqiYZUkhQIAAAQIECBAgMAOB7QNnxUVLOuN7h46XnZ2yeQpqLSNpukrhmYMxVnxE9oFS5tZ0zVMm1/Fy/Nix+M7X/r70WDqwMjaMbI11m18aA6vOGT/EMwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECNRQQ1FpDTF3NvcBYPvP6bLZ8nOuHl0fvopO3DS3fq0aAAAECBAgQIEDg9ALXr10e7/zfD5UdmO1bHvmnHo0YO1HWrtKEAvmxyD/9eETxkWlfFJme5ZHtXRaRWzSx2IMHHot/vOOLpUd3b2+svfglcf6FF8X5Gy6KXLtfrSegbBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBCYhYBP3maB59T5FcgNX3JNZGKkctTXrSt+2KwQIECAAAECBAgQmIXAq89fGh/97qNx4NjoyV4y2WL2zv7IHzpwss1W0wsUTjwfhYP7I198pKytKXtrtqd4Z4jsyS/SHT18eCKDawpoTYGtKcA1BbqmgFeFAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIGZCQhqnZmbsxZCIBO7K4d95eq+GO7pqGxWJ0CAAAECBAgQIHBGAh1tmfjpc/vj939YzNY5qWT7BwW1TvJotc3C8aORHvnHH4jMWUsj290bme4lZQGuoydG40d3/XPpcccf3x4Dq86JtZteWgx03Rir1pzfamTWS4AAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQGBWAoJaZ8Xn5PkSWLR+64Z8Pq6tHO/nhvorm9QJECBAgAABAgQIzEhgz8aB+Oy+J+K5scLJ89vaS0GMhaOHTrbZakmBwjMHY6z4SCXT1VN6XWQWF4NcF3WWeRzY/3Ckxz/e8cUYz+J67gXr4ty160sBr2UHqxAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUCYgqLWMQ6VeBfL5TDFL66TgguJEU4bWlKlVIUCAAAECBAgQIFALgYGuXOw8uzf+9qGny7rLLhmIMUGtZSatXikcOxLpkUoKai0FtxYzuWY6u8toJmdxTTs6u7qKGVwvinOHi0GuxcfSgZVlx6sQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQaHWBTKsDWH8DCAzt7MzlDj9V/Li4LAXSrZesil9cv6IBFmCKBAgQIECAAAECjSJw55NH49r/+eOq6Y49+P0oPPdsVbsGAmUCxcy+2e5i9taUwbWYzTWK9Rcr3b29cUExyPXs84ZixapzYtWa81/s8Kbf9+H/+5cm1njiR/97YtsGAQIECBAgMHcCmYyPCOZOt756LhTKk0bU1+zMhgABAgQINI9AK72/8v6ieV63VkKAAAEC9S3QKu8vcmu3TlwImVonKGzUq0B77vCeQkVAa++itrh+eHm9Ttm8CBAgQIAAAQIEGlRgy7LuuGhJZ3zv0PGyFWR6l0fh8QfK2lQIVAmMnYj84Scj0qNYJrK4FgNcS0Gu2bayU44ePhx3/dPXS4/xHSmD68A5q0uBrucMDUVv/7LxXZ4JECBAgAABAgQIECBAgAABAgQIECBAgAABAgQINL2AoNamv8SNv8Di98d3V67iNUNLo6NNFoFKF3UCBAgQIECAAIHZC/zihSvi//7Gg2UdZXuXRf7gIxHFoEWFwHQFCs8fj/SIQwdKp2Q6FheDW88qZnLtKz1HJlvV1YP7fhTpMV46u7ri7GIG15TNddWaoVi2cqVA13EczwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAk0nIKi16S5pcy2obXjrtcUVbahc1Y0XytJaaaJOgAABAgQIECBQG4GfPq8/PvjPj8SBY6MnOywGH5YCW5969GSbLQJnKFB47tlIj4kg187uUgbXTEdX8d4UZ0XkFlX1ePzYsbj3B98rPcZ35tpzxSDX82PpwMpYVnykzK4rVq2OFACrECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEGhkAUGtjXz1WmDumUzh9cWbdpat9JWr+2J1d/WHvWUHqRAgQIAAAQIECBCYoUC6I8Brh5fHR+8uD2DNLhmIfMq4WcjPsGenESgXKBw/GukxUbJtxeDWYqBrMcD1hefuiGJbZRk9MVrK5jo5o2s6pru3N5auWFnM6Hp+Mcj1nDirWBfsWqmnToAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUM8Cglrr+eq0+Nza120bKRTyKVNrWXnt8NKyugoBAgQIECBAgACBWgu8bnhZfPz7j8VzY4WTXbe1F28b3xuFo4dOttkiUEuB/FgUnj1ceox3m2kvfqGv4ydBrimja8fiKQNd0/FHDx8uPSqDXdO+FOjaUczkevZ5Q6WMrim7a9/SpdHbvyztbojytr/7ckPM80wn+ZF/uavqFGutImm4BtfVv9eGe9FWTNhruHVewxWXXrXFBLznaPwL7ud16/y89u/Vv9dGEmj1n02NdK3mYq77f+0X56LbBe9z1W2/XzUHa60iabiGVr+u3l803Eu2asKt/v/cVnoNV138FmgQ1NoCF7lRl1gopCyt5eWiJZ2x8+ze8kY1AgQIECBAgAABAjUWGOjKRbpDwBfuLw9gzfYPxpig1hpr6+7FBAonno84cTAKzxw8eVjK6FoMbk3ZXCO3KDKLOiPT1XNy/xRb+++/t9R67w++V7V36cDK6O7pLWV27S9mek1lYNXq6FzcVQyEXVzcPqfqHA0ECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIE5kJAUOtcqOpz9gJDI0si8nsiMmV9Xb92eVldhQABAgQIECBAgMBcCdy0cWVVUGvplvDFYMLCc8/O1bD6JXB6gZTR9diR0mPywaWsru3FANfOYmbXlOE1Bby+SGbX8XMPHngs0uN0ZTz4NR03eXv8vPPWrhvfLD0LiC3jUCFAgAABAgQIECBAgAABAgQIECBAgAABAgQIEJiGgKDWaSA5ZP4F2nPZGwrFj2Inj5yyZb36/KWTm2wTIECAAAECBAgQmDOBdJeA9PjeoeNlY2T7V8bYoy9kvSzboUJggQVeyOr6fBSePVw1k1JW10z2hYyupUyvXRFt7aUsr1UHn6JhcvDrg/t+VHXUP95R1aSBAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwBkJCGo9Iy4Hz5dAvpC5OVOepDV++tz+6GiraJyvCRmHAAECBAgQIECgJQVStta3fu3+srVnuos3FSgGA8bYibJ2FQL1LFA4frQ0vZThtbJkFhW/T5he08WS6ep5YXdbbiLgdTwg9oUd/kuAAAECBAgQIECAAAECBAgQIECAAAECBAgQIEBg7gQEtc6drZ5nKNA2vPXaYkDrUOXpezYOVDapEyBAgAABAgQIEJhTgVeu7ot0x4ADx0ZPjlPMdpntWxH5g/tPttki0MAChedTNuIXMhJPFfQ6sbRShtfFE9XJwbDjjZmus8Y3X3iuOKd8pxoBAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBcgFBreUeanUgkMnkb47Ils1kPJigrFGFAAECBAgQIECAwBwLpDsFvGH9ivjgdx4pGynbtzzyTz0aUciXtasQaGqB/FhMDnqdvF2LdU8VJFuLfvVBgAABAgQIECBAgAABAgQIECBAgAABAgQIECDQOALlkYONM28zbVKB9nXbRooBrTsrl3fj+uWVTeoECBAgQIAAAQIE5kXg1UNLIwW3lpXirdoz3UvKmlQIEJidQMoYmwJlax0sO7tZOZsAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQGA+BQS1zqe2sU4rkM/nd1cedNGSztg+UHELy8qD1AkQIECAAAECBAjMkcBAVy5+Zk1/Ve/Z/pVVbRoIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAYOYCglpnbufMWgsMbRvMZAo3VHb7ixeuqGxSJ0CAAAECBAgQIDCvAjeur35PmulYHOmhECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgEBtBAS11sZRLzUQaM+NXReR6ZzcVcqK9dPnVWfFmnyMbQIECBAgQIAAAQJzLZDuHrBlWXfVMNmlZ1e1aSBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBmQkIap2Zm7PmQKAQsbey21cPLY2OtkxlszoBAgQIECBAgACBeRe4aeMU2VoX90a0tc/7XAxIgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBZhQQ1NqMV7UB19Q2vPXaYpbWwclTT8Gsb5jiNq+Tj7FNgAABAgQIECBAYL4EXrm6L9KdBMpKJhvZJQNlTSoECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgMDMBQa0zc3NW7QWqsrTuPLu3Omig9uPqkQABAgQIECBAgMC0BfZsrA5gzfYuK34/y69W00Z0IAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBE4hUJFm6BRHaSYwhwLt67aNFAr57ZVD/O1DT8ea279T2axOgAABAgQIECBAoL4E2toj29Mf+cNP1te8zIYAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQINJiCdUINdsGacbj6fv7kZ12VNBAgQIECAAAECrSOQ6avO4No6q7dSAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQI1EZAUGttHPUyU4GhbYOZTOG6mZ7uPAIECBAgQIAAAQL1IJDpWByZrp56mIo5ECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoGEFBLU27KVrjonncvkbIjKdzbEaqyBAgAABAgQIEGhlgewS2Vpb+fpbOwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECsxcQ1Dp7Qz3MVGBoZzGYtXDzTE93HgECBAgQIECAAIF6Esh0L4loa6+nKZkLAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEGkog11CzNdnmErjv/zs+GnF2cy3KaggQIHB6gUKh0Fl8PJXJyFR9ei1HECBAoP4FtmzZEt/+9rfrf6JmSIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgToXkKm1zi+Q6REgQIBAUwrsEdDalNfVoggQaFGBm29284EWvfSWTYAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAjQUEtdYYVHcECBAgQOB0AsUsrbtPd4z9BAgQINA4Atddd10MDg42zoTNlAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAnUqIKi1Ti+MaREgQIBAcwoUA1qvLWZp3dCcq7MqAgQItKZAZ2dn3HDDDa25eKsmQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBADQUEtdYQU1cECBAgQOB0ArK0nk7IfgIECDSmwM033xwpuFUhQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgZkLCGqduZ0zCRAgQIDAGQkUA1pHillarzmjkxxMgAABAg0hMDg4GNdc40d8Q1wskyRAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoG4FBLXW7aUxMQIECBBoNoF8Pv/6ZluT9RAgQIDASYG9e/eerNgiQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgTMWyJ3xGU4gUCOB9nXbRgqF/J016k43BAgQqHuBjgtfVvdzNEECBAgQmJ1Abu3Wsg7GHtkXhaOHytpUCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQGBqAZlap3bROg8Cxdtwy1g4D86GIECAAAECBAgQWDiB7NKzF25wIxMgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQKDBBAS1NtgFa5rpDo0sicjvaZr1WAgBAgQIECBAgACBKQQyHYsjPRQCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQOL2AoNbTGzliDgTac9kbIjKdc9C1LgkQIECAAAECBAjUlUC2f2VdzcdkCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUK8Cglrr9co0+bwKEXubfImWR4AAAQIECBAgQKAkkOku3qSgrZ0GAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECJxGIHea/XYTqLlA2/DWa4udDlZ2fON73xE9S5dWNqsTIECAAAECBAgQaBiB0ROj8elfe18cffrIyTlnspHtWxH5g/tPttkiQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgSoBmVqrSDTMtUAmk7+5cozhkU0CWitR1AkQIECAAAECBBpOINeei43bL62ad7ZveUQxuFUhQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgVML+FT11Db2zIFA+7ptIxHZnZVdb778ssomdQIECBAgQIAAAQINKTCy64pIwa1lpa09sj39ZU0qBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIFAuIKi13ENtjgXy+fzuyiFWrF4Vay7eUNmsToAAAQIECBAgQKAhBbr7eiLdiaCyZPoGKpvUCRAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQGCSgKDWSRg251hgaNtgJlO4oXKUjTuqb89aeYw6AQIECBAgQIAAgUYS2Hr1rqrpZjoWR6azu6pdAwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAi8ICCo1Sth3gRyufwNUfwYf/KAKYvV5it2TG6yTYAAAQIECBAgQKDhBdLdCNKjsmT7Byub1AkQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIEDgJwKCWr0U5kdgaGcxmLVwc+Vg67eORK49V9msToAAAQIECBAgQKDhBabM1rq4N6KtveHXZgEECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBCYCwFBrXOhqs8qgba2I9cUs7RWpaWa6oP+qpM1ECBAgAABAgQIEGhAgeGRTZHuTFBWMtmQrbVMRIUAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQITAoJaJyhszLHA3sr+p/yQv/IgdQIECBAgQIAAAQINKpDuSDCy64qq2Wd7+ovf9/KrWBWMBgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEWl7AJ6kt/xKYe4D2ddtGMpnYXjnSyK7LK5vUCRAgQIAAAQIECDSVwMbtl0YKbi0rbe2R7V1W1qRCgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAhGCWr0K5lwgn8/fXDnIitWrYvX64cpmdQIECBAgQIAAAQJNJdDd1xPrL91StaZM7/KqNg0ECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBBodQFBra3+Cpjr9Q9tG8xkCtdVDrPl5bK0VpqoEyBAgAABAgQINKfAyFXV730zHYsj09XTnAu2KgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECMxQQFDrDOGcNj2BXK6wJyLTOfnolK1q3dbqbFWTj7FNgAABAgQIECBAoFkE0l0KBs9fU7Wc7JKBqjYNBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQaGUBQa2tfPXneu1DO4vBrPndlcNs3H5p5Npzlc3qBAgQIECAAAECBJpW4NKrd1atLdO9JKKtvapdAwECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBFpVQFBrq175eVh3W9sz1xaztA5OHioFs47sumJyk20CBAgQIECAAAECTS8wPLIp0h0LKku2v+ztcuVudQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECLSUgKDWlrrc87vYQqGwt3LENRdvmPLD/Mrj1AkQIECAAAECBAg0m8DWq3dVLSnbu6z4PTC/llXBaCBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoCUFfHrakpd97hfdvm7bSDYbI5UjTXXb1cpj1AkQIECAAAECBAg0o8D6rSOR7lxQVrJtke1bXtakQoAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgVYVENTaqld+jtddKIy9u3KIFatXxeD5ayqb1QkQIECAAAECBAi0hEB3X09s3H5p1VozPcVsrQoBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIhKBWL4LaCwxtGyx2ek1lx1PdbrXyGHUCBAgQIECAAAECzSyw+codVcvLdCyOzOLeqnYNBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQaDUBQa2tdsXnYb253OjNEZnOyUOlrFTDI5smN9kmQIAAAQIECBAg0HIC6e4Fq9cPV60727eiqk0DAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEWk1AUGurXfG5Xu/QzmIwa+aGymE2Xb4jcu25ymZ1AgQIECBAgAABAi0nMLLr8qo1Z7qXRGZR2ffCqo7RQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgWYXENTa7Fd4ntfX3n74umJQ6+DkYVMw6+Yrtk9usk2AAAECBAgQIECgZQXSHQx6ly2tWn+mV7bWKhQNBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAi0lICg1pa63HO/2LGxzM2Vo6QP7bv7eiqb1QkQIECAAAECBAi0rMDIVdXZWrN9y4vfD/MrWsu+KCycAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIHIMCBQK4H2tZdsL0Tma5X9vfZdvxwrVq+qbFYnQIAAAQIECBAg0LICzz17LD55y2/E6InRMoP8Ew9G/tCBsjYVAq0oUCgUWnHZ1kyAAAECBOZdIJPxEcG8oy/QgN5fLRC8YQkQIECg5QRa6f2V9xct9/K2YAIECBBYIIFWeX+RW7t1QlgaoAkKG7MVKH7kuLeyj8Hz1whorURRJ0CAAAECBAgQaHmBjsVdsfmKHVUOmd4VVW0aCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0CoCglpb5UrP9TqHtg0W75V6beUwl169s7JJnQABAgQIECBAgACBosDGHZdWOWQWdUame0lVuwYCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAi0goCg1la4yvOwxlyuUJWltbuvJ9ZcvGEeRjcEAQIECBAgQIAAgcYTWLF6VQyPbKqaeLZ3WVWbBgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECLSCgKDWVrjKc73GoZ2dEfnrKocZ2XVF5Npzlc3qBAgQIECAAAECBAj8RGDjZVurLFKm1kzH4qp2DQQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEGh2ARGHzX6F52F97e1HbigUMoOTh0rBrBu3V99OdfIxtgkQIECAAAECBAi0ukDK1Nq/ciCeeuxAGUWmZ1kUnnu2rE2FQKsLvO3vvtyUBB/5l7uq1mWtVSQN1+C6+vfacC/aigl7DbfOa7ji0qu2mID3HI1/wf28bp2f1/69+vfaSAKt/rOpka7VXMx1/6/94lx0u+B9rrrt96vmYK1VJA3X0OrX1fuLhnvJVk241f+f20qv4aqL3wINMrW2wEWe6yWOjcXuyjFSQGt3X09lszoBAgQIECBAgAABAhUCm6/cXtESke1bXvxPW1W7BgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECDSzgKDWZr6687C23PAl12SzMVI51OYrd1Q2qRMgQIAAAQIECBAgMIXA5it2RLrTQVnJZCPbu6ysSYUAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQLNLiCotdmv8FyvL1OdpXX1+uFYsXrVXI+sfwIECBAgQIAAAQJNIZACWlNga2XJLllZ2aROgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBphYQ1NrUl3duF9e5YWQoInNt5Sgjuy6vbFInQIAAAQIECBAgQOBFBLa8fIr30LlFkele8iJn2UWAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIHmEhDU2lzXc15XMzqau7lywO6+nhge2VTZrE6AAAECBAgQIECAwIsI9CxdOuX76OySgRc5yy4CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAg0l4Cg1ua6nvO3mqGdnRGFGyoH3Hr1rsomdQIECBAgQIAAAQIEpiGw+fLLqo7KdPVEpmNxVbsGAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQINKOAoNZmvKrzsKb23OE9xWHK7oWaa8/F+q0j8zC6IQgQIECAAAECBAg0n8CaizfEitWrqhaW6V1e1aaBAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECzSggqLUZr+o8rKkQsbtymM1X7Ijuvp7KZnUCBAgQIECAAAECBKYpsHHHpVVHZnuXRbS1V7VrIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQLMJCGpttis6D+tpG956bURmQ+VQU30AX3mMOgECBAgQIECAAAECpxZIXxTrWNxVfkAmG9me/vI2NQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECDShgKDWJryoc72kTKbw+soxTnWr1Mrj1AkQIECAAAECBAgQOLVArj0XF22fIltr/+CpT7KHAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECTSIgqLVJLuR8LWPR+q3FDK2ZYqbW8rL58svKG9QIECBAgAABAgQIEJiRwNard1Wf19Yeme4l1e1aCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0EQCglqb6GLOx1Ly+czuynF6ly2N4ZFNlc3qBAgQIECAAAECBAjMQKC7r2fK99dZ2VpnoOkUAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQaSUBQayNdrYWe69DOzoj8nsppjFx1eWWTOgECBAgQIECAAAECsxAY2VX9HjvT2R2ZjsWz6NWpBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQqG8BQa31fX3qanbtucPFgNZMMbD1ZMm152LzFTtONtgiQIAAAQIECBAgQGDWAqvXD8eK1auq+skuGahq00CAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIFmERDU2ixXch7WkS9kbq4cJgW0psBWhQABAgQIECBAgACB2gpsvrL6y2OZs/oj2tprO5DeCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgUCcCglrr5ELU+zTahrdem8nEUOU8N1+5vbJJnQABAgQIECBAgACBGghs3H5pdPf1lPeUyUa2d1l5mxoBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgSaREBQa5NcyLleRiaT2V05xvDIpuhf6fanlS7qBAgQIECAAAECBGohkO6IkAJbK0t2SfE9eDG4VSFAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECzCfgktNmu6Bysp33dtpGIwjWVXW+8bGtlkzoBAgQIECBAgAABAjUUGNl1RaTg1rLS1h6Z7iVlTSoECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBBoBgFBrc1wFed4DYVC4fWVQ6xYvSpSplaFAAECBAgQIECAAIG5E+ju64k1F2+oGiDbv7KqTQMBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQaXUBQa6Nfwbme/9C2wYj8nsphNu6ovg1q5THqBAgQIECAAAECBAjMXuDSq3dWdZLpWBzpoRAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQKCZBAS1NtPVnIO1tOfGrovIdE7uumNxV2y+YsfkJtsECBAgQIAAAQIECMyRwOD5ayLdKaGyyNZaKaJOgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECjCwhqbfQrOMfzL0TsrRziou2XRq49V9msToAAAQIECBAgQIDAHAlsefnlVT1nupdEtLVXtWsgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAowoIam3UKzcP824b3nptMUvrYOVQU32gXnmMOgECBAgQIECAAAECtRNYt3VLdPf1lHeYyUa2b0V5mxoBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQaWEBQawNfvLmeeiaTv7lyjOGRTdGzdGllszoBAgQIECBAgAABAnMokO6UsOnyHVUjZPuWF7+H5te6KhgNBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAg0pIBPPxvyss39pNvXbRuJyO6sHGnz5ZdVNqkTIECAAAECBAgQIDAPApuv2B4puLWstLVHtqe/rEmFAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECjSogqLVRr9wczzufz++uHGLF6lWx5uINlc3qBAgQIECAAAECBAjMg0B3X0+kOydUlkzfQGWTOgECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBBpSQFBrQ162OZ700LbBTKZwQ+Uom6+svt1p5THqBAjMn8Cavt4Y7l8Suawf5fOnbiQCBAgQILCwAluv3lU1gUzH4sh0dle1ayBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgECjCVTcu7LRpm++cyGQy+VviOLH4pP7TlmhNm6/dHKTbQIE5kigK5eLVT1nFR89MbC4Kwa6u2O8bTpD7nvqUOmwg8eOxwOHD8f+I0fi/qcPT+dUxxAgQIAAAQJ1LpDunpAejz+0v2ym2f7BGHtkX1mbCgECBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBBpNQFBro12xuZ7v0M5iMOvhm4tBrWUjrd86Erl2L5cyFBUCNRJIAaubBpbHxStWRMq+2rNo0ax6TtlbUxnuj9i2anCirxTsOh7gevfjT8RoPj+xr542btq6pZSBdrZzetvffXm2XTh/BgLp9Zeu4WxLer1+/Ft3zrYb5zeggJ8BDXjRTHneBS77V6+Iv/zdz5SNm1ncG9HWHjF2oqxdhQABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAo0kIEqxka7WPMy1re3INcWA1pNRcD8Zc6rbnM7DdAxBoGkFJgeyblqxfF7WmYIN0+OK4mjHRkfj248+Fv/7kUdlcZ0XfYMQIECAAIHaCay5eEOkOykcffrIyU4z2UjZWvNPPHiyzRYBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQaTEBQa4NdsHmY7t7KMYZHNpU+NK9sVydA4MwFlnZ1xivOP78sg+qZ9zL7M1JQ7Y7V55QeB44+G197eH8pyPXI88/PvnM9ECBAgAABAnMqkO6gMLLriviHz3+xbJxsT3/kn3w4olCf2djLJqtCgAABAgQIECBAgAABAgQIECBAgEBkMplYvnx5PPnkk5Gvk7ss9vf3R19fX9x3332uEAECBAgQIEBgQQQEtS4Ie30O2r5u20ihkN9eObtLr95Z2aROgMAZCqzqOSuuGloTIysHzvDMuT98oHtx/Oz6tfGv1l4QX3ngwfjyfQ+UMrnO/chGIECAAAECBGYqsHH7pfGNv7ojRk+MnuyirT2yvcsi//TjJ9tsESBAgAABAgQIEFhggY6Ojli/fn0sWrSoKlAjm83GPffcE0899dQCz9LwBAgQIECAAIGFEXjVq14VN954Y/y3//bf4i/+4i+iUCgszER+Murg4GC8/e1vj5e85CVx2223xf/6X/9rQedjcAIECBAgQKA1BQS1tuZ1P8Wqx4pZWjNl+1asXhWD568pa1MhQGD6Ailg9KfWDsemFcunf9ICHZkrfoiQAm+3rTo77rjnvvjaQ8VMbwoBAgQIECBQlwLdfT2x/tIt8b2vfbNsfpne4nsOQa1lJioECBAgQIAAAQILK3DuuefG7bffHitXrozR0ZNfykpZyVJQ65ve9Kb4/Oc/v7CTNDoBAgQIECBAYAEEXvnKV8b73ve+eOlLXxqbN28uBbT+5V/+5YIFtqb3aymg9aabborOzs7o7u6OW265RWDrArw2DEmAAAECBFpdINvqANb/E4GhbYPFL31dW+mx5eWXVzapEyAwTYFtqwbjrdu2NkRA6+Ql9RSzZvzchvXxK5dti+H+JZN32SZAgAABAgTqSGDkqur36pmOxZHp6qmjWZoKAQIECBAgQIBAqwvkcrk4++yzY9myZaXA1hQskR4DAwOlW+12dXW1OpH1EyBAgAABAi0okAJaP/jBD5YCWtPyU2b7D3/4w3HttdeWvvgz3yTp/dnevXtj9+7dpYDWNP727dvjQx/6UOzatWu+p2M8AgQIECBAoMUFBLW2+AtgfPm5XGFPMUtr53g9PafsT+u2bpncZJsAgWkIdBX/UP/GLS+NX7hoY6TtRi2res6Km4o/A1KAa8riqhAgQIAAAQL1JZDuqrB6/XDVpLJLBqraNBAgQIAAAQIECBBYKIF0C90TJ06ccvh8Pn/KfXYQIECAAAECBJpR4FWvelVZQOv4Gi+88MJSEOm/+Tf/Jtra2sab5/x5cHCwFNC6Z8+eWLx4cdl4L3vZy0rBtq94xSvK2lUIECBAgAABAnMpIEppLnUbpe+hncVg1vzuyulu3H5p5NobNyCvcj3qBOZDYMOypfH2f3FZpOdmKTtWnxNvKgbpNnKAbrNcC+sgQIAAAQKVAiO7psjW2l3MtN7WXnmoOgECBAgQIECAAAECBAgQIECAAAECCyiQyWTimmuuife///0TGVorp7N27dpSwOurX/3qeQlsTRla3/72t5cytJ4qg/7WrVvjAx/4QGnu7e3+7lh5zdQJECBAgACB2gsIaq29acP12Nb2zLXFLK2DkyeegllHdl0xuck2AQKnEdi2arCUobVn0aLTHNl4u4f7l5Syti7tKkvo3HgLMWMCBAgQINBkAsMjm0p3WKhcVra/7O195W51AgQIECBAgAABAgQIECBAgAABAgTmUSBbvCviv/7X/zo+8pGPnDKgdXw6F1xwQSnw9ed//ufnNLB1ckBrZYbW8bmMP19yySWlYNsrr7xyvMkzAQIECBAgQGDOBAS1zhlt43RcvP3T3srZnurD8crj1AkQeEEgBbT+wkUbm5pjVc9Z8dZtWyM9KwQIECBAgED9CGy9elfVZLK9y4rfW/PrXhWMBgIECBAgQIAAAQIECBAgQIAAAQILIJCCWnfs2BEXXXTRtEY///zzS4Gtr33tayOXq/3dVVNA6y233BJ79uyJ0wW0jk+4u7s7li5tnrtVjq/LMwECBAgQIFB/Aj7lrL9rMq8zal+3baT4/nmkctCpPhivPEadAIEXBK4aWtP0Aa3j1zplob1p65ZImVsVAgQIECBAoD4E1m8diXSnhbKSbYts3/KyJhUCBAgQIECAAAECBAgQIECAAAECBBZGYHR0NH73d383PvWpT8Xzzz8/rUmsWbMm3vve98b1119f08DWFNC6d+/eePOb3zztgNZ9+/bFb/zGb8Sf/dmfTWvuDiJAgAABAgQIzEZAUOts9Jrg3EJh7N2Vy1ixelWkh0KAwOkFUkDrT6294PQHNtERXcVvg16/+eJIAa4KAQIECBAgsPAC3X09sXH7pVUTyfQUs7UqBAgQIECAAAECBAgQIECAAAECBAjUhcD9998ft956a3z605+O5557blpzOvfcc+O2226LG264Idrb26d1zosdNJ6hdffu3WcU0Pqe97wn/vt//++RgnMVAgQIECBAgMBcCwhqnWvheu5/aNtgcXrXVE5RltZKEXUCUwtsWrG85QJaxyVSQGsKbFUIECBAgACB+hDYfOWOqolkOhZHZnFvVbsGAgQIECBAgAABAgQIECBAgAABAgQWRmD//v3x67/+66WMrdMNbF29enXpnNkGtq5YsSJuueWWM8rQes8995QytH7uc5+LQqGwMGhGJUCAAAECBFpOQFBry13ykwvO5UZvjsh0nmyJSFmehkc2TW6yTYDAFAJLuzrjFy7eOMWe1mka7l8SV19wfuss2EoJECBAgEAdC6Q7Lay5eEPVDLN9K6raNBAgQIAAAQIECBAgQIAAAQIECBAgsHACjzzySKTMp5/4xCemnbH1nHPOKQW23njjjbFoBndTHBgYmFFAawrA/exnPxv5fH7hwIxMgAABAgQItJxAruVWbMEvCAztLAazHr6hkmPT5Tsi1+5lUemiTmCyQC6bjTeOvDS6cvP7b+XgsePx1PHjpakcK97aY/+RZ6K/szNSgG0qa/p6I81tPsvVFwzFvqeeKj4OzeewxiJAgAABAgSmENh8+WVx/3d/ULYn070kMos6o/D8C+8hynaqECBAgAABAgQIECBAgAABAgQIECCwIAKPPfZY3HbbbaXsp7t3746Ojo7TzmPVqlWlwNb29vb4vd/7vWkHxK5cufKMA1r37ds3kaFVQOtpL40DCBAgQIAAgRoLzG9EVo0nr7uZC7S3H76uUMgMTu4hBbNuvmL75CbbBAhMIXBdMUPrQPfiKfbUtunI88/Htx99LO5/+nA8cPhwpKDW6ZQ0t5HiL6cjKwfmZZ7Xb744fvvr34w0X4UAAQIECBBYOIF0x4XeZUvj8JMHyyaR6V0RhSceLGtTIUCAAAECBAgQIECAAAECBAgQIEBgYQUef/zxUmBrChq96aabphXYOjg4GL/6q78a2WKim5Tp9fhPEuKcaiUpoHXv3r3x5je/ORYvnt7nmymgNWVo/YM/+AMZWk8Fq50AAQIECBCYUwFBrXPKW7+dj41lbq5M6Lj+0i3R3ddTv5M2MwJ1ILBpxfJSsOhcTiVlYP3qgw/GnY8eiNEZ3MrjwNFn44577i09UoDrpWcPxpXnnTtnWVx7irc4uWrovPjzH/54Lln0TYAAAQIECExDYOSqy+Mrf/SFsiOzfcsj/+TDEQW3CCuDUSFAgAABAgQIECBAgAABAgQIECCwwAJPPPFE/OZv/mYpePQtb3lLdBbv0ni6kgJV3/Wud5UCW//rf/2vpwxsnUlA67333hvvec97SgGtY2Njp5uK/QQIECBAgACBOREQ1DonrPXdaW7tpTuLn2iPVM4yfQCuECBwaoFcMRL8p9YOn/qAWe45NjoaX/jhj+Kb+x+dZU8nT08Brl/88T3x1Qceil8oZpjdUMzeNhdlx+pz4q+K48wkCHcu5qNPAgQIECDQqgIXbb80/vHzX4zRE6MnCTLZKAW2Hjpwss0WAQIECBAgQIAAAQKRyWRiUfEL2+kWvo16W91nn33WlSRAgAABAgQaXODJJ5+M9773vTFa/KzwrW99a3R1dZ12RQMDA/HOd74z2tra4uMf/3hUvicYD2jdvXv3tDO0poDWW2+9NW6//fYQ0HraS+AAAgQIECBAYA4FBLXOIW79dp2/OSJTNr3B89fEitWrytpUCBAoF0iBmynz6VyUHxRvE/yH3/1+HHn++bnovtTvJ+/8TqRMsz+38cJI2VVrWVLAb/L56gNubVxLV30RIECAAIEzFehY3BWbr9gRd37pq2WnZnpXRAhqLTNRIUCAAAECBAgQINDd3R0333xzXHLJJQ0buPHzP//zLiQBAgQIECDQBAIHDx6MD3zgA6X3JP/+3//7aQW2rlixIt7xjneUMrZ+7GMfi6NHj5YkZhrQ+u53v7sU0JqCaxUCBAgQIECAwEIKCGpdSP2FGHto22BE/trKoS+9emdlkzoBApMEUhDov1p7waSW2m2mgNZPf+eueclyevfjT8T+Z56JPZdsiaVdp799yZms8qqh8wS1ngmYYwkQIECAwBwJbL5ye3VQ66LOyHQvicLRQ3M0qm4JECBAgAABAgQINJ5AytL68pe/PHbt2tV4kzdjAgQIECBAoOkEnnrqqfjQhz5UyiD/y7/8y9PKsLps2bK45ZZbSoGtH/3oR+Oss86Kt7/97fHmN795WucnxJShNQW0/sEf/EEpW2zTwVoQAQIECBAg0HACglob7pLNbsK5XGFvZQ/dfT2x5uINlc3qBAhMEviptcORspHWuuw/8kx87q7vzktA6/jcDx47Hp/+57vipq1boitXu/8NpMDflAk2Bc4qBAgQIECAwMIJ9K8ciOGRTbHv23eXTSLbuyzGBLWWmagQIECAAAECBAi0tkChUIjnnnuutRGsngABAgQIEKgrgUOHDsWHP/zhUsbWX/mVXykFqZ5ugkuXLo23ve1t0d/fHykT/Rve8IbS8+nOS/tTQOutt94af/iHfyigdTpgjiFAgAABAgTmRaD2EVrzMm2DzEhgaGcxLWP+uspzt169K3LttQtsq+xfnUCjC6RgzW2rikmOa1zGg0uPLcAtPFIw7ce/dWfUeuxdQ2tqrKQ7AgQIECBAYCYCGy/bWnVaytSa6Vhc1a6BAAECBAgQIECAAAECBAgQIECAAIH6EXj66afjt3/7t+MjH/lIHDlyZFoTW758efy7f/fv4o1vfOMZBbSmDK0poPXEiRPTGsdBBAgQIECAAIH5EBDUOh/KdTJGe/uRGyIyZZF5KZh1/daROpmhaRCoT4Edq8+p+cRSMOknv/2dSIGtC1VSYOsd99xX0+HX9PXG0q5i/LxCgAABAgQILKhAytSaMrZWlkzPssomdQIECBAgQIAAAQIECBAgQIAAAQIE6kwgBbZ+9KMfjQ984ANx+PDhac2uq6srOjun9zndfffdV8rQevvttwtonZaugwgQIECAAIH5FBDUOp/aCzzW2FjsrpzCxu2XRndfT2WzOgECPxHIZbOxY/WqmnqM5vPx6e/cFQeOPlvTfmfS2dceejiOPP/8TE495Tnn9faecp8dBAgQIECAwPwJbL5ye9Vg2b7lEdm2qnYNBAgQIECAAAECBOZK4Pni354KhcIpu5cV7JQ0dhAgQIAAAQItLpACW3/nd34n3v/+98ehQ4dqpnHvvfcKaK2Zpo4IECBAgACBuRBwz/m5UK3DPnPDl1wTmahKybr5yh11OFtTIlA/AhuWLY2eRYtqOqEUSLrvqdr94jmbyaUA2y/d90D87Pq1s+mm7NyUrfXbjx0oa1M5tUB6fQ10L441fX3R39lR3O6eODi1p/0ps2/KrDteUiByCop+oPjHjAPPPrugGX/H51QPz+m1d+GyFzIQDvcvKU1pVc9Z0ZUrf7uTXvf3P324+Hi6aJieD9c8uHu+PdIa0/rPK76OUjmvuN1eDMpPpb/4rexTZVBOr6PxwPb0c+mF19bR0ustve4UAgQaW2DzFTviG3/1P+O5Z4+dXEgmG9neZZE/5P/VJ1FsESBAgAABAgQIjAt0F/8ucdFFF8Wi4t8jXiwQdfz40z2PFTMtXHDBBaX+TnXspk2b4sEHH4z29vZTHTLt9kwmU5r3j3/84zhw4PTvedPxac0KAQIECBAgQKBeBY4cORL/+T//5xgt/s3+ne98Z/T3989qqpMDWlOfCgECBAgQIECgHgUy9Tgpc6q9QG7tJX8Wkbl2cs+r1w/Hz/3ynslNtgkQqBB445aXRgpsrVVJwXTv/fuvTQSR1arf2fSTstG+6/IdNQve/cGTB+OTd35nxlO6aeuWGA9InHEnxRPf9ndfns3pc3puWt9I8ZbQmwZW1MQ9va7ufvyJuPvA46XnVF+oktaWruFsSwqw/Pi37jxtN+n1u2VwIK4499xIAawzLcnv248+tuB+051/ClJN1ikzcgqIns3aTzVmCnj9P08+Gd985NGyoOpTHV+r9lb4GVArK/0QmI7AV/7oC3Hnl75afujo8zF6313lbWoE6kygFkE0dbYk0yFAgAABAnUpkII6J5fNmzfHX/zFX8Tg4ODk5hlvp/+nZ4u/u+eKX8ZMz1OVlKk1Bb9WzmWqY6fTli/+XeSmm26Kz3zmM6c9vKenJ971rnfF1q1bT3tsvR7wile8YlpT8/5qWkwOIkCAAAECsxao1XuayomkL+Ls3r27FNi67CcJPiqPOV09BbS++93vjv/xP/5H6f3X6Y4/3X7vL04nZD8BAgQIEKiNwFy9v6jN7GrXS27tyb/PlKcuq90Yeqojgc4NI0Ojo+UBrWl6I7sur6NZmgqB+hNIQWO1DGhNK0wBn+NZEetlxSkA8gdPHIxtq2rzYUXKFqlUC6TX0yvOPz82LK999t8U2JmCZNMjlRSg+fViRuD0emvWkjy3n7Mqdqw+pyoT60zWvGnF8kiP9O/ziz/eF9/c/+hMupnzc9K/0/Q6Suuf65IyBafHFeedW3JJQb8ps3O9/Qybawf9E2h0gS0vv7w6qDW3KDLdS6JwtD4yxze6sfkTIECAAAECBJpJIAWeLl68ODo6OuZtWSlDay2ytE6ecAqinU5Jmc9uueWW6RzqGAIECBAgQIDAggocPXo0Pv7xj5eCUVPG1oGBFz4Tmu6kUkDrr/3ar8Xtt99ek4DW6Y7rOAIECBAgQIDATASm95edmfTsnLoRGB3N3RxRKJtPbzHz5PDIprI2FQIEygVqkS20vMeIO+65t7KpLur3HHqqZkGt6TboKeDu4LHjdbG2hZ5EyqB51dCaiYDT+ZjPeIDm/iPPFIMQ749vP3b62+3Nx7xqNUYK3r3u4o2RgnlrXXqKt1f8hYs2ljK//skP/k/c//ThWg8xo/7SNf2Z9evmJZh1qgkmlxTcmoKIU2DrVx94MI65LdFUVNoI1J1Az9IX3vfv+/bdZXPLLhmIMUGtZSYqBAgQIECAAAECxb8iFzOrNvptaFPW15StVSFAgAABAgQINJvAsWPH4m/+5m/iVa96Vbzyla88o+Xddddd8fWvf11A6xmpOZgAAQIECBBYKIHaR4Ms1EqMO7XA0M5iKrfCDZU7R66SpbXSRJ1ApcDFK1ZUNs2qnrJnpiDDeiwP13heq86a+W3g69FnJnNKQYBv3PLS+JXLts1rQOvkuaaA2us3Xxxv33FZ8Rb1jZ9BNwWx/tyG9aU1zUVAa6XdTVu3lAI5J7fP93YKZk2voRteunnBAlonrzm5X33BULz9X1xWs0D4yf3bJkBgbgQ2X35ZVceZrp7IdCyuatdAgAABAgQIECBAgAABAgQIECBAgEB9Cpx99tmxe/fuuOKKK854gi9/+cvjrW99a5xzzjlnfK4TCBAgQIAAAQLzLSBT63yLz/N47bnDewqRWTJ52Fx7Li7afunkJtsECEwhUOtMrfWapTUtPQXbplvVt9co8+VcBxxOcbnqqindIj5l1UxZa+uhpFvIv3Xb1vjm/kfjCz/8UUNm2EzZf1+36eJ5Dc5Nr+OfXb+2NObt3/1+jM5zlpeUMTa9luqxjGe07e/sqtsM1PXoZk4EFkpgzcUbYsXqVfH4Q/vLppDpXR6Fxx8oa1MhQIAAAQIECBAg0EoC7e3t8bKXvSxWrlzZsMv+0z/904adu4kTIECAAAEC0xdI71f+w3/4D/FLv/RLkd7DnGnp7u6OPXv2RK742dUHP/jBePDBB8+0C8cTIECAAAECBOZNoD6ibeZtua03UCFid+WqN1+xIzoWd1U2qxMgMEkgZbWsZUBiChqt1yyt48v+5J3fGd/0PEOB9Jr5hYs3RsquWY8lBUhuWL40Pv2du+L+pw/X4xSnnNN4UG4t//vbmbUAAEAASURBVE1OOdApGkdWDkQKcE1u81XqOaB1skHK2prLZuKLP75ncrNtAgTqUGDjjkvj8T/6QtnMsr3LIn/wkYixE2XtKgTqVWD/r/1ivU5tVvNaddvvV51vrVUkDdfQ6tf1bX/35Ya7ZtOZ8Ef+5a6qw6y1iqThGlr9ulZesEwmUwp0qGxvpHpbW1tkp/nF7Z6envjABz4Ql1/euHc2S9dspsV7jpnK1c95rf6eo5Vew95z1M+/u5nOpNXfc7TSa3imr5EXO29wcLAU0Ppv/+2/nVFA63jfHR0d8eY3v7n0XikFtt5///3ju2r23ErX2lpr9rJZsI78bPb3mwV78dVoYK/h1nkN1+gl01DdCGptqMt1ZpNtG956bfGMDZVnpQ+0FQIEXlzgwmXLXvyAM9y776mnzvAMhzeaQAq4TLerX9VzVl1PPWXYTPNMAZopO2+9lzTfN468tKZB5jNZcwpUToGmf/i978/k9DM6p1ECWscXddXQmtL1+ZMf/HC8yTMBAnUokL7Y9q07vhxHnz5ycnaZbGR7+iN/6MDJNlsECBAgQIAAAQItLZAv3qXk2Wefjeeee64mDoVCoRQ0kTKCnSrQ9MSJEzE2NhazCc6cPNm0htHR0clNp9xO83vmmWdOud8OAgQIECBAgMBCC6SA1r1795YytKb3VLMtKcvrm970ptIXmd73vvfFfffdN9sunU+AAAECBAgQqLnA7N/11HxKOqyVQCZTeH1E+be0x289Wqsx9EOgWQXOK2ZqrWXZ99ShWnanrzoTWNrVGXsu2RLpuRFKyjr6xi0vLQVofnP/o3U75RQo/KbiPOvFNWW6fbz4wd6X7qv9N5fHL0KjBbSOz3vH6nPiu48/0RCB0uNz9kyg1QRy7blYv3Uk7vzSV8uWnu0fjPzTT0QU8mXtKgQIECBAgAABAq0pcM8998RrXvOaWFT8kmkK+JxtScGqF1xwQfyX//Jfor+/f8rufvM3fzP++q//elZZx8Y7ToGxad4//vGPx5s8EyBAgAABAgQaViAFtN5yyy1x00031TSbfgqOvfHGGyNluE+Brfv27WtYIxMnQIAAAQIEmlNAUGtzXtdYtH7rhuIX0lOm1rKy+fLLyuoqBAhMLXBOjbNtCmqd2rkZWlMm0UYKaJ1sngIoT4zl49uP1V+GvhR4+7rNF9dd5turLxiKux9/PA4cfXYyZU22rzjv3EiBs41afmb9uvjxN74Zo8U3IAoBAvUpsPXqXVVBrdHWHpnFvVE46gs49XnVzIoAAQIECBAgML8CR48ejW9+85s1HfTAgQPx/PPPn7LPu+++u+ZjnnIwOwgQIECAAAECDSIwHtD6lre8Zdpf/nnooYciZa0/77zzTrvKFND6+te/vhQse9ttt/lS0GnFHECAAAECBAjMp0B2Pgcz1vwJ5POZ3ZWj9a8ciOGRTZXN6gQIVAik7JApULFWZf+RZ+LYNG95Vqsx9TM/Ainw8oaXbq6bTKIzWfV1F2+MNTXOTDyTeVSek+a0YdnSyuYFr6drnoKBa11Sv1cNnf6PTLUet5b9DXQvjiuLgbkKAQL1K9Dd1zPl7wMpW6tCgAABAgQIECBAYK4EUtbXlEH1VCXdAlchQIAAAQIECBA4KTAwMBB79+4tZWid7nule++9N371V3813vWud8UPf/jDk529yFYKbL3++uvj1ltvjXXr1r3IkXYRIECAAAECBOZXQFDr/HrPz2hDO4v3v87vqRxs85XbK5vUCRCYQiAFZtWy7HvqqVp2p686EqjXgNAzIWqGwNwzWW8tjk0Btymrai1LygBby2D6qeZ2pJgV5+Cx41PtqlnbfKyjZpPVEYEWFRjZdXnVyjOd3ZHpqO37n6pBNBAgQIAAAQIECBAgQIAAAQIECBAgcFqBFNB6yy23lAJa05eDplPuu++++PVf//X4zGc+E5/97Gfj3e9+97QDW7Ppznmve13p/AsvvHA6wzmGAAECBAgQIDDnArk5H8EA8y7Qnju8pxCZYmDryZJrz8XmK3acbLBFgMApBfo7y/75nPK46e7Y95Tb+U7XqpGOS0GNI8UM2M1QUjDl6zZdHL/zzW81w3LmZQ0pq+rXHno4Rou38ZltSYHF21adPdtuys5P8/rm/kfiu48/UcoUff/Th8v2D/cvKQXRXloct5YZcdNaNixfWhz70bLxVAgQqB+B1euHY8XqVfH4Q/vLJpVdMhBjj91X1qZCgAABAgQIECBAoNkFUgbZvr6+Zl+m9REgQIAAAQINIjAe0PqWt7wlOjo6pjXrFNCaglhTMOt4uf3222O0eBfJ2267LTZs2DDefMrnFNh63XXXlbLrp3O+//3vn/JYOwgQIECAAAEC8yEgqHU+lOd5jHwhc3Pl3ZxSQGsKbFUIEDi9wKqes05/0BkcIaj1DLAa5NAUBJoyUjZTSdlHd6w+pxSo2YjrOnD02UiZSMf/veWymVhT/FAqXataZ19OPqnf5PXVBx6cNdfanwSYzrqjn3SQ5vSl+x4oeZyqz3Gnbz92IFKA68+uXxe1+tl38YoVglpPBa+dQJ0IbHn55XHHZ/7fstlkzuqPeOLhiLETZe0qBAgQIECAAAECBJpZ4NixY/GpT30qvvrVrzbzMq2NAAECBAgQaACBFcW/rb/jHe8oZWg9k4DWW2+9NT73uc9FviIJxx//8R+X2lKQ6kUXXXRagfHA1lwuF7/xG78Rd99992nPcQABAgQIECBAYK4ERDnOlewC9ds2vPXaYkDrUOXwm6/cXtmkToDAKQQGurtPsWdmzceK34RUmkvgZy9cF13FX+prXVJgZgoy3H/kSCkgcXJ2zRRwmMYc7u8vBR9uWrG81sPHT629IO4+8PiLBkPWfNBZdHjw2PH4n/feGz944uCLzjkF7G4/55xiNtTBWYxWfWqtsrWmbKm1KMnjk9/+TqTX0ZmUFOD629/4Zun6XzW05kxOnfLYlPk1ZWytRRbbKQfQSIDArAXWbd0S//D5v46jTx852VcmG9neZZF/Sqblkyi2CBAgQIAAAQIEml0gBbV+8pOfbPZlWh8BAgQIECBQ5wIpoPWd73xn7NmzZ9oZWu8tfj6SMrROFdA6vtw//dM/LQW2vuc974mXvOQl482nfE5Z7F/zmtdECmxNfd91112nPNYOAgQIECBAgMBcCtQ+ImcuZ6vv0woU32jujiiUHTc8sin6m+QW2WULUyEwRwK1DFZMmSP/f/buPzaO887z/Lea3RTJVvOHJFI0TYuUKdGkRY5bIQmRF1EjKhuOYU9izWaACHfBRcB6YPmPOd0O5sYTZ5L4kEwOCBY7wO0fmcXOHmaAxEnm4CT2bHwXJZB34klkj/zzLMeSbMmSLFOyflqiflki2VffdlruqmqS3V1PNau73w9AsJ+nq5566lXdbdH88Fu0yhLQqpZJw5+pGnx+zq6suffY8XmxpqavpJ/LVNjU4ODn7eqaJquQ6mv/gXU98qPfhvu2Mur1zOG3864GquFg/Xrz7Fn54oZ+Y4Fkrdaq4WINIvtp+pry2zTQ+revvCr6vdj27DtH7cB0QvS15adpoFWDxJnXqp+52BcBBIIR0Ds49I8Oy0s/f85xgEhzm8x9aH+mpeYc43QQQAABBBBAAAEEEEAAAQQQQAABBBBAIBiBVatWpSu0aqC1rq4ur4McO3ZMtELrk08+6anQ6p7gpz/9qczOzsq3vvWtvIKtuv8f/dEfiQZcn3jiCXn99dfdU9JHAAEEEEAAAQQCF4gEfgQOUDKB2PqRpP0b6PvdBxzcvMk9RB8BBEokcGuWUEiJqEt2mAkDlSyzF3vg7Dn5zm9eXDDQmr195vHB8xfS1TU1iGiyaTVTDWuGtWk4Vb32TxVeSVCt/8auSGqyenKyfbUvKrU24a0hXz+B1sxJPPvOkcxDX9+DqCTsa0HsjAACHoHkxLhouNXRamJixf0H7R1z0kEAAQQQQAABBBBAAAEEEEAAAQQQQACBnAIrV64sONCqFVq/9rWv5RVozRz0n/7pn9LHeeWVVzJDi37fvn27fPOb35SNGzcuui0bIIAAAggggAACpgUItZoWXcL5UqnUl92Hb+3skK4Nfe5h+gggsICAiYBZZvqLN4qvmpiZg+/hEehILPddxTL7bLSS5ffeeFOKreirt3fX6q5PH34ne1rfj7d1r/E9RxATaJD3uy+/WrSXrkmDn0+9dcjY8rSqqZ/qznfarym/7czVa6KBXRNNKwKbmKsuGjOxHOZAAIEABeJNiZw/J0Ra/IX1A1wyUyOAAAIIIIAAAggggAACCCCAAAIIIFAxAhpoffzxx+XRRx+V+vr6vM6rkAqt7gmfffbZ9PEKCbZ+7nOfSwdbh4aG3NPRRwABBBBAAAEEAhVwleYJ9FhMHqRA90i7yOwuEctxlP6xYUefDgIILC6gt8421W7ZocMgmwZwTd5+3uRaNXCpVTUrqY3cYX/UGmoartRAqzr5bc+feC99u/fk6ja/U6X31+qjP7MrwJpYm5EF2ZNo2PL7hrxe++CM6DmaqCaqnxc6176T7xd1qq3xeFH7Ze906Pz57K7vxxq29muzor7O9zqYAAEEghfY9OBn5chrBxwHspY1iH6lPrrmGKeDAAIIIIAAAggggAACCCCAAAIIIIAAAmYEMoHWXbt2lSTQmln1z3/+c7ELZclf//Vfy/BwfjmCBx98UKLRaLo67P79+zNT8R0BBBBAAAEEEAhUgFBroLylmzwWnd2REsuRIFnWUC+D42OlWwRHQgABj8D0Rzc9YyYH+latkC/e229ySmNzaWjz27/eZ2y+pZ5Iq3GOdd5pZBkaFv3+geIrtOZaxA/ffEta6urS4dZczxcypmHpkY47ig5qFnKsfLbVQKtWaL0+M5PP5nlts+fou76Dm5kD3WNXay021Oqnymvm+CYqq2bm0u8nLl3K7hb12MR5FXVgdkIAgYIE9K4O+nX25JRjP63WOnv6XccYHQQQQAABBBBAAAEEEEAAAQQQQAABBBDwL6CB1r/8y78UDbQ2NDTkNeG7774rTzzxhPzgBz+QOZ/FUvbs2ZOe41vf+pZs2rQpr+P/wR/8gViWJV//+tflxRdfzGsfNkIAAQQQQAABBPwImCtH6GcV7OtbICXymHuSe0eHJRojt+x2oY/AYgImKwzemLm12OF4vkwEelqaxVQV31dPnzFexVaDss++c8SY5obWVcbm8jPR9M2bxgOtuh4NyuqXidaxfHnR09RHa4reN7PjxRs3Mg+NfFdzv63ODoHTEECgPASGJic8C7XizSI1Mc84AwgggAACCCCAAAIIIIAAAggggAACCCBQvEB7e7s8/vjj8uijjxYUaNUw6ZNPPikzhop//PKXv5SvfOUr8sILL+R9MpOTk/Ltb39bfv/3fz/vfdgQAQQQQAABBBAoVoBQa7FyIdqvpmdou4jluSd2rl9Qh2jZLAWBqhC4PjNbFedZDSe5obXV2Gk+/957xubKnkhvG28qqLnOYIg3e42FPt5z9JjRCq3Zxzd1HTQIr9Vti2l1Uf+hseu3zFWw1XPQKst+m8k/DvC7FvZHAIGFBXqSAxJvSjg3siISaW5zjtFDAAEEEEAAAQQQQAABBBBAAAEEEEAAAV8Cn/vc5+Thhx+WeDye1zxaoVUDrT/84Q+NBVozB37uuefSFWOff/75zNCi38fHx2XLli3pqq2LbswGCCCAAAIIIICADwHKaPnAC8uuljW3W8SZT875y+mwLJh1IBByAa14aaoiZ6LWf2At5FxVszyt1GqimawQmms9+0+dlocS63I9VdCYvgc02Hrw/IWC9jO5sVrtO/m+ySkdc2kI2FTramqUA2fPFTzdc8ePy0unThW8X/YO1w39ZXb2nDxGAIHqEdA7OwxsHpMXf7bHcdKRxpUyd8H+fErNOcbpIIAAAggggAACCCCAAAIIIIAAAggggEBxAi+//LJoiPSBBx5YNBiqgdZvfOMbgQRaM6v/53/+53Tl2G9961uLVmC9deuW/OQnP5Gf/vSnmd35jgACCCCAAAIIBCZAqDUw2tJMHFs/kkyl5ra6j5ac2Oweoo8AAnkKXP7oppiqMhiN+L+1eJ7LZrMABfT1YOo1se/9qQBXKvLS1Cl5cN3dRoLZ96xcsaSh1qcOHgrUSiuS6peJa7umyFDrmavXRL9oCCCAwFIKDI6Pyst79spMduXnmphEEi0yd/n8Ui6NYyOAAAIIIIAAAggggAACCCCAAAIIIFAxAq+88kq6OuqMXaziD//wD6WmJvfvEY8dOyZPPPGE/OAHPzBeodWN+S//8i/yla98Rb797W/L1q1b3U+n+xpofeqpp9Ih28OHD+fchkEEEEAAAQQQQMCkgLO8p8mZmaskAnNzc4+4D9Ta2SGdvT3uYfoIILAEAibCckuwbA7pEjBVpVWnPXDmrGt2s12t2mmqumpPS4vZxRUw2/FLl0W/gm6mqrWuqK8PeqnMjwACCAQmEG9KiN7pwd2spjb3EH0EEEAAAQQQQAABBBBAAAEEEEAAAQQQ8CFw4MAB+epXvyrPPPNMzsCqBlq1QmspAq2Z09i3b1862Lp3797M0O3vN2/elB//+MfpkC2B1tssPEAAAQQQQACBgAUItQYMHOj03SPtlpXa6T7G4JYx9xB9BBAoQOAGt/IuQKs6Nm1taDByotP2D/76FXSbmr5i5BBtcTPnXcxiZuZKc7vrs9fMVEltqasr5jTZBwEEEAiNwNDkhGct1rIGserinnEGEEAAAQQQQAABBBBAAAEEEEAAAQQQQKB4gTfffDMdbP3JT34iWgU109599135+te/Lk8++aRjPPN8kN9feOGFdBXZX/ziF7cPk12h9dChYO+ud/ugPEAAAQQQQAABBGyBKArlKxCNzu0U+9fM2WegVZb6R4ezh3iMAAIFCmilS1ONoJspyaWdpy1uJtBTisqjKjU1PW0ELBqJSKK2tiRBXCMLLmKSC9evF7GXd5dyq8rc1dQoen2z21JW5s1eB48RQGBpBPRuD+1ru+T0u8cdC4i0tMvsqSOOMToIIIAAAggggAACCCCAAAIIIIAAAggg4E/grbfekq997Wti35lV/viP/1hOnjyZrtD6wx/+MGcFV39Hy2/v/fv3pyu2plIpGR8fT1eTfeKJJ4QKrfn5sRUCCCCAAAIImBMg1GrOsrQzdW+1w6yXd9uhVsdxe4eSEo1xWR0odBBYQoFYjTM0toRL4dA+BNoMVWo1VUF1sVM5cvHDxTbJ+/k7E8vl4PkLeW9fbhtevHHDyJI1/Ksh0VJVmJ1v0T0tzVIfjUpHIpHeZI0dXo3Z69Kqu7pGGgIIILCQwPDkVvlv//kfHJtYDY0iNTGR2U8qRjg2oIMAAggggAACCCCAwCIClmVJLGb/m3KeFnH90eU8mzGMAAIIIIAAAghUnIBWP9XQ6NmzZ+Vf//Vf5R//8R9LXqHVjfryyy+ng62f/exn06FWAq1uIfoIIIAAAgggUAoB0o+lUA7gGDU10/fbgdb27Kk1zJrrtqHZ2/AYAQQWFzB5e3hCZIt7l8MWpqpwmqqgupiZVhu+cP2GmFh3q12ltpJDrSbf743LatPui10fE89rSFWrrXYsX54OsBJaNaHKHAgg0LWhT/TOD1cvZVX8tiKi1Vrnzr0HEAIIIIAAAggggAACRQnM2P+f4tSpU+kqZPo40zTsqoHW64buopKZl+8IIIAAAggggEA5CRw8eFD+6q/+Sq5evbpkFVrdXq+88oq8+eab8tFHH7mfoo8AAggggAACCJREgFBrSZgDOchj7lkzv4R2j9NHAIHCBLSaZnJ1W2E7LbC1Vk3UkGG1NZNhwaW002Co+zbtxa6nlCZagdREqFVfv5XcNPxbDk0D8gNtrbKhdVU6zFrp16UcrglrRKASBfSP5JIT4/Lrnz7rOL1IokXmzr8vkppzjNNBAAEEEEAAAQQQQCAfgffee0927NghtfbPtnp73eymodajR49mD/EYAQQQQAABBBCoOoFLly6F7pwJtIbukrAgBBBAAAEEqkqgspMqFXopY+tHkqnU3Kj79PR2oTQEEPAvcMFwdQi9HfiBs+f8L6zMZljq27CHkauUoVZT55+onf/2gKaOUSnzaPDUZEhWw9Qb29vkvtWrpW/likph4jwQQCDkAv2jw/Liz/bIzK2sP8ipiUmkcaXMXTob8tWzPAQQQAABBBBAAIEwCmgg4o033gjj0lgTAggggAACCCCAAAIIIIAAAggggEAIBSIhXBNLWlRg1lOltbWzQ9rXdi26JxsggMDiAmeuXVt8owK20FBrNbZKqU5bV6aVSk2FK6ORmmp8+RZ1zqYq+urBx9fcJV/dPCZfvLefQGtRV4OdEECgWIF4U0I02OpuVuMq9xB9BBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAwLgAlVqNkwY8YfdIeyo1u919lI2f2eweoo8AAkUKnLlqOtTaUuRKynu369kV3sr4VEze5t1U0LSUnLGayv/7F70uK+rrSsk677EGWlfJ53vXh2Y98y6UJxBAoKIFBreMyRvPv+A4R2tZg1j1CUldn3aM00EAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQRMClR+UsWkVgjmikZTu0QsR/JGqymtH9oYgtWxBAQqQ2Bmbk5MBls7EsvFZDCyXJRn5mbLZakVuc6LN24YOa9Eba2ReZhkYQGt8vqFvl7Zed8ggdaFqXgWAQRKIKB3gejs7fEcKdLc5hljAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEDApQKVWk5pBz9W91Q6zXn7EDrU6jjRg3544GuNSOlDoIOBT4IIdCGyLN/ic5ZPde1qa5cDZc58MGHq0f+q06Jep1rdyhTy88T4j01288ZGReZZ6kpgdNjTRNCxNQyCXgFaJ3fl7g6IBeBoCCCAQFoHkxGY5efiIYzlWvFmsWK2kbt10jNNBAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEETAmYSeqYWg3zLChQU3Nlux1obc/eSMOsg+Oj2UM8RgABAwKHzl8wMMsnU2iotRxaazxubJlnrl41NtdSTnTLUBhVK3GWsiVqY0YOd31mxsg8YZ6kcdnSVaPVKs4PJ+8j0BrmFwhrQ6BKBXqSA6J3hHA3q2m1e4g+AggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggYEyA8p7GKIOfKJVKPWY5i7TKfL9sDn41HAGByhY4cvGi0RPsaWkxOl9Qk7U11Bub+sy1a8bmWsqJTFZYTdTWyvTN0lS3i0ZqjLBdv1X5odZSB44zF0aPu/O+QaNVoTNzz/f9yMUP009duH5DLtoVqbObBpinpqftgG1CHupdl/0UjxFAoEoFhiYn5Ff/9zOOs480rpS58++LpKhA7oChgwACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACRgQItRphDH6S2LpPjaZEku4j6S+aaQggYF5gavqKaOhLbwtuoultxfVL5w1z62pqMrY89auEZjKEGqspXbXWxBJWH62E617MORRa1fbBdXdLUFWcdS3HL12WE+mvS+nHha6vGAP2QQCByhK4d3RYfvPTZ2Um+w8c7D+aiDStkrkPz1TWyXI2CCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAQCgFCraG4DIsvwg60PubeqrWzQ/SLhgACwQhoRcMV9e3GJp+8e638/etvGJsviIna4g1Gpj1z9ZqYrHBqZFEhmKTOvtV8qVrMrgJqormreZqYM0xzaPVcU+2GHSTNt+lxxzrvzHfzvLd77YMzcuDMWdHvNAQQQMCvwDK7gvvg+Ji8uvd5x1RWYqUIoVaHCR0EEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEDAjYCbxYmYtzDKfQPeIpurudz9NlVa3CH0EzAocOn/e6IQDravS1VqNTmpwsr6VK8TUbdinroS7Im0hbJc/ulnI5gtu22gwQLnggewn6w0FaCs9nGyqGvNi18P9/LbuNcbebzr3wfMX5H//1a/le2+8SaDVjU0fAQR8CfSPDXv2t5Y1iNXQ6BlnAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEPArQKjVr2AJ9o9GZ3aLWI57oMebEtKTHCjB0TkEAtUrcMgOiZkO9Gm11rC2UYNVI00HgpfSTF8DWnnWRGuNx01Ms+gcGk7uSCxfdLt8Njhz9Wo+m5XtNi11jv+8Fn0e+jq5cP1GXvtr4NhkldanDh6Wv3v1dZm+aS6AndeJsBECCFSFgN4ZomtDn+dcI02tnjEGEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAb8ChFr9Cga9f/dWO21j7XQfJjkxLtFY6W5j7T4+fQSqQeC6fSvxA2fPGT3VsFZr1QCkrs1UO3jugqmpQjHPhRv5hRUXW2xXU2mq2pk8zplrZgK9i9ks1fMr6uuNHDrfQKserKel2ViV1r3Hjsu+k+8bOQcmQQABBOYTGNy8yfOUFW8Wq9bMHwZ4JmcAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSqVoBQa8gvfSx2eYcdam3PXqaGWftHvbcBzd6GxwggYEYgiLBYGKu1jt3ZYQbMnmVq+krFVYw8a6haacdyM9VTF7tYHYnEYpvk9bzJKrV5HXAJNjJV0baQ4LOGWk20g3Y16WffOWpiKuZAAAEEFhTQO0Q0rlzh2cZqpFqrB4UBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBDwJUCo1Rdf8DvPzlq73UfpHd4o8SYzgSX33PQRQMApcOTih+mQpnPUX08roiZXt/mbxODeK+rrZKTjDmMzmq5uW+zC9BbvptqZa9eNTNUWbxCT65pvUaYqtRZSfXS+tYR9fE2jmeq5hQSfe1pajLC8QIXWBR1L8V5bcAE8iUCFCSS3bfacUaTJrvJu8SOlB4YBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoWoDfQBZNF/yO0XXDWyMRSbqPlOsXyu5t6COAgDmB/adOm5vsdzPt2NAvfTkqnhk/0CITRu0PmYeT9xm7Fboe7sDZs4scdeGnb83NLbxBns/W21WtTbXjly6Zmip963ljk+WYSK+pqUqgxy9dznGE0gxpMDdRWxvowTTQrV8m2oUbH+U9jYnqsGeuXrPfa+fyPmY5bRjGz4By8mOtCAQlMDg+JnrHCEezA63pYKtjkA4CCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQvACh1uLtSrDnnKdKa/vaLmntNHeb8BKcBIdAoOwFXpo6JXobdpNNg4c77xsUE+E2P+v60uAG0eqhppqJyrbTH900spyWOjNhRV3M1PQVuT4zY2Rdwwar4uZakIalTYVBD50/n+sQJRnT98hD96wP9Fimwr+6yCMXL+a1VlMh2qkrV/I6XjluFMbPgHJ0ZM0ImBbQQKsGW93Namx1D9FHAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEihYg1Fo0XcA7do+02/fy3O4+yvDkVvcQfQQQCFhAw4y/OvGe8aNoaO/RoY3GKkUWusDJu9fKQKt922CD7XkDTjdmbhlZkelbjx84Y6YqppqbCp3mghrtvDPXcFFjh85fKGo/UzslV7cFGvze0GomiKWfERp8zqfp+95Eu3D9uolpQjlHWD8DQonFohAoscDgllHPEa3aOrHizZ5xBhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIFiBMwkK4o5MvssKBCNph5zbxBvSkhPcsA9TB8BBEogsOfoMZm+aaaCaPZyNXi561OlD7Y+sO5umby7O3spvh+rj4nboV+fmfW9Fp2gLR43Mk9mkqMf5leJM7P9Qt/H13Qu9HTRz2nVXa3UaqJp1V1T1Wn9rGfHvf1+dp93Xw0Wmwp15xto1cWYCjTPzKXmPTe/T5hyKXYdYf0MKPZ82A+BShJosf/YINfPI5EmM38kUElWnAsCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCBQnQKi1OLdg9+reat8ze26H+yBDkxPuIfoIIFAigZm5OXn60NuBHE1vR/6nI0PGwogLLVJDj3qsbd1dC21W1HN7j50oaj/3TqaClMN32AWvDbaD5y6Ivg5MtJGOO4yFG7PXM9Fl7rqaCChnr63Yxx2J5XYAe22xu8+737buNfM+V+gTGgDOt8UMVWpN1MbyPWRB2+lnw/iauwrax/TGYf0MMH2ezIdAuQr0bxryLN1qaBRrWYNnnAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgUAFCrYWKlWD7WPTyLhHLkcaKxqLSO5QswdE5BAIIzCfw2gdn5Pily/M97Wtcqzc+vPE++aJdlVKrtwbRxuzb0v/ZphHpamo0Pr1Wad138n0j85q69bgGeDUQaarpOR48f8HIdHq9vzS4QUzdil4XpUHEkQ7HfzqKXquGd187/UHR+5veUasKmwy26mtD3w+m2oGzZ/Oe6rKhis8diUTex8x3w6RdgVGrOC91C+tnwFK7cHwEwiKglVpbOzs8y7ESKz1jDCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAKFChBqLVSsBNvPzllfdh+mf3RY4k3mAyzu49BHAIGFBZ46eGjhDXw+q6HEb2z5dDrcaiJ8quG9h3rXyeOfHpMv9PUaDVFmn+pTbx0yVsV0+qOb2VP7emy6Iu2eo+/6Wk/2zj0tzfKgoQChzqXX2VTTgLKGeMPUTAVbNUj8pQFzgeKp6SuiX/m2i9dv5LvpgttphWeTTQOtOzb0m5yy6LnC/BlQ9EmxIwIVJtA/Nuw5o0jTKpFIjWecAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKAQgWDKARayArZ1CER7PnW/WOIpyTq4ZcyxHR0EEFgaAQ2v7Tl6zK4a2R3YAjR0p+FW/Tpz9ZocuXhRTly+nK4Sq/2FmgZhdX+t4jh2Z4doqDXoprepN3mr+jPXFj7HQs5Hg3raNHRr4pbmmQCjqQqwWl11+uYt2XvseCGn5dhWr/nO+wYdY347+0+d9jtFIPvr+y5RG0u/B4sJ3ep7Q4Obpq6fnqRWcC6k6etQK+HqWvw0rfbbt3KF7+rBug4NRJusXOvnvHTfMH8G+D039kegUgQGx8fkxZ/9Qj66dv2TU7IiEmlcKXMfFva5+MkEPEIAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQRECLWG7VVgySPuJXVt6Mt5i0/3dvQRQKA0AlqtU0NxA612RbKAm4ZS07dKF+et0o9funy7MmpLXZ2YrtqY72lpOE8DoybbBbuSpX6ZOicNtt5jh/8Onb+QDgZPTU/nXG62ac4Nfjeo199kiFRv9z58R7s8c/jtggKK6vP53vXGX4caUC6k8uhCVkE8p+HLZPtqee7YCfnVifduvw8WO5a+j3b+3qDxoPf+qVOLHdrzvL6+TQTOv2gHdL/zmxeLDmzrGrRqrcmQr+dkixgI+2dAEafELghUnEA0FpV77TtJvLr3ece5RZpXE2p1iNBBAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIFCBQi1FioW4PZ1fcnumRlru/sQg5s3uYfoI4DAEgv86M23pG1kyEgwrZhT0eqcYWhPHTwUyG3qD50/b7RyZH00KhpuzVRuzWX37V/vS4dpcz2XPaahz4N2QFarZJpqGi58eON96XlfskOSWoU0V8hWr7ueywY7UD3ScYfvap/u9WtIWcO1YW9qoGHg8TWdsu/kVLqa8cUbH4eh3WtPX3c7BBtECF1fC8VUjNV9TIRatVrrF/rvKbgSse63rXtN+j3mt2Ks29tUP8yfAabOkXkQKHeBjZ/Z7Am1SrRWrHizpK5+WO6nx/oRQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQGCJBAi1LhF8rsPOzER3i6QcTzXaoame5IBjjA4CCCy9gN5C/O//vzfkT+1gqwbsqrE9b1fJ3D8VzG3qtapqmG6H7r6+P7bDvH8xtsl4qFSDstlhWX2dadVUDUBqEDHotufosbyCvUGvI9/51WTy7m57c/36uB25+HGQqqelOTMUyHcNAGvV3mKaVgs2tT4N7epcTx96W177YOFbfmt138+uXWsHotuLWXZJ9wn7Z0BJMTgYAiEVSKz4+OeUI68dcKww0twms4RaHSZ0EEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAgf4HqTGLl71O6Lbu31olM73QfMLlts3uIPgIIhETgzNVr8v033kzfij6s1Q6DotLg4NOH3wlq+nTFUg0NhtVVb4++99iJ3wUqA2NIB6ZNhR8XW6W+nn9lB5XLvZXKa9/J99OB42K89p86bVeZvauYXXPuo+HeLw1usKuvdokGoTPBXt24pa5ONMzakVheVgF8rYYc5s+AnBeCQQSqUCA5sVncoVarPiHWsgZJfXStCkU4ZQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQ8CtAqNWvoKH9Y9HLu1JiOcrKRWNRuXd02NARmAYBBIIQ0ODVf3n19XSwtVoqtmqg8+9ffyMIzttzapjt+KXLxqpZ3p7Y4IO9x46n11eqEKXBpXum0iDk9w68mQ4Rep5kwCOgr08NNRfbtPquvr67mhqLnSLnfhpc1VYJr8ly+AzIeREYRKDKBDp7e6S1s0POnpxynLnVuEpSZ4v/nHRMRgcBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBKpKIFJVZxvik02JPOJe3uD4mCxrqHcP00cAgZAJaFXE7778arpCYsiWZnw5Gsb7T/tfLsm5Ph/yqqEautNwr1Y4LfemFYf12oapadA2rO2pg4dk+uZNX8sL++vb18kZ2hkjQ5BMg0DAAoNbxjxHiDSuFKmJecYZQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBYTINS6mFAJnq/pGdouYvW5D9U/RpVWtwl9BMIqoIHASg+2alVaPUe/Yb58r+GBs+dEjxnmpsHLv3vt9ZKEfINyeOrg4VA67zl6TDQwHramr8v9U6d9L0vnKdV7qdDFajXmMIS1y+EzoFBbtkegEgX67TtLxJsSzlOzIhJJtDjH6CGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJ5CBBqzQMp6E0sK/Vl9zF6kgPpW3m6x+kjgEB4BTTY+p3fvBjKgKBfNQ3xaVXSUlfPfObw2yU/ZqFWGgDUsK9+L7f29OF3ZN/J90O57Bszt+R7dgXZMLnqWn705ltGvLTS795j4bs1t57j377yqpy5Fo4KxOXwGWDkBcEkCJSxQDQWld6hpOcMIi3t9t/t8eOmB4YBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBYUIDfMi7IE/yTtb1DdoVWy67U6mz9m4acA/QQQKAsBLTy4t+9+ro8+85R0dBauTcNsf7ot2+lv5bifLRapKkQYZDXQgPN6SCgvd5yaHot9bqG+fbu0x/dTFcy/f6BN0PxXsqEPU0Gu9U/TKHizDnq9+u3ZkLxUi6Xz4BQYLEIBJZQYGhywnv0mphYDY3ecUYQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBBQSsBZ7jqRIIRNcN/41I6n/NPlTL6jb5n5/437KHeIwAAmUo0BZvkC8NbJCOxPIyXL2kK85qoDQMt0gf6WiXL/TdI9FIcH+L8e1f7/NdFbQ+GpWHN94nXU3hDfFoKPP7dgXUg+cvBPK67GlplkeHNvqeW6vfHrn4YXqesc477evf63vOYidQM12PhpeDaPqa6Vu5Ioip854zO9CqO33x3n7R952f9ue/fM7P7o59y+UzwLFoOghUmcB/+8//IEdeO+A469SNqzJ78qBjjA4C+QqkUql8N2U7BBBAAAEEEPAhYFn8isAHX1ntyr+vyupysVgEEEAAgTIWqKZ/X/HvizJ+obJ0BBBAAIGyEqiWf19E131SBDS4dFBZXfolWmx3sllkbpf76INbRt1D9BFAoAwFtMLg/7n/ZdFbvIchGJovYaY6q1acDcu690+dTocKNXgX5qZ2/8m+5mGt1PvaB2fkO795MbBAq8lrk/3a02qmTx08vCQVW/V9rNc0qECrmv39628EOv9i10XPUSsNZ7+/Lt4I13utXD4DFrPmeQQqWWB4cqvn9Ky6uFjLGjzjDCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALzCUTne4Lx4AVi0cjOlFh12UeKxqIyOD6WPcRjBBAoYwG9zXvmFuMjHXfIRNcaWVHveNuH5uw0RLjv5FR6vRrODFs7fumyfGffi7JlzV0y0b1GtCpqWNveY8fltQ8+kH9rV5dd6gqcaqTXVqvuBlWdNYjroO+d7KbB1qnpadl536AkamuznwrssQYpnzp4KPAwrZ7rf7FD5A+s6/FdHbUQDD3u3mMn7K/jgZ9jIeuab9ty+gyY7xwYR6CSBdrXdklrZ4ectf8tkd0izW0y+8Gx7CEeI4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAvMKhDcRNO+SK+eJuZS12313Jw20arCVhgAClSWg4TEN5emX3kZ7/K67pCOxPBQnqdUZnzt+QvZPnQp9sO3jEN7xtOOwHRIeu7ND2uLhrACnrlrttqel2Q7hdi1JuFXX8It335VXT58J/bV1vxmyq4ZmnkuHGu1KsxpsVdegmlYufebw2yUNAaeDx799S163w9Bf3NAfeHBXA84/tgO7uZzVNbtSblDOxcxbTp8BxZwf+yBQ7gIbP7NZ9vzDPzpOw1reInLufZHZW45xOggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEAuAdKTuVRKMFbTM7TdDrR2uw+lvwimIYBAZQto9Uf90oqtG1pbZeSO9pIHXDXIduj8ebua6Bk5cvHDsgPXSrJaAVe/NNSaXL06bdixfHnoKuGqr35piHmbHW69Z+WKwKvM6vE0QK3XtxybBhfna3rtv/vyqzLWeacdDu80GmrWIOeeo8fSdvMdP+hxDZt+xw7uPrDu7vQ5mj7e1PQVefadI4sGdmfmZk0f2uh85fQZYPTEmQyBkAusH9oov/7p/yNXL01/slIrIpHGlTJ38fQnYzxCAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEE5hEg1DoPTNDDlmU9IpJyHKYnOSCJFSscY3QQQKByBTRYmglmZgKuXU2N0tbQYDzkqgEwDbNp2PHA2bPpx5Uiq1U19xx99/bp6K3pNehaH43ajgn7e036++0N5nlwa3b+IOU8uxQ0rP7fe+PN9D59drBVw60aatZr77dpGPPguQvpoPIhOxSp13upmh5bX2d+Wj5VQjOVj/U9M3rnnbKxvU2ikUjBh9X1HjhzTl46dcr3ugs++Dw76JqeOnjYDp8elYG2VTJ8xx1FV6bVgPCBs+fSrw19jeRjq8ua/uhmaDzmYbo9XC6fAbcXzAMEKlhA7zjRPzosL/38OcdZRprbZO5D+w8tUsH+t9ZxUDoVKTD1tX9XkefV8c3/6jkvztVDUnYD1X5d//yXzv8WlN0FnGfB/+HfTHie4Vw9JGU3UO3XtewuGAs2KsC/OYxyLslk1f5vjmp6DfNvjiV5ixk9aLX/m6OaXsNGXzhlOFk1XWvOtQxfoK4l89nM/79xvSTKrstruHpew2X34jSwYEKtBhALnSK2fiSZSs3d795vcPMm9xB9BBCoEoHbAdes89XKnhpwbYvHcwYzNcynIT4NdLmDalPT03Lhxkd2eHU6HWBdypBj1imV5KFaZDw0zBfGptU49evpw++kl9fT0pwOt7ba17urqWnRJYf5+mp4Vyuplqodv3RZ9OuZw2+nw+AfB5mj0lJXlzb9ONy83PE+SL8vrlyR9+216nrD2vR9m6nsrGHtgbZW0e+Zc9N162sn+zNAz+36zMdVVk9curRoRdb5zj3zGp3v+TCPl8NnQJj9WBsCfgWSE+Py2t7nZeZW1h9Y1MTEijdL6soFv9OzPwIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIVLgAodYluMCpVOrL7sO2dnZI14Y+9zB9BBCoYgEN24U5cFfFl8b4qWtl0yMXjU9bVRNqAPRjR39VYsOKpkFNrU5LQwABBMIuEG9KiN6B4tD+1xxLjbSslllCrQ4TOggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggg4BUo/D693jkYKUSge6RdZG6Xe5f+sWH3EH0EEEAAAQQQQAABBBBAoOwEhia9t2a2ljWIftEQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBhQQItS6kE8BzsejsDhGrLntqrWY0OD6WPcRjBBBAAAEEEEAAAQQQQKAsBfQuFPrlblqtlYYAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggsJECodSGdAJ5LiTzmnrZ3KCnRWNQ9TB8BBBBAAAEEEEAAAQQQKEuBnNVa480iNbGyPB8WjQACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACpREg1Foa5/RRanqGtttVWtvdh8z1C1/3NvQRQAABBBBAAAEEEEAAgXIR6EkOiN6RwtGsiESa2xxDdBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIFsAUKt2RoBP7asud3uQ+T8Za97I/oIIIAAAggggAACCCCAQBkJ6J0oBjaPeVYcaVxp/50fP4Z6YBhAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIG0AL9NLNELIbZ+JCkS2eo+XHJis3uIPgIIIIAAAggggAACCCBQ9gKD46Oi4VZHq4lJJNHiGKKDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIZAQItWYkAv4+N+et0tra2SGdvT0BH5npEUAAAQQQQAABBBBAAIHSC8SbEtI7vNFzYKupzTPGAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIKAChFpL8TroHmm3rNQO96EGt3hvx+nehj4CCCCAAAIIIIAAAgggUK4CyW3eO1NYyxrEqouX6ymxbgQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQCFCAUGuAuJmpo9G5nWL/2jbT1+9atah/dDh7iMcIIIAAAggggAACCCCAQEUJ6N0p2td2ec4p0tLuGWMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQINQa9Guge6sdZk3tdh9GA63RWNQ9TB8BBBBAAAEEEEAAAQQQqCiB4cmtnvOxGhpFamKecQYQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKC6BQi1Bnz9a2qm77ertDrKEGmYNTkxHvCRmR4BBBBAAAEEEEAAAQQQWHqBrg196TtVOFZiRYRqrQ4ROggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgggYAsQag3+ZfCY+xA5f6nr3og+AggggAACCCCAAAIIIFABAvpHfUOTE54ziSRa7L//40dSDwwDCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCFSxAL9BDPDix9aPJC1LRt2HyHX7Tfc29BFAAAEEEEAAAQQQQACBShHoHUqKhlsdrSYmkcaVjiE6CCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQ3QKEWgO9/rOeKq2tnR3SvrYr0KMyOQIIIIAAAggggAACCCAQJoF4U0L6R4c9S7IaV3nGGEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgeoVINQa1LXvHmlPpWS7e/pct910b0MfAQQQQAABBBBAAAEEEKg0gcEtY55TspY1iFWf8IwzgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAC1SlAqDWg6x6NpnaJWHXZ02t1op7kQPYQjxFAAAEEEEAAAQQQQACBqhDQu1Z09vZ4zjXS3OYZYwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBKpTgFBrENe9e6sdZp17xD31wOYxicai7mH6CCCAAAIIIIAAAggggEBVCCQnNnvO04o3ixWr9YwzgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAC1SdAqDWAa15Tc2W7XaW1PXtqDbMOjo9mD/EYAQQQQAABBBBAAAEEEKgqAb1zRePKFZ5ztppWe8YYQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACB6hMg1BrANU+lUo+5p9Vf3sabEu5h+ggggAACCCCAAAIIIIBAVQkkt3mrtUYaV9p/F8iPp1X1QuBkEUAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEMghwG8Nc6D4GYqt+9RoJCJJ9xxDkxPuIfoIIIAAAggggAACCCCAQNUJ3Ds6LHonC0eL1EikaZVjiA4CCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCFSfAKFWw9c8JeKp0tq+tktaOzsMH4npEEAAAQQQQAABBBBAAIHyE1jWUC+D42OehVsJu1orDQEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEqlqAUKvJy9890m5Pd797yuHJre4h+ggggAACCCCAAAIIIIBA1Qr0jw17zt1a1iBWvNkzzgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCFSPAKFWg9c6Gp3ZLWLVZU8Zb0pI14a+7CEeI4AAAggggAACCCCAAAJVLaB3ssj1c1KkkWqtVf3C4OQRQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSqXoBQq6mXQPdWO8xq7XRPl5wYl2gs6h6mjwACCCCAAAIIIIAAAghUtcDg5k2e89dKrVat4+8EPdswgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAClStAqNXQtY3FLu+wQ63t2dNpmLV/1HtbzexteIwAAggggAACCCCAAAIIVKNAT3JAWla3eU7damz1jDGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALVIUCo1dB1np21drun6h3eKPGmhHuYPgIIIIAAAggggAACCCCAgC0wuGXU4xBpWmX/vSA/qnpgGEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgCgT4TaGBixxdN7w1EpGke6rkts3uIfoIIIAAAggggAACCCCAAAK/ExgcHxO9w4Wj2YHWdLDVMUgHAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQSqQYBQq5GrPOep0trZ2yOtnR1GZmcSBBBAAAEEEEAAAQQQQKASBTTQqsFWd7MaW91D9BFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoAoECLX6vcjdI+12qX/DAABAAElEQVT2vTG3u6dJTlCl1W1CHwEEEEAAAQQQQAABBBBwCwxuGXUPiVVbJ1a82TPOAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIVLYAoVaf1zcaTT3mniLelJCe5IB7mD4CCCCAAAIIIIAAAggggIBLoGV1W86fnyJNVGt1UdFFAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoOIFCLX6ucTdW+tE5na4pxianHAP0UcAAQQQQAABBBBAAAEEEJhHYHDzJs8zVkOjWMsaPOMMIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBA5QoQavVxbWPRy7tErPbsKaKxqPQOJbOHeIwAAggggAACCCCAAAIIILCAQNeGPmnt7PBsYSVWesYYQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACByhUg1Orj2s7OWV927z44PibxpoR7mD4CCCCAAAIIIIAAAggggMACAv1jw55nI02rRCI1nnEGEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgMgUItRZ5XaM9n7o/EhFPSdZcv4gt8hDshgACCCCAAAIIIIAAAghUjYD+geCyhnrn+VoRiTRSrdWJQg8BBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACByhUg1FrstbXkEfeu890y070dfQQQQAABBBBAAAEEEEAAAadANBaVe0dzVGttaXduSA8BBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBCpWgFBrEZe2ri/ZLWJtd+86uHmTe4g+AggggAACCCCAAAIIIIBAngJDkxPeLWtiYsWbveOMIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAxQkQai3iks7MRHe7d2tcuUJ6kgPuYfoIIIAAAggggAACCCCAAAJ5CsSbEjl/roo0t+U5A5shgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEA5CxBqLfTqdW+tE0ntdO+W3LbZPUQfAQQQQAABBBBAAAEEEECgQIHkhPdnK6s+IdayhgJnYnMEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEECg3AUKtBV6xWPTyLnsXx70vo7GoDI6PFTgTmyOAAAIIIIAAAggggAACCLgFOnt7pLWzwz0sVuMqzxgDCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQWQKEWgu8nimRR9y7aKBVg600BBBAAAEEEEAAAQQQQAAB/wKDW7x/NBhpXClSE/M/OTMggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEBoBQi1FnBpanqGtotYfe5dBreMuofoI4AAAggggAACCCCAAAIIFCnQPzos8aaEc28rIpFEi3OMHgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIVJQAodYCLqdlpb7s3rwnOSAtq9vcw/QRQAABBBBAAAEEEEAAAQSKFNA7YfQOJT17R1ra7b8z5MdYDwwDCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCFSIAL8NzPNC1vYO2RVaLbtSq7P1bxpyDtBDAAEEEEAAAQQQQAABBBDwLTA0OSEabnW0mphYDY2OIToIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFA5AoRa87yWc3PWI+5NtUKrVmqlIYAAAggggAACCCCAAAIImBWINyWka4P9t4Wulq7W6hqjiwACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAClSFAqDWf69idbBaZ2+XedHDLqHuIPgIIIIAAAggggAACCCCAgCGB4cmtnpmsurhYyxo84wwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggED5CxBqzeMaxqKRnSJWXfamyxrqZXB8LHuIxwgggAACCCCAAAIIIIAAAgYF2td2SWtnh2fGSHObZ4wBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAofwFCrXlcw7mUtdu92b2jwxKNRd3D9BFAAAEEEEAAAQQQQAABBAwKbPzMZs9s1vIWkZqYZ5wBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAobwFCrYtcv5qeoe2WJd3uzXL9YtW9DX0EEEAAAQQQQAABBBBAAAF/AuuHNkq8KeGcxIpIpKnVOUYPAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQTKXoBQ6yKX0LKsR9yb9CQHJLFihXuYPgIIIIAAAggggAACCCCAgGEBvUPGwOYxz6yRplUidriVhgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAClSPAbwAXuJax9SNJkdT97k0GN29yD9FHAAEEEEAAAQQQQAABBBAISGBwfFQ03OpoNTGx4s2OIToIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFDeAoRaF7h+c3NzniqtrZ0d0rWhb4G9eAoBBBBAAAEEEEAAAQQQQMCkQLwpIXrHDHeLtKx2D9FHAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIEyFrDKeO3BLr17pD0anX3Xvp9lXfaBtv2PXxCtEkRDAAEEEEAAAQQQQAABBBAoncDZk1Py5F//jeeAs++9JamPrnnGGShvgVQqVd4nwOoRQAABBBAoEwHL4lcEZXKpfC+Tf1/5JmQCBBBAAAEE8hKopn9f8e+LvF4SbIQAAggggIBvgWr590V03dBtKyq13qZwPohFZ3e4A61aHah/dNi5IT0EEEAAAQQQQAABBBBAAIHABfSuGfrlblRrdYvQRwABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKB8BQi1znPt7Jowj7mf6h1KSjQWdQ/TRwABBBBAAAEEEEAAAQQQKIHApgc/6zmKFW8WqYl5xhlAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIHyEyDUmuOa1fQMbbertLa7nxqanHAP0UcAAQQQQAABBBBAAAEEECiRQNeGPtE7aDiaFZFIc5tjiA4CCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCJSnAKHWHNfNsuZ2u4d7kgPeX566N6KPAAIIIIAAAggggAACCCAQmIDeOSM5Me6ZP9K40v67RH689cAwgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggECZCUTLbL2BLze2fiSZSs1tdR9oeNIz5N6EPgIIIIAAAggggAACCCCAQMAC/aPD8uLP9sjMrZlPjlQTk0iiReYun/9kjEcVJzD1tX9XceekJ9Txzf/qOS/O1UNSdgPVfl3//JfPld01y2fB/+HfeO/ixLnmIxfubar9uob76rC6oAX4N0fQwsHPX+3/5qim1zD/5gj+/RT0Ear93xzV9BoO+rUU9vmr6VpzrmF/NS6+Pj6b+f83i79Kwr0Fr+HqeQ2H+5UYzOooZeNynZvzVmlt7eyQ9rVdri3pIoAAAggggAACCCCAAAIIlFog3pSQ3uGNnsNaTW2eMQYQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQKC8BAi1Zl+v7pF2y0rtyB7Sxxs/s9k9RB8BBBBAAAEEEEAAAQQQQGCJBJLbvD+jWcsaxKqLL9GKOCwCCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCJgQINSapRiNzu0U+9egWUOiVYDWD3mrAGVvw2MEEEAAAQQQQAABBBBAAIHSCcx3N41IS3vpFsGREEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEDAuACh1gxp91Y7zJranelmvvePDks0Fs10+Y4AAggggAACCCCAAAIIIBACgeHJrZ5VWPFmkZqYZ5wBBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAoDwFCrb+7TjU10/fbVVodZX00zJqcGC+PK8kqEUAAAQQQQAABBBBAAIEqEuhJDqTvrOE+Zaq1ukXoI4AAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFA+AoRaf3etUin5hvuydW3oy/lLUvd29BFAAAEEEEAAAQQQQAABBEovMDQ54TloJNFi/70iP+p6YBhAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoAwE+E2ffZFi60eSkYgk3ddr04OfdQ/RRwABBBBAAAEEEEAAAQQQCIlA71BS9A4bjlYTk0jjSscQHQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQKA8BQq3p6zT7mPtytXZ2iH7REEAAAQQQQAABBBBAAAEEwikQb0pI/+iwZ3FW4yrPGAMIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIBB+AUKt3SPtqZRsd1+qXLexdG9DHwEEEEAAAQQQQAABBBBAYGkFBreMeRZgLWsQqz7hGWcAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQTCLVD1odZoNLVLxKrLvkxa7acnOZA9xGMEEEAAAQQQQAABBBBAAIEQCugdNro29HlWFmlu84wxgAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAAC4Rao7lBr91Y7zDr3iPsSDWwek2gs6h6mjwACCCCAAAIIIIAAAgggEEKBwc2bPKuy4s1ixWo94wwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEB4Bao61FpTc2W7XaW1PfvyaJh1cHw0e4jHCCCAAAIIIIAAAggggAACIRbQO200rlzhWaHVtNozxgACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCIRXoKpDralU6jH3pekd3ijxpoR7mD4CCCCAAAIIIIAAAggggECIBZLbNntWF2lcaf8dY1X/2OsxYQABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQACBMAtU7W/3Yus+NRqJSNJ9cXL9ItS9DX0EEEAAAQQQQAABBBBAAIFwCdw7Oix65w1Hi9RIpGmVY4gOAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgiEV6BqQ60pEU+V1va1XdLa2RHeq8XKEEAAAQQQQAABBBBAAAEEcgosa6iXwfExz3NWwq7WSkMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgbIQcJWxKYs1+19k90i7yOz97omGJ7e6h+gjgMASCDz+6TFZUV9n/Mj/8cX9MjV9xfi8pZpwfM1d8lDvOqOHO3LxQ/nuy68anZPJEEAAAQQQQACBpRIY3DIqr+593nF4a1mDWPFmSV390DFOBwEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEwidQlZVao9GZ3SKWIzEXb0pI14a+8F0hVoQAAsYEHupdb2yuUk+UqK2Vbd1rSn1YjocAAggggAACCJSVQMvqNulJDnjWHGmkWqsHhQEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEQihQfaHW7q12mNXa6b4WyYlxicaqs3Ct24I+ApUq0NPSLEk76FCOTQOtGmylIYAAAggggAACCCws0L9pyLOBVmq1ah1/1+jZhgEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEFh6gaoLtcZi0zvtUGt7Nr2GWftHh7OHeIwAAhUq8NA96yUaKa+Pvo7EchnrvLNCrwinhQACCCCAAAIImBXQSq1asdXdrMZW9xB9BBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBAImUB5JbsM4M3OyiPuaTTQGm9KuIfpI4BABQpotdPJu7vL6sweWNdTdkHcsgJmsQgggAACCCBQcQKDW0Y95xRpWmX/fWPV/QjscWAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgTALVNVv9KLrhrfaBRqT7gsyuGXMPUQfAQQqWGDLmrtkRX153H52oHWV9K1cUcFXg1NDAAEEEEAAAQTMCwyOj4nekcPR7EBrOtjqGKSDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAJhEqiqUKvI3G43fmdvj7R2driH6SOAQAULRO10++d714f+DMtlnaGHZIEIIIAAAgggUHUCGmjVYKu7WY2t7iH6CCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAQIoHqCbV2j7Tb95rc7rZPTmx2D9FHAIEqENAKqPoV5jbWeWfZVJQNsyNrQwABBBBAAIHqFNj4Ge/PelZtnVjx5uoE4awRQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQKAOBqgm1RqOpx9zXI96UkJ7kgHuYPgIIVInAA+t6RKuhhrElamvlwXV3h3FprAkBBBBAAAEEECgLgcSKFTl/3os0Ua21LC4gi0QAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEKhKgXCmuUxfiu6tdSJzO9zTDk1OuIfoI4BAFQm0xRtEq6GGsYU5cBtGL9aEAAIIIIAAAgjkEhjcvMkzbDU0irWswTPOAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIILL1AVYRaY9HLu0Ss9mzuaCwq944OZw/xGAEEqlBAq6FqVdQwtY7EchnpcHxkhWl5rAUBBBBAAAEEECgbga4NfdLa2eFZr5VY6RljAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEll6gKkKts3PWl93Ug+Njsqyh3j1MHwEEqkwgGonIQ/esD9VZ77i3P1TrYTEIIIAAAggggEA5C/SPef+YMdK0SiRSU86nxdoRQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQqEiBig+1Rns+db+dWUu6r16uX2y6t6GPAALVIZBc3SZdTY2hOFmt0KqVWmkIIIAAAggggAACZgT0DxrjTQnnZFZEIo1Ua3Wi0EMAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEBg6QUqPtQqljziZp7vFpTu7egjgED1CHyh754lP1mtGvvAup4lXwcLQAABBBBAAAEEKkkgGotK75Dn7xwl0tJeSafJuSCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCBQEQIVHWqt60t2i1jb3VdqcPMm9xB9BBCocgGtjjrWeeeSKkze3S2J2tolXQMHRwABBBBAAAEEKlFgaHLCe1o1MbHizd5xRhBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAYMkEKjrUOjMT3e2WbbFvM96THHAP00cAAQRkKUOlK+rrZMuau7gKCCCAAAIIIIAAAgEIxJsSOX8OjDS3BXA0pkQAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAgWIFKjfU2r21TmRulxtmcMuoe4g+AgggkBbQKqnbutcsicbne9dLNFK5H8lLgspBEUAAAQQQQACBLIHkxOas3scPrfqEWMsaPOMMIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIDA0ghUbIIqFr1sB1otO9j6SYvGojI4PvbJAI8QQAABl8C4XS21I7HcNRpst2/lChloXRXsQZgdAQQQQAABBBCocoHO3h5p7ezwKFiN/DvMg8IAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAkskULGh1pTII25TDbRqsJWGAAIILCTwkF01tZTtgXU9pTwcx0IAAQQQQAABBKpWYONnvNVaI40rRWpiVWvCiSOAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCAQJoGKDLXW9Axtt6u09rmhB7eMuofoI4AAAh6BnpZmSa5u84wHMbAUlWGDOA/mRAABBBBAAAEEykFg/dBGiTclnEu1IhJJtDjH6CGAAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAwJIIVGSo1bJSX3Zr9iQHpKVEITX3sekjgED5CTx0z3qJRoL9iEzU1sq27jXlh8OKEUAAAQQQQACBMhXQO3f0jw57Vh9pabf/LjLYf/t5DsoAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAgh4BCrut3a1vUN2hVbLrtTqbP2bhpwD9BBAAIEFBDRwOnl39wJb+H9KA616HBoCCCCAAAIIIIBA6QSSE+Oi4VZHq4mJ1dDoGKKDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAKlF6i4UOvcnPWIm7G1s0O0UisNAQQQKERgy5q7ZEV9XSG75L1tR2K5jNvz0xBAAAEEEEAAAQRKKxBvSkjXBvtvIV0tXa3VNUYXAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQRKK1BZodbuZLPI3C43Yf+Y9/aS7m3oI4AAAm6BaCQin+9d7x420n9gXY+ReZgEAQQQQAABBBBAoHCB4cmtnp2surhYyxo84wwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEDpBFz3XCzdgYM4Uiwa2ZkSy1FWcVlDvQyOjwVxOOZEAIGQC8zMzYkGU/20gdZVol8Hzp7zM41jX52vb+UKx1ghHT2vW/ZXfbSiPsJvE7TFG2RFXZ2saWoSrWir59li93NVzT1z9ZpM37yZ3vfM1av241syNT0tRy5+KNdnZm7PWc0PelqapS0etw2XSZdtOl9TM22Hzp+X45cuz7dZIOOJ2lp7jQ3S09Iy77XWA+s1nZq+kr7mus4L128Esp5STpo5d702eo30WmWamujzmfPOjOtrXl/7Jy5dkjPXrlWEQ+bc/HxXqzvtz4zszw6dL/M5sn/qtPzot2/5OQT7IoBABQm0r+0SvaPH2ZNTjrOKNLfJ7AfHHGN0EEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgdAIVlYiaS1m7LcuJd+/osERjFXWazhOkhwAC8wrsOXpMHlh397zP5/uEVlU9eP6CaJjUbzNR/XXfyfdlQ2trRYVaNXiZXN0mA22t6RBfvs4a+tMvbTpHdtNg5iH7ui1FSDOzDl3To0MbM92iv3/71/vyDi7qaywdxrYt9Xu+we6M3+Td3ekQ5YEz59J2r31wpuh1L7SjXrfk6tXp6565hgttn3lOzynTNNip13f/qdPpsGtmPOzfC329a7A7c31ynZt+Nmnw/sCZs+nvJj6rch0nnzFTr/k//+Vz+Rwu/Xmhnxv6+bGQUfZk+t+Fbd1d2UNFPVbnv/rvzxv5b0OhC/jivf0y0tFe6G6e7Z86eFj0vyk0BKpVYGhyQv7f/+v7jtO3lreInLPfF7O3HON0EEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEECgNAIVk/as6Rnabgdau91sGz+z2T1EHwEEqkTguF3FUCvz+Q3+aOBuy5q7ZO+x477ldJ5cFUfznVgrNGpYV0Ot5d60quK27jWSbF9dUJA13/Puamq0K5M2Siak+fyJk/L8ifcqtoKrBh/H7dfX+JpO34FnnUvfN/ql4b9n3zmSDnbna7/QdhpK1XXmG0BcaC59b+qXzqfh26cPvX27cu9C+y3Fc/q+/+zatdK3aoXx17sGlzXUqV/aNOD6gh1U1DB+pbbM58dY5515B7czFi/ZIWgTodZMgDyo4Hdmvbm+6+vIRNMgNA2BahboSQ5IvCkhVy9Nf8JgRSTS1CpzF5wVXD/ZgEcIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIBCkQMWEWi3LekQk5bDSX1ImVpj5pb9jYjoIIFA2AhrG29jeVnDoyX2CGozcP3XKV2BOQ1g6j5+299iJsg9larhvomuNHZi8w/d1yddSQ5pqr4HPSgy3akBvwg4I63mabnrr9oc33pcOSP744KG8K8a619G3coVdObknfSt493Mm+ulKv3ZgVt8je46+a2JKI3Oon16fTODUyKSLTJKu1GtbTE1fSYfxlyJ0ucgSfT2tYesv9N1T9OeHVvjVSs4aevfbtEpsqX31NaX/PfHbNPSsfyhBQ6CaBfSOHgObx+TFn+1xMESaVsncxdP2j5f+q/Q7JqaDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAKLCkQW3aIMNoitH0nav3G8373U5ARVWt0m9BGoNgEN7PzsnaO+T1sr8j10z3pf8+j++d4KPteBNKCmlUbLtem5622/H//0mBRTXdHEeWfCrX/xP2xKV/f0cz1MrMfvHBps08CpugYRaM1en4ZSd31qY8GhVF2X3ipd16lhvCCbXk8NL39pcIOv95qJNWauzZ9tGilpoDV77eqtFn8xtslIgDN77qAfz/fe1Ne6vp7mez7fdb3wvn1rcQNNA8R+11LoMgYMVep+/YMPCj002yNQkQKD46Oi4VZHq4mJFW92DNFBAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAIHSCFREqHVubs6u0upsrZ0d0tnb4xykhwACVSmwz74Nt1bm89u00mKxlf00XOa3UuPTh9/2ewpLtr+6abjPxC2/TZyEBg4f6l0nf2IHLYMOg5pYb0tdnWcaDZn+2eiI6PdSNa2y++hQ/sFWve7/3r7uWlmzlE3fazvvGyx52DBzjnq+Gpwu5bXJHDvX97Z4g/zpyFA6DFoOr3c9h8ZlzkqkGhzVgK6pz5BXT5+RmTn/FRh1XRpsLWW7x8B7Xs9dDWgIICASb0qI3uHD3SItq91D9BFAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAoAQC5R9q7R5pt6zUTrfV4JYx9xB9BBCoUgEN7zz7zhEjZ6+3vC6m7bArC/ppB86ekyMXP/QzxZLtqwE/DdRpsC5sraelObRrW8hKq1Vq5VMTtyBf6Di5ntNQpAZbF7qeGvTTNep11yDsUjQNlGqwtZRNbfSYWkk0jOHRTNi22HB+KS2zj5V5zfn9w4DsOU2GOgfaWrOnDvSxvudNXD/9b4qJUG+gJ8vkCJRQYGhywnM0a1mD6BcNAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEEAAAQRKK1D2odZYdHaHiOVIzGi1nf7R4dJKcjQEEAi1gAZ49Mtv04qr42vuKmgaDZL5ue26Bo+eKdMqrWqlAb8wt0wVSw24lkNTT1PVKos9Xw0ZfmlgQ85KqPrc/2KHWZd6jXpuGmwt1ToywctSV+0s9BpqKFJDyWGpIpvP+nW9JoKc7mO98P777qGi+nrNNchdita3ykxl5tdOf1CK5XIMBMpGQO/y0b62y7PeyIo7PGMMIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIBAsAKl+Q18gOeQEnnMPX3vUFKisah7mD4CCFS5gAZDTVSm29a9Ju8KmR9XrOzxJb/v5Pty4foNX3Msxc6Td6+Vh3rXLcWhCz6mBhL/xK58arISZMGLWGCH2O8Cc1r9VEPSYWga1HYHRrUq67/fNOIrxG363Cbv7l6wqqyJ44XxvBc6L/1c0kq/YXkt5Vprnf2e1DbWeWdgr6fjly7LmavXch2+oDH1LFWYeUOr/6qw12dm5OD5CwWdIxsjUA0Cw5NbPadpNTSK1MQ84wwggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggggEBwAmUdaq3pGdpuV2l1pHs0zJrr9pHBETIzAgiUi4AGQ3914j3fy9VKhxpszac9aIcQ/dwifvrmTfnZO0fzOVSottEgmoYJy6lpMG3Hhv7AA5DFmCSW1aYrBLtDpMXMZXIffR9kqhDr912f2iga8AxT0+saZLVgfX+H8bzzuQbqEtYgtwbN1Tboz5GXTp3Oh2rRbQba/IdNFzuIvpZNVNjVKq0m/sBjsfXyPALlJtC1oU/0jh+OZkUk0tzmGKKDAAIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAALBCpR1qNWy5na7eXL+MtK9EX0EEKhagT1Hj4kGRf228TV33Q7zzTeXhvs03Omn6XrLLXzU09JcNhVa3ddGQ2M7f2+wZLcSdx9/vv59q1eH0lS9NGirgVa9RXzYAq0ZT711fRCVNNOvl/sGQ3vemfNf6LsGudUnjO2BdT2+/iggn3PaP3XKyGesvr709RBkW2d/tpo4xmsfnAlymcyNQNkK6B9HJifGPeuPNK60/44y2Pe356AMIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIFDFAmX727nY+pGkSGSr+9rlum2kexv6CCBQvQIaEH360NtGAL7Qd8+C83y+d72vANLU9BXZd/L9BY8Rtic11LjTDvmZCF4t1bm1xRtksWtb6rWZqM4Y1Jo1zPcn9q3stbJmmNtwxx3GlxfmQGi+JxvWYG5XU5OMdDiK8ed7SgVtp3/kcPD8hYL2ybWxOgYRnM4+1gb7vea36fkeufih32nYH4GKFegfHRYNtzpaTUwiiRbHEB0EEEAAAQQQQAABBBBAAAEEEEAAAQQQQAABBBBAAAEEEAhOoGxDrXNz3iqtrZ0d0r62KzgtZkYAgYoQ0Cp1xy9d9n0uWt1wvlt3awjRb8DpqYOHfK+x1BPo7cyDCDdqwHfvseOilWvdXyaupdtJw3SlCNS5j1uOfQ3z6W3iw970/WiykqxWa57v/R92C/f69Pr9TwMb3MNL2p+8u7tkx3/JrtZqog20tZqYZt457llpV4v02bQyLQ0BBOYXiDclpHd4o2cDq6nNM8YAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIIIIAAAggEI+AqQxPMQYzP2j3SblmzO+z7QDqm3viZzY4+HQQQQGA+AQ2M/tmmkfmeznv8oXvWy4Gz5xy3r9aQn94220/TOYMIa/pZ02L7amiwx749tqmmFQX3Hjshr53+QPTxfG3P0XfTQdqBtlXy2bVrjQUXtVrrwXMXFjz2fGtiPJwCo3d2yLPvHPW9OA2BljJ06XvBeUygIf2xzjtDUx1aP0dL1bRSq37G+A1n62egrlsrgptuHYnlRj7b9I86aAggsLBActtm+e2+/Y6NrGUNYtXFJXXjqmOcDgIIIIAAAggggAACCCCAAAIIIIAAAggggAACCCCAAAIImBdwpkLNzx/IjNF1Q39pT/x/ZE+uVXV2fvNx7+0iszfiMQIIlIXA458eMxLe+e7Lry54m+WHeteJVlv027SCaHZQTufUuYttGoj663/Zt2CY0pSR3oZanfw2DXJ9dfOY71BYZh1ajVVdiwmHqf+D6+5Oh8sy8xX7/fkT78nTh98pdvd0yPfRIW/Ft6InzHNHDeiduXpNLly/IRdv3EjvpaE4De1peDEsTSvwXp+Zuf0+rY/WSEciIW3xBmOvpexz1eP9xxedQaXs5/N9/KXBDYFUadVrpqHDqenp9Ps/O9iu10+rIPe0tNhGy31Xgs51rnotvvObFxf87Mm1X/aYBtuX4jWfvYb5Hu+fOi0/+u1bOZ829d+D773xZvoa5jyIj8HJu9f6DlLr6+s7+170sQp2RaB6BJ76m7+Vk4ePOE44dfVDmT3lHHNsQMe4QCqVMj4nEyKAAAIIIICAV8CyyvJXBN4TYWRRAf59tSgRGyCAAAIIIGBEoJr+fcW/L4y8ZJgEAQQQQACBRQWq5d8Xdib0tkX5VWrt3loncnm3u0pr/+gwgdbbl5UHCCCQj4BWAU22r/YdnttihyhfOnU6HSLU0OC27jX5HH7ebX5lBykXqkw6745L+IQa+K1yqMvXUN337VCYVk4stmkQ9cSlS/LwxvvSIcBi59H9tHLlz+zKnsWEa/0ct5h9NcD6/Hsn05U2///27i82riu/D/iZ4YxEkSYpSiZNcxWLu7RlaUU2dCjBIioZkrdVFwnQGu1DA7RADLSANw+FgaLoPhRFUBTIQxG0QPOwKZICCZAWSQsH3aDZB7dwgHUT70LaSq7lrO2V/9v6Q8mSLVmW1qbIzqFDi3OHosi5Z4YznM8BuJx75t5zz/nM5Vjc+fJ3V5tvDEQe+YVfCAdHR+o5Te5jYmjzpQ8+/CrIutKAMST92Mhw0sq78Txx7XkracbQ5tQDaW9DHa/7P6+8H8Ug991aDOTGFoPose3duSP83T2PLAaAFzsS/E8MzcYq03cLfiY4RcOGiAHgeN0vD3LHcHC0jW3J7W4TeOnDc0n+yGFieKghodYUVbDjf6c0AgTWJjB17HBNqLXQW6lG31UO4fYXaxvEXgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQI1CXQdqHWrq7r364EWquSOKVyKUwdO1IXgIMIEOhcgRgcjcHWPFVVo14MycUg2O+//MpioDVPuDPOKVYpbacW13/koV1JphwN7xU+W8uJYsDt9069vFgxMs6v3haPjcHWGJRt1RbXGue31tuKx3BkDC2e+/TT3Nf+Wk1isDAGWV9874M1BbZjODFW1Twze3nxNYxh1FQtVqrNc40dG9udaiqL45y5dDk899PX1+Sy/MQx+H22UnU2Bsp/uVKZOFWLYecfnH1z3fNJdf61jhOvqXh9vP7RR5WvK1+FV9d6fHa/WMU0/izlrWQ8MXR/7uB0dm5fVuithOlyttMXL+YcweEEOkdgfGoixDuB3PjketWii4MjYf5y6/6boGqyNggQIECAAAECBAgQIECAAAECBAgQIECAAAECBAi0qUD9SZ8NWnDlDoS/kT310oeO2X7bBAgQuJdADAMuVUC8176rPR+DTPH20EcqAbM8LYbJVquymWfsRh0bK0bmCfIuzev7b5zNFTZcGmfpewyoxTHztlh5N08wNu/5Vzs+BqB/+8RP1hxoXT5WvPZXqwy6fN88j2P4Mt7S/geVirfrrUAcg4vf+8mpxSrIeeaw/NjRvr7lm+t6HMO18XpP1WK4Nt6ufr0uS+eP7xXxNUxxnS+NGb/nrTa9fKzUj6NVXO9v/p+XFsPZMcy9VI0177lSVDKN7xXxvwcp28Rw/vHi+2GsYqsRILB2genjx2p2LvYNVv6+su1+ha5Zhw4CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECrSzQVp/IlR85OFXJCkxlQVf6wDG7j20CBAjcTeD7b/zsbk+tq//4N8bWtX925xiujdUp260dqlQyzdvi2htRDTVWB80bWo6B3dQhtbxeMcQXK9E+/9bbuYaKodh6A5X3OnEMXMbwYZxnnnPEtcawd6qW5zbuBx+sKhSfa0oxYBgDrSlC7Oup1LuWSU+NPNByQe7oFIPR/+aHf7H4XpEqyLrc48S580lej4nhoeXD5n786M6ducdIEdjNPQkDEGgzgT3TUyHeEaSqdZVDsT//z2TVmDYIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEKgSyHxKV/VcC27c/m6lNE7VvIZ2jYb4pREgQKBegVgtMVb7m3pguN4hkhz33GuvJxmnmYMM9/YkqVyZN5y52prj2E//4uRqu9zzuRjyi9dIK7QY0v39//dKkqqLX1b6fC/8vT0PJ11aDGz+3umXk1VYPXPp8mI4OVZKzdvirdzrafG4mQQB7nju6P5fztRfoXWl+f/Rqz8Ng93dYfdA/0pPr6svBrkPjj4YYii8FVq8nqJXrDbayBZfl1MXZitrzxdejiH4WLE1jpe3xXEezVkdOM7jzOylvFNxPIGOE+gd6AuTR2bCqRderFp7ob9SPfkTP1NVKE3cOPev/0kTz9a8U43+2/9cczJrrSFpu45Of13/xf/+87Z7zdYy4d/6W7WVvK11LXKtvU+nv66t/eqYXaMF/Juj0cKNH7/T/83RSdewf3M0/uep0Wfo9H9zdNI13OhrqdXH76TX2lpb/Wq89/y8N/v/b+59lbT2Hq7hzrmGW/tKbMzs2qdS69jBkYWF8FSWQZXWrIhtAgTqEfj+6z9LEkCq59zxmBiYbHRgq965rXZcigqCMbAWQ4uNaq99dCX3a5unumfKdcVA6/d+cipJoHVpXq9eShvMia/n7/zfU8kCrUvzTHWNxJBgPS1eA/Uemz1fDE6m/nmPwcWUFW33V4KZrdDiNf8ffnwiudfd1nby/Pm7PbXm/nidpKruHEPK9QaxlyZ8tvKHG3mqJS+N4zuBThTYN3OgZtmFrT2h0JP/DwhqBtZBgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgMCiQH3Jjg3AK5UWvlOp0tq9/NSxes741MTyLo8JECBQl0AM/MRbsW9Ei2G0GKptx5Yi7Pn6Rx81dOnRN2+AMIbKUlQJzbPQGBb93VMvh9S3XY/jphpzKdAav6du565fTzJkrEJaT9s/lO6W8i++/349U7jnMbHqdAyBpmgPJwzx1jufeB3FEHeq63Mt84iGKa7fieE010uKcOzLFy+uZen2IUBgBYF4R5Dd+/fWPFMcSPMzXjOwDgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEQnuEWseOVsKs889kX6+pY0dCqVzfbYSzY9kmQIDAD997P0mYab2SL7zzXltW0YvVCPfmvC12tHq1gVVal16LGFTL23YPDOQdou7jl8Kijaq2mMInLu5PXnu9YT9DqeZY7qrvnz4pAtzRKIZOUwVP43jZduL8hWxXXdvx5zsGWzeqxSBrrPjbzEDr0lp/9OG5pYd1f49h1BSVffOGqWOoP1YG1ggQqF9g8vDjNQcXereHQrm+P5KoGUwHAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQJVAvUlO6qGaPxGV9enT1WqtI4sP1MMs+47VHs7yOX7eEyAAIH1CMTwz5++0dyKqTGk+MI7765nmi2zb7wtdorQ1oeJKkuuBnP11s3Vnl7Tcw/1b9ythlNVj7zbQm9+MXe3p9bV/0XlZ6hRLVW4sZ5KrTu2dYf4laK9lCAwudo8Tp47H+J7WYr2aILQer3zODN7uWEB6XvN6UTFMG+L7415q6wO9/bkvu7OVP5oINX1kNfE8QTaVSDeGaR/hffDwsAD7bok8yZAgAABAgQIECBAgAABAgQIECBAgAABAgQIECDQ0gJtEWpdWFj4blZxz4HHQu9AX7bbNgECBHIJxADQax9dyTXGeg7+/us/a9vAUYrKpTFs1ajqo8tfhxS3E09VqXP5vFrl8dVbt1plKqvOI8XruOoJ7vJkytf+zOylu5wlTXcM/6Z6DxsfHEwzqTYbJb4npTCcGM53e/KJoXzHR/YYctYIEMgvMPXk4ZpBiv07K3932Ra/TtfMXQcBAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBVhZo+U/hyg//0qFKsaupLOJKHyxm97FNgACBegRitdZmVLaLtyA/fbF9bws92L21Ht6qY2ZvfFa13aiNFMHZWKmzniqfjVqTcZsnMNTTk+Rk8TpMcS3eazLxvSVFi5VCO7X96IMPcy89VmrNU806b6XcGHA+e/Xj3OswAAECIUwemQnxTiFVrdgVigP3V3XZIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIEAgv0Dmk7n8A6YeYSGEmiqtI1/fHYZ2jaY+lfEIECCwKBCDli9VAk1HHvqFhor80V/9tKHjN3rw4d7e3KcY7bsv/NbfOpZ7nGYN0Ld1S1NCic1aj/OsTSDFtR7P9O4n19Z2wpx7nbt+PecIXx4eA5kxyN2MIG6SCSccJFZqjevOE2SPfjHYWs8fL2wrlULeCsGnL1xsyh9oJGQ3FIGWFYiB1hhsPfXCi1VzLPRVqrV+3L5/oFS1GBsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEWkSgtSu1jh0cqTh9O2t14PjRbJdtAgQIJBX4s7NvNTTIdeLchZCqmmLSha9jsE6s4hiDZlrnCQwnqtTarJ/5NxNW5/xaJXjeiS1W646h0LxtYnioriHyVmmNJ60nTFvXZB1EoEMEJp84VLPSwtaeUOjdXtOvgwABAgQIECBAgAABAgQIECBAgAABAgQIECBAgACB+gVaOtRaKs09G0Khe/nyegf6wvjUxPIujwkQIJBcIAaafnD2zeTjxgEbOXZDJrzCoDHcmaeC4QpDtkWXUGtbvEzJJ7ljW9U/ReoeP1UF1XtNIN52/srNW/fabU3PDyWoyLymE7XgTifOX8g9q1ipNVZsXW97dGel+mOOFl//lOHmHFNxKIFNIzD4wPCKv4cW+/P9vG4aIAshQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgkEhg/Z+yJzrxPYcZO1pJkBSezu43fbx9blOdnbttAgTaSyBWU23E7cKff+udhlaBbYbytspteDuxdeq6O/G1XlpzDLTWE0pcOn7593g7+2a1q7fShFo7OcgdK+vm/W9AvHZisHW9bWJ4/ccsP8fpi/mrzC4fz2MCBL4U2Pf4dA1FrNRa2JLmjx9qBtdBgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoAMFWjbUWi5ff7oSah1Z/pqUKiGqPdNTy7s8JkCAQEMFnnvt9aTjx1DbD997P+mYBmueQN+Wrc07mTNtOoFmhlpT4fVtKacaqi3HOZmiWuvw0LrWPj64PeQNE5++OLuuc9qZAIG1CcQ7hsSKrdlW6F/fz3n2eNsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECNwRaNlQ6+3b4Zk70/zy0b5DB0LvQF+22zYBAgQaJhAr9b30wYfJxn/up6+Hufn5ZONt1ECD3Z1ZkSxVxc6Net2cd/0C3aX2rEocbz+fopWKXSmGadsxTl+4mPs9O1ZqXc97x6M7d+Tymr3xWYj/7dIIEGiMwOQTh2oGLg5UqisXWvZX65r56iBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECDQygIt+clb6eEDRyt3a60pyTr5xEwrW5obAQKbVOD5t94JN+fmcq8u3sb6zKXLuccxwMYJdHrVyo2T37gz562YuXzmqYKmy8ds9ONyV0v+U7HRy/5q/Pjen/d9OwZaY7B1rW1iKF/FxxTVZdc6V/sR6ESBySMzYWvPtuqlVwKti8HW6l5bBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAjUIdCiSYX5Z7Nr2bVnPAztGs122yZAgEDDBeItw2OwNW977rXX8w7RMseXKyGtTmydXrWyE1/zdl3z1VtpKrX2bdnSrgTJ5p2iWvfE8NqCqju2dYfh3p5ccz998WKu4x1MgMDqAqVyKXyzcgeRbCtufyDbZZsAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgToEWi+VNHZwpHLvxqeya5k6djjbZZsAAQJNE4ihpjy3c857fNMWusYTfTE/v8Y9N9du1z//+eZakNXcUyBVgHuuQ39m7gncBju8efXjkLfKbqzUGiu23qvtz1mlNcVc7zVHzxMgEMJj31rhd9PSllDo3Y6HAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIGcAvf+dD3nCdZ7eKm08N3sMf07d4TxqYlst20CBAg0TSAG0r7/xs/qOl88NkWl17pO7qCkAnPzC0nHM1jrC6QKcK8l0JhSo29LOclwN+fmkozT7oOcPH8h1xLi6x+Drfdqa9lntTFOX5xd7WnPESCQSKBvx8q/nxYH1laVOdE0DEOAAAECBAgQIECAAAECBAgQIECAAAECBAgQIEBgUwq0Vqh17Gh3CAtPZ6WnnlyhEk52J9sECBBosECsgHfm0uV1n+XPzr4Vrn/++bqPa+UDNtt61mrdqeteq89m3C9lhdW+LVuaRlQqdiU5180vhFojZKy2nbdNDK8edttWKoXdA/11nyZeq2dmL9V9vAMJEFifwOThx2sOKPT0h8LWnpp+HQQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIrF2gtPZdG79nuXTtOwuhUHXPxlK5FL556EDjT+4MBAgQWIPAcz99PcTg0VpbrPKYIgy11vM1a79UQb/XProS/vydd5s17dznmb3xWe4xDNBeAimDzOWu5v0tUd/W5gVo2+sVrW+28TqI71d7K3cPqLfFKqyxYuvd3j8frYydp6Lv2cofXqS8Xutdp+MIdIrA7v17w9Cu0XDpg3NVSy707QwLP/fvhSoUGwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgTWIbD2ZNY6Bq1319vzhV+rfNZf1SaPzIStPduq+mwQIEBgowRiYOh7Pzm1UadvmfN+cXs+yVzKlTf9WAFXI9AJAt3rCMTn9Yg/Wyna1Vu3UgyzKcY4ee58rlBrDKzGYOvpi7Mrejy6c+eK/WvtjPPTCBBorsDkEzPhhf/6XNVJiwP3h/krlaDr/O2qfhsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECKxNIE3iYW3nWnWv0vgvfbvyWf9Udqd9M6q0Zk1sEyBAYKMFYrj3btUG1zO3Zt6OfT3zsi+BJYFrP/986WHu7/1bmlc9dT0VpVdbWIqf89XGb6fnzly6nLsS6sTw0F2XvPf++qvAxtcpzk8jQKC5AvsqdxTpHeirPmmhGIr9+ULq1QPaIkCAAAECBAgQIECAAAECBAgQIECAAAECBAgQINBZAi0Tag2F8EyWfnxqYvGWjtl+2wQIECCw8QKzN/LfWtct0jf+dTSD1QViWDDFtR7PMtTbu/rJEj0bK4KO9t2XZLTZGzeSjLMZBonXwukLF3MtJVZqja9Ptu0e6A95Qv4x0CqAnFW1TaDxAqVyKeyZrvm7zFAcHGn8yZ2BAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwCYVqP1UfQMW2r13aiyEwlPZU+97fDrbZZsAAQIEWkTgSoLbkqeqJtkiJKaxSQVSXOuRJgYXm9FSnmf2s/zh9WasuVnnOHH+Qq5TxUBrDLZm26M781V1PHnufHZI2wQINElg+vix2jN1lUOhd3ttvx4CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBO4p0BKh1rm50rPZmQ4+MBxipVaNAAECBFpTIFUFx/FBoY/WfIXNakngUqJqpaP3pameujSvu30f7cvcCvtuO96jP2WV2nucqm2ePnf90xC/8rSJ4aGaw1cKutbsdJeO659/Hs5e/fguz+omQKDRAr0DfSv+3lrcPtzoUxufAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwKYU2PhQ69jR7hDmv5PVnXziULbLNgECBAi0kMClRBUcxwcHW2hVpkKgVmD2s5u1nXX0DPf2hGZUJ05VqfXKzVt1rHLzH5K3WmsMsMaKrUttx7buMNpXf+D5zOylEAPIGgECGydw4PjRmpMXtvWFwtaemn4dBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAisLnDnE/XV92vYs+XStUqgtVAJtt5ppXIpTB6ZudPhEQECBAi0nMCbiSoDqtTaci+tCWUE3v3kk0xP/ZuNvt5jWDLVOd795Fr9C93ER548dz5XiDS+Rssrsz66c2curdMXZ3Md72ACBPILjHx9dxjaNVozUKH//po+HQQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIrC5Qh+jKWgAALvZJREFUWv3pxj+7EMIz2bPEQGsMtmoECBAg0LoCsYpj/IpVBvO0WFUyVq+8OTeXZ5hVj40Bsqd/cXLVfdby5B//1U/DiXMX1rKrfTaRQLzdfLw+U1RZPTD6YDhz6XLDdPbu3BH6tmxJMv7rH32UZJzNNki8FuJrOPVA/bcWnxgeCkth1P2V96d6W3wPTvUHBvXOwXEECHwp8Ni3Dofn/+C/VXEU+3eG+SvnQ7j9RVW/DQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIE7i6woZVau8ann6pUad2bnV78QFAjQIAAgdYXSBGmilULY9CvkW24tzfJ8Nd//nmScQzSfgJnZtMEUWPAOlXodCXFQ7u+tlJ3XX2vf3SlruM64aBYrTVPi9dBfO+LXw8Pbq97qNMXL9Z9rAMJEEgr8Mj0Y6F3oK960EIxxGCrRoAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIDA2gU2NNRaKCz8Wnaq41MToW/Hjmy3bQIECBBoQYFUlRyfHHtoMdzVqCWO9t2XZOhrnwu1JoFsw0He+vhqslkfeWhXsrGWDzTc2xNipdYULQbWG1k9OcUcN3KM1yqB31gltd4Ww6wx2Bpfr/i43nbyvMrR9do5jkBqgXinkX2HDtQMW9xeqepcCbdqBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAisTWDDPl0rP3JwqvLpXqVSa3WbPPx4dYctAgQIEGhZgXgL7rn5+dzzi5UrZxJWmMxOaPS+NKHWqzlCbNk52W4vgdcuX0lyrcdVH6xUJm5EtdZju3cnQ40/29rqAnkDpRPDQ2H/0NDqJ1nl2dkbn4X4pREg0DoCU8eOhBhurWpd5VDo6a/qskGAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwN0FNizUurBQW6V1aNdo2L1/791n6xkCBAgQaCmBGGh96YMPk8wpVmvdVsoEQRKMHKtXxq+8LVZlVLkyr2L7Hn+9UqU3VudM0WKg9R9P7s9VoTM7jyMP/UIlLDuS7a5rO/5cn77gtvb3wjt5/vy9dln1+cVKrffXX1k3b6h21cl5kgCBugR6B/pW/H22OJjm/bmuSTmIAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQJsJbEyodWxqewjz38la7ZupvV1jdh/bBAgQINBaAicS3f46Bv1+ffqxpMHWeFvvp//GZBKweDt2rbMFnn/r7WQA44Pbw688/I0k48Wx/t6eh5OMFQeJQfUY4tVWF4hB9zxB5/j+lKdi7+mLgserv0KeJbAxAo//yt+uOXGhuzcUtub/A5uagXUQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQ2IQCGxJqLZeKT4dQ6F7uubVnW5g8MrO8y2MCBAgQaAOBc9c/DakCn6N994V/lLCC5T/Y+2iSKq3xZXjr46tt8GqYYiMF4rUev1K1WF31ybHduYbbPdAfnv7FNMHtpYmkCqovjbeZv588l69aa7028T03hmo1AgRaTyDefSR+ZVtx+3C2yzYBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAisIbEiodX6h8Gx2Lt88dCCUyulvO509j20CBAgQSC/w5++8m2zQvTt3LIb08lQwjBUQj3/j60lvx/7a5TS3nk8GZaANEUhZrTUu4Jcr1Vr/5czjIV7362k7tnUv/pz8s4PTSasbn7l0OWlwdz1rasd9o9fNubmmT/30xdmmn9MJCRBYu8D08WM1OxfuGwyhq1zTr4MAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgWqBpqdIu8annyoUwlj1NEJY6YO/7D62CRAgQKA1BeItuGO4a2Lo/iQTjAG/f3V4ZvE26C+8896ab4Uew6wzu75WqX75UK7bemcXEdfnduxZlc7cjtd5vB7WG0JdTWu4tyf808d+cXHcWPkzXmvvfnItzM3PVx0Wq7JuK5XC/srP2cHRB0O83lO2eL4/feNnKYfc9GNFs9MXLi6+7zRrsfGcZ2YvNet0zkOAQB0C41MToXegL9z45PqdowvFUBwYCvNXzt3p84gAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAgRqBpodaC4XCMyEsVE1k6UO/qk4bBAgQINBWAjEMF4N+qYJ2cZx4e/YYUj2xGPT7olJB8vpiVcSlwF8M+cX9xgcHK98Li0G/PBVe7wb+ow8+vNtT+jtQ4E9ee32xumqqa32JMP78LA/Lxgqg565/GmLotRHX9dJ5l74//9Y7bmm/hLGO7y99eK6podazVz8Wsl/H62NXAhshEO9AMlH545wf/9nzVacvDtwf5q9eqPw6XP1HC1U72SBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECDQ4QJNDbWWHzk4tbAw/+2s+dSxw9ku2wQIECDQZgJXbt4KP3zv/UqV1N1JZx6DgzHYulEtBmhjZU6NwJJAvNZjBeHj3xhb6mrI91iVdXxwe0PGzg46e+OzxZ/fbL/tewvE4HH8Gu277947J9gjVvPVCBBofYHJI4fCT55/Icx9MXdnsl3lUOjdHhY+9e+KOygeESBAgAABAgQIECBAgAABAgQIECBAgAABAgQIEKgWSHvf2uqxa7bm5+crVVqr29Cu0bBrz3h1py0CBAgQaEuBWOnxzUoVwc3S4m2+n6tU5dQIZAVeeOfdTXOtx4qwf3jm1RCvd60+gRPnK5UXm9Dia3Tm0uUmnMkpCBDIK9A70Bf2HHisZpji4AM1fToIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIELgj0LxQ69jBkUJh4ek7p/7y0eQTM9ku2wQIECDQpgIxcPX7L7+yaW5hHivPxgqMGoGswNK1Hiuctnv7L6+86jrP+SKevnCxKaHgUxdmm3KenBwOJ0DgrwWmnqy9I0lha0+IXxoBAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAisLNC3UWi7d/tUQCt3LpxGr1+w7dGB5l8cECBAg0OYCserj751+ue2DVyfOXQg/OPtWm78apt9IgaVrPX5v1/bca2+E1z5yG+y8r9/1zz9vSgXVly9ezDtVxxMg0ESBeFeSka/vrjljcceDNX06CBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBD4UqBpodaFEL6bRY+B1lK5lO22TYAAAQJtLhCrV/7HEz8J7Rr2i4HWP/6rn7b5q2D6zRC4cvNW+N5PTrVldeLvv3E2vPTBh81g6ohznDx3vqHrjMHZs1c/bug5DE6AQHqBA8eP1gxa6OkPoatc06+DAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAAIEQmhJq7RqffqpSpXVkOXgMs04dO7K8y2MCBAgQ2EQC565/Gn67EmyNob92agKt7fRqtcZc47X+O//3VIhh7nZoc/Pzi6HtF997vx2m2zZzjBVvY/C0Ue3M7KW2r4DdKBvjEmhlgd3794Z4h5KqViiG4vbhqi4bBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBAh8KdCUUGvlVDVVWlf8cM+rQoAAAQKbSiCG/GLYL4b+2qEJtLbDq9Sac4zh7RjifveTa605wb+eVaye/PsvvxLita6lF3jpg3PpB/3rEU+e95o1DNfABBooEP+Yc/r4sZozFPt3Vv7us1m/jtecXgcBAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBlhVo+Kdo5UcOThUK4VBWYKXbMGb3sU2AAAEC7S8Qw37//scnwg/OvtWyVQZj0O/5t95ZrF7Z/uJWsFEC8TqKwdZWvdZPX5wN/+4vfxxiRVGtMQInz59vyMDxfbTVA9MNWbhBCWwSgT3TUyGGW6taVzkU+warumwQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQIBBC5pO19CTz8/PPVkKtVW1o12gY+fruqj4bBAgQILC5BV54591w5tKl8A+/uS/sHuhvmcW+9MGHi4HWRt42vGUWayJNEYjX+umLF8Pf3/to2LtzR1POudpJ4rX9x6/+VJh1NaREz8Xw6ZtXPw7jg9sTjfjlMPF60ggQaF+B3oG+sO/QgfDKiz+qWkRhYDiEax9V9dkgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0OkCjQ21jh0cKRRu/2rlvopVzivdfrFqBxsECBAgsCkFZm98tljJMgb9jo3tTh78Wg9arFb5p2/8LMQ5aQRSC8Rw4++dennxGo/X+kaEW+Mc/tfbb4dTF2ZbtkpyavdWGC9Wa00daj15/kIrLM0cCBDIITD5xExtqHVrTyh094aFWzdyjOxQAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAptLoKGh1lJp/ulKoLV7OVmsUjM+NbG8y2MCBAgQ6DCBGCiNX6N994UnK4G/Rysh122lhv4naVF4qYpiDJ3FaooagUYLxOssfjXzWo/nixWIT1+cbfTyjL+CQAwR/909jyR7Tzt3/VPh+xWcdRFoN4F4t5Jde8bDB2+8WTX14uBIuH2+uq9qBxsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEOkygcQmisaOVMOu1Z7NVWicOz4RSuXGn7bDXz3IJbEqB965dC1dv3cq9tptzc7nHaNUBUhmdu359Q5cYw1p/+Mqri3OIlSz3D90fJoaHQt+WLUnmNTc/vxiefb0SoH3z6tWmB8PiNZgiPHvps8ZWk71662aSeTb6Zy7VdZ/k4lrnINlrPQa59w8NhR3bqv72Z52jfrn79c8/D69dvhJe/+ijyteV0OjXYbVJtss1v9oa8j4X33dOX7gYZnZ9Le9Qi8cLJydhNAiBlhCYOna4JtRa6N0eQlc5hNtftMQcTYIAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIDARgs0LF3a1XX925VA68jyBcYw6+SRQ8u7PCZAgECNwFLIseYJHV8JbEajpeqtz732xmKFw1jZcrSvL+zo3rr4fWnxpWIx7B7oX7yd+rufXFvqDu9+8kmlb2Fxe/bGjcVg9PLnv9qxiQ9ikPF7PznVxDPWd6oT5y6E+NXqbbNc90vX+vffOLtIHm9VH8OtQz09lWt74J4vQwyjX7n18xC/x2tsI0Os2cm2yzWfnXfq7bW8jms954lz59e6q/0IEGhxgXjHknjnkhufVP9RUazWOn/5/RafvekRIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQaI5Aw0KtCwvhNwqF6kUsfYhX3WuLAAECBAhUCyxVe0xR5bR6ZFsEWk8gXudvXm29eZlRfQJfBvLvq+/gzFExAB0r8WoECGwegenjx8IP//ufVi2o2DcY5j/6MISF+ap+GwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQ6UaDYiEWXHzk4VSmkN5UdO36ApxEgQIAAAQIECBDYrAIzXxtNtrSXL15MNpaBCBBoDYFvHjoQ4h1MqlpXORT7d1Z12SBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECDQqQINCbWGcPu7WdChXaMhfmkECBAgQIAAAQIENqNA35Yt4eDog0mWNjc/H05dmE0ylkEIEGgdga0928LkkZmaCRX676/p00GAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgEwXSh1rHDo4sLISnspiP/8rfznbZJkCAAAECBAgQILBpBJ4ceyiUKrcrSNFioDUGWzUCBDafwL6ZAzWLKmztCYWe/pp+HQQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQ6TSDNp+7L1Eqlhe+EUOhe1hV6B/rC7v17l3d5TIAAAQIECBAgQGDTCMQqrTO7vpZsPSfPn082loEIEGgtgXgHk5V+Py4ODLXWRM2GAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAwAYIFJKec+xod6l07e1KqHVk+bh/86lfDgf+zrHlXR4TIECAAAECBAgQ2BQC20ql8I8m94e9O3ckWc+Vm7fCb/7FS0nGMggBAq0p8ObpM+F//qc/qJnc7XdfCQtffF7T34kdC5VbwGgECBAgQIBA4wUKhbQfETR+xs5Qr4B/X9Ur5zgCBAgQILA+gU7695V/X6zv2rA3AQIECBCoV6BT/n1Renj6K6KklVrL5Wu/mg20lsqlsO9Q7e0Vv5qBBwQIECBAgAABAgTaVCBWaP316ceSBVojw8nzF9pUw7QJEFirwPjURBh8YLhm98LAAzV9OggQIECAAAECBAgQIECAAAECBAgQIECAAAECBAh0kkDSUOvt24Vns3h7DjwWegf6st22CRAgQIAAAQIECLStwI5t3eH4N74e/vmhg2G0776k63jpgw+TjmcwAgRaU2DyiUM1Eyv276z8nWjSX9NrzqGDAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQCsLlFJNrvzwLx2q3BxwKjve1JOHs122CRAgQIAAAQIECLScQAyqDnZ3rziv0b6+sK1Uqnx1hYcGBsLugf4V98vbeebS5XD9c7cez+voeALtIDB5ZCb85f/4QZj7Yu7OdItdoThwf5j/ePZOn0cECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIEOkggWai1Emj9btZt157xMLRrNNttmwABAgQIECBAgEDLCRx48MFK9dWxDZ3X82+9vaHnd3ICBJonUCqXQgy2nnrhxaqTFvoq1VqFWqtMbBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECHSOQJpQ69jBkRBufzvLNnVMldasiW0CBAgQIECAAAECKwnEKq3nrn+60lP6CBDYpAKTTxyqDbVu7QmF3u1h4cbHm3TVa1tWoVBY2472IkCAAAECBAgQWJOAf1+ticlOBAgQIECAwDoE/PtiHVh2JUCAAAECBNYlUFzX3nfZuVSaezaEQtW9WnsH+sL41MRdjtBNgAABAgQIECBAgMByAVVal2t4TKAzBAYfGF7x9+Zif6Vaq0aAAAECBAgQIECAAAECBAgQIECAAAECBAgQIECgAwXyh1rHjlbCrIWns3bTx49lu2wTIECAAAECBAgQILCCwIvvva9K6wouugh0gsDk4cdrlhkrtRa2VP3daM0+OggQIECAAAECBAgQIECAAAECBAgQIECAAAECBAhsRoHcodZy+frTlVDryHKcUrkU9kxPLe/ymAABAgQIECBAgACBFQSuf/55eP6td1Z4RhcBAp0gsHv/3jC0a7RmqYX+oZo+HQQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQ2u0DuUOvt2+GZLNK+QwdC70Bftts2AQIECBAgQIAAAQIZgT985dVwc24u02uTAIFOEtg3c6BmucWB+yt/P5r7V/aacXUQIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQaGWBXJ+QlR4+cLRYDDUlWSefmGnlNZsbAQIECBAgQIAAgZYQiBVa37z6cUvMxSQIENg4gckjM2Frz7bqCVQCrYvB1upeWwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQ2tUCuUGsI889mde5268TsfrYJECBAgAABAgQIdLLAi++9H55/6+1OJrB2AgT+WqBULoVvVu54km3F7Q9ku2wTIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQ2NQC9Ydaxw6OVO6F+FRWZ/Lw49ku2wQIECBAgAABAgQILBP4/htnQ/zSCBAgsCTw2LcOLz288720JRR6t9/Z9ogAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIDAJheoO9RaKi18N2vTv3NHGJ+ayHbbJkCAAAECBAgQIECgIjA3Px/+8JVXQ6zSqhEgQGC5QN+OlX+fLg4MLd/NYwIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQKbWqC+UOvY0e4QFp7Oykw9uUJlmexOtgkQIECAAAECBAh0oMC565+G3z31cjh9cbYDV2/JBAisRWDqWO3v1IWe/lDY2rOWw+1DgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAoO0FSvWsoFy69p2FUKi6B2KpXArfPHSgnuEcQ4AAAQIECBAgQGDTCsQw6/NvvR3OXLq8addoYQQIpBHYtWc8DO0aDZc+OFc1YKFvZ1j4+WdVfTYIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIbEaBukKtt+cLv1bM1HidPDITtvZs24xG1kSAAAECBAgQIEBgXQJz8/Ph7NWPw48++FCYdV1ydiZAYPKJmfDCf32uCqI4cH+Yv3ohhNtfVPXbIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQILDZBNYdai2N/9K3QyFMZSEmnziU7bJNgAABAgQIECBAoCME3v3kWjh3/Xp479q1EB/P3lBRsSNeeIsk0ACBfZU7oPz4z54PNz65fmf0QjEU+wbD/Mezd/o8IkCAAAECBAgQIECAAAECBAgQIECAAAECBAgQILAJBdYdaq0EWp/JOoxPTYTBB4az3bYJECBAgAABAgQItI3AyfPnw5tXr95zvjfn5ioB1k/vuZ8dCBAgUI9AqVwKe6anwqkXXqw6vDg4ItRaJWKDAAECBAgQIECAAAECBAgQIECAAAECBAgQIEBgMwqsK9TavXdqbG6u8FQWYt/j09ku2wQIECBAgAABAgTaSuDKzVshfmkECBDYaIHp48fCKy++FOa+mLszla5yKPRuDws3Pr7T5xEBAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBTSZQXM965uZKz2b3jxVaY6VWjQABAgQIECBAgAABAgQIEMgv0DvQF3bv31szUHG7O6TUoOggQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBDYVAJrD7WOHe0OYf472dVPPnEo22WbAAECBAgQIECAAAECBAgQyCFw4PjRmqML2/pCYWtPTb8OAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAptFYM2h1nLpWiXQWqgEW++0rT3bwuSRmTsdHhEgQIAAAQIECBAgQIAAAQK5BUa+vjsM7RqtGafQf39Nnw4CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECm0VgzaHWhRCeyS76m4cOhFK5lO22TYAAAQIECBAgQIAAAQIECOQUeOxbh2tGKPbvDKGrXNOvgwABAgQIECBAgAABAgQIECBAgAABAgQIECBAgMBmEFhTqLVrfPqpSpXWvdkFr/QBW3Yf2wQIECBAgAABAgQIECBAgMD6BR6Zfiz0DvRVH1gohsVga3WvLQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQKbQmBNodZCYeHXsqsdn5oIfTt2ZLttEyBAgAABAgQIECBAgAABAgkE4p1RJg7P1IxU3D5c+bvTNf06X3OsDgIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQKtLHDPT8HKjxycqnxaVqnUWt0mDz9e3WGLAAECBAgQIECAAAECBAgQSCoweeRQiOHWqtZVDoWe/qouGwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQ2g8A9Q60LC7VVWod2jYbd+/duhvVbAwECBAgQIECAAAECBAgQaFmB3oG+EO+Ukm3FwZFsl20CBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECbS+weqh1bGp7CPPfya5y38yBbJdtAgQIECBAgAABAgQIECBAoAEC08eP1Yxa6O4Nha09Nf06CBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECLSzwKqh1nKp+HQIhe7lC4xVYiaPzCzv8pgAAQIECBAgQIAAAQIECBBokEC8W0r8yrbi4APZLtsECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIE2lpg1VDr/ELh2ezq9kxPhVK5lO22TYAAAQIECBAgQIAAAQIECDRIYMVqrb2Vm6t0lRt0RsMSIECAAAECBAgQIECAAAECBAgQIECAAAECBAgQaL7AXUOtXePTTxUKYSw7pZU+SMvuY5sAAQIECBAgQIAAAQIECBBIJzA+NRHinVOqWqEYigNDVV02CBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECLSzwF1DrYVC4Znswlb8EC27k20CBAgQIECAAAECBAgQIEAgqUC8Y8rUsSM1YxYH7g+hEm7VCBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECGwGgRU/+So/cnAqhIVvZxc4dexwtss2AQIECBAgQIAAAQIECBAg0ASBfYcOhBhurWpd5VDo3V7VZYMAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAuwqsGGqdn5+vqdI6tGs07Noz3q7rNG8CBAgQIECAAAECBAgQINDWAr0DfWHPgcdq1lAcfKCmTwcBAgQIECBAgAABAgQIECBAgAABAgQIECBAgACBdhSoDbWOHRwpFBaezi7msW+p0po1sU2AAAECBAgQIECAAAECBJopMPVk7e/mha09IX5pBAgQIECAAAECBAgQIECAAAECBAgQIECAAAECBNpdoCbUWi7d/tUQCt3LFxarwTwyXVsNZvk+HhMgQIAAAQIECBAgQIAAAQKNFYh3URn5+u6akxR3PFjTp4MAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIBAuwnUhFoXQvhudhH7Dh0IpXIp222bAAECBAgQIECAAAECBAgQaLLAgeNHa85Y6OkPoatc06+DAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQDsJVIVau8ann6pUaR1ZvoAYZp06dmR5l8cECBAgQIAAAQIECBAgQIDABgmMT02EeEeVqlYohuL24aouGwQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgTaTaAq1FqZfE2V1t3799Z+WNZuqzRfAgQIECBAgAABAgQIECCwiQSmjx+rWU2xf2fl71Szv+bX7KaDAAECBAgQIECAAAECBAgQIECAAAECBAgQIECAQMsKFJZmVn7k4NTCwvyppW3fCRAgQIAAAQIECBAgQIAAgfYSmJ99J8xf+6i9Jm22BAgQIECAAAECBAgQIECAAAECBAgQIECAAAECHS1Qenj6q/WXlh7Nz88/W/gq4rrU6zsBAgQIECBAgAABAgQIECDQLgLF4bEQvzQCBAgQIECAAAECBAgQIECAAAECBAgQIECAAAEC7SjwZYx17OBIqXT77cp9CrvbcRHmTIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0N4CxTj9Umn+aYHW9n4hzZ4AAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAgQIAAAQIECBAg0M4ChTB2tLtUuhartI6080LMnQABAgQIECBAgAABAgQIEPhSYG6u+GB458QFHgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAgQIECBAgAABAu0k8P8BZhdTehAYdQ0AAAAASUVORK5CYII=" preserveAspectRatio="none" id="img2"></image><clipPath id="clip3"><rect x="0" y="0" width="7588716" height="3314007"/></clipPath><clipPath id="clip4"><rect x="1237" y="5547" width="789" height="241"/></clipPath><clipPath id="clip5"><rect x="1237" y="5547" width="789" height="241"/></clipPath><image width="180" height="207" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAALQAAADPCAYAAABcKPswAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEbAAUAAAABAAAAUgEoAAMAAAABAAIAAIdpAAQAAAABAAAAWgAAAAAAAACQAAAAAQAAAJAAAAABAAOgAQADAAAAAQABAACgAgAEAAAAAQAAALSgAwAEAAAAAQAAAM8AAAAAyu+mMgAAAAlwSFlzAAAWJQAAFiUBSVIk8AAAK9ZJREFUeAHtXQl8FEXW7+qZSTI5CTcoggcC4oeiIHihqLuoq+vtrqiA4ooHQsIRbghnuEXEAxFvPFDU9VpZFQQRQfFAuQQXQQ7lTAJkMpPp7vr+NclggJme6umeme6Zrt9v0p06Xr169e/Xr14dTQQ7xFQCY3f7biGK2F0R6Pjik9N+iGllNnGB2DKIjQQm/uE7W5HJY6DetaYGBdJ+xkFcY4Y3IftiU6tN1Qa0wRgo/pXWcaX7R1NB6AvSzhDky5FW3LCJ68k+hPhDpNtROiRgA1qH8GoXLaZUdO329yJEKAFgG9ROC3O/WRRIwbCmro/DpNvRUUjABnQUQju+yIQ/qjqLijAb8R2OT4v8P/0P8gwc3jR9U+S8do5IErABHUlCKukT99BGDsU/SaBCL2TTI0u/QOns9Kq0iYWnkjKVKu2kCBLQ0wkRSCdv8lxKXeW/+/tSIoxBK3MNbOl+mCzDPY1czxUTohhIN2VI2YDW2NVT9vj/QhQ6C8XaaCzKn50IawVKCoqauJbxF7JzMgnYgObEQcnvlS2cgnO6INCbOYvozobOWegX5KHDmri36SaWIgRsQEfo6Jk7qJs6q4bg2S9C1owI2WOR7MVDNFUhadMGNyYVsaggmWjagFbpzRl7MMtHyQxkOUUlW3ySCNlFqFJU0CjtdUIIPIN2CCUBG9AhpDLzd19bgYiPQThXhEhOaBSQ/JVIaUFBk7RvEsqISSu3AV2rYyYfpHkZVRLzXISb5auVO6G3cLCQFxSHY2RhA/J7QjkxWeU2oNEhlFLy+B7/PVQgk/BvQ5P1kRo7R+ADn0COOB/r15L41DKmSlrKA3rOH1WdKCFslq+jdTudbAWwBzzS2PmeddtgDOcpC+hZmOVzUnmSQGgviDIp5IBGfCZSpfDBxunrjIGH9agkRUdqETsWETnr7/H3haegGOWMnOXTwkYs88rQ1k+m+51j72tGDsayIjPSTilAP7nHfxU6gc3ynWXGzjCYp4N4+4zZ28A1F9PoksG0TUsuJQA9Zzdt7nJKM+Dyitssn4l6fD2zrx9o5PrERDzFjJWkBjSb5cvKkIrQoWyWzx0zKVqAMB7md52CXHRfw4xfLMBu1CwmLaDn7ZFuhrMWs3y0edTSSb6CVWjSTAdxlvRuQA4nX/OSZHRfu2OeO+A7S5ZFZicze9kOoSXwBxXo8H81cL2YbNPoSaOh52KWzyFLo9F/j+AXai9f6K5N4Vh0/hpZEQrub+RamSxisDyg2Szf8/v8vQRCStApVprlMxGG6AJZdA37Vz2y00RMRcWKpQH9/P6qjlgI/zhafkFUrbcL1ZaAh1JhcrbPOeP2ZqSydoKV7i0L6BcP0XrUK32AOb5OVhK42XklAv1NoOKgng2db5md11D8WRbQrDHM3Hj5gL8nFhVNxL9NQjXQjotaAssFhRb2bJj2fdQUElDQ0oAOymvhXprtE6Vh+L8Qv0TsKgmykmxXnPZEnvVXOUbda5HTnpIC0EEUvVZa2UKWndPw/y3BOPtqiATKsUlm3JG6rjlmP+3JtIB+9YD/auzj74sDXKZ1b6Bt9/OCvf4ughhYs3GuId1pEwlKYDMVSeFddZ3scBxTBtMBesFeb0vidEzHdPX1RyVG6asydQ2+W8PuDKyqE1sd9PeGfT0edGx33lFh6r/BNPrH2N84oHt98532ZBpA/3sfzfESaQQAWADPRVoIsWOqloxtVFec3VXD6rFXDtBcpyCNhHuvXxi6IaqyozgkgIMm6Zx00Tn+pnzznPaUcEAzT8WbB+SeWHfBJkYaRRYkWU9Epe/t+drMEAwczxCc8nT4Wv8euQ47B7cEqLCfiGTkhnzxWSxTTfhpTwkF9MID/guhddkZypoPOQQwX3VSx+BbNJghrJPe2O+/ihBxJrTL2dydZmfkkABZSxSl8LYGrs85MscsS0IAvXA/PUkUZXbs7J1omR4eDuPcjOJ6dcXHtZghCyl1iKXKA1igU4z668VMuqlJ+E2JOIbckU+2JaL5esCkmd+PKE2vKFUGYjaK+YyzNBMIW4CsU4jyiFYzZEEZzU+XcWwBIQ+BtL2gKax8NSd4CaXTK/zOKT3ifNpT3AC9qFS6Cef94Gw44VTN4uEv8CqVtZshb+2nrfHGgBkiXM1flZ2TQwK7kGfoTfmOV+O1TDXmgH77oO9sQh2zYFjE6xSigBmSp9EMYZ3zzkHpGtjW2BRAWnN0lp2FXwKrFEXpf0v92J/2FDNALyynddMUuRhtfhA/B3/bjcoJM0RQHrlZozdkKXaFl5UpfWEWjQIn+UZxY9PBzIIgvChLjhFaB/JaZGc4oNmAK6NU6UMxVQpG6mphJkZ5F0iSo0irEN/Gaj5RkcfBTLoffCXggYyRNBJP9gjMj4nOPHHWtcT4054MBfR7Zf4rCGXbn0znEjuMCZvinDravCGs7z+AySQQx0yoF3tLl7EPw1aQG3R9vvNdI8kaAuj3yuiphMrTQMzkxwRUmyF/12iGMIF/WCrdADcfFj6RM4zsAJuW8JlA5cLr6hpz2pMuQC/+g2ZJ6dJQuL0GomOss2yTCgscaY6iq7O0ndwJcyotq1zpjzfQCLQ31wajYRKQMVH2dJXDUXxzLjmgh2rUgP6oXLoTZv5kVH6SHgYSWDbgDcmIwgxZfJg2VGR5IsyQe8C/mMA2JFvVpVCOY9y54tNaJspqC0EzoD8sq+ogMjuZCBfVJmTde7KOUuWRa6MyQ6rOFUngyIQu1m2/CTknZAM7dLJbHe2nPXED+tMjtJEsyWyrE9NK3OVMKK7QLMEMoVGYIYzYf8uk26Ctp+C2BfvfDgZJgAjvUcEx6Oo8wn3aU0RgMrsx/7DSD6viRoLNZLcbD8EbMjYtV7s3BP7rDKlcGYgDEocA3NkGdalNRhCqINNZlT7nxBs4TntSBfQn5dLfMADClDBpmVKSpfCGOJS+3XJcy7W2++MK2kSU5BII9m6UVZWvVtopnn8PpDn8qhzHC2rT6CEFvuQQbaVQ+VEIMNXXNiyQnNq9IQx4n5bjzBAhYF9jiawdDJTAt9jf2P/K3NCnPR0D6E/YcVoOdpwW6QsGXAYyYWVShyCPsSQKM4Q1ekm51B0X5g06mf1vB8Mk8BqRHEO6HnfaUwDQbP/d5Yfl3qhqAuy/BoZVmVSEAktU+14ZhRny/m6amZ0lFWFnx2D4W1P6WF+DIeHB93GmZmSL0y4i1ac9kaXlVZ2J6HgCp7a0N7iyZCW3gIqOwV2zyR9aG7jMQ5tRWZ4M//0dWsva+VUl8JtChIFX5DgXkWXl8usYRTbFa/ViFDnGBFElkdqJh7BoaawcpRmy7BCFD5+NUYiFv7xlJgDQb8GNfFmuszNZfkjBGQu0GyLYl5PYrFcqfH8EzTQkrKNE7HtZDtHsDWGbg784LPcEqJlvv4kh3KQaESrshDLGiamkM5q+p0uuowlZwQBNAoBm4sCuXfolMrTBfX0WYQcOCWBSxi+KUZkhS3GMmcutDEMt9jFmHKKuyeIDRr8CVtlBncExyd5LchyNA+sQmJ1R8xPxyd1Lce/CAvfluMq10oJ57Ouf8mKyUAgRPGmUKQPtoWtDcgQdMcIhiWdB9otseR/FYkiciYR+DXNvD/B5OWTlri0vJn1nANEn9kMeyLL1CZvxq8TvnBOz2DGQ0RKRkAGds8mPeqVxYfUu6dtWHqaX4RlhcwD2MWa1hUqE7dDIe3Fg0AVQIGGDk4RBdE2JMwNXAjOEktNwb9t61YL5BTtyii7Kchi6OJ2RviiHLIN93WGVB25Uah9jBpF4oDi+BpgvxLV5QG1X90HIv+E09LGZacADcgQEPwdhjNBDHtV1bJnk/K8cfvqJOVni7LaEVMWqiZjaZebLvFUH6BskXWFraPrhF+p4tFixYA66lK7CctJmwNzlHAwF9Db5+shRLwdHmUCWbahgH8CdSi4ntgB9vksQR52XQ9D2+IZvy+kZkkinY8r37/GtOUG1EWEr7OQyKI/zNHCw74JsRyOnijkSjlYLgBk/itcAaSwIwin4JXEgn2GwNqBDDvkpUY08v3r55I3fHGGfdmbHmAnJeowZswLWYJKPzYmcpgWbAH8gOJlNoqVgTTlcCPtQjxevhM/BALvP/DMtCe6osIXZyR2zHP/W25rVXnqmI13Y2YEQ2IPRh47Zrk+xnLf9aR6lD6iMxS+JjjELuItP5zQvThBiEMbkuwrNJscJxBCxC4zsAMiZg9vqoQxT0xMV2MkAoF9PY1bjKASXQykGjQfw2w1//5DzMp2v66EZLPsjjjHzO5UxGPE/hDhnMN5qV5w6uxnmhRGetP3nZTkakh8AaOxm7maEIPCUfAdbMxcq34o7o2XIYB6+Qju6Qy7Zr0ceWOzvzKtUHkRHFYPOcYfV0JXYhVFwXhZerQaENYdoa4eDnfZErzGAXDxJlOPt/gPe7pegUiPOPdnfPghoCMMQQNdIQwKjK8AoM+hz4ymh6OsinxCFDDwnh7Dpf13hhyP+bkLguF6BzbaGC8zke8FFxRFto1jkFIro9xUSzm0hM/B2aRUq3URx0J8wLwhhfBq5svPAuVmOBmRtJUwOaiigg7KDN4D+DDPEzIueNqPtg8/Jcr4fZDra6w+wk4lCp6PN1/HSgAY/jLfZJLdbnNXSgFOE1lDqclUqD6P+0fjV4eUjXvnQ3o1UFGQ8dGfHoM4D52QC0D/GDtBBnn9CA9jgU01jBfPG61qGisZXucU5eu3kNdgU4cpQRsLcYr7iaDdFbMWDNahdljGnCMEMqZ/uVMbiNWCWY8wO4kHH248w80J9Ki9KBED+B/6PAXpd7AHNvCiYKKArsAGVPZl1o+TZiGLMTp6LdRNj2ug80ASzeeJ6NptHyATQNOjVSZYoCilsl22Mi3DDYXq2LNJH4b++0gjhaaUR534/eHamoz5ZHwdA1xJEKYANf27ADDFiIFCLdIRbQv4rSGRg2xyyPkLOiMkbK+llCo3Zegu2IGwuMeChCzZkfYV0Ax48mEPC6cG4mF8JliMrGOzF7818sC0D9EYA2igvB6+Q8Cr8WaSCDy6bdrxlos0XqIvQwW0ynB9ESyNYDkBugUV1U/D/bcG4GF5LIZ+xe9PFJ6M9Rag2b+txHAWpxDFmRBiB+JgN1vEw7gOeNtWYF/g3buHgWe4aQENrGunl4G4BGv4lZuGYi68RdyH+jKVwIY73uMUn9NrJaynNSvcq7LyNQag+g58F/TlR5yaRKANaZ7g+1k9NEH7CgUFOhzIetO7Fz0h7FuYcWQHz5lzIHas14x5K27gd9cjPzORIEKBrmnwYgsAWmsB0Z7SDqtrSkzAInatIYrEBdjL52SvfiYduMipoWruSuN9T8qEikoFtMshmI+reUkHby6L8KNrWRTc9IqyFzN2gU706UzfBqAiUtgoCGu+FhGjo49jeCj5KoZHOPy6e+1+UXSyi01umkw3chcJk3OzBbgic2IPkTmGyJCLaDw34uJgujj+dkHIjGNjiobdhip+ZUS2ioPcHyvwPP+aaTXQoPdMt1iNbvExDmwLQ1QIhwio87ewMC/bjDZtEgQw6PYN8xFsgXL5NFbSpw6mUYC3yXcgTTxswHEuh4vdBs446PV14tmapaag83HG/4hgzuUoYCM/NUBTK4ijIlgRgq17g+5LZHPljn4UKpS0ZoH8xG6Crm+7FZRV+TDuyV1nIAFutFB06dke6oHvgFOxU+IN5OzUkT/GMxNO2VqGksKWbfG5EvexhdjqVSXiY7wa9cA/zd0hjkzZsw4eZQtkZGWJdshWAxqvaDCZHKOHsxOKbnQBu5+MSJewpe8qVLo5tRgic9vrCVh+9FdppKqi00EcpYaXfEikpOtVNthnBwf+qaEc82LPwpsQukaNhF/piR4i+OJohwTdlpzNA/2pODX2MbDBl+i1cWGyRD9MK/4FWHtQinWw8JlMU//xaRc8VFMr2710WRXGzFWEfu5yRkSFObkxIhRHMbfPR7njQi0FrJ35mXyJcdioD9DafyWzo8D3hx+RAYYs08mT4LHwpWyhtkOZTxuMYqftQwkjXFR8Dsc21Gw/8sFNcwiu44uWrL2z3YdAo0Df0UYlL6fIW6WJ+oDOZsWSBnwuvQLYGI+rAFu/85qUD0qroZjwc96PNOLbBEm3X0j9NYS68uL2KfoU30PGmWlSys4qMWOP4NslGJQbjC+lRpdu89G+kCuuGiXAm66BkD2jjBQD2lzt88gJBFoc1yyTYhBFdsJK8nGxBhe73UnRy0lwqGj53+2gbvDLxncHEzIZqbqSxBYBFuB8d9KadPjrFnyZMP5UQ5kHiDvFdcMPNVqiMgefOyVLUDu4IVTJRcVj/wR22U5rv8Ctj8LgGtihZSctwN5I/I3zLdFyaX+gNYBednE7e5C9qDXwE+zeaXd9aZJGQvL/7aVfqpwvRFfUYA8HGJoQZc1XaHOPEN3ZXKQ83cZGrMGiUIrFnpTc4a4ulAM0LTCjy9sgbAHOkDkvR9C6/Vx9cExHQkA88ppYIATYtNSjktTgsZPclDClspwNPsJ6G1uM64JGIgXlETl5ZNquoFQPFExtSQLR+b3ZsWAtF1VImR6gGhIuzyGsyHPumireILKtNDiY5izDM7V4MaGhTQcK6zFjO5OB8i5uiR7TwapWH1BSCVWfCUtab04Ge5x1sqbc79qm8fAZsbd7MsWfb0jUENLSFtEP1xIpFRG5r6MR0lEXwHGAzKQeFVrP7EgNT/lotAuhAgywFaC1WhJU6gR9a8c9pNeXgxKbSpDOiA6aJVRaoxB+jgRo1fCyH4sNICeJSU7UBJpNSQzNAa9HmmsSWYpmtp6Et1Pn2oDAxT5Ml9HONaCzl5dDSnVbqBC3tSkRei8iylslhFY45VbTthzYO9g7YHNjpbfZQCTOfnW5VvXzUInjmtoutZveZHS2ceiQhzQB238GZFgMyCNkeALSZmT1eQlp4tcpDenwbzfa/iRX0BixC6u8i5LPaMrOUl6M242r3DPjmf0uqtcBcaSZTDuUwL8ajj2eH2nHjTEI3dAAN1nCdmgu44bgxiSyZjnoRQB4KIO8Nx2tAQ5vsCQzHK3e8FtOEm2iKZmTjEROEr8FDPwCZXVWD3feq4rETEyyBPaifnW51IQ+YGa/OgE/GKgYn78pcfKLItqKZDFRC4MwolfRgEvsIW/z3YGHzLp0jiGIxgHwoyIraFWfwsWOAs5yCHOh9tbzmSeO1jRTWJqs8pQkSr8hrTGA7bXwh8qmgOPqTdJHrME4AmQiyfJcgySXo8/ucVEHH8wIlQbIPVouvvQZvI1wVK0wGRGhDbJN5JYnPZGLDcTyUA9mGagaTdOci3pZTv/9C6pfYVxY6sjIYvFKYHHj8eF/lvDXFKh/hVBWBPfrx6IRYNdRMdJmG5oZ/NIxXQstOEzPSJsO84DqmjHo8zRTiLKEy7X5shYQBGlEWmNsMMM4rV4UhmjfzsSKx/ztOAuwomthp6LcJUQaKbvf242oN+S+Anyl4qwYDrkU4C9t9glUpKgB0wN4MWd58kZwKuvrDtTagDepAnHPJLXjeKtfjZduf5LiX8BQI2MkV3juox8vWa5wctozENHRAmVnk9czr4Wfyt8pbJ2zvmCRBhormlXtkltks31ghyz0H5gXT/REDLfdcQI9UMju58wka+cTSCpaPWmcApXC+TaqzWeQhPbFT4hJDcLgdT6jWd7plyQg8jw+IDifZOWFn+WrzAzv5JEWiE4HOuwFkrtetTGRmcuhmtjYfsb3nahZYYK5I3ryx5dj61HW+wWGvrHYQsR/Jy/qGRxh0B3UruZ6BShUdgtcs8y3zB1RmLRua450TaD1T0TagIwCBU0Uzm5SZcNrDHjjPhjvq5LwA84JLa0oHDt+ukMNTUeUp2qtja6EDXg6uuqKhb3wZLSC1bWhj5K9dQ7OPcs4RBWksqVuPb5ZvX1kHWST4TLNyMa/OCtm4AKCtZHLwql3bbReyv6OKhLOXd54CevgTIjoKSP0cvlm+fRVNFOKfAAj2IviCaFT81S4k+6nTUgMoXvcRXpHohdpNte+jlQA0NMdM4TaQH+holP8OTzX0118zlKy8QlmpGob8hn1amX2NFTY0ONb/bPC0w4A8vIwGEG1AfTYJQYB3jYZdlMn28k0RPYemklNP5Zrlk/YcuAU6fxqcES2Y2jE2BGxo1vm8QDG2eu3UwCtPYHaf4cLiqTgJ84S3od9yCOIg0rTubzytrtq1t71IxEcFmXbhyR9VHkWAH5o9JFYZQPE+0NV2VFQySZlCYZVuCAnUGmfhrb5eEcT+rpPrLwmR84QoumdPI1kSxwNn9wJnWmo9gVakCLbO1Vp+aF7biClyPk9RJBnZ6exVV63wyvAeHyue3OAJB8csH11P05Tcvf3lKjISJHLiI0gMCqnMbGhrmBy8GMWbB75TXnUeH1Gbrhbe5dB+SQE+5jvS6HDSpMk+nnZIv+25UaZ7pmEMf3pcTT+RQkNbyeTgffDYQNcy4wIeiCQuj6N5448wMfIhDwe+rXvaYXr7USorXXnyG55HCqyHtpAm49W64Qcyhssw2QnyzPLRLb83kEU6jgryfVCQvLo/FqKzmA3Nq6EpXQWzbwcMqWaxkFoS0HxP+Llpld520DVrXFJe40ckoowCkPMCb3u9RHWVZ4PCJFzI4zrjpJV09+420mG5COPIwZCRW5eckqUwEdbBpzzAdeZJn+ptkrRl1/WSoEzHgLGlabxkxIGZQjxWvIMtvULQW17h1dCoiDRt6sGl2PPLrvlOWZ6C+3/qrd/C5Q9AImNdLU9+CiYEM8iiDr4t29sKMpmpKPJfoiYSs4IK/NABn601vBzRLPvKPOOkHZBfd/+m7U/ADGELxc+PmTzNR5gton/a5RfGkHanlOphj27cWc8vKMWCRB/AiNuRePMiRGvEKpgcLFhlYkXHc+dq3fxLbOW5wL9h+z1wgEwAqcYhRJI0Udjd/180ZkB62+Yb9DSK2cn+zAYP+ak0GnTy9dCKdVnAuMaGjnVNRtHXaRvVjNifo5s2vVkluUYQgfQHa+lGsWcSOlvghR+UcfZp7+vlR1q/7Ro/VWZivU8rvbTiUl5ifmg2q2bK90fsREBatz4M6kO9P22Z56DidDgub4hdbXGjfAgP7ASnWDmbtG2ry4Ph+2lbG4HKMxRZvjpu3BtREV7BmFhhw0KLBA2DQp4WZfxfy/8h302Va/93pSjQR3F/Nk85k+Vh3vnnsKp+ZE6707n264Xjn67fUdcv+UZTRXoIeZyWwUWwQdWABtuW4TzwOgmyb9jVfc7pn9GFtL2/5S99UMNY2Nf1DCMeQ0LotuU4oq4w/dyW3+uphi5d6vTlN3ugqspbDDp19dBKaFnL2dAxnM4mtwfcWU/SH398ze/PGAPZBLRUQjsofOXboYWGZJzfemH4LHwp3m83dqui4gwiy2fxlTBzruBqOx3eg/g2L7yG9q3ZOIzKwqKMTm026+GJtGtXivIFvm83z4VLcyYmZrrpoWdw2QrQm5p2OH0a6cq3oD5c/d7VG88kIsHEiHBd4BVtmbd0uBahFZSy9dAWMjlUhI6tb9diUDS26uuNT7lEcTzp0Gp/+KZHTkk//0y2L+4adPx1MN2n4/7MyKViloN10quy5BiaeVGrXXpqod9/X8cvZYxCt/dF37usY25ytJqZHAzPVvFDA7DhWwXHK9rCBjKP+BS5h3f1hknpHvfjejUZNP4H8MUu9klZ/SAorO3FmoU4BrT4a0ztFWR2PmuVnmoxRnD4mm/4l6+KjEN/19dDy8RlmdtO10xofNumdgpm9ZMZ5IctlJniy6h4yLdy3fC0C9u+XuODDqZrupIOHdjW/BmHl3/3ksuZNgHrg3sD3DHdfYGp6t14hQ5Pv7Dty3p4Zw2tXLn+Sh/Z8CjWiMOLo/Ka0yQVE2aGTqtZnKSi+czEt8pxumwbfIiuao64Bd6V6woqv/xxsPvidsv1NCeny3lsgXufqi/XP6UQuPmocJkeemHKesHzzIwjcgnpdg6zmaMO3pXrz4Bbdhp+NyQzjv8UkOUW+P/J+gl3Kmul8bh2RP7PvV/8+C7WgwzNuPRcXQPHtIvb/gB6XX3L195KCZmK+xb4GREWUUke7O7afpseYnTVllyf5B2Jk+1hJglpemhZrCxem+xVbZWfmnXEPk0QuR03YifLOu+ytY8dWrpGtx2Z3uWct9KlnLNQMdYD0wqO+kPLWqFrsTj+ioxL292mB8wwUcTK5T/+y+f3bIaHZhD4SYuap8iyDN2WBJbD1ygwKFTRbGZ7OqmKyaHhwBy2IOuRNOLs4fn8+xI3rTNbz8CxpuzEihXfPS9KpAS2712gz2vD7cM4dzQejHmwk8P7JDk6onLp95f7lq1ls53nQEWlZgjMFFpqy78KThRZJfHE/kWn56HAZK9Q+qDns++Gu69or2vgmHXJebtRS0/PkjVPEkFky1Q7nVjr0Rg/6p6TLivjyF86lB+NjeLG++k3pwmiYyr68mb8oqCQREVkth6ajRasIgc1JcbeNJogfbQjm6PYAu+S7wsqP/tusPvK83QNHDOv6LAawLoI9KCpKTS20PRoTeyGCh8RwTEg4yp9djxdsSLH680chqFwIcyK9GPqSNF/qMI+SRH4rFt0SDCV3AI2tB6OaPXA8dM1GDjKQzOu6hT1wLHGzfYyXbz2ba/oHwquBuK3DfEA8vkf6+KymIrei9f08laSCRjgNraMMtLTaN6yaYFvrLDcFlHRams5DBoLQBI3CoJ4nWfx109JaeL43K4dop5xrHG7jar473dPZbrK95KuXSXevgmVr/Ljby/1kTXwJwvnWabPQjUkVnEBDW2h5aNq7xFKmQ2tlkOTFAMDR6dP7uH5eHWJO92DgWNXryYKtTJn/TVgX9eK0XZb+eGXzanDOZUK0m2pbiarSQ7ngbD10MhiFSmpaWjm0gs1taImgchpbJp7cmWl+8HKD1ePyLj2gtf0ztpFrvLPHFjWme3xuIeg2QMxy5fxZ4p9F1oCGfBDMy9HAn2HmuoOfLIrdFPQCB4/dLRtbU4F5ZXKj1athrbsEo4Do+IxqCSVH3zVs9KTsQmNGgEZZWiSk1X602A+qzU0Dus17k1tVJdGQccgGzpCzR1w6MPnnvdXviuKjqEZf4t+4BiuHs8HKy8CmJk/uaNVhjbh2hL3+KANbRnBqY1dFbWVS4aL9kZ8Yu46z7srnpYc/nG513eNeuAY5Mzz7rJm8CdPhtfpjmCcfdUmAeqSFCdmC7WVSmBuHIMZtvYETCqwgWNfh+y8+8g7y0uy6shRDRzp+2syPVJlEXphMOxk+4SnsD0cOQEn6bLlo9YBtOoneo346ExkmYXKEZhx9JQ6HjyyaNmIrJu7cA0cmZ1c8c7y7h6/h02+nGyZt2QoCZglTmGAjo13IDZNDK+gawZ7samWk2pzsPeKZ9HygspFywa5b7ks7Iyj560vOnneXv4onDKdwTgneTtbJAlQJROAZoi2ikzV3iYyvBzGu+0iyTBUegew+XnFm0vfdciYcfznVUdnHD0LPztJEEkJfOZ3wmuh9niGomvHRZAA9UND4/PI+ywjWjUIsE++mevBvFEm4nUVbyx5mjjJVOx5vBfDlSLoj8xozuiL0JepnrwN0Bha585LS53uTct7eVpd+gWwMA5SaWRuyYRHNJbBhk9MXKMCA0daRduBhZj7rxPXzMTUjA7Hjh4yNTPPO5Vce62PceEkxcVQbcK8ffNXvO7O9MKJL/TH/+ZcvaW22s7c3hpzvTtYz1s7MHm+KhBlSNYdf2XLdo8GpkECoUHvSwLnvVW+tHiu4hCmwcS7OZhmnquKEo6vH9o8IkkxToDk1Q6FFmT26LY6VNOPAjqY6O7R7Vfc33r4hcVdiCiwGav2wbTEX9nLJEwITKOGSUt4NHsQbSWtsxt2Q4Qjsnv89SW19TRiuEpyenVbnrX1q45wHNyHEfkfplhLEI5ZFs9s6ACoARyzXdkRr2bjyTL8CF747EuyBKFVds9uL6qBmcHgBA3NIoOhxr5+bu8TSxe6MyvxoXFSiLSErfqCXy7I2glXK+2NPIF5OyKcBN7Bl9oG5t9zzbZwGY6PVwV0MHPDh7sewf2I0uf/M8+pKOx7JbcF0+J6VXtrm9uGVuM8riK0RGVUWAt/fUHOvdcu08ovF6CDRGuelH8cmv/+49jVMRNnb3UIpsXnGl5DV5uoNm7i0w8xqoXQfYJCirN3rZlbYx1orkgToIPUc3tfvwJ2TaeKZz7sgU2aExF/7EbQYEajr8zuCxfMraHDcW3HV0vAjzHGk37BXVy3z1907YKPCtCMhxrj/MU/Xlr8VlZl1RCMethG0BivFrOqhmZ8qzyMTKApGiCVjwWZFuY+dMPPRoggakAHK2/coxtma4TRlXPff1ZSlMnouH8G0wy/qmFC47kchvOmRtDGcyjpbIJSHJDzwA26dsEfT1g3oIME3X2u/w333Q/NeftxqG/mv74gmGbUVe3Dm+b2crAnUeXtYpSArEGnDIvIJuTszZ9NivXtgg/VXMMAHSSe2/fmr2BfX3hozrt3gvFJ6MaTg2m6r2qnPJl+9Zra60W3ZKxAgC1Uni+4XCNz+1yve4dPuAYbDmhWUY19/cruue+/neWtKsL5bYMQjVVmOoOaklNbWqqzWru4bgksFRVamF1w64+6KUUgEBNAB+ts2uf66u9tz17E7Gu2M6M7fmqwDBYNfVVb72xqL0fKmhy/wrU7JKfw9rdCd6jxsTEFdJDdzH637MT93eWz3ppDqj/Ec2EwTdNVZSlHYCtZ9I+KJjY0Z2Z8qbkcNRM0fYEjaPKUbEf2dNKvellnvDiOC6CDjckruJWtkLq4fObC7uhgprGbBdO4rirfWGF79MzqGQvgmauBls+EUx7oAidxDckqvOX3RLQmroAONjBvwO2v0pkL3zkky7CtSRHisfaEJ6ioaDPb0CmBaLpKUBwFeUP+8TVPT8YqT0IAzRpDBtxeicv4iimvzZdEOgna9W4WzdLCBhUNjR3h6mXDEo1DQhI7OCD0XXg5Ds8t+ucrkVbCxUHSQtjlo/GonNWRNeSO3XmDu/eCUDrBDv4yYAszbRvqJzPPT5gQKr8dF1qOBsgFfn8vDsWZ5CFS67whd+j+UleYXtUcnTANfTyndYbcsQZxl5aVvPIPODMw4yg0Pz6Pqo1sYhv6hHZYPAJaeRGcSoPqjLxru9maYhpABwVTZ9hdb9Di5/9dnuYcAAAPRXx2MM2q66HxgVM0wbwWUVC+Ea9U+IEKYkH+yDuXR8yboAymAzSTAym+x4vLpIqJLz/vp8oE4KEn/scXu1RAIcOGVklmdBMWGF/Wdtux7zOOzpN/nRftss54yd6UgA42PmvE3cz107us+LknMP3IPiQfPgS2FIVPtlOikoAfEyNz5IyqcXWH9tG1rDOq2qMoZGpAB9tTp/je73B/+b4p83OCcSdcmQ1t2mA9kwMcf+QQxAG5xb2OnvxkWvHWYswSgA7y22BIb3bUQujARu5mDZhuMMkxZRElBCluFEUysE7xPYYu64xYsUEZLAVotTbj9Hbz2tCMcRM/bzVyLcMYZVwd1445sJN1fdxIrZ9inZY0gIbDFZ8mNrHVEeuejJ6+DDfpPJGmjcot6XkgejLmKJk0gCay1JUS5ziI9QH8HOYQr9m5IEvwscrC/Cl9fjI7p7z8JZ1KOzhk7v9hCnY23vGX8Qoh9vnIMlPxQ4St4Kcov+SBt2Pf9vjWkHSADoqvbPDT/8CO9Kn4X9uKviABY69sIqKLsSSjonYEqxIn53ulGeTxfoHTOqOiYuJCSQtoJvPdxXMz049IQzFaZDtmEnbiE+pONKCZl/5lRXIMazCrT0KWdbL+iEdIakAHBVhW+ORpikhn4P8bgnFxvRIAmiZMQ3+FRbcF9Wc8/E1c25ygylIC0EHZHih84q/YBTAL/7cOxsXlSoQvAOhL41LXn5Xsgu97WP7Mvgss4wT/k/eo71IK0ExK9P65roNuXz908ij8mxu15LQV/ALZ4wXoSngvZ/g8aZObPtOH7elMqZBygA727p7+sxrBZzmZUNIDcbGWQ7wA/SaRHUX5T/TdHmxnql1j3ZGml+f+h2d1qnHzdYwhszEFNDrxe0yOFOY/UWDaZZ0xlO0xpFMe0EwaWK9MDj702L24YwdPNjxGQsb8swJkLjGG1DFU9uLlMqpuw7L5Zl/WeQzXMfzHBnQt4R68f3IeFdPGwADpi2gjZ1GNBrQfD99swekaX+/xfodqNSHlb21Ah4DAgT7Tz0I084ZcFSI5iigCQFNDNDT8yR8qgjKg4dyiLVEwkvRFbECrdPH++6bfTAidjiwtVLLxJH2JTBfzZFTJsxHmxYB68wYtVsmT8kliyktARQD1nx309iF/xVkwssdiyrgSP+yk0v5jZ0xGU66mTCmqLKh7UsU5NphVOqsmydbQkWUUyFHae3pzmcrT8M+tnEWOZoOZsBKCvuhoBN+NjGzPpEny6NyXhlt+WSdfk/XnsgGtUYb77ym5Al9pewwDx7a8RTUDmgqfiU6lsO78Eet467DzVUvABnQUSKCXFzsPNE97CEWL8asTkQQlX2EamueAyq3wIA6u/9KwdyLStDOElIAN6JBi4Yv8/Z5JDdIkZSK2DMKHrXoK1VdIVwP0YexqLynNy360ZZIu6+STqP5cNqD1y1DY133c+UQUZ8O0CAlaDO5WYTayc4iq2KaxFxWnNLzhC8V/hEi3ozRKwAa0RoGFy85mGw90H383QD0ZeRrXzoe4VRD08YBeKShiQYPXR7Ij0OxgkARsQBskyCCZfX+fkiNkeUfB1dcfcS4WD0CvhqA71eTZCX/y0Aavj3615n/7YqAEbEAbKMzapPbdNr6VIMrsa2BX1wC6HTwj0/wVwtSm7xen3LLO2rKJ5b0N6FhKF7T33T76ekLFTpjBeqbum8W/xbi6lCf//wfwv5oLyY34AAAAAElFTkSuQmCC" preserveAspectRatio="none" id="img6"></image><clipPath id="clip7"><rect x="1824" y="5560" width="180" height="207"/></clipPath><linearGradient x1="2.53282" y1="25.2686" x2="2.53282" y2="-99.4357" gradientUnits="userSpaceOnUse" spreadMethod="pad" id="fill8"><stop offset="0" stop-color="#E73768" stop-opacity="1"/><stop offset="0.5" stop-color="#FFFFFF" stop-opacity="1"/><stop offset="1" stop-color="#69E0F9" stop-opacity="1"/></linearGradient></defs><g clip-path="url(#clip0)" transform="translate(-1237 -5367)"><g clip-path="url(#clip1)"><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 6046)">A_shared</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4754.51 6046)">=</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6046)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4840.45 6046)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4869.1 6046)">alloc_shared</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5212.85 6046)">((</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5270.14 6046)">block_M</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5470.66 6046)">,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5527.95 6046)">block_K</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5728.47 6046)">))</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 6111)">B_shared</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4754.51 6111)">=</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6111)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4840.45 6111)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4869.1 6111)">alloc_shared</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5212.85 6111)">((</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5270.14 6111)">block_K</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5470.66 6111)">,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5527.95 6111)">block_N</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5728.47 6111)">))</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 6176)">C_local</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4725.87 6176)">=</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4783.16 6176)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6176)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4840.45 6176)">alloc_fragment</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5241.49 6176)">((</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5298.78 6176)">block_M</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5499.3 6176)">,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5556.6 6176)">block_N</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5757.12 6176)">),</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5843.05 6176)">accum_dtype</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6158.16 6176)">)</text><rect x="4456.5" y="5986.5" width="1769" height="211" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="none"/><rect x="4370.5" y="5690.5" width="1855" height="193" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="#FFFFFF" fill-opacity="1"/><text fill="#FF0000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4267.53 5429)">import</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4468.05 5429)">tilelang.language</text><text fill="#FF0000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4983.68 5429)">as</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5069.62 5429)">T</text><text fill="#FF0000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4267.53 5560)">def</text><text fill="#0000FF" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4382.12 5560)">Matmul</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4553.99 5560)">(</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4582.64 5560)">A</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4611.28 5560)">:</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 5560)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4697.22 5560)">.</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4725.87 5560)">Buffer</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4897.74 5560)">,</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4955.03 5560)">B</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4983.68 5560)">:</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5040.97 5560)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5069.62 5560)">.</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5098.26 5560)">Buffer</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5270.14 5560)">,</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5327.43 5560)">C</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5356.08 5560)">:</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5413.37 5560)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5442.01 5560)">.</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5470.66 5560)">Buffer</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5642.53 5560)">):</text><text fill="#FF0000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4382.12 5732)">with</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4525.35 5732)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4553.99 5732)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4582.64 5732)">Kernel</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4754.51 5732)">(</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 5797)">ceildiv</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4697.22 5797)">(N,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 5797)">block_N</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5012.33 5797)">),</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5098.26 5797)">ceildiv</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5298.78 5797)">(M,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5413.37 5797)">block_M</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5613.89 5797)">),</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5699.83 5797)">threads</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5900.35 5797)">=</text><text fill="#1F77B4" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5928.99 5797)">128</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4382.12 5862)">)</text><text fill="#FF0000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4439.41 5862)">as</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4525.35 5862)">(bx,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 5862)">by):</text><rect x="6180.5" y="5981.5" width="45" height="143" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#E57C62" fill-opacity="1"/><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 6267)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4525.35 6267)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4553.99 6267)">clear</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4697.22 6267)">(</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4725.87 6267)">C_local</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4926.39 6267)">)</text><text fill="#FF0000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 6451)">for</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4611.28 6451)">k</text><text fill="#1F77B4" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 6451)">in</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4754.51 6451)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4783.16 6451)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6451)">Pipelined</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5069.62 6451)">(</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5098.26 6451)">ceildiv</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5298.78 6451)">(K,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5413.37 6451)">block_K</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5613.89 6451)">),</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5699.83 6451)">num_stages</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5986.28 6451)">=</text><text fill="#1F77B4" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6014.93 6451)">3</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6043.58 6451)">):</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4611.28 6608)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4639.93 6608)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 6608)">copy</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4783.16 6608)">(</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6608)">A</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4840.45 6608)">[by</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4955.03 6608)">*</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5012.33 6608)">block_M</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5212.85 6608)">,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5270.14 6608)">k</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5327.43 6608)">*</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5384.72 6608)">block_K</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5585.24 6608)">],</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5671.18 6608)">A_shared</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5900.35 6608)">)</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4611.28 6673)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4639.93 6673)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 6673)">copy</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4783.16 6673)">(</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6673)">B</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4840.45 6673)">[k</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4926.39 6673)">*</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4983.68 6673)">block_K</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5184.2 6673)">,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5241.49 6673)">bx</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5327.43 6673)">*</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5384.72 6673)">block_N</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5585.24 6673)">],</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5671.18 6673)">B_shared</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5900.35 6673)">)</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4611.28 6844)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4639.93 6844)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 6844)">gemm</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4783.16 6844)">(</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4811.8 6844)">A_shared</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5040.97 6844)">,</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5098.26 6844)">B_shared</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5327.43 6844)">,</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5384.72 6844)">C_local</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5585.24 6844)">)</text><rect x="4370.5" y="5615.5" width="1855" height="76" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#CCDAF6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4917.51 5673)">Kernel Context Initialization</text><rect x="6180.5" y="6124.5" width="45" height="73" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#B4E5A2" fill-opacity="1"/><rect x="4456.5" y="5912.5" width="1769" height="76" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#CCDAF6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5109.45 5970)">Buffer Allocation</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6241.78 6181)">Register</text><rect x="4456.5" y="6224.5" width="1769" height="63" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="none"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5431.51 6272)">Initialize Accumulate Buffer with Zero</text><rect x="4456.5" y="6395.5" width="1769" height="523" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="none"/><rect x="4456.5" y="6321.5" width="1769" height="76" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#CCDAF6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4852.81 6380)">Main Loop with Pipeline Annotation</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4496.7 7081)">T</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4525.35 7081)">.</text><text fill="#9467BD" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4553.99 7081)">copy</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4668.58 7081)">(</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4697.22 7081)">C_local</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4897.74 7081)">,</text><text fill="#D62728" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4955.03 7081)">C</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4983.68 7081)">[by</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5098.26 7081)">*</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5155.55 7081)">block_M</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5356.08 7081)">,</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5413.37 7081)">bx</text><text fill="#666666" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5499.3 7081)">*</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5556.6 7081)">block_N</text><text fill="#000000" fill-opacity="1" font-family="Courier New,Courier New_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="48" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5757.12 7081)">])</text><rect x="4456.5" y="7027.5" width="1769" height="84" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="none"/><rect x="4456.5" y="6953.5" width="1769" height="76" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#CCDAF6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4937.86 7012)">Write Back to Global Memory</text><rect x="4580.5" y="6543.5" width="1645" height="153" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="none"/><rect x="4580.5" y="6469.5" width="1645" height="76" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#CCDAF6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4827.91 6527)">Copy Data from Global to Shared Memory</text><rect x="4580.5" y="6786.5" width="1645" height="92" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="18.3333 13.75" stroke-opacity="1" fill="none"/><rect x="4580.5" y="6713.5" width="1645" height="75" stroke="#000000" stroke-width="4.58333" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#CCDAF6" fill-opacity="1"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="64" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 5311.86 6771)">GEMM</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6241.78 6054)">Shared</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="50" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6241.78 6114)">Memory</text><rect x="2093" y="5556" width="505" height="130" stroke="#000000" stroke-width="10.3125" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#73BBBE" fill-opacity="1"/><rect x="3523" y="5556" width="505" height="130" stroke="#000000" stroke-width="10.3125" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#8ED973" fill-opacity="1"/><text fill="#73BBBE" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2070.16 5779)">Global Memory</text><rect x="2808" y="5556" width="505" height="130" stroke="#000000" stroke-width="10.3125" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-opacity="1" fill="#E57C62" fill-opacity="1"/><text fill="#E57C62" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2785.16 5779)">Shared Memory</text><text fill="#8ED973" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="700" font-stretch="normal" font-size="83" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3534.24 5779)">Register Files</text><path d="M4154 5390 4154 7114.23" stroke="#000000" stroke-width="10.3125" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="8" stroke-dasharray="41.25 30.9375" stroke-opacity="1" fill="none" fill-rule="evenodd"/><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="110" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 1620.74 7280)">(a) Efficient GEMM with Multi</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="110" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 2978.85 7280)">-</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="110" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 3016.09 7280)">Level Tiling on GPUs</text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="110" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 4290.32 7280)">(b) Describing Tiled GPU GEMM with </text><text fill="#000000" fill-opacity="1" font-family="Aptos,Aptos_MSFontService,sans-serif" font-style="normal" font-variant="normal" font-weight="400" font-stretch="normal" font-size="110" text-anchor="start" direction="ltr" writing-mode="lr-tb" unicode-bidi="normal" text-decoration="none" transform="matrix(1 0 0 1 6019.39 7280)">TileLang</text><g transform="matrix(0.000360892 0 0 0.000360892 1339 5961)"><g clip-path="url(#clip3)" transform="matrix(1.00011 0 0 1 -0.0235485 0.164795)"><use width="100%" height="100%" xlink:href="#img2" opacity="1" transform="scale(2768.59 2768.59)"></use></g></g><g clip-path="url(#clip4)"><g clip-path="url(#clip5)"><g><path d="M0 0 1013.13 0 1013.13 744.948 0 744.948Z" fill="#0A0619" fill-rule="nonzero" fill-opacity="1" transform="matrix(1.00126 0 0 1 1131.64 5290.99)"/><g clip-path="url(#clip7)"><use width="100%" height="100%" xlink:href="#img6" transform="translate(1824 5560)"></use></g><path d="M69.3696-87.6953 2.53282-87.6953 2.53282-65.7044 23.3616-65.7044 23.3616 0 48.3918 0 48.3918-65.7044 69.3696-65.7044ZM90.7347-75.2994C101.849-75.2994 103.727-76.4317 103.727-87.5761 103.727-98.5417 101.849-99.4357 90.7347-99.4357 79.7392-99.4357 77.713-98.5417 77.713-87.5761 77.713-76.4317 79.7392-75.2994 90.7347-75.2994ZM78.7261 0 102.713 0 102.713-68.2372 78.7261-68.2372ZM140.259 0.744948C143.149 0.744948 145.95 0.506565 148.721 0L148.721-18.0575C147.321-17.7 146.934-17.8192 146.069-17.8192 142.136-17.8192 140.885-19.458 140.885-25.1494L140.885-96.0387 116.867-96.0387 116.867-19.458C116.867-5.6914 121.427 0.744948 140.259 0.744948ZM220.624-38.5287C220.624-59.3873 214.038-69.4888 189.783-69.4888 167.166-69.4888 154.025-62.1585 154.025-34.1186 154.025-6.07878 167.166 1.25151 188.651 1.25151 202.149 1.25151 213.025-1.25151 217.197-4.17171L217.197-21.8717C213.145-19.458 202.656-16.9252 193.448-16.9252 184.717-16.9252 179.294-19.3389 177.894-25.2686L219.849-27.8015C220.236-28.8146 220.624-32.9863 220.624-38.5287ZM177.774-41.5681C178.281-50.0307 181.827-51.9378 189.902-51.9378 197.501-51.9378 199.378-48.3918 199.378-43.207ZM258.02-21.6035 258.02-87.6953 232.871-87.6953 232.871 0 290.47 0 290.47-21.6035ZM330.399-69.3696C325.483-69.3696 318.152-69.1014 312.968-68.2372L312.968-48.1534C317.407-48.8984 322.056-49.2858 327.002-49.2858 336.985-49.2858 339.518-48.2726 339.875-41.3297L325.87-41.3297C306.025-41.3297 297.056-35.5191 297.056-19.8454 297.056-5.18484 306.025 1.25151 320.298 1.25151 332.306 1.25151 338.117-2.9202 340.143-6.31716L342.021 0 363.773 0 363.773-46.3656C363.773-62.2777 354.297-69.3696 330.399-69.3696ZM327.896-16.5378C322.95-16.5378 320.298-17.4318 320.298-20.9777 320.298-25.0303 322.712-26.1626 330.28-26.1626L339.875-26.1626 339.875-20.8585C337.998-18.4449 333.826-16.5378 327.896-16.5378ZM424.531-69.4888C413.416-69.4888 406.861-65.9428 403.434-61.2943L403.434-68.2372 379.447-68.2372 379.447 0 403.434 0 403.434-44.9949C404.447-48.3918 407.099-50.5373 413.178-50.5373 420.747-50.5373 422.892-49.1368 422.892-40.6742L422.892 0 446.909 0 446.909-46.1272C446.909-62.665 440.592-69.4888 424.531-69.4888ZM505.015-68.2372 505.015-62.2777C502.363-67.3433 497.208-69.4888 485.557-69.4888 463.954-69.4888 458.411-53.4575 458.411-34.7444 458.411-14.0348 463.954 0 485.557 0 497.059 0 502.363-2.9202 505.015-7.83685L505.015-6.07878C505.015 4.79747 499.592 7.44948 485.051 7.44948 479.121 7.44948 471.403 6.43635 465.98 5.06565L465.98 23.6298C472.416 24.6429 481.147 25.2686 487.583 25.2686 519.944 25.2686 528.794 13.0217 528.794-5.6914L528.794-68.2372ZM493.901-16.5378C484.186-16.5378 482.667-24.8813 482.667-34.7444 482.667-43.9817 484.186-52.2954 493.901-52.2954 505.403-52.2954 506.177-45.7398 506.177-34.7444 506.177-23.1232 505.403-16.5378 493.901-16.5378Z" fill="url(#fill8)" fill-rule="nonzero" transform="matrix(1.00126 0 0 1 1271.86 5696.36)"/><path d="M0.238383-25.0303 0.238383-20.9181 7.80706-20.9181 7.80706 0 12.6343 0 12.6343-20.9181 20.1434-20.9181 20.1434-25.0303ZM22.8252 0 27.6525 0 27.6525-25.0303 22.8252-25.0303ZM32.5393 0 50.7459 0 50.7459-4.70807 37.3964-4.70807 37.3964-25.0303 32.5393-25.0303ZM53.6661 0 72.2004 0 72.2004-4.70807 58.5231-4.70807 58.5231-10.5783 71.0383-10.5783 71.0383-15.0182 58.5231-15.0182 58.5231-20.352 72.2004-20.352 72.2004-25.0303 53.6661-25.0303ZM85.9372 0 104.114 0 104.114-4.70807 90.7943-4.70807 90.7943-25.0303 85.9372-25.0303ZM119.996-25.0303 113.888-25.0303 104.918 0 110.133 0 111.712-4.64848 122.589-4.64848 124.257 0 129.83 0ZM112.964-8.40301 116.778-19.8454 117.136-19.8454 121.248-8.40301ZM132.452 0 137.13 0 137.13-17.0146 137.309-17.0146 149.347 0 154.085 0 154.085-25.0303 149.466-25.0303 149.466-7.80706 149.288-7.80706 137.19-25.0303 132.452-25.0303ZM179.026-3.75454 178.847 0 183.376 0C183.376-0.178788 183.376-13.1707 183.376-13.3495L170.057-13.3495 170.057-9.83331 178.937-9.83331C178.579-6.79393 175.45-3.63535 170.921-3.63535 165.378-3.63535 162.577-7.50908 162.577-12.5151 162.577-17.3722 165.945-21.1267 170.951-21.1267 174.795-21.1267 177.238-19.3389 178.192-16.4783L183.287-16.4783C182.065-22.7358 177.745-25.3878 170.921-25.3878 162.935-25.3878 157.899-20.3818 157.899-12.5151 157.899-4.67827 162.458 0.357575 170.414 0.357575 176.106 0.357575 178.281-2.38383 178.847-3.75454ZM203.579-9.77372C203.579-6.19797 201.196-4.3505 197.769-4.3505 194.223-4.3505 192.226-6.19797 192.226-9.77372L192.226-25.0303 187.34-25.0303 187.34-9.05857C187.34-2.65202 192.197 0.357575 197.799 0.357575 203.609 0.357575 208.317-2.65202 208.317-9.05857L208.317-25.0303 203.579-25.0303ZM224.885-25.0303 218.776-25.0303 209.777 0 215.022 0 216.571-4.64848 227.448-4.64848 229.146 0 234.718 0ZM217.823-8.40301 221.667-19.8454 222.024-19.8454 226.136-8.40301ZM256.262-3.75454 256.083 0 260.613 0C260.613-0.178788 260.613-13.1707 260.613-13.3495L247.293-13.3495 247.293-9.83331 256.173-9.83331C255.815-6.79393 252.686-3.63535 248.157-3.63535 242.615-3.63535 239.814-7.50908 239.814-12.5151 239.814-17.3722 243.181-21.1267 248.187-21.1267 252.031-21.1267 254.474-19.3389 255.428-16.4783L260.523-16.4783C259.302-22.7358 254.981-25.3878 248.157-25.3878 240.171-25.3878 235.135-20.3818 235.135-12.5151 235.135-4.67827 239.695 0.357575 247.651 0.357575 253.342 0.357575 255.517-2.38383 256.083-3.75454ZM264.784 0 283.319 0 283.319-4.70807 269.641-4.70807 269.641-10.5783 282.157-10.5783 282.157-15.0182 269.641-15.0182 269.641-20.352 283.319-20.352 283.319-25.0303 264.784-25.0303Z" fill="#FFFFFF" fill-rule="nonzero" fill-opacity="1" transform="matrix(1.00126 0 0 1 1395.89 5764.9)"/></g></g></g></g></g></svg>
diff --git a/images/logo-row.svg b/images/logo-row.svg
index 633243f3a..e73244b74 100644
--- a/images/logo-row.svg
+++ b/images/logo-row.svg
@@ -1 +1 @@
-<svg width="2268" height="537" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><defs><clipPath id="clip0"><rect x="0" y="1440" width="2268" height="537"/></clipPath><image width="401" height="463" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAZEAAAHPCAYAAACSpefQAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAAFgBSURBVHhe7d0HeBvnfT9wNm3TpE2zk3/SzGY0aTObttlDWW2TZseOt01vy3vHm9rU3ntviZJIihIliuLee+8tUlwgNg6Hw929997/ASgr9nuwLQm4AeD7fZ7PE+c96h23fiBAHlNSEASJmLRJ8da0KfHLbDuCIAiCvGHSJqX/TJsQq9PGRZo2IUppE+LmtAn1g+zXIQiCIMjlLJiU/iNtXFw7b0J0hwvIa01IrfPGpafSpuiH2X+HIAiCJHFChWH+uPT0vAmpfd64SN/M/HExb96EeAvbB4IgCJKEWTAu3jhvXDzLFos3M39clOdPSLvnX5R/zPaHIAiCJEHmT0jfmz8ublswIQrzZwvD1ZuQxuZPiIvSJoJfZPtHEARBEjALJwOfWjAuvbxgQhrQFIVrtGBCrJ0/IT2c5qDvZsdDEARBEiBpqvq2hePinQsnxOIFoRu/DhaOS9mLJoN/ZMdGEARB4jgLx+WfLZyQ9i0M3+j1tWhc5BZNiJuXjEvfYeeBIAiCxFEWjAe/sGhSXLxoUrq4cEKkRlo0KfUvmpReTp8SPs3OC0EQBLFwVkzRf1g8Kc1dPCHWLArd0M1VumhCvCetU307O08EQRDEYlk0Hvz14gnpWISbubkmpSOLpuT/Y+eLIAiCWCCLxsWvL54QVy2eFB2LJ0RqSZOifcmEuDp9XPx3dv4IgiCICVkyTj+weFJ6Ysmk1KK5aVuW1L5kUnp6uU39CLseBEEQxKCkT4nXp0+Ip5dMiDQuTYrn0ifFW9l1IQiCIDpm6bT07SUT4ub0SZHX3JjjTPqEqCyZkPakT8k/YdeJIAiCxDBLL/IfT5+QXkiflHrTZ2/AiWNSupg+KS5ZOhn8V3bdCIIgSJRZPinevnRSLFg6KdJEtmxKrF8+JT2yZkR9L7sPEARBkKvMsin5x0snpT3LJkWFveEmsmWT0smlk8Hr2P2BIAiCXEGW2ITPLZsSFyyfkkaXhW+qScm/fErcsnxC+h67fxAEQZAISRtR37FsUrpv2ZRYGeGmmpSWT0oDyyeltBVTwj+z+wtBEAS5lGWT8i+XT0pHlodvnMBaMSWWL58Q7w0VWnbfIQiCJG1WTYtfWTElrlg+KdrYGydorZiUji6bkn/F7kcEQZCkSugnkFZOS4+tnJKaVoRvjnClVk6JjpVT4pplE+I32P2KIAiS8FkxGfzDyknpJHtzhKuzclLqWDklPbt6hn6U3ccIgiAJl1VT0jdXToobVk2KvpXhmyDEwqpJ8fyKSfE2dn8jCIIkREKvlFdPSc+tnpa6Vk2JFHQyLe1dPS3/jN3/CIIgcZvV0+Itq6bEc5obHuhi9ZQ0vmpaTF89JX6JPRYIgiBxkzWT0g9XT8s7V09J0uopKXRzAwOtmZYbVk9Lj20eVd/HHhsEQRDLZvW08JnV01La6ml5mL2xgfHWTEun1k6R69njhCAIYqlsU9W/XT0t3rN2Wi5bE7p5gWWsnZYCa6fkrWtt0vfZ44YgCGJ61tjk/1k7JR1kb15gNfLg2vB3icJn2GOIIAhieNZNBv9t7bS8dN2UNLk29IoX4sK6Kbli/bR0/6ox+k72mCIIguiedQ767rVT0sPrpuV69gYF8WPdtHRs3ZT8a/b4IgiC6Jb10/Lv1k1JmevCr2gh7k1LrnVT8toNk9J/sscaQRAkZllnl/5jvU1eu2Facq+fligkGJvcud4mPbfFQT/GHnsEQZBrzuYp+uENNunp9Ta5XXPjgYSzwSYVbLCRO1RV/Sv2XEAQBLmqbJgiN26cls5uCN1cINns3zAt/5w9JxAEQd4yG23S9zfa5G0bbJIQ4eYCyWNy47S8dPOU+GX2HEEQBNFk4yT91MZp6eVN0/LAxmmJAsySGzdMS4/vGKPvZ88ZBEGQlDRVfdsmm3jnRptUrL2BAMzaZJNOb5oiN7DnD4IgSZzN0/LPNk1L+zeFbhIAb8UmCZtt8vbNM9IP2HMJQZAkymZ78AubbPLizdPSRc2NAuAtbLbJw5ts8ryNNuFz7LmFIEgCZ/8U/YfNNmnuFptcszl0MwCIily5dVp6IHResecagiAJlq12+ddbZqRjW2wSBYilrTPS8S3T8m/Zcw5BkATIdpv49a02efXWGcnBXvwAMeTeMiOv32aX/os9BxEEicNsGKcf2DYjPbF1Rm7ZGnq1CGAIuXubTXp+u4N+nD0nEQSJk2ybIddvtUmntRc4gDG2zUiF221iauhHyNnzE0EQi2brtPSd7TPy5u0zEr8tdCEDmG1GOrDdJv8Pe64iCGKhhN462DYjvbjNJvdqLmIAk22fkaa2zcjLt06LX2HPXQRBTM52G7l9x4xUsD10sQJYmty8fUZ6fNuE+kH2PEYQxODstMs/3jEj79lhkxTtxQpgabnbZsiN7DmNIIgB2T0T/PxOu7xg54w0umNGogDxaKddCu60yzt2zUg/Ys9xBEF0yJ4R9R277NJ9u+xy5c7QRQiQCOzySOhFUejFEXvOIwgSo+yekX+5yy4d0VyAAAli14xcvcsuzd1oU9/Fnv8Iglxjdk+LX9k1I6/YZZdsu8IXGkBi222XTuyxy79jrwUEQa4ye2zy/+62SzPsRQaQDHbb5d17bOpH2OsCQZArzG4n/cTuGemF3TNy9+7QRQWQdOSePTPSi6Frgb0+EAS5wux1SN/cbZfX756R3NqLDCApFO+yiXemqerfsNcHgiBXmL12+Td77dKxPTMSBUhGe2ekg6G3edlrA0GQK0zGGH3nXrt03167XM5eYADJYK9dmt4zI6/YOy1+lb0+EAS5wux3C/+8zyG9ss8h9++1hy8sgKSyzyG37HNKT+6apB9irw8EQa4w+53Sd/c55M37HBK3L3RhASSZ/Xbp7IEZchN7bSAIchXZ55D/sN8uZbEXGEAy2G+XpH12eecBuzyHvTYQBLnC7LTTfzxglx48YJer989eWADJ5sIBu7xw30zwX9jrA0HiKqFXRPsdUtoxEx7hcGgm+Pn9dnn+Abs8HOEiA0gCcs1+u/RQ6IUVe30giKVzxC18+qBDOnzALtFLxg6Z9MjrAzPSDw445G0HHFLgNfMBSBoH7VLWAYf8e/baQBDL5YhN+OwBuzzvgEMeYk/kMId0cL9D/jn774zIASf50wGHdEozJ4AkcNAh+Q445I0HHNK32WsDQUxP6Pc2Dtml+w7a5YqDs6983tAhuzR+0C4vCr3dxPajdw551PcddEiPHnLI9ey8AJLBIYfce9AhvbTPpX6SvT4QxJQcnJF/ddAuZbAn61uTKw7OiPduU9W/ZfvUO0ccwX89aJcXH7RLY9p5ASS+Qw6p5IBDvMuM6w9Bwjk6I37jkFNec9ghOQ45wifltTpw2KS3uA7a5R8fcsq7DjskKcK8AJLB4SNO+RfstYEguuWYTf3IEaf09GGH3B7hhLwmh53SxUNOeeF+m/A5djwjcthJbjrskM6w8wJIEjOHHPLKg27x6+y1gSAxzVEXueWIQzp3OHTj18ERh1xxyCneU2LCU0ozfPRDhx3SE0cccjM7L4DkILcedkpP7Z+iH2avDwSJKkft8o+POOXdRxyyfMQhh272ejtw1CH/jJ2HEclwil854pCXHXXKkxHmBZDwjjpI3hEnuZm9NhDkqpNhp1846pQXHXHIo+yJprejDnnsqFNekGGj5rzF5ZB/fsQp72PnBZAk5NALx9ALSPbaQJC3zEk7/cejLvJghpPUHJ29oZsmw0HKMxzkbjPe4golw05uy3CSfHZeAMkgY/bF3KITdvWL7LWBIBFz1C7/LsNJMtmTyXROef8Ru/xTdr5GJHOGfjTDSZ456iDtmnkBJIEMJ6nNcJCHDzjou9nrA0HCyXBI38xwyhuOOWRPxuwrECsaPeaUFxzx0M+y8zcix93iv2c45FUZDnkmwtwAEt4xJ8k+7pD/wF4bSBIn00E/fsxBnj/mlLvZE8aqjjlIWegtrmOq+tfseozIcaf8iwyHfIidF0BScMpchkPenOGUvsNeG0gSJU1V33bcRe445iSFx5yhVxjx57hT3pfhln/Crs2IhD6jOeYmdx53kiJ2XgBJov+4i7x8xK1+mr0+kARPhkP++XGnvD/CSRF3jjvl0eNOef4Jj/oZdp1GJPydnIs8f9wld7FzA0gGx52k9JiT3HNMVd/OXh9IguW4U/zScaecftwlTxyfvQEnEFJ23EHuCn2Hxa7biGTapf864ZTXnXDKLu3cABLfCZd85IRT/iV7bSAJkFyP+r5MF3k000kaToQOdgLLdMp7M016iyuUEy7515kuOYOdF0CSsJ9wyatDP4TCXhtInCbTIV93wklyIhzshJXplC9kOuV5GW76z+z+MCJ7RtR3ZDnJvZlOUsbODSA5kPZMJ3k69Kw99vpA4iTZTum7mS55S6ZL9mfO3liTECnNdJM7VVX9K3b/GJFst/rpTBd5OdMl92nnBpD4slzkXJaL3MpeG4iF8+qNK8sp97MHNIntPeE27/ENWU7pO1lOeVOmS/ZGmBtAQstyyiTTKe8x821m5AoS+uMy2Q5yd7aTlGbNHjh4vZEsF0nLMektrlBOuuXfZztJZoS5ASS8bJd8MdspL8ly0H9lrw3E5GQ55V9ku+XDWa7Qt4/w5khJtpukmvUW1zGb+q4sJ5mb5SZV2rkBJL5sF6nPdpFHst3qe9nrAzE4mS7xaydd8oqTLtmWHT44cKVOuuQ92W55DrtPjUqOh34u2y3PO+mSh9i5ASQFNzl50i1fx14biAEJ/QGlk27yxEkXadEcGLgawyddJC30ORK7j41Klkv6/kmXvDXbLfMR5geQ2NyyP9stbwn9IBB7bSA65aST3JjjJmdOzr6ahhjIcZHiHDdJZfe1kcnxkOtz3CSHnRtAkhgIvaAz8zPLhM9Jl/TDHJe8/aRLDkY4ABADOS55t5lvcYXeI85xkUdyXKSOnRtAMshxk/KTbnJvyYj6Dvb6QK4xoffOc9zyvFMueThn9kYHOgrt51Mukpbrop9ij4VROWVXv5jjlhfluOVRdn4ASeJojkv+FXttIFeRjDH6zlMecv9pN6k8NXtzA0OR4tMucgd7XIzMabc857RL3nnaJYva+QEkttMu2XHKJa855Ra/wV4byFvklEv+9SkXOcbuVDDWaZesnHbJu0+5pB+xx8jInHaSG0+5SS47P4Ck4CYdp9zkmbwZ+lH22kCY5Hqk/8h1yWtOu2Tn6dmbGFjDUK6LvGLqW1w+9YOn3eTxXBdpijA/gCRAzue6yG3stYGEiodf/UiumzyT6yEdp90yBYvykKLTLnI7e/yMzGmn+OVcj7w01yNPaOYHkARyPfLeXI/8M/baSNrkesmtuW6SnxvaOWB5Z9wyyfXIu3Jd0g/ZY2lkQhfRGY+894xbVtg5AiS6Mx55/IxbTj/ro//GXhtJkzNu+Se5Hnl3+KYUYSeB5Q3mesgrp1zqJ9lja2RCL0LOuMm5CPMDSAKkIddLHg39zST22kjYnPOpXzzjlhed8chjZ2Zf2UI885DCXK+579OG3g496yZPn/GQNs38AJKBh5zK85Dr2WsjoXLSTv/xjIc8dNZNajU7AOLaWbdMznjknXku6QfscTcyuW7x62c98sozbtnGzhEg0Z31yIEzbnnrWZf0ffbaiPucdcu/P+shmWdnbziQoPLc8mCeh7x8zkk/wZ4DRuacR/7fPLd8kJ0fQDIIXYdnPSQt16N+hr024i75XulbeW55wzmP7M2bXRwkgXNuUpBn8ltcx1T1r/PcJPWchxSy8wNIBuc8pCLfQ+4L/eI2e31YPmcd9ON5HvJCnlvuYRcGSUM+55F3mP0W12kH/dg5D3nunId0RpgjQMI75yHH8l3yr9lrw5JJU9W35c+++is6F35FCsku3y0P5HvIS6EXFuz5YmTOe6T/POeR1+a7ZSc7R4BEl++WXaHzP3QdsNeGZZLvkf873yPvP+cJVT6A18v3koJ8L7mVPW+Mzjmv/Kt8r3yUnR9AMsj3kM7Qd+bnefpP7LVhWvI48Uv5XnnpeY88mR+eJMAbkvI98o4Cn7k/PXKW0r877yP35HtIaYQ5AiQ+Lyko8JLbzfoz2eGEfrnlvJc8dt5LGjUTBHhz/ec95EWz3+IqctFPhd5qO++ReyPMESAZhN49+jl7beie8x5yfYGH5Jz3yBTgmnnJ+QIPuYU9v4zOea/07QKvvKHAK3s0cwRIcAUeebLAKy/N58Qvs9dGzFPklb5b4JW3FHhlnp0IwDWSCrzy9nyf9D32fDM6hT75d+e95ESEOQIkvAIvaTzvJY+f89D3s9dG1Clxq58u9JBXCrzyQMFs5QKIqUKP3FfoIS8W8vRj7PlnZPKn6D8UesgDhR5Syc4RIBkUesjp8x7yJ/bauKYcU9W3F/rIPYUeUlo4e6ED6MtL8q3wFleRh3620EPSCr3yoGaOAIlPKPTK2wt9UfyeV5FX/mWhVz4coXMAXRV5ZLHQK28LvX3KnpdGp9gnfa/IK28p8sp+dp4Aia7IKw8XeeV5JR76OfbaeMMU+cWvFfnklUVeeabIG+4EwBTFPrmv2EdesMLPtJf45OuKvOQkO0eA5EAqC33kgVMT6t+z18bllPvoh4q95MkiL2nRdgBgnmIfyS/ykJvZc9boFLjU9xR6yMNFPlLLzhEgOZDjRV75N+y1kVLiITeV+MiZ4tAFC2BBJV45WOKVt1rhLa4iL/1CsVdeWOKTL7DzBEh4Ptld4pPXlXqk/wpfECVe+VclXtKq+UIAK/LJvhIfea5EVf+GvbkbndD7xMVeckwzR4DEJxX7SN7sheAlqSVemZb6SG2pjzSG/hvA6kp95FzoO2j2xm5GyjhyQ6mPnGbnCJCISr2ksMQnj4b+O3wBvFpEwht98mCJlxSXeuUp9h8CWE2pVxZCb3GVeqXvsDd2o1Ptoe8v8ZLH8UIMEhdpK/GSshKvLL7aFn72VpmXpJZ6Sai6/IWPNJZ5SYWmHcCCyrykp8xHni/z04+yN3ejU8HRL5V6lfQyLxln5wkQj0LncplXKS7zkRF22+UiUjb7hSyp3EvKy7ykJcI2AOvxKXnlHLmRvbGbkXJO/mm5T9lT7iVEM0+A+FFR7iNNEdrDVFV925sVkVk+MlruVYrLvWRMsw3AYsq9RCj3KlsqvPTb7I3djJR5yC2h4sbOE8DKyr2k+dI3EQq77bUuF5FyH6FvpcJHmss5Ul7hIzK7DcBqKjjSXRH6KS6/+hH2xm50qjj64XKOPFnuI63sPAEsZrTcpxRX+MjFCNs0rqqIvKrCpxTODqTdBmApHKmu4qj+j7e+wlT46dfKfcqKch+Z1swVwEThbw44Ul7uIy3stjdzTFX/OqXSS1IrZju5YpU+MlXhU4orOTLEbgMwHUeqyn3kwUo7/Uf2Rm6FVPnl/6nwKQc08wYwQaWPNFZypIJtvxLXXEReo72SI6WVHPFH2AZgqPCLGk6ZV301D4szKaG3ASr85I5Kn1LArgPACJUcGQ5/M+Ajk+y2K3W5iFTOVqNrx5GqSo7UadoBDFDFEX8Vp2yp9lHT/7DV1aaSp/9U6SPPVvpIB7suAJ0EKjlSVhn6JkC77aqEnxwRkyISwhFHlU8pqeJIr2YbgF44JavCJ/+BvTnHW6r99BuVPmVN6DrSrBEgdurDL/q17dckXERqvCS1KvRqLlY40h0qJtUccWu2AcRINUeqqiz8uce1psor/1+1TznCrhcgKhwZCN+XfcSm2RYFfYrIJdUcqQlh2wGiUc2Roao4+dzjWtOpqm+v9pG7w9/ZR9gHAFeqmiNcFUdKq32kk90WC42q+rfhIlIdGkwHNRzx1YQWwJEudhvAVeGIvyZOP/e41lQE1E/W+MiLNRzp0ewPgLfCkdoajlRr2mNotoj4SWoNF77h66bWT/pr/UpJrZ/Y2G0Ab03JqkmAzz2uNdU8/WatX1lf4yce7b4BYPhJ76X7rVOzLcYMKyKvUV/rJ5UR2gE0av2kqsZH5iba5x7Xmlpe/m2NXznO7ieAMD9xXyoe3ZptOgkXkTo/Sa0NXbAGqeOIUOsnZXUcaWW3AYT5yWBdgn/uca1pVNW/r+XJ/XV+UqHZb5C06jhSE8K26y30+Z3hReRVdRwZqfMrxXUcGWe3QdLy13LKloYk+tzjWlMvqJ+p9ZO0Oj8ZiLAfIUnU+UlXrZ+U1vmJl91mhMtFpG72pm6Wpno/KY/QDkmknlOy6pP4c49rTb2Xfrfer2yu8xOO3aeQwPzEXudXSur9pF+zzUBWKSJhlwpJM9sOCc5PqurwuUfUCRXgek7J1uxfSDyha4YjdZp2E/RT+nfhIlIffiVovgaOjNf7leIGjoyw2yDB+MlgvZ+kNeFzj5il1kHfXc+Rh+o5UqPZ3xD3Gvyko8FPSus54me3mcVyReQyP2lr8JOyeo4ENdsgvvkJVx8nn3vUc/TLDZxyqoEjT4V/MzdOUuel/1LPKQvwYiwxNHBkKvTiOvzCK8J2M4WLSKOfpDbMTtRyGv2kspEj9Ww7xCslqykOPvdo9tOvNXDKikY/mX517o1+5XyDn9zOfq2V0+SjP2zglO2NHBG0xwLiQege2MCRBrbdKkpU9R3hItLoD0/Wmnhia+KVkkY/GdBsg3hR1cBb/3OPVo5+uClAnmzkSUuENczilX0NnPxz9t9aOY0B8qdGv3JKsxawLp60NvpJWaOfiJptFhIfReRVPOls9JPSJp74NNvAkpr8ZLAx9LmHYP3PPRo5cksjr+Sxa4ikiSeTjbyyrIGjX2H7sWraPer7GnnyWKOfNLDrAUsZb+KV4kY/GYmwzXLCRaTFT1KbZi/4uNDsJ9XNflLLtoOlcM1+ZUtLwPqfezRy8k+aeWV3s5/IEdbxppp50tTIk8cbfeoH2X6tmmYf/bcmXlnS5CcX2fWAuZr9pCJ0TrHtVlZN6TvjroiE8cQVeourmSfdmm1gqma/ktXEW/9zjyYf/dcmXlkck5spr5xpDpCb2DGsnGiKJ8QYT5qb/aS8yU8UzTaLu1xEmmerYPzhSW8zr5S08MSh2QaGavGTqmaezO1U1XexNywrpcWtvreZJ4+2+Ek9u4YoiS28srOFk+ewY1o5zRy5uYVXzkZYD+isxU9Gm3mluNlPLrLb4kX8F5G/qAvfxLTtoLMWPxlsjpPPPVoC5PoWv5LDriGWLt0YFrV56RfY8a2aZh/9UAtPnmjmSQu7Hoi9Fj+RQ995JML+Dj3LLVxEWmYXFu/8rX5S2sKTjgjbIPa4Fr+yOR4+92gNqN9v9StbW/0kEGEdeqlt4cnDjS71Pex8rJp2jn61lVeWt/BkKsJ6IBZ40tjiJxWa9jiVaEXkVUOtvFKMC0E/rX4lqzUOPvfoEOhnW3hlXuicYNdglFa/crKVl//Izs3KafHL/93CK/vZtUBUhkP3pVaeTEbYFrfCRaRNIKmtPKEJJ0Aa23hSoWmHaFS1xsHnHq2U/kMrTx5o40llhDUYro0n/rZAfPy02qtRVfWvWgPk9lZeOc+uB65cG08CrTwpaw2QdnZbIrhcRNpmF5tw2nkit/OkvC1AWthtcFUG2wMkLfTKnr3ZWC1tvPzb9oByIsIarCC8H9sF9TPsvK2abj/9aFuAPNMeIO0R1gNvJkDq23hSpWlPIKEXbAldRF5jtI1Ximf/V7MN3hjXHlA2d8TBK+hWnn6rLaBsaA8QT4R1WEo7TyraeXJ/+FVcnKTVT/+9PaCsbg8QO7se0Bho45WS9gCxRdiWUC4XkfbZEzvxBUhLB0/KO2a/Q9Fuh78IKJkdcfC5R2dA/WRHgLzYwZMezRosriOgHOvk5d+wa7Jy2v3yL9t55TC7FgjjOnhS2h4gnRG2JaTkKyJ/UdERII0R2oEnVR1x8LlH6G87twvkrvbQKz7tGuJGR4C4OwLK+k6efpNdo1Xzmn1fzK4naQVIbQdPqjXtCS58n+gUSGpH6GRONgEy1ckrxZ08GdJsS0KdPBnsjJPPPbr88i87eOUwu4Z41smT7o4AeaEjQD/BrteqCc01NOfw3COsKRl08qS3k1dKOgLEyW5LBsldRC7pDJD2Tp6UdvLEz25LElzn7Oce32VvElZLl5/+ewevrOoIEHuEdSSE0Aub0Kv80Kt9dv1WTQdP/6szoKzrDH1XFWFNiSi01lDx6ORJD7stmVwuIp2zFTWpdfGkqitA6tj2RNYVL597+NWPdAXIM5cKvmYdiaiLVw51+eVfsPvCygl9vtMV/pxHu56EEiA1YWx7Euqh9B9RRF4rQBxds68uejXbEkgXTyq74uBzj1C6AuS2Ll7JZ9eQFAJkpotXVoW+A2P3i1UzRuk7O3lyXydPyjXriXNdPOm69K6Fl92WrMJFpEcgqV2B0KtSeFV3gHR3BZSSboG42W3xrDtABrvj5XMPQf5Zl6DsZdeQpNq6AuTp0Hdk7H6yanoF+s/dAfJKd4D0R1hPfBGIPXw/SIS1xNjlItI9e4MBRo9AanoEUs22x5ueAOG6A8rmnnj43EOkX+oRlKXdAplg15HsegJKfug7M3afWTndAfqdnoCyqTtAfOx64oJAqroFUqdph7B+St+NIvLWfD0CKe0JkK4I2yyvR1Ayu3n59+zFbbV0e+kHunnyeI9AGtk1wOsoPaHv0AT5Z+w+tHJC52CPoGRFWI8l9QRIx6Xr3s9ug7+4XER6ZncavIneAOnvDSjFPQKxsdssSSCVvUJ8fO7RGyA39AaU05o1wBvqFchE+Ds2kX6Z3Z9WTeitj16BPNgrkGp2PVbRK5Cp8HUeIIPsNtBCEbkWAqkP3aA17dYx2Bsgaf1x8LlHd4D+qCeg7OgJkGCEdcAV6BVIYy9PHg99J8fuX6umK0g/3yMo83sCZJhdj6lmX3g1aNrhDQ2p6ntS+gSS2jv7ShuunNArkLK+AGmLsM0UfQHC9cXJ5x7dQfovfYKyoC9ARth1wLVScvsC5EZ2X1s5fQH6g96Asq03QALa9Riq9dL1LEbYBm8CRSRKoZtgX0Ap7hXIOLvNUIKS2RcHn3u85u2MGs0aIBaCfQFlR3+A/ojd91ZOb4Bc3xdQciKsR18CGQ9dv3gxc+0uFRE1tS+gUIiCoDT1C0q5pl1n/YJS2S/QuPjco4+nf+gXlCx2DaADQbnQLygLQ9/xscfBqhlxq+/tC9JH+wWlXrMeHfQLSkXoumXb4eqgiMRYuJAISjPbroOB/gCNi889BgL0O30BZVN/QPFFWAfoSVBqegX6UPjDzzhJX5D+a7+gLO4TlDHNemJBUJovveBTNNvgqoWKf7iI9AvhGyDEwEBQGe8PKsUDQWWE3RatAUHhBoLK5oE4+NyjR1A/3R+kL/cLSj+7DjBcdl+QWv7xNq/NgEB/PBBUdg0IihRhPVcvqIxeui4varbBNRtR1femDApq6sDsDQpiq20wqJQNCkowwrZrkTkYpJb/3KOf0r8bEOi9obVHWAOYZDCOXoC8NoMivWlAUM6w67lSg4IiXzoXW9ltED0UESMElcoBQanXtF+poFI5KNAH4uFzj6Eg/dWgoBzVrAEsYzCoDAwG6Suhx5Kwx8+q6fWpH+wP0icGBaWZXc+bGRSUxoGgUsG2Q+yMqur7wkVkcHaHg06GgoptKKiUDIUu4AjbIwl/bTA+PvcYlNT/GBKUtYOC4mTXARYVVMoHBfW+0AMT2eNp1QyK9CtDgrJsMKhMatbz+rUNDwWV4rf8OogaiojBhgSlazColA4Jio/d9pqv8Q3GydsOfTz92FBQfW5IUDrZdUDcyBgK0l+zx9bKGRLoz4eCyr4Iawlcegu5PcI20EG4iIwIaurQ7M0LjBJUqoeCSi3bPiwomSNx8LmHqqpvGw7QO4aDSgG7BohLriFBWTck0f9ij7WVMyLQ/x0SlO7wGoLKyHBQKYqwNtDRGKXvRxExj+vSTTj0v2NDAXore5FYMcMC/e/hoHIgwnog/nUNBenz/Tz9OHvcrZoSVf2b0Ftcw4IiRVgP6GxEUD8dLiLDs6+CwQQjgtI3oqrvYC8Oq2VIpF8dEZTlw0Flil0DJJigUjQi0DtDN2j2PLBihoM0TbMGMMTl70TYDWAoF3tRWCn9PvqhkSB9YlhQmiPMHRLbwdBbRuw5YbWgiJhntoiIaupIUKFgGssWkRGR3nxBVM5GmDMkiQuiYrsgKitHRPXr7PlhlVwI0jR23mCMcUo/kDIqqqkXQicLmMVyRWREoD8eDSq7RoOKFGG+kIRGg0rrSJA+NcTR/8eeL2bnQpC+ws4XjIEiYg2WKSIjQfWLF0Rl0QVRGYswT4DQdybnLojW+iEQFBHzoIhYg+lFJPQkzgsCfeRCUKmLMD8AFrkQVPZcEOhP2XPJjKCImGdCVT8YLiKjs9+uggnGTC4iF4L0ujFROcnOC+Aticr4mKikj4r0S+x5ZWTGgvRlzdzAEJeLyNjszQxMcNGkIjIh0e+NBZUtY0GFZ+cEcDUuBpWGi0H6WPgndUxIqIiwcwJjoIhYgNFFZFSgnx0L0rSLojLIzgUgKqJyelSkN7DnnN5BETHPJKUfShkX1dSLszczMMG4QUVkQlX//qJE778YVCrYOQDEynhQEcaDyvYxif6QPQf1yniQvsTOA4zxlyIiKhTMMS7qX0TGg/Q345JynB0bQDeSMnJRVBZcDNLPs+djrDMu0Zc044MhLheR8dmbGZhgQsciEvqDMROSspsdE8AoE6IyeTFIr2PPzVgmVETYccEYU5R+OGVKVFMnZg82mGBSxyIyJatz2PEAjDYpKc3suRnLTEr0JXZMMAaKiAWgiECiM6CIvMiOCcZAEbEAFBFIdCgiietyEZmcvZmBCaZ0LiLseABGmzKgiLBjgjGmKf1/4SIyNXszAxNM61xE2PEADKdzEZmW6AuaMcEQKCIWgCICCQ9FJGGFi4hNVFOnZ29mYA5di0iE8QCMZUAR0YwJhrCp6kdmi4gUPtBgDn2LiHY8AGPJ+hYRm0Sf14wJhrhcRGyh/wNm0a2I2GV1ToTxAIxlQBHRjAnGeLWIzEgKBdPoWkQijAdgKLvORcQh0efZMcEglH4URcR8KCKQ0PQuInZJfY4dEwyCImIJKCKQ0FBEElioiDhENdUeOtBgFl2LSITxAAzlMKCIsGOCQSj9p3ARcYQONJhF1yISYTwAY+lcRJwSfVYzJhgCRcQaUEQgsaGIJKxwEXGLaqpTUiiYRrci4pbVORHGAzCWAUVEMyYYwkHpx1LcRE11yuEDDebQt4hoxwMwFtG5iBD6jGZMMMTlIuKSFQqm0bWIRBgPwFg6FxE3oc9oxgRDXC4iblmhYBpdi0iE8QAM5TGgiLBjgjEclH4cRcR8KCKQ0AwoIk+zY4IxUESsAUUEEhqKSOIKFxEvUVM9oQMNpvDqXETY8QCM5tW5iHgJfZodE4zhpPQT4SLinb2ZgQl8OhYRTlbnsOMBGM6AIqIZEwwRQBExH4oIJDz9i8hTmjHBEJeLiE+mFMzByVTXIsKOB2A4QnUtIhyhT2nGBEP8pYiQ8IEGE3BE5yISYUwAg+lfRLRjggECqvrJFD9RU7nZmxmYwK9zEWHHAzCB3kXkyQhjggFQRCwARQSSAIpIgrpcRPyzNzMwAa9zEWHHAzAar3MRCRD6JDsmGCNA6adQREyGIgKJTu8iwhP6BDsmGCNcRASipvKzBxpMENCxiAiyOocdD8BoAQOKCDsmGONyEQnMHmgwgaBzEWHHAzCaYEARYccEYwiq+mkUEZOhiECi07uIBAhNY8cEQ/j8qvqRFEFVUwWFUjCNfkVEVedEGA/AaLoWkSClj0cYE/TjDyp0E0/pt8MHAEXEdCgikOhQRBLHqSCl173uAIiqmhpUKAXT6FpEIowHYDTdi0iEMSG2mkP7mVL6fnb/o4iYD0UEEh2KSPyaERW6UqT0q+x+v5xQEREVSsE0uhURWVXnRBgPwFCSzkVEovRxdkyInqTQozKlv2T3tyYoIqZDEYGEZkAReYwdE6JSKVF6v6qq72D3dcSEiog0e6DBHLoWkQjjARhKNqCIsGPCNbkgK3RBkNLPs/v4TUNUNVWePdBgDl2LSITxAAxFdC4ihNLH2DHhqiiyQnfLlP6Y3bdXFBQR06GIQEIzoIg8yo4JV4YotIBQehu7T68qoSJCKKVgGl2LSITxAIymexGJMCa8uR5C6fOU0o+x+/OqgyJiOhQRSHQoItbBKZRuopR+i92P1xxVVVMVSimYRrcioqrqnAjjARhN1yJCKX0kwpiglUMp/SO7/6IOiojpUEQg0aGImKuJUvq4qqrvY/ddTBIqIhQxM7oWEXYwBDEhuhcRdkAkHBuldAV9s982j0XCRSRC6QLD6FtEtOMBGE3/IqIdM9kdoVfy2+axyGwRUSiYRuciohkPwGh6F5GHI4yZrCpUVb3vin/bPBZRCUmlhFAwjX5FRJbnRBgPwGj6FhFCHo4wZrK5QBVlPqX0c+z+0Suqqn6QEvLSbBGRCQXT6FtEtOMBGMuIIsKOmTwIlZXdoWud3S96RVXVv6GEPBW6d1GZDKeoIkmlkkzBNPoWEe14AMaSdS4iEnlIM2ZSIOcpIbey+0OvqKr617P1ghRenoMsD4SLiCLKFEyjaxGJMB6AsST9i4hmzEQmyd1UIs9Tnv4Tuy/0CpXl/1Uk+WCEufSnqKKYqogSBdPoXEQ04wEYS5J1LiLSQ5oxE5NPEeWNVJK+ye4DvaKK4tcVUV6piJItwnxCx7ZvtogEJQqm0a+ICPKcCOMBGEz3IvKgdswEI0onaVCO/W+bv0FUv/oRGpSeVoJym2Yur5uX3HOpiIgUTKNzEdGMB2AwyYAiwo6ZKKRGGpQeU1X1vey69QoVxdsUUczXziUCUepOUQUxVRFECqbRt4hoxwMwmM5FRJAe1I4Z54KiTRHE5VQUv8KuV69QQf65EpT2aebypqSu2SISCFIwiRDUsYgIczTjARhNCOpdROZqxoxnQvAIDQZ/wa5Tr4QKlSKIyxRBnNTM5a11hm40qQovUDBJQNC3iLDjARgtIOhbRHhhrmbMuBSsoH7hXkrp37Fr1CPU5/sQ5YNPhI6Pdi5XKCB0pKh+IVXxCxRMwutYRDhhjmY8AKPxBhQRdsy4EhxReGE+FYTPsmvTK5QTb1Z44ax2LleJF9ovFZEABZPoXkQijAlgJP2LyAOaMeODrPCBXTQQ+BG7Jr1COeGnCh/Yo/gDJMJ8rh4faJstIlyAgkn8OhcRdjwAwxlQRDRjWl4+5QKG/bY55cQvKZyQrnDCeIS5REFoSVG9Qqri4ymYhAvoW0TY8QAMF9C3iPj4B7RjWhTHd6m+wHPU7/8ouw49Qr3eD1COf1zxBRo1c4mJQDOKiNlQRCDh6V5E7teOaTEc71N8gQ3Uwxv22+aUC9yocIFczVxiKtCUQrz+VOL1UzCJj9etiMgcN0czHoDheF2LCPHx92vHtA7F68+Wffwf2HnrldB1r3j5ncTrF9m5xJyPb5wtIh4/BZN4dSwibm6OZjwAo+ldRDz8/ZoxrcDLNxIv/6jqdhvy2+aqz/dFxcsvIl7/qGYuevH6G1KI259K3BwFs/j1LSKa8QAM5uH0LiL3acY0lX9a8XDLqZP7MjtXPRIqUsTDPULcXJ12LrqrTyFubypx+yiYhdO5iLDjARjM49O5iPju04xpEsXtOyx7/P/LzlGvEA93veLhcth5GIermy0iLh8Fs+hcRDTjARhO/yKiHdNYbq6cuH33qqr6dnZ+eoS6fD9QXNw24vYFNHMxFFc7W0ScXgpm8elbRDTjARhO3yLi9N0XYUyD+IYVp3ee6vF8hp2XHqFe7+cVp3d+aFztXEzg8lWnEIc3lTi8FMyiYxGxc3O04wEYTu8icm+EMfUmK07vTuryGfLb5tTheDexcw8Rp68mwlxM5Ku6VEQ8FMzi1bGIuOdoxwMwmt5FxH2vdkz9KA7vOeLy3MLOQ6/IDvcfFacnm52HRVTOFhG7h4JZdC4imvEADKZ3EZlx36sZUx9dxO57ls4Y9NvmTt/3FId3C3F4/RHmYhUVKcTmTiUzbgqm0beIaMcDMJbdo3cRuUczZkx5vIrdvZ7aPf/Fjq1HqM3zWTLjSSN296B2LhZj95TPFhGbi4Jp9C0i2vEAjKZ/EdGOGRPKjCtbnnb/nh1Tj9CpqX8g084HyIy7kp2HdbnLwkVEtrkomEbXIhJhPABDGVFE2DGjRWyuBjLjelR1ud7DjqdH5Gn378iM6wQ7D6sjNnfpbBGZdlIwjX5FZMo9J8J4AIYi0059i8iU4252zGvnmpKnXcvEKeeX2HH0iDTt+LY87dwoT7u82rlYH7E5S1LIlDtVnnJSMIneRYQdD8BgZMqAIhJh3GtwSLY5Dfltc3XK/Wky7XpZnnL1RZhH3CDTzqLQAUiVJx0UTDLl0LGI2OdoxgMwGJly6F9EIox7pciUo4xMO+9RVfVv2b5jHXVk5B1kwnlvaEx2HvGITDoLUUTMhiICCc7CRWSYTDrS1Gljfttcnrb/hkw5j0WYR9wik86C2SIyYadgkkm7vkWEHQ/AYGRS5yIy7riLHfMtSPKkfYc0OfNDti89Ik05vilP2tfLk3Z3hLnENTJpPx86AKnyuJ2CSSZ0LCIX7XM04wEYjEwYUEQijBsJGbefIxP2m9k+9Ig64fokGXe8KI87eth5JAoybs9PIeO2VHl8hoJJJmZ0LiIRxgQwEJmw61xEZu5ix2SR8ZlOMm5/RrXZPsL++1hH7VTfTiZm7iYTMyXsPBJNqCjPFpGLNgomGbfpW0TY8QAMRi7adC4itjvZMS8bt3nk8Zn10qj9P9l/p0fkizP/J1+0HdHMI0GR8Zm8FDJqS5XHbBRMclHnIsKOB2AwMqZzERmz3cmOGR734kyWPDZtyG+bS6P2/5DHbGvksRkHO49ERsZsZ2eLyOg0BZOMTetYRKbmaMYDMBgZm9a/iLx2vFFbPRmzPUL7He9mvzbWoWOOj5FR23NkzNbJrjsZkLHp3BRx1JYqXZimYBr9isjI1JwI4wEYSh7Vt4hcvoeNTkvyBdtqdcT9XvZrYh1VVf86PO6orZBdb1IZtZ1OEUenUqULUxTMoncRYccDMJY8OqVzEQnfw84HRya+yG7TI/KY7X+l0amD7DqT0/SpFHFkKlUamaJgFp2LiGY8AGPJF/QtIkZ85xGKOGL7unxhapV0YXqGXWMSy5ktIsOTFMwypW8R0YwHYCx5WN8ionfohZmPSsNTz8gjk+3s2pLeyORJFBHToYhAYovXIqKq6l+R4cnbpZHJ8+ya4FVT2bNFZGiCglkm9S0imvEAjCUPTcZdEZGHJ/5bGprYz64FWJNZKeLgeKo0OE7BLBP6FZGBi3O04wEYSx6aiJsiIg5d/Ko8NLFcGhyfYtcBEQyNZ84WkYFxCmbRr4gII1Oflgcm+rVjAhhHHphYwJ6bVgsdnPqwNDD+pDw40cLOH97E4MTxS0XkIgWzjOtWREKhFy58VBqceFYeGO/Qjg2go8HxY/LA+G/Yc9JqIUNjt0iD43ma+cNbGxw/liL2jadK/RcpmEXfIvJqxIGJb8gD46ulgYsO7RwAYkfuv1ghDV68j46NvZM9D60UeXD8p/LAxT1S/0XCrgGuWEaK2DeaKvWNUTDLRUOKyKuR+y7+n9R/8Yh2HgDRkfsuDkj9Y2lC36ghf+TpWiMOjH5J7htLl/ovjrNrgKvUf/EoiojpjC0ioaidnW8Xe8fukfsulmrnA3CV+sf8ct/YFmlg/LvsuWal0O7xD0j9Fx+X+y42atYA16b/4pFwERF7RymYpG/M8CLyagJdw5+S+sZekvpG+zTzArgCUt9odrD/4h/Zc8tqEfvGbhJ7R8+w84foSL2jh1PEntFUsWeUgkl6zSsir0bqvvhtsWd0o9gz5tXMDyCyWql37CHa36/7k3Kjidxz8cdS7+gusXdMirAGiJLUO3potoh0X6Bgkp4LpheRVxPsGvud1DOaqZkjwCVSz+gFsXt0YbBn+Avs+WOlBPtG/lXsvrBY6rkwxq4BYkfqHj2QIvaMpIpdIxRM0j1imSISitppe5fUNTJX7Bqp0swVklf3BVHqHtkpdQ//iD1nrBS1ffR9UteFR8XukXrNGiDmpO6R/SgiZrNYEXk1wa6Lnxe7RuZLXSPDmjlDsskVu0duZM8Rq0XsGv2T1DVyKsL8QSdS18i+FLFzJFXsHKZgkq5hSxaRVyN1Df1A6hrZLnYOC5q5Q0KTOoebpO6Rx0M/1cSeF1aK1DX8Q5yj5pC6RvbOFpGOYQom6bR2EXk1Yvvwn6TOkVOa+UPCkTqGJ8SOkaVi9+CX2fPASgl2D/2L2DGyQOoYHmHXAMaQOob3pIhtI6li+xAFk3QMxUURCYV2jr1f6hh+TOwYbtCsAxKC1D60T24f+hl77K2U0E+ESZ1DD4kdQzXs/MFYUsfwbhQRs8VREXk1wc4L/yZ2DKdLHUPjmvVAvMoXO4ZvY4+11RLsGPqj1DGUHWH+YAKpY3hXitg2mCq2DVIwTdwVkVcjtw3+VGof2iO2DSoR1gVxQGofbJfaBp+h3Rc+yh5fK0VqHfq+2Da0VWwf5Nk1gHmktsGds0WkdZCCaeK2iLwasWXoVrFt8FyEtYFVtQ3ZxbbBVWJr/7+zx9NKEToHPie2Dc2TWoeGNGsA00mtg9vDRSTYOkDBNHFfRELh2of+X7B14Klg60BbhDWCtRwOtg/+kj2GVora2fkuqXVwbrB1sCrC/ME6tqWILX2pwZZ+CqZJiCLyasSWvq+LLf0rxZZ+W4S1gonE1v5isbX/LrWx8W/Z42alBFsGfy+2DGSy8wcLah3YOltEmvspmCahisirCTYP/iLYMnAownrBYGLLQE+wZeDFQOPQJ9njZKVITQPfCbYMbAo2D/jYNYBFtQxsmS0iTX0UTJOQRSQUtaTkb8Sm/jvFpr7iCOsGvTX3ecTm/vVSS9832WNjpQitvf8cbO5/RWzq69esAaxuc4rY2JcabOyjYJqELSKvJlA/8IlgY98LYmN/d4T1gw7Exr7jweaB37LHwkqh1WPvlJr67gs29Zez84c40dS36VIR6aVglsQvIq9Gauj7ptjYuz7Y2OvW7geIiaa+Sqmp7361ceLv2f1vpQSben4jNvUd08wf4orY1Ldxtog09FIwS3RFJHT81MbeD7LtVk6wvuc34VfKmn0B10ps6B0MNvamCQ39n2X3t5Vy+YVEQ6+bXQPEH7Gxd0OK2NiTGmzooWCW3uiKSENvcaiPYGPvc2rJyDvY7VaN2tj491JDz/3Bhp4K7T6BK9fLBxt7t0oNvd9j97GVEmjq+lSwoedFsaGnR7sGiFdiQ++6FLGuJzVY30PBLFEWkfqe7Nf0Vy419DxAK3v+kf06q0ao7/tMsL43TazvGdTuG3gzYn3PyWBDz3XsPrVSaH//34kNvfcEG3pL2flD/BMbetfOFpG6bgomqe+JrojUdZ9k+wy1iXW9f2K/1soJvZIO1ndvCdb18Ox6QKNOqOt5RG0ceg+7H62UYF33r4J1PUcjzB8ShFjXs2a2iNR2UTBJXXf0RYTtM6xbDNZ17xbquyz9RFY2wdrO6954TclNrO0eFWu7FwUbe7/I7jcrRarr+U+xrnttsK7bya4BEotY17U6RajpSRVquiiYpDa6IiLUdudo+nyNYE3XpFDTvUKs6rL0M5Jem9ArbKG68xGhtruOXU9y6paCNd27hJqeOey+slL42raPB2u7nxdquru0a4CEVNu1KkWo6UwVajopmKS2K8oi0pWj6TOyzmBt1wuBqq5PsX1YNcGq9i8KtZ2LgjVdYxHWkxxqO8+ItZ03sfvGSgn9UqlQ03lnsKarSDN/SGy1XStni0h1JwWT1ERZRGq6Tmn6fBPB6s5yobrrAbWk811sX1aNUNvx42BN5y6hukti15Owajqbg9WdT/jK+z/E7g8rJVjT9QuhpuuQZv6QLFbMFpGqDgomqe6MrohUdZ7S9Hklqjuyxequ69n+rByxuvNmoarjrGYtCSRY3TEpVHUuE6vbvsKu30oJvT0qVHWsClZ3zrBrgGTSuRxFxGxmFZHw2B3BYHXnLqGq66dsv1aNr7z5Q6FX6EJ1R7NmPXEuWN25X6js+G92zVaKv6z7o8HqjmeEqs52dv6QjDqXpQiVbalCZTsFk1R1RFdEKttPa/q8SsGq9kmhsn2FWNn5dbZ/q0asaf+qUNW+PFjZPsWuJ94EqzrOB6o7bldV9a/YdVolobmF5hiaKzt/SGYd6bNFpKKdgkkqY1BE2D6vWUdHsKL9+UBFp6UfGf7ahF65Bys6DmjXEg86OoKVHc/ylT3/xK7LShGqOv4nfvcx6KqyY0mKUNaWKpS3UTBNdEWkoi03Qp9RCVa0lwkV7ffHy4fvqqq+LVDRdkewvK2AXYsVBcvbHEJF+2qxrO0b7FqsFLGi42tCeduKYHnbNLsGgLCK9sWXikgrBdPEoIho+oyRtqxgeZulH6vx2vDVLR8LVrQ+J5S3dWrXYhEVbUeCZS3/x87dSuFq2v9fsKL1KaG8tVUzf4DXqmhdNFtEylopmCa6IlLeeiZCn7FT3ioEy9t2CmWtP2HHtmqkivb/EMpb1wplrU7NekwSLGsrEcpa71Y7O9/OztdKCZS33BosbzvHzh8govLWhSlCWUuqUNpCwTTRFZGy1jMR+oy5YGnrhFDaulwsaYmbD9+D5a2/EkpbjrJrMVhvsKzlpUBRk6V/yVMoaf5ZsLR1r1DaokRYA8AbWZAilLSkBkpaKJhDKImuiARKWs+wfeqsPVDS+lygrOMT7FysmNDj8YWSlnuFkpayCGvRUas3UNq6gS9u/RY7JytFLGn9slDSulQoaZ3QrgHgzQklLfMvFZFmCuYQSpqjLCItZ9k+jSCUtpQKpS330fzWf2DnZMUIJS2fDpQ2vxIoaeln1xJrQmnLCb6k5XfsHKwUX0njB/mSlseFkuYmdv4AV0ooaZk3W0SKmymYQyiOsogUt5xl+zSSUNycxZc0x82H74HSpu8ESlo2BYqbfexaoiWUtFQJJU0PWL2wBkqabwqUtJxh5w9wDdJmi0hREwVzCEVN0RWRouY8tk/jNQcCRU07hMKWH7Pzs2r4oqY/hAqgdi3XoLh5SChqmieUNH2OHcdKCR2fQHHzrkBRs6RZA8C1KG5+JXRipQYKmyiYQyiMQRGJ0K8ZhKKmcaGoeZlY1Pw1dp5WTOgvQArFzQ8Khc017FquTHMgUNi8LVDY+AO2bysleL7pX4WipsVCUfOYdg0AUShqfjlFKGxMDRQ2UjCHUNgYXREpbDrH9mm6oqb2QFHjc3xJ28fZ+VoxwYLWfxEKmxYEChtHNGt5YzmBokZLP8DSc676/cHCxseEwsaGCPMHiF5R00uzRaSggYI5hIKGKItIwzm2T8sobCwRzjfeq55q/Ht23lZM4Hz9jwIFDTsChQ1BzVouEQob64MFDY+6S1rey/57KyVQ0HRDoLDhNDt/gJgqbHgxRchvTA2cb6BgjqiLSEHDObZPqxEKGjL5gvo/snO3asI34PMNua9fQ+OYUNCwOPTWEPv1VsrlQljQEGSPA0DsNb6QIuTXpQby6ymYQzhfH10ROV+fz/ZpTQ2BQH79DqGgwdJ/4vXVhP9a3/mGB4Tz9TOB8w2nhHPW/tA8mFf/BeF8/UIhv/6Cdt8D6OZ5FBGTJU8RmSWcb7go5NcvFQubvsquxYpRS0rewbZZKWpB43uE8w0PC+fra9l9DaC7gsbnZovIuToK5hDyoywi+XXn2T7jQn59WyC/9ln+bG1cfPhuxfD5DdcF8utOavYtgFHya59NEfLqUgN5dRTMIZyLsojk1Z1n+4wv9cVCXt09NKP6nezakMgJnG38fuBc3dbAuTpeuz8BjFT/TLiI8Hm1FMwROFcXVRHhz9UWsH3Go0Be3Qk+r/4P7PqQv0Q4V/05Pq9uHp9XO8TuPwAzBM7VPZUi5NWk8mdrKJgjkFcbXRE5W1vA9hm38mr4QF7tdiGvJi4+fDcqaknJu/izNXP5s7VVmn0GYKJAXs2Ts0XkTA0FcwTORllEztQWsH0mgDH+TG06d6b6K+x6ky18Xs0fAmdqsiLsIwDzna19IkXIrUnlc6spmCNwpia6IpJbU8j2mSgCZ6pbA7nVz/Cnqz/GrjvRE8it+m4gt3ozn1vDsfsFwDJOVz8+W0ROh/8PmCBwOgZFJEK/CabYf6b6Hqv/uG0sIuTWfyaQW53G59YMRNgPAFbzahGpomCOwOnq6IrI6eoits9EFcitPs6frv49uw8SIaFHw/C51ffzuVUV7LoBLOyxFCGnJpU/VUXBHIFTMSgiEfpNYP7AqeptgVPVP2L3RbyGz6n8beBU9fEIawWwukdThJzKVD6nkoI5AjlV0RWRU5VFbJ9J4VTVGJ9Tmc6drvoyu0/iJfzJ8m8FTlVtCJyq9GjWBxAfHkERMVn0RaSqmO0zmQRyqloCp6qe5k9W/hO7b6yaQG7VpwKnql7iT1X1susBiCunqh6aLSInKyiYI3CyMroiklNRzPaZpIqEnIq7rfzhOz179u/82ZX38CcrSyPMHyD+nCp/MEXILkvls8spmCOQXR5dETlZXsL2mcwC2RXH+ezy37H7yez4s8p/xZ8sP8rOFyCuZZXPTfFnl6X6s8opmIPPirKIZJWXsH0mvewKzp9VvjVwsuyH7P4yOvzJyv/0Z5evDR1nzTwB4hyfVfnApSJSRsEcfFZZVEXEn11WyvYJl43y2eVLuMwSwz98D2SXfcKfXf4Cn13eHWFeAAmBzy67f7aIZJZRMAefGWURySwrZfuE1+Ozypu5E6VP+TPLPsruv1hH3db4t/6sirv4rPJidh4AiYbPrrgvxX+iLNV/opSCOfgTpdEVkROlpWyfEBmfWVroP156V+gDbnY/xiL+E6W/9GeWHWbHBUhYWSX3ooiYLOoikllaxvYJb6XsGH8idh+++zPLvuE/UbqaP1Fm144FkMAyy+5J8Z8oSfUfL6FgDv5ESXRF5HhJGdsnvDX+eInPf6J0S+B46Q/YfXqlCf1uiv9E6bP88dIOtn+ApHCi7O4U/7GSVP+xEgrm4I9HWUSOlZSzfcLVKL3AHytZzB0v+hK7b98oapr6Nv/xsjv4Y6UF2v4Aksqds0Uko5iCOfhjxdEVkYzicrZPuHp8RnFzIKP4Sf+xko+w+/i14Y6WfJnPKD7P/nuApHSsJHW2iBwtomAOPqMouiJytKiC7ROuHZ9RVOA/WnSneuzY25n9/DX+aPEKPqN4mv03AMmGP1rU7s8oeib8E4/80ZJvh25k7BeBYYZfe7O62qCI6KU4gz9S/FvucOH/444WP8VnFLdqvwYgufAZRXb/0eLV/qOF33jdjYg7XPhV/mjhcv5o4aT/SCEF4/BHC6P6ToQ/UlTM9gkxVRqhDSDphO81x0q+zt6DXhfucOHPucOF+7gjRQp3uJCC/vxHoisi/sOFlWyfEENHCms1bQBJ5NILqbvZt3jfNNzhwlu5Q4V53KECCvryHy6IvohE6BdipjZCG0DC8x8u6PUfLngpcLDoU+x954rC7c//MHeo4EnuUGEz2znEDoqIxR0uqNO0ASQw/+FCr/9QwUb+wPlvs/ebawp38PxXuEPnl3EHCya4g+cpxJb/YJRF5OD5KrZPiKm6CG0ACcl/qCCTP5D/e/Y+E5Nwhwp+xh0o2MMdKCDcgfMUYsN/IAZFJEK/ECMHz9dr2gAST7XvQMGDtmMl72LvMTEPd+jcLdyB82e5A/kUouc/cD6qIsIdyK9m+4QYOpjfoGkDSBjnR7iD+fO9h85/nr236BrfrrMf4vbnP8Htz2/i9p+jcO38+/OjLyIR+oVYyW/UtgHEuQP5QW5//nbfwXxz/3Abtz//y9z+/HRu/7lxbt85ClfPvy/KIrLvXA3bJ8RQqIiwbQDxbH/+aW7fuRvYe4mp4fac/Sm3L283tzdP5vbmUbhy/r3noi8iEfqFWDnXpG0DiEP78kIviB737Dj3fvY+Yplwu8/dzO09d4bbk0fhyvj3RFlE9p6rYfuEGAoVEbYNIJ7szZvg9uQt5fbmXfGTqk2N79CpD3J7zj7O7T3byO05S+HN+ffkRVdE9uTVsn1CDO3Na9G0AcSNvL3cvryfsfeNuEio6nF7zi7h9uRd5HafpRCZf3cMikiEfiFW8lq0bQBWl5fv35N3G3u/iMtwu8/8hNtzZpdvzxnJt/sMhdfjdp+NrojsPlPH9gkxtOdMq6YNwKr2nG337sl7xn8g76PsvSLuw+05eyO3+0yub9cZCn/B7YpBEYnQL8TI7jOtmjYAi+F2n7X7dp9Z7d+d9/pHtCdavPuyPsDtPPOYb1dug29nLoVcyu06E1UR8e3MrWf7hJhqj9AGYB27zhzx7z77S/bekNDx7Tr7b9yO3MW+Hbljvh2naTLjdp6OvohE6BdiJbdD2wZgPm5HbqlvV+7VPaI90cLtOD2H23F6p297rujbfpomI25HlEVkR2492yfEUm6Htg3APNz2072+7bkvBXbmXtsj2hMx3I5TN3A7Tp/2bT9Fkw2341R0RWT7qQa2T4gdbvvpLrYNwAzc9lNe347TG73bc2LziPZES+i3KLmtpx71bT9d79t2iiYLbnsMikiEfiE2wkUkQjuAkbhtpzJ920/r84j2RItv26kvcltzFvm25oz6tubQRMdty4muiGzNaWT7hNjhtuZ0s20ARuG25VT7tubMtW08pv8j2hMtvm2nfsRtzdnh25ojsDs2kaCIWBu3NaeHbQPQG7ft1Ai3NWe+d8tpYx/RnojhtuT8iduSc8q35SRNRNzWk9EVkS0nm9g+IXa4rTm9bBuAjgRua85239aT5j6iPdHiXpP9Xm5rziPc5pw63+aTNJFwW6IsIptPNrF9Quxwm3P62DYAPXBbTp7mtmZb6xHtiRbv5pNf8G7KXujbdHLEuymbJgLfpuyoioh3U3Yz2yfEjm9Tdj/bBhBLvs0nG72bsx/37Miw7iPaEy2+jSd/6N2cvd27OTvg3ZRF45lvU1aURSSrme0TYse3OWuAbQOIic1ZE97NWUu5jTnx8Yj2RAy3Oet638asHO/GLBqvfBujLCIbs5rZPiF2fJuyB9k2gGj5Nmbv5TafjM9HtCdaXNuOvcezMeth34asGu+GTBpvfBsyoysiGzJb2D4hdnwbsobYNoBr5duQme/deCIxHtGeaPFuyPoX74asBb4NmcPe9Zk0XvjWR1lE1me2sH1C7MTb+QSW1e5dn/WMf3Vm4j2iPdHiW3/8B971mdt860/w3vUnqNX51p+IrohsONHK9gmx41ufeYFtA7gKdt+GE6v9647/O3vtIhaPb13mdd51mdnedSeolfnWRVlE1p1oZfuE2PGtzxxl2wCuyPoTR7xrTyTXI9oTLY51B97NrTv+kG/diWrv2uPUinxroy4ibWyfEEPrjo9p2gDezLoTJb61J+5W05L4Ee2JFu/aE5/3rj0+37f2+JB3zXFqJb41URaRNSfa2D4hpi5GaAPQ8K090etbc+Il18YTeER7osa39tj3vWuObfGtPe73rjlGrcC35niUReRYG9snxNDa4+OaNoDX8K057vWuPbbRuy4Tj2hPlvjWHv2jd01Glnd1BjWbb/WxKItIRjvbJ8TSsQltG8Blmb5VGXhEezLGvuzkP3pWHXvQszqjyrMqg5rFG2UR8azOaGf7hBhanTGlaQNYlVHtWZmBR7QjKSmeVRmf86w8muZdlTHoWXmUGs27KiO6IrIqo4PtE2Jo1dEpTRskLe+qo8PelUfne5edwCPakdfHt/ro97yrjm72rjzKsSeOnlBELG7VUZumDZLPqgzBu/Lodt+Ko3hEO/Lmca848gfvyqOZnhVHqBG8K49GV0RWHO1k+4SYmonQBknl6GnPyiN4RDty5Qm9z+lZkfGAZ8WRSs/yI1RP3hUxKCIR+oUYWXHEoWmDpOBdfqTRu/zo455VeEQ7co3xLD/yWc/yw2ne5YcHPMsPUz14VxyJrogsP9zF9gmx411+2Mm2QYJbcWTCu+JwOrf0MB7RjsQm3qVHvutdfniTd/lhn2fZIRpL3uWHoysiyw53sX1C7ISOD9sGCWz5ob2e5YfwiHZEn/iWHfqdd+nhE56lh2iseJdFWUSWHu5i+4TY8S475GHbIPF4lx7K9y47gke0I/pnasX+f/AsPXS/J/1ghSf9II2Wd+nB6IpI+sFutk+IHe/SQ162DRLJoXZv+qFn/IsO4BHtiLHxpB/6jCf90CueJQf7PUsO0mvlXRJlEVlysJvtE2LHm36QY9sgIdg9Sw6tdqcfxiPaEXPjXXrg294lBzZ60w96PUsO0KvlXXIguiKSfqCH7RNix5t+gGPbIL55lxw44l1yCI9oR6wVX/rB33qXHDzuWXyAXg3v4iiLyJIDPWyfEDveJQd5tg3ik3fxwRLf4oN4RDti3Uykbft7z6L993kW7S93L9pPr1BURcS9eH9vhD4hRjyL9gfYNogzi/f3ehbvf8m18CAe0Y7ER9zpez7tWXTgZfeiA32aE1oLRcTSDgS1bRAPPIv3e92LDmz0LjqAR7Qj8Rnv/P3fci/ct8GzaL/bvXAffQPRFZGF+/si9Akxs1/StoHVeRbtz3Qv2P879npBkLiMa8He33gW7D3mXrCXRhBdEVmwty9CnxArC/fKmjawroV7qz0L9861peER7UiCRU3b8w73vH33eubvK3PP30tfI/oi8vr+IFYW7FPC2HawHM/8vcPuBXvnexfsxiPakcSOa+HOT3nm733JvWBvj3v+Huqevye6IjJ/b/+lfiD2SIQ2sJIFewXPgr3bXQv34hHtSHLFn7bnI575e3Z75u0ZZrddTdzz9va75+2hoIe9krYNLGP+ntOeefvwiHYkueNJ2/E5tu1q4p63Z8CdtpuCDubtFjVtYDpP2u5Gd9qexz1pO/CIdgSJNigiOpq3W9C0gWk88/ZMuOftTncu2ItHtCNIrOJO2zXoTttFIfY8absCbBuYw5O2a68nbRce0Y4gsY77lV2D7ld2UYg9zyu7/GwbGOzl3fneV/bgEe0IoldcL+0ccr28k0LsuV/axbFtYAz3yzvb3S/tfMaWtucj7DmPIEgM43pxx5DrpR0UYs/90g4f2wY6e3Gn3fXiztXul3biEe0IYkRQRPTjfnGHl20D/bhf2nnY+dIOPKIdQYyM64Udw64Xt1PQwQvb3Zo2iL0Xtpe4X9yOR7QjiBlxvbA9zf3CjoDrhfDFCDHkfn7HONsGMfT89l7X89tfcr24E49oRxAz435+60/dL2zf63phG4VY2u7QtkG03M9v97pf2LbR8cJ2PKIdQawU13Nb73A9t73Q9fw2CrGw3aFtg6g8ty3T/fw2PKIdQawa54ubPuF6btuLrj9v7XM9t5VCFP68dUbTBtfE/edt1c5nt861pW3EI9oRJB7ifG7rd9zPbt3sfnYr7/pz+IYIV22bTdsGV2nY/ezW+d5ntuAR7QgSj/E8t+1695+3nnb9eQuFq+P+89Yptg2ujPvZLYLrz1u3u/68FY9oR5B4j/e5DR9wP7v5cdezW1pcz26mcGXcz26ZYtvgrbmf3XLa88xWPKIdQRItrj9v+Zrrmc2rnM9stjuf2Uzhzbme3jLBtsGb2dLoembLY54n8Ih2BEnoOJ/a9H+upzdlOJ/eROGNuZ7eNM62gZbrqc0Tzqc2pTuf3YhHtCNIsmTsiVXvdD616QHXU5urnE9toqDlenLzRbYNXuPJzYrzyU17HU9vxiPaESRZE/qpGeeTmxY6n9w46nxqI4W/cD21cYxtg1muJzfmu57aiEe0IwgyG/dTm3/ifGLTHucTmxTnkxsphI1GaEtym9qdT2x8xvb0RjyiHUEQbVxPbLrd9eTGAueTG2jSe2LDBU1bsnpig931xIbV7ifX4RHtCIK8eRyPrPu46/GNL7ie2NDrfCJ8A0lOj28c1rQlIdcTGw47n9iIR7QjCHJ1cTyx7tvOxzZscj22we98fD1NOo+tH9K0JZcSx6Pr71bT0vCIdgRBrj2Ox9dd53x0/SnnY+toclk/qG1LfK5H1/e6Hlv/kuvBjXhEO4IgsYnniVXvdz+y7nHno+ubnY+uo0nhkXUDmrYE5np0vdf56PqNjsfWfIs9/giCIDGJ69G1X3U9sm6l89G1M85H19KE9si6fk1b4sp0P7YGj2hHEMSYOB9d+0vXI+uOOh8J3WwTk+uRtX1sW6JxPby2yvnIurm2B/GIdgRBDM7IHXve4Xx4zf2Oh9ZWOh5eQxON86G1vWxbwnhozbDz4bXzZx5di0e0IwhibmwPrvqc86E1CxwPr7mguVnFMedDa3rYtnjnfHiN4Hho7XbXg2vwiHYEQawV+9xVP3Y+uGa346HVxPHQahrvnA+t7mbb4tqDq087H1z9J/a4IQiCWCr2B1ff5py7+rzjwfCNK245H1zdxbbFI+eDqxsdc1c/5rl7FR7RjiBIfMRx/6qPOeauet75wKpux9xVNB45567uYtviiXPu6nHH3FXpzgdX4xHtCILEZxwPrviWc+6qjc4HVnKOuStpfFnVoW2LAw+sUpxzV+51zF2NR7QjCJIYcTyw6o/OB1aedDwQusnFiftXdWjaLM55/8p8+wMr8Ih2BEESL5656e9z3L/yMef9K5sc96+gcaA9QptVtTvvW/GM7cHleEQ7giCJHee9q77iuG/FCud9K20RbobWcd+KVk2b9dgd969Y5X5gJR7RjiBIcsV574pfOO5bfsRx33JqSfcub9W0Wcm9yw8771+GR7QjCJK8oY+s+zvnPSvvc9yzvMJxb/jGaCErWrRt5nPeu7zEcc+Ku9Xr8Yh2BEGQcGz3Lv+s/Z7l8+13Lx+x37OMWoHj7mUtbJuZHPcs73Xcs+wl110r8Ih2BEGQSLHfvWKO/e7lu+x3LZftdy+jZnLcvayZbTOJ13HXso2OO1fgEe0IgiBXEvvd6bfa716ab797KTWL4+6lTWyb4e5almm/axke0Y4gCHK1sd+17J/sdy59zn7X0i77XenUaI47lzaybQaqst+ZPtf2YBoe0Y4gCBJNHHcv/ab9zvQNjtR0n/3OdGqUcBGJ0K6r1PRhe+rS+TOpy/CIdgRBkFjGkbr4D47U9Gx76hJqBMcdSxrYNr04UtMFR2r69pnb0/GIdgRBEL3iviPtvY47ljzqSE1vZG/Esea4I72ebdOD4470087bl+IR7QiCIEbFeeviLztuX7LMcfuSafsdi6k+ltRp22LHccfiRsftix/z3J2GR7QjCIKYEecdS/7Xcdviw/bbF1Md1EZoi95tS8Ydty9Jd96+FI9oRxAEMTuh39yeuW3xvY7bF5Xbb19EY6g2Qtu1u22xYr990V7HbQvxiHYEQRCrZfrm9M/Yb1s8z37b4mH7baGbdrQW12jbrtGti/Ltty3BI9oRBEGsnpnbFv/IfsuinTO3LJRmbl1Io1AToe2q2G9d1G6/deHTtjvS8Ih2BEGQeIr9poW32G9ZeG7mlgX02iys1rZdGfstC+32mxesmrlxAR7RjiAIEq+ZuWXRR+03L3x25uaFnTM3L6BXZ2G1tu2t2W9ecHjmpgV4RDuCIEiixH7L/P+y3zx//czN8z0zN8+nV8J+8/wqtu1N3TS/ZObmeXhEO4IgSKLGceP838/ctCBr5qbwTf9N2W+aX8W2RXTj/N6ZG+e/5Lo+7ZPseAiCIEiCxXX90vfYbpz3yMwN8xtmbpxH34j9xnmVbNvr3DDfO3PD/I2OG+fjEe0IgiDJFueNC740c+P8pfYb509pCsRbFBH7DfNO2G9KwyPaEQRBkj22P83/n5k/zTs0c0MafS37DfMqNG1/mldlv2HeXNv1eEQ7giAIcinqfff97cx1r9xtv/6Vspk/pdEQ+59eqfjLf6cN2//0yvyZP7yER7QjCIIgkTNyR9o7Zq57Jc1+XVrAfn1a+cz1adR+XdrhqRvSPs1+LYIgCIJETKho2K97eZH9j2lz2G0Igvwl/x+CzBCctq+LIgAAAABJRU5ErkJggg==" preserveAspectRatio="none" id="img1"></image><clipPath id="clip2"><rect x="1548" y="1469" width="401" height="463"/></clipPath><linearGradient x1="5.66719" y1="56.5386" x2="5.66719" y2="-222.487" gradientUnits="userSpaceOnUse" spreadMethod="pad" id="fill3"><stop offset="0" stop-color="#E73768"/><stop offset="0.5" stop-color="#FFFFFF"/><stop offset="1" stop-color="#69E0F9"/></linearGradient></defs><g clip-path="url(#clip0)" transform="matrix(1 0 0 1 0 -1440)"><path d="M0 0 2266.88 0 2266.88 1666.82 0 1666.82Z" fill="#0A0619" transform="matrix(1.0005 0 0 1 0 867.18)"/><g clip-path="url(#clip2)"><use width="100%" height="100%" xlink:href="#img1" transform="translate(1548 1469)"></use></g><path d="M155.214-196.218 5.66719-196.218 5.66719-147.014 52.2715-147.014 52.2715 0 108.277 0 108.277-147.014 155.214-147.014ZM203.019-168.482C227.888-168.482 232.088-171.016 232.088-195.952 232.088-220.487 227.888-222.487 203.019-222.487 178.417-222.487 173.883-220.487 173.883-195.952 173.883-171.016 178.417-168.482 203.019-168.482ZM176.15 0 229.821 0 229.821-152.681 176.15-152.681ZM313.829 1.66682C320.296 1.66682 326.564 1.13344 332.764 0L332.764-40.4038C329.631-39.6037 328.764-39.8704 326.83-39.8704 318.03-39.8704 315.229-43.5374 315.229-56.2719L315.229-214.887 261.491-214.887 261.491-43.5374C261.491-12.7345 271.692 1.66682 313.829 1.66682ZM493.646-86.208C493.646-132.879 478.911-155.481 424.64-155.481 374.035-155.481 344.632-139.08 344.632-76.3404 344.632-13.6013 374.035 2.80026 422.106 2.80026 452.309 2.80026 476.644-2.80026 485.979-9.3342L485.979-48.9379C476.911-43.5374 453.442-37.8702 432.84-37.8702 413.305-37.8702 401.171-43.2707 398.037-56.5386L491.912-62.2058C492.779-64.4727 493.646-73.8069 493.646-86.208ZM397.77-93.0087C398.904-111.944 406.838-116.211 424.906-116.211 441.908-116.211 446.108-108.277 446.108-96.6757ZM577.32-48.3378 577.32-196.218 521.049-196.218 521.049 0 649.927 0 649.927-48.3378ZM739.269-155.214C728.268-155.214 711.866-154.614 700.265-152.681L700.265-107.743C710.199-109.41 720.6-110.277 731.668-110.277 754.003-110.277 759.671-108.01 760.471-92.4753L729.135-92.4753C684.73-92.4753 664.662-79.4741 664.662-44.4041 664.662-11.6011 684.73 2.80026 716.667 2.80026 743.536 2.80026 756.537-6.53394 761.071-14.1346L765.271 0 813.942 0 813.942-103.743C813.942-139.346 792.74-155.214 739.269-155.214ZM733.668-37.0034C722.601-37.0034 716.667-39.0036 716.667-46.9377 716.667-56.0052 722.067-58.5388 739.002-58.5388L760.471-58.5388 760.471-46.671C756.27-41.2705 746.936-37.0034 733.668-37.0034ZM949.888-155.481C925.019-155.481 910.351-147.547 902.684-137.146L902.684-152.681 849.012-152.681 849.012 0 902.684 0 902.684-100.676C904.951-108.277 910.885-113.077 924.486-113.077 941.421-113.077 946.221-109.944 946.221-91.0085L946.221 0 999.96 0 999.96-103.21C999.96-140.213 985.825-155.481 949.888-155.481ZM1129.97-152.681 1129.97-139.346C1124.04-150.681 1112.5-155.481 1086.43-155.481 1038.1-155.481 1025.7-119.611 1025.7-77.7406 1025.7-31.4029 1038.1 0 1086.43 0 1112.17 0 1124.04-6.53394 1129.97-17.535L1129.97-13.6013C1129.97 10.7343 1117.84 16.6682 1085.3 16.6682 1072.03 16.6682 1054.76 14.4013 1042.63 11.3344L1042.63 52.8716C1057.03 55.1385 1076.57 56.5386 1090.97 56.5386 1163.38 56.5386 1183.18 29.136 1183.18-12.7345L1183.18-152.681ZM1105.1-37.0034C1083.37-37.0034 1079.97-55.6719 1079.97-77.7406 1079.97-98.4092 1083.37-117.011 1105.1-117.011 1130.84-117.011 1132.57-102.343 1132.57-77.7406 1132.57-51.7382 1130.84-37.0034 1105.1-37.0034Z" fill="url(#fill3)" transform="matrix(1.0005 0 0 1 313.501 1774.2)"/><path d="M0.533383-56.0052 0.533383-46.8044 17.4683-46.8044 17.4683 0 28.2693 0 28.2693-46.8044 45.0709-46.8044 45.0709-56.0052ZM51.0714 0 61.8724 0 61.8724-56.0052 51.0714-56.0052ZM72.8068 0 113.544 0 113.544-10.5343 83.6745-10.5343 83.6745-56.0052 72.8068-56.0052ZM120.078 0 161.548 0 161.548-10.5343 130.946-10.5343 130.946-23.6689 158.948-23.6689 158.948-33.6031 130.946-33.6031 130.946-45.5376 161.548-45.5376 161.548-56.0052 120.078-56.0052ZM192.285 0 232.955 0 232.955-10.5343 203.152-10.5343 203.152-56.0052 192.285-56.0052ZM268.492-56.0052 254.824-56.0052 234.755 0 246.423 0 249.957-10.401 274.292-10.401 278.026 0 290.494 0ZM252.757-18.8018 261.291-44.4041 262.091-44.4041 271.292-18.8018ZM296.361 0 306.829 0 306.829-38.0702 307.229-38.0702 334.164 0 344.765 0 344.765-56.0052 334.431-56.0052 334.431-17.4683 334.031-17.4683 306.962-56.0052 296.361-56.0052ZM400.571-8.40078 400.171 0 410.305 0C410.305-0.400037 410.305-29.4694 410.305-29.8694L380.502-29.8694 380.502-22.002 400.371-22.002C399.571-15.2014 392.57-8.13409 382.436-8.13409 370.034-8.13409 363.767-16.8016 363.767-28.0026 363.767-38.8703 371.301-47.2711 382.502-47.2711 391.103-47.2711 396.57-43.2707 398.704-36.8701L410.105-36.8701C407.371-50.8714 397.704-56.8053 382.436-56.8053 364.567-56.8053 353.3-45.6042 353.3-28.0026 353.3-10.4676 363.501 0.800074 381.302 0.800074 394.037 0.800074 398.904-5.33383 400.171-8.40078ZM455.509-21.8687C455.509-13.868 450.175-9.73424 442.508-9.73424 434.574-9.73424 430.107-13.868 430.107-21.8687L430.107-56.0052 419.172-56.0052 419.172-20.2686C419.172-5.93389 430.04 0.800074 442.575 0.800074 455.576 0.800074 466.11-5.93389 466.11-20.2686L466.11-56.0052 455.509-56.0052ZM503.18-56.0052 489.512-56.0052 469.377 0 481.111 0 484.578-10.401 508.914-10.401 512.714 0 525.182 0ZM487.379-18.8018 495.979-44.4041 496.78-44.4041 505.98-18.8018ZM573.387-8.40078 572.987 0 583.121 0C583.121-0.400037 583.121-29.4694 583.121-29.8694L553.318-29.8694 553.318-22.002 573.187-22.002C572.387-15.2014 565.386-8.13409 555.252-8.13409 542.851-8.13409 536.583-16.8016 536.583-28.0026 536.583-38.8703 544.117-47.2711 555.318-47.2711 563.919-47.2711 569.386-43.2707 571.52-36.8701L582.921-36.8701C580.187-50.8714 570.52-56.8053 555.252-56.8053 537.383-56.8053 526.116-45.6042 526.116-28.0026 526.116-10.4676 536.317 0.800074 554.118 0.800074 566.853 0.800074 571.72-5.33383 572.987-8.40078ZM592.455 0 633.926 0 633.926-10.5343 603.323-10.5343 603.323-23.6689 631.325-23.6689 631.325-33.6031 603.323-33.6031 603.323-45.5376 633.926-45.5376 633.926-56.0052 592.455-56.0052Z" fill="#FFFFFF" transform="matrix(1.0005 0 0 1 590.804 1927.55)"/></g></svg>
\ No newline at end of file
+<svg width="2268" height="537" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><defs><clipPath id="clip0"><rect x="0" y="1440" width="2268" height="537"/></clipPath><image width="401" height="463" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAZEAAAHPCAYAAACSpefQAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAAFgBSURBVHhe7d0HeBvnfT9wNm3TpE2zk3/SzGY0aTObttlDWW2TZseOt01vy3vHm9rU3ntviZJIihIliuLee+8tUlwgNg6Hw929997/ASgr9nuwLQm4AeD7fZ7PE+c96h23fiBAHlNSEASJmLRJ8da0KfHLbDuCIAiCvGHSJqX/TJsQq9PGRZo2IUppE+LmtAn1g+zXIQiCIMjlLJiU/iNtXFw7b0J0hwvIa01IrfPGpafSpuiH2X+HIAiCJHFChWH+uPT0vAmpfd64SN/M/HExb96EeAvbB4IgCJKEWTAu3jhvXDzLFos3M39clOdPSLvnX5R/zPaHIAiCJEHmT0jfmz8ublswIQrzZwvD1ZuQxuZPiIvSJoJfZPtHEARBEjALJwOfWjAuvbxgQhrQFIVrtGBCrJ0/IT2c5qDvZsdDEARBEiBpqvq2hePinQsnxOIFoRu/DhaOS9mLJoN/ZMdGEARB4jgLx+WfLZyQ9i0M3+j1tWhc5BZNiJuXjEvfYeeBIAiCxFEWjAe/sGhSXLxoUrq4cEKkRlo0KfUvmpReTp8SPs3OC0EQBLFwVkzRf1g8Kc1dPCHWLArd0M1VumhCvCetU307O08EQRDEYlk0Hvz14gnpWISbubkmpSOLpuT/Y+eLIAiCWCCLxsWvL54QVy2eFB2LJ0RqSZOifcmEuDp9XPx3dv4IgiCICVkyTj+weFJ6Ysmk1KK5aVuW1L5kUnp6uU39CLseBEEQxKCkT4nXp0+Ip5dMiDQuTYrn0ifFW9l1IQiCIDpm6bT07SUT4ub0SZHX3JjjTPqEqCyZkPakT8k/YdeJIAiCxDBLL/IfT5+QXkiflHrTZ2/AiWNSupg+KS5ZOhn8V3bdCIIgSJRZPinevnRSLFg6KdJEtmxKrF8+JT2yZkR9L7sPEARBkKvMsin5x0snpT3LJkWFveEmsmWT0smlk8Hr2P2BIAiCXEGW2ITPLZsSFyyfkkaXhW+qScm/fErcsnxC+h67fxAEQZAISRtR37FsUrpv2ZRYGeGmmpSWT0oDyyeltBVTwj+z+wtBEAS5lGWT8i+XT0pHlodvnMBaMSWWL58Q7w0VWnbfIQiCJG1WTYtfWTElrlg+KdrYGydorZiUji6bkn/F7kcEQZCkSugnkFZOS4+tnJKaVoRvjnClVk6JjpVT4pplE+I32P2KIAiS8FkxGfzDyknpJHtzhKuzclLqWDklPbt6hn6U3ccIgiAJl1VT0jdXToobVk2KvpXhmyDEwqpJ8fyKSfE2dn8jCIIkREKvlFdPSc+tnpa6Vk2JFHQyLe1dPS3/jN3/CIIgcZvV0+Itq6bEc5obHuhi9ZQ0vmpaTF89JX6JPRYIgiBxkzWT0g9XT8s7V09J0uopKXRzAwOtmZYbVk9Lj20eVd/HHhsEQRDLZvW08JnV01La6ml5mL2xgfHWTEun1k6R69njhCAIYqlsU9W/XT0t3rN2Wi5bE7p5gWWsnZYCa6fkrWtt0vfZ44YgCGJ61tjk/1k7JR1kb15gNfLg2vB3icJn2GOIIAhieNZNBv9t7bS8dN2UNLk29IoX4sK6Kbli/bR0/6ox+k72mCIIguiedQ767rVT0sPrpuV69gYF8WPdtHRs3ZT8a/b4IgiC6Jb10/Lv1k1JmevCr2gh7k1LrnVT8toNk9J/sscaQRAkZllnl/5jvU1eu2Facq+fligkGJvcud4mPbfFQT/GHnsEQZBrzuYp+uENNunp9Ta5XXPjgYSzwSYVbLCRO1RV/Sv2XEAQBLmqbJgiN26cls5uCN1cINns3zAt/5w9JxAEQd4yG23S9zfa5G0bbJIQ4eYCyWNy47S8dPOU+GX2HEEQBNFk4yT91MZp6eVN0/LAxmmJAsySGzdMS4/vGKPvZ88ZBEGQlDRVfdsmm3jnRptUrL2BAMzaZJNOb5oiN7DnD4IgSZzN0/LPNk1L+zeFbhIAb8UmCZtt8vbNM9IP2HMJQZAkymZ78AubbPLizdPSRc2NAuAtbLbJw5ts8ryNNuFz7LmFIEgCZ/8U/YfNNmnuFptcszl0MwCIily5dVp6IHResecagiAJlq12+ddbZqRjW2wSBYilrTPS8S3T8m/Zcw5BkATIdpv49a02efXWGcnBXvwAMeTeMiOv32aX/os9BxEEicNsGKcf2DYjPbF1Rm7ZGnq1CGAIuXubTXp+u4N+nD0nEQSJk2ybIddvtUmntRc4gDG2zUiF221iauhHyNnzE0EQi2brtPSd7TPy5u0zEr8tdCEDmG1GOrDdJv8Pe64iCGKhhN462DYjvbjNJvdqLmIAk22fkaa2zcjLt06LX2HPXQRBTM52G7l9x4xUsD10sQJYmty8fUZ6fNuE+kH2PEYQxODstMs/3jEj79lhkxTtxQpgabnbZsiN7DmNIIgB2T0T/PxOu7xg54w0umNGogDxaKddCu60yzt2zUg/Ys9xBEF0yJ4R9R277NJ9u+xy5c7QRQiQCOzySOhFUejFEXvOIwgSo+yekX+5yy4d0VyAAAli14xcvcsuzd1oU9/Fnv8Iglxjdk+LX9k1I6/YZZdsu8IXGkBi222XTuyxy79jrwUEQa4ye2zy/+62SzPsRQaQDHbb5d17bOpH2OsCQZArzG4n/cTuGemF3TNy9+7QRQWQdOSePTPSi6Frgb0+EAS5wux1SN/cbZfX756R3NqLDCApFO+yiXemqerfsNcHgiBXmL12+Td77dKxPTMSBUhGe2ekg6G3edlrA0GQK0zGGH3nXrt03167XM5eYADJYK9dmt4zI6/YOy1+lb0+EAS5wux3C/+8zyG9ss8h9++1hy8sgKSyzyG37HNKT+6apB9irw8EQa4w+53Sd/c55M37HBK3L3RhASSZ/Xbp7IEZchN7bSAIchXZ55D/sN8uZbEXGEAy2G+XpH12eecBuzyHvTYQBLnC7LTTfzxglx48YJer989eWADJ5sIBu7xw30zwX9jrA0HiKqFXRPsdUtoxEx7hcGgm+Pn9dnn+Abs8HOEiA0gCcs1+u/RQ6IUVe30giKVzxC18+qBDOnzALtFLxg6Z9MjrAzPSDw445G0HHFLgNfMBSBoH7VLWAYf8e/baQBDL5YhN+OwBuzzvgEMeYk/kMId0cL9D/jn774zIASf50wGHdEozJ4AkcNAh+Q445I0HHNK32WsDQUxP6Pc2Dtml+w7a5YqDs6983tAhuzR+0C4vCr3dxPajdw551PcddEiPHnLI9ey8AJLBIYfce9AhvbTPpX6SvT4QxJQcnJF/ddAuZbAn61uTKw7OiPduU9W/ZfvUO0ccwX89aJcXH7RLY9p5ASS+Qw6p5IBDvMuM6w9Bwjk6I37jkFNec9ghOQ45wifltTpw2KS3uA7a5R8fcsq7DjskKcK8AJLB4SNO+RfstYEguuWYTf3IEaf09GGH3B7hhLwmh53SxUNOeeF+m/A5djwjcthJbjrskM6w8wJIEjOHHPLKg27x6+y1gSAxzVEXueWIQzp3OHTj18ERh1xxyCneU2LCU0ozfPRDhx3SE0cccjM7L4DkILcedkpP7Z+iH2avDwSJKkft8o+POOXdRxyyfMQhh272ejtw1CH/jJ2HEclwil854pCXHXXKkxHmBZDwjjpI3hEnuZm9NhDkqpNhp1846pQXHXHIo+yJprejDnnsqFNekGGj5rzF5ZB/fsQp72PnBZAk5NALx9ALSPbaQJC3zEk7/cejLvJghpPUHJ29oZsmw0HKMxzkbjPe4golw05uy3CSfHZeAMkgY/bF3KITdvWL7LWBIBFz1C7/LsNJMtmTyXROef8Ru/xTdr5GJHOGfjTDSZ456iDtmnkBJIEMJ6nNcJCHDzjou9nrA0HCyXBI38xwyhuOOWRPxuwrECsaPeaUFxzx0M+y8zcix93iv2c45FUZDnkmwtwAEt4xJ8k+7pD/wF4bSBIn00E/fsxBnj/mlLvZE8aqjjlIWegtrmOq+tfseozIcaf8iwyHfIidF0BScMpchkPenOGUvsNeG0gSJU1V33bcRe445iSFx5yhVxjx57hT3pfhln/Crs2IhD6jOeYmdx53kiJ2XgBJov+4i7x8xK1+mr0+kARPhkP++XGnvD/CSRF3jjvl0eNOef4Jj/oZdp1GJPydnIs8f9wld7FzA0gGx52k9JiT3HNMVd/OXh9IguW4U/zScaecftwlTxyfvQEnEFJ23EHuCn2Hxa7biGTapf864ZTXnXDKLu3cABLfCZd85IRT/iV7bSAJkFyP+r5MF3k000kaToQOdgLLdMp7M016iyuUEy7515kuOYOdF0CSsJ9wyatDP4TCXhtInCbTIV93wklyIhzshJXplC9kOuV5GW76z+z+MCJ7RtR3ZDnJvZlOUsbODSA5kPZMJ3k69Kw99vpA4iTZTum7mS55S6ZL9mfO3liTECnNdJM7VVX9K3b/GJFst/rpTBd5OdMl92nnBpD4slzkXJaL3MpeG4iF8+qNK8sp97MHNIntPeE27/ENWU7pO1lOeVOmS/ZGmBtAQstyyiTTKe8x821m5AoS+uMy2Q5yd7aTlGbNHjh4vZEsF0nLMektrlBOuuXfZztJZoS5ASS8bJd8MdspL8ly0H9lrw3E5GQ55V9ku+XDWa7Qt4/w5khJtpukmvUW1zGb+q4sJ5mb5SZV2rkBJL5sF6nPdpFHst3qe9nrAzE4mS7xaydd8oqTLtmWHT44cKVOuuQ92W55DrtPjUqOh34u2y3PO+mSh9i5ASQFNzl50i1fx14biAEJ/QGlk27yxEkXadEcGLgawyddJC30ORK7j41Klkv6/kmXvDXbLfMR5geQ2NyyP9stbwn9IBB7bSA65aST3JjjJmdOzr6ahhjIcZHiHDdJZfe1kcnxkOtz3CSHnRtAkhgIvaAz8zPLhM9Jl/TDHJe8/aRLDkY4ABADOS55t5lvcYXeI85xkUdyXKSOnRtAMshxk/KTbnJvyYj6Dvb6QK4xoffOc9zyvFMueThn9kYHOgrt51Mukpbrop9ij4VROWVXv5jjlhfluOVRdn4ASeJojkv+FXttIFeRjDH6zlMecv9pN6k8NXtzA0OR4tMucgd7XIzMabc857RL3nnaJYva+QEkttMu2XHKJa855Ra/wV4byFvklEv+9SkXOcbuVDDWaZesnHbJu0+5pB+xx8jInHaSG0+5SS47P4Ck4CYdp9zkmbwZ+lH22kCY5Hqk/8h1yWtOu2Tn6dmbGFjDUK6LvGLqW1w+9YOn3eTxXBdpijA/gCRAzue6yG3stYGEiodf/UiumzyT6yEdp90yBYvykKLTLnI7e/yMzGmn+OVcj7w01yNPaOYHkARyPfLeXI/8M/baSNrkesmtuW6SnxvaOWB5Z9wyyfXIu3Jd0g/ZY2lkQhfRGY+894xbVtg5AiS6Mx55/IxbTj/ro//GXhtJkzNu+Se5Hnl3+KYUYSeB5Q3mesgrp1zqJ9lja2RCL0LOuMm5CPMDSAKkIddLHg39zST22kjYnPOpXzzjlhed8chjZ2Zf2UI885DCXK+579OG3g496yZPn/GQNs38AJKBh5zK85Dr2WsjoXLSTv/xjIc8dNZNajU7AOLaWbdMznjknXku6QfscTcyuW7x62c98sozbtnGzhEg0Z31yIEzbnnrWZf0ffbaiPucdcu/P+shmWdnbziQoPLc8mCeh7x8zkk/wZ4DRuacR/7fPLd8kJ0fQDIIXYdnPSQt16N+hr024i75XulbeW55wzmP7M2bXRwkgXNuUpBn8ltcx1T1r/PcJPWchxSy8wNIBuc8pCLfQ+4L/eI2e31YPmcd9ON5HvJCnlvuYRcGSUM+55F3mP0W12kH/dg5D3nunId0RpgjQMI75yHH8l3yr9lrw5JJU9W35c+++is6F35FCsku3y0P5HvIS6EXFuz5YmTOe6T/POeR1+a7ZSc7R4BEl++WXaHzP3QdsNeGZZLvkf873yPvP+cJVT6A18v3koJ8L7mVPW+Mzjmv/Kt8r3yUnR9AMsj3kM7Qd+bnefpP7LVhWvI48Uv5XnnpeY88mR+eJMAbkvI98o4Cn7k/PXKW0r877yP35HtIaYQ5AiQ+Lyko8JLbzfoz2eGEfrnlvJc8dt5LGjUTBHhz/ec95EWz3+IqctFPhd5qO++ReyPMESAZhN49+jl7beie8x5yfYGH5Jz3yBTgmnnJ+QIPuYU9v4zOea/07QKvvKHAK3s0cwRIcAUeebLAKy/N58Qvs9dGzFPklb5b4JW3FHhlnp0IwDWSCrzy9nyf9D32fDM6hT75d+e95ESEOQIkvAIvaTzvJY+f89D3s9dG1Clxq58u9JBXCrzyQMFs5QKIqUKP3FfoIS8W8vRj7PlnZPKn6D8UesgDhR5Syc4RIBkUesjp8x7yJ/bauKYcU9W3F/rIPYUeUlo4e6ED6MtL8q3wFleRh3620EPSCr3yoGaOAIlPKPTK2wt9UfyeV5FX/mWhVz4coXMAXRV5ZLHQK28LvX3KnpdGp9gnfa/IK28p8sp+dp4Aia7IKw8XeeV5JR76OfbaeMMU+cWvFfnklUVeeabIG+4EwBTFPrmv2EdesMLPtJf45OuKvOQkO0eA5EAqC33kgVMT6t+z18bllPvoh4q95MkiL2nRdgBgnmIfyS/ykJvZc9boFLjU9xR6yMNFPlLLzhEgOZDjRV75N+y1kVLiITeV+MiZ4tAFC2BBJV45WOKVt1rhLa4iL/1CsVdeWOKTL7DzBEh4Ptld4pPXlXqk/wpfECVe+VclXtKq+UIAK/LJvhIfea5EVf+GvbkbndD7xMVeckwzR4DEJxX7SN7sheAlqSVemZb6SG2pjzSG/hvA6kp95FzoO2j2xm5GyjhyQ6mPnGbnCJCISr2ksMQnj4b+O3wBvFpEwht98mCJlxSXeuUp9h8CWE2pVxZCb3GVeqXvsDd2o1Ptoe8v8ZLH8UIMEhdpK/GSshKvLL7aFn72VpmXpJZ6Sai6/IWPNJZ5SYWmHcCCyrykp8xHni/z04+yN3ejU8HRL5V6lfQyLxln5wkQj0LncplXKS7zkRF22+UiUjb7hSyp3EvKy7ykJcI2AOvxKXnlHLmRvbGbkXJO/mm5T9lT7iVEM0+A+FFR7iNNEdrDVFV925sVkVk+MlruVYrLvWRMsw3AYsq9RCj3KlsqvPTb7I3djJR5yC2h4sbOE8DKyr2k+dI3EQq77bUuF5FyH6FvpcJHmss5Ul7hIzK7DcBqKjjSXRH6KS6/+hH2xm50qjj64XKOPFnuI63sPAEsZrTcpxRX+MjFCNs0rqqIvKrCpxTODqTdBmApHKmu4qj+j7e+wlT46dfKfcqKch+Z1swVwEThbw44Ul7uIy3stjdzTFX/OqXSS1IrZju5YpU+MlXhU4orOTLEbgMwHUeqyn3kwUo7/Uf2Rm6FVPnl/6nwKQc08wYwQaWPNFZypIJtvxLXXEReo72SI6WVHPFH2AZgqPCLGk6ZV301D4szKaG3ASr85I5Kn1LArgPACJUcGQ5/M+Ajk+y2K3W5iFTOVqNrx5GqSo7UadoBDFDFEX8Vp2yp9lHT/7DV1aaSp/9U6SPPVvpIB7suAJ0EKjlSVhn6JkC77aqEnxwRkyISwhFHlU8pqeJIr2YbgF44JavCJ/+BvTnHW6r99BuVPmVN6DrSrBEgdurDL/q17dckXERqvCS1KvRqLlY40h0qJtUccWu2AcRINUeqqiz8uce1psor/1+1TznCrhcgKhwZCN+XfcSm2RYFfYrIJdUcqQlh2wGiUc2Roao4+dzjWtOpqm+v9pG7w9/ZR9gHAFeqmiNcFUdKq32kk90WC42q+rfhIlIdGkwHNRzx1YQWwJEudhvAVeGIvyZOP/e41lQE1E/W+MiLNRzp0ewPgLfCkdoajlRr2mNotoj4SWoNF77h66bWT/pr/UpJrZ/Y2G0Ab03JqkmAzz2uNdU8/WatX1lf4yce7b4BYPhJ76X7rVOzLcYMKyKvUV/rJ5UR2gE0av2kqsZH5iba5x7Xmlpe/m2NXznO7ieAMD9xXyoe3ZptOgkXkTo/Sa0NXbAGqeOIUOsnZXUcaWW3AYT5yWBdgn/uca1pVNW/r+XJ/XV+UqHZb5C06jhSE8K26y30+Z3hReRVdRwZqfMrxXUcGWe3QdLy13LKloYk+tzjWlMvqJ+p9ZO0Oj8ZiLAfIUnU+UlXrZ+U1vmJl91mhMtFpG72pm6Wpno/KY/QDkmknlOy6pP4c49rTb2Xfrfer2yu8xOO3aeQwPzEXudXSur9pF+zzUBWKSJhlwpJM9sOCc5PqurwuUfUCRXgek7J1uxfSDyha4YjdZp2E/RT+nfhIlIffiVovgaOjNf7leIGjoyw2yDB+MlgvZ+kNeFzj5il1kHfXc+Rh+o5UqPZ3xD3Gvyko8FPSus54me3mcVyReQyP2lr8JOyeo4ENdsgvvkJVx8nn3vUc/TLDZxyqoEjT4V/MzdOUuel/1LPKQvwYiwxNHBkKvTiOvzCK8J2M4WLSKOfpDbMTtRyGv2kspEj9Ww7xCslqykOPvdo9tOvNXDKikY/mX517o1+5XyDn9zOfq2V0+SjP2zglO2NHBG0xwLiQege2MCRBrbdKkpU9R3hItLoD0/Wmnhia+KVkkY/GdBsg3hR1cBb/3OPVo5+uClAnmzkSUuENczilX0NnPxz9t9aOY0B8qdGv3JKsxawLp60NvpJWaOfiJptFhIfReRVPOls9JPSJp74NNvAkpr8ZLAx9LmHYP3PPRo5cksjr+Sxa4ikiSeTjbyyrIGjX2H7sWraPer7GnnyWKOfNLDrAUsZb+KV4kY/GYmwzXLCRaTFT1KbZi/4uNDsJ9XNflLLtoOlcM1+ZUtLwPqfezRy8k+aeWV3s5/IEdbxppp50tTIk8cbfeoH2X6tmmYf/bcmXlnS5CcX2fWAuZr9pCJ0TrHtVlZN6TvjroiE8cQVeourmSfdmm1gqma/ktXEW/9zjyYf/dcmXlkck5spr5xpDpCb2DGsnGiKJ8QYT5qb/aS8yU8UzTaLu1xEmmerYPzhSW8zr5S08MSh2QaGavGTqmaezO1U1XexNywrpcWtvreZJ4+2+Ek9u4YoiS28srOFk+ewY1o5zRy5uYVXzkZYD+isxU9Gm3mluNlPLrLb4kX8F5G/qAvfxLTtoLMWPxlsjpPPPVoC5PoWv5LDriGWLt0YFrV56RfY8a2aZh/9UAtPnmjmSQu7Hoi9Fj+RQ995JML+Dj3LLVxEWmYXFu/8rX5S2sKTjgjbIPa4Fr+yOR4+92gNqN9v9StbW/0kEGEdeqlt4cnDjS71Pex8rJp2jn61lVeWt/BkKsJ6IBZ40tjiJxWa9jiVaEXkVUOtvFKMC0E/rX4lqzUOPvfoEOhnW3hlXuicYNdglFa/crKVl//Izs3KafHL/93CK/vZtUBUhkP3pVaeTEbYFrfCRaRNIKmtPKEJJ0Aa23hSoWmHaFS1xsHnHq2U/kMrTx5o40llhDUYro0n/rZAfPy02qtRVfWvWgPk9lZeOc+uB65cG08CrTwpaw2QdnZbIrhcRNpmF5tw2nkit/OkvC1AWthtcFUG2wMkLfTKnr3ZWC1tvPzb9oByIsIarCC8H9sF9TPsvK2abj/9aFuAPNMeIO0R1gNvJkDq23hSpWlPIKEXbAldRF5jtI1Ximf/V7MN3hjXHlA2d8TBK+hWnn6rLaBsaA8QT4R1WEo7TyraeXJ/+FVcnKTVT/+9PaCsbg8QO7se0Bho45WS9gCxRdiWUC4XkfbZEzvxBUhLB0/KO2a/Q9Fuh78IKJkdcfC5R2dA/WRHgLzYwZMezRosriOgHOvk5d+wa7Jy2v3yL9t55TC7FgjjOnhS2h4gnRG2JaTkKyJ/UdERII0R2oEnVR1x8LlH6G87twvkrvbQKz7tGuJGR4C4OwLK+k6efpNdo1Xzmn1fzK4naQVIbQdPqjXtCS58n+gUSGpH6GRONgEy1ckrxZ08GdJsS0KdPBnsjJPPPbr88i87eOUwu4Z41smT7o4AeaEjQD/BrteqCc01NOfw3COsKRl08qS3k1dKOgLEyW5LBsldRC7pDJD2Tp6UdvLEz25LElzn7Oce32VvElZLl5/+ewevrOoIEHuEdSSE0Aub0Kv80Kt9dv1WTQdP/6szoKzrDH1XFWFNiSi01lDx6ORJD7stmVwuIp2zFTWpdfGkqitA6tj2RNYVL597+NWPdAXIM5cKvmYdiaiLVw51+eVfsPvCygl9vtMV/pxHu56EEiA1YWx7Euqh9B9RRF4rQBxds68uejXbEkgXTyq74uBzj1C6AuS2Ll7JZ9eQFAJkpotXVoW+A2P3i1UzRuk7O3lyXydPyjXriXNdPOm69K6Fl92WrMJFpEcgqV2B0KtSeFV3gHR3BZSSboG42W3xrDtABrvj5XMPQf5Zl6DsZdeQpNq6AuTp0Hdk7H6yanoF+s/dAfJKd4D0R1hPfBGIPXw/SIS1xNjlItI9e4MBRo9AanoEUs22x5ueAOG6A8rmnnj43EOkX+oRlKXdAplg15HsegJKfug7M3afWTndAfqdnoCyqTtAfOx64oJAqroFUqdph7B+St+NIvLWfD0CKe0JkK4I2yyvR1Ayu3n59+zFbbV0e+kHunnyeI9AGtk1wOsoPaHv0AT5Z+w+tHJC52CPoGRFWI8l9QRIx6Xr3s9ug7+4XER6ZncavIneAOnvDSjFPQKxsdssSSCVvUJ8fO7RGyA39AaU05o1wBvqFchE+Ds2kX6Z3Z9WTeitj16BPNgrkGp2PVbRK5Cp8HUeIIPsNtBCEbkWAqkP3aA17dYx2Bsgaf1x8LlHd4D+qCeg7OgJkGCEdcAV6BVIYy9PHg99J8fuX6umK0g/3yMo83sCZJhdj6lmX3g1aNrhDQ2p6ntS+gSS2jv7ShuunNArkLK+AGmLsM0UfQHC9cXJ5x7dQfovfYKyoC9ARth1wLVScvsC5EZ2X1s5fQH6g96Asq03QALa9Riq9dL1LEbYBm8CRSRKoZtgX0Ap7hXIOLvNUIKS2RcHn3u85u2MGs0aIBaCfQFlR3+A/ojd91ZOb4Bc3xdQciKsR18CGQ9dv3gxc+0uFRE1tS+gUIiCoDT1C0q5pl1n/YJS2S/QuPjco4+nf+gXlCx2DaADQbnQLygLQ9/xscfBqhlxq+/tC9JH+wWlXrMeHfQLSkXoumXb4eqgiMRYuJAISjPbroOB/gCNi889BgL0O30BZVN/QPFFWAfoSVBqegX6UPjDzzhJX5D+a7+gLO4TlDHNemJBUJovveBTNNvgqoWKf7iI9AvhGyDEwEBQGe8PKsUDQWWE3RatAUHhBoLK5oE4+NyjR1A/3R+kL/cLSj+7DjBcdl+QWv7xNq/NgEB/PBBUdg0IihRhPVcvqIxeui4varbBNRtR1femDApq6sDsDQpiq20wqJQNCkowwrZrkTkYpJb/3KOf0r8bEOi9obVHWAOYZDCOXoC8NoMivWlAUM6w67lSg4IiXzoXW9ltED0UESMElcoBQanXtF+poFI5KNAH4uFzj6Eg/dWgoBzVrAEsYzCoDAwG6Suhx5Kwx8+q6fWpH+wP0icGBaWZXc+bGRSUxoGgUsG2Q+yMqur7wkVkcHaHg06GgoptKKiUDIUu4AjbIwl/bTA+PvcYlNT/GBKUtYOC4mTXARYVVMoHBfW+0AMT2eNp1QyK9CtDgrJsMKhMatbz+rUNDwWV4rf8OogaiojBhgSlazColA4Jio/d9pqv8Q3GydsOfTz92FBQfW5IUDrZdUDcyBgK0l+zx9bKGRLoz4eCyr4Iawlcegu5PcI20EG4iIwIaurQ7M0LjBJUqoeCSi3bPiwomSNx8LmHqqpvGw7QO4aDSgG7BohLriFBWTck0f9ij7WVMyLQ/x0SlO7wGoLKyHBQKYqwNtDRGKXvRxExj+vSTTj0v2NDAXore5FYMcMC/e/hoHIgwnog/nUNBenz/Tz9OHvcrZoSVf2b0Ftcw4IiRVgP6GxEUD8dLiLDs6+CwQQjgtI3oqrvYC8Oq2VIpF8dEZTlw0Flil0DJJigUjQi0DtDN2j2PLBihoM0TbMGMMTl70TYDWAoF3tRWCn9PvqhkSB9YlhQmiPMHRLbwdBbRuw5YbWgiJhntoiIaupIUKFgGssWkRGR3nxBVM5GmDMkiQuiYrsgKitHRPXr7PlhlVwI0jR23mCMcUo/kDIqqqkXQicLmMVyRWREoD8eDSq7RoOKFGG+kIRGg0rrSJA+NcTR/8eeL2bnQpC+ws4XjIEiYg2WKSIjQfWLF0Rl0QVRGYswT4DQdybnLojW+iEQFBHzoIhYg+lFJPQkzgsCfeRCUKmLMD8AFrkQVPZcEOhP2XPJjKCImGdCVT8YLiKjs9+uggnGTC4iF4L0ujFROcnOC+Aticr4mKikj4r0S+x5ZWTGgvRlzdzAEJeLyNjszQxMcNGkIjIh0e+NBZUtY0GFZ+cEcDUuBpWGi0H6WPgndUxIqIiwcwJjoIhYgNFFZFSgnx0L0rSLojLIzgUgKqJyelSkN7DnnN5BETHPJKUfShkX1dSLszczMMG4QUVkQlX//qJE778YVCrYOQDEynhQEcaDyvYxif6QPQf1yniQvsTOA4zxlyIiKhTMMS7qX0TGg/Q345JynB0bQDeSMnJRVBZcDNLPs+djrDMu0Zc044MhLheR8dmbGZhgQsciEvqDMROSspsdE8AoE6IyeTFIr2PPzVgmVETYccEYU5R+OGVKVFMnZg82mGBSxyIyJatz2PEAjDYpKc3suRnLTEr0JXZMMAaKiAWgiECiM6CIvMiOCcZAEbEAFBFIdCgiietyEZmcvZmBCaZ0LiLseABGmzKgiLBjgjGmKf1/4SIyNXszAxNM61xE2PEADKdzEZmW6AuaMcEQKCIWgCICCQ9FJGGFi4hNVFOnZ29mYA5di0iE8QCMZUAR0YwJhrCp6kdmi4gUPtBgDn2LiHY8AGPJ+hYRm0Sf14wJhrhcRGyh/wNm0a2I2GV1ToTxAIxlQBHRjAnGeLWIzEgKBdPoWkQijAdgKLvORcQh0efZMcEglH4URcR8KCKQ0PQuInZJfY4dEwyCImIJKCKQ0FBEElioiDhENdUeOtBgFl2LSITxAAzlMKCIsGOCQSj9p3ARcYQONJhF1yISYTwAY+lcRJwSfVYzJhgCRcQaUEQgsaGIJKxwEXGLaqpTUiiYRrci4pbVORHGAzCWAUVEMyYYwkHpx1LcRE11yuEDDebQt4hoxwMwFtG5iBD6jGZMMMTlIuKSFQqm0bWIRBgPwFg6FxE3oc9oxgRDXC4iblmhYBpdi0iE8QAM5TGgiLBjgjEclH4cRcR8KCKQ0AwoIk+zY4IxUESsAUUEEhqKSOIKFxEvUVM9oQMNpvDqXETY8QCM5tW5iHgJfZodE4zhpPQT4SLinb2ZgQl8OhYRTlbnsOMBGM6AIqIZEwwRQBExH4oIJDz9i8hTmjHBEJeLiE+mFMzByVTXIsKOB2A4QnUtIhyhT2nGBEP8pYiQ8IEGE3BE5yISYUwAg+lfRLRjggECqvrJFD9RU7nZmxmYwK9zEWHHAzCB3kXkyQhjggFQRCwARQSSAIpIgrpcRPyzNzMwAa9zEWHHAzAar3MRCRD6JDsmGCNA6adQREyGIgKJTu8iwhP6BDsmGCNcRASipvKzBxpMENCxiAiyOocdD8BoAQOKCDsmGONyEQnMHmgwgaBzEWHHAzCaYEARYccEYwiq+mkUEZOhiECi07uIBAhNY8cEQ/j8qvqRFEFVUwWFUjCNfkVEVedEGA/AaLoWkSClj0cYE/TjDyp0E0/pt8MHAEXEdCgikOhQRBLHqSCl173uAIiqmhpUKAXT6FpEIowHYDTdi0iEMSG2mkP7mVL6fnb/o4iYD0UEEh2KSPyaERW6UqT0q+x+v5xQEREVSsE0uhURWVXnRBgPwFCSzkVEovRxdkyInqTQozKlv2T3tyYoIqZDEYGEZkAReYwdE6JSKVF6v6qq72D3dcSEiog0e6DBHLoWkQjjARhKNqCIsGPCNbkgK3RBkNLPs/v4TUNUNVWePdBgDl2LSITxAAxFdC4ihNLH2DHhqiiyQnfLlP6Y3bdXFBQR06GIQEIzoIg8yo4JV4YotIBQehu7T68qoSJCKKVgGl2LSITxAIymexGJMCa8uR5C6fOU0o+x+/OqgyJiOhQRSHQoItbBKZRuopR+i92P1xxVVVMVSimYRrcioqrqnAjjARhN1yJCKX0kwpiglUMp/SO7/6IOiojpUEQg0aGImKuJUvq4qqrvY/ddTBIqIhQxM7oWEXYwBDEhuhcRdkAkHBuldAV9s982j0XCRSRC6QLD6FtEtOMBGE3/IqIdM9kdoVfy2+axyGwRUSiYRuciohkPwGh6F5GHI4yZrCpUVb3vin/bPBZRCUmlhFAwjX5FRJbnRBgPwGj6FhFCHo4wZrK5QBVlPqX0c+z+0Suqqn6QEvLSbBGRCQXT6FtEtOMBGMuIIsKOmTwIlZXdoWud3S96RVXVv6GEPBW6d1GZDKeoIkmlkkzBNPoWEe14AMaSdS4iEnlIM2ZSIOcpIbey+0OvqKr617P1ghRenoMsD4SLiCLKFEyjaxGJMB6AsST9i4hmzEQmyd1UIs9Tnv4Tuy/0CpXl/1Uk+WCEufSnqKKYqogSBdPoXEQ04wEYS5J1LiLSQ5oxE5NPEeWNVJK+ye4DvaKK4tcVUV6piJItwnxCx7ZvtogEJQqm0a+ICPKcCOMBGEz3IvKgdswEI0onaVCO/W+bv0FUv/oRGpSeVoJym2Yur5uX3HOpiIgUTKNzEdGMB2AwyYAiwo6ZKKRGGpQeU1X1vey69QoVxdsUUczXziUCUepOUQUxVRFECqbRt4hoxwMwmM5FRJAe1I4Z54KiTRHE5VQUv8KuV69QQf65EpT2aebypqSu2SISCFIwiRDUsYgIczTjARhNCOpdROZqxoxnQvAIDQZ/wa5Tr4QKlSKIyxRBnNTM5a11hm40qQovUDBJQNC3iLDjARgtIOhbRHhhrmbMuBSsoH7hXkrp37Fr1CPU5/sQ5YNPhI6Pdi5XKCB0pKh+IVXxCxRMwutYRDhhjmY8AKPxBhQRdsy4EhxReGE+FYTPsmvTK5QTb1Z44ax2LleJF9ovFZEABZPoXkQijAlgJP2LyAOaMeODrPCBXTQQ+BG7Jr1COeGnCh/Yo/gDJMJ8rh4faJstIlyAgkn8OhcRdjwAwxlQRDRjWl4+5QKG/bY55cQvKZyQrnDCeIS5REFoSVG9Qqri4ymYhAvoW0TY8QAMF9C3iPj4B7RjWhTHd6m+wHPU7/8ouw49Qr3eD1COf1zxBRo1c4mJQDOKiNlQRCDh6V5E7teOaTEc71N8gQ3Uwxv22+aUC9yocIFczVxiKtCUQrz+VOL1UzCJj9etiMgcN0czHoDheF2LCPHx92vHtA7F68+Wffwf2HnrldB1r3j5ncTrF9m5xJyPb5wtIh4/BZN4dSwibm6OZjwAo+ldRDz8/ZoxrcDLNxIv/6jqdhvy2+aqz/dFxcsvIl7/qGYuevH6G1KI259K3BwFs/j1LSKa8QAM5uH0LiL3acY0lX9a8XDLqZP7MjtXPRIqUsTDPULcXJ12LrqrTyFubypx+yiYhdO5iLDjARjM49O5iPju04xpEsXtOyx7/P/LzlGvEA93veLhcth5GIermy0iLh8Fs+hcRDTjARhO/yKiHdNYbq6cuH33qqr6dnZ+eoS6fD9QXNw24vYFNHMxFFc7W0ScXgpm8elbRDTjARhO3yLi9N0XYUyD+IYVp3ee6vF8hp2XHqFe7+cVp3d+aFztXEzg8lWnEIc3lTi8FMyiYxGxc3O04wEYTu8icm+EMfUmK07vTuryGfLb5tTheDexcw8Rp68mwlxM5Ku6VEQ8FMzi1bGIuOdoxwMwmt5FxH2vdkz9KA7vOeLy3MLOQ6/IDvcfFacnm52HRVTOFhG7h4JZdC4imvEADKZ3EZlx36sZUx9dxO57ls4Y9NvmTt/3FId3C3F4/RHmYhUVKcTmTiUzbgqm0beIaMcDMJbdo3cRuUczZkx5vIrdvZ7aPf/Fjq1HqM3zWTLjSSN296B2LhZj95TPFhGbi4Jp9C0i2vEAjKZ/EdGOGRPKjCtbnnb/nh1Tj9CpqX8g084HyIy7kp2HdbnLwkVEtrkomEbXIhJhPABDGVFE2DGjRWyuBjLjelR1ud7DjqdH5Gn378iM6wQ7D6sjNnfpbBGZdlIwjX5FZMo9J8J4AIYi0059i8iU4252zGvnmpKnXcvEKeeX2HH0iDTt+LY87dwoT7u82rlYH7E5S1LIlDtVnnJSMIneRYQdD8BgZMqAIhJh3GtwSLY5Dfltc3XK/Wky7XpZnnL1RZhH3CDTzqLQAUiVJx0UTDLl0LGI2OdoxgMwGJly6F9EIox7pciUo4xMO+9RVfVv2b5jHXVk5B1kwnlvaEx2HvGITDoLUUTMhiICCc7CRWSYTDrS1Gljfttcnrb/hkw5j0WYR9wik86C2SIyYadgkkm7vkWEHQ/AYGRS5yIy7riLHfMtSPKkfYc0OfNDti89Ik05vilP2tfLk3Z3hLnENTJpPx86AKnyuJ2CSSZ0LCIX7XM04wEYjEwYUEQijBsJGbefIxP2m9k+9Ig64fokGXe8KI87eth5JAoybs9PIeO2VHl8hoJJJmZ0LiIRxgQwEJmw61xEZu5ix2SR8ZlOMm5/RrXZPsL++1hH7VTfTiZm7iYTMyXsPBJNqCjPFpGLNgomGbfpW0TY8QAMRi7adC4itjvZMS8bt3nk8Zn10qj9P9l/p0fkizP/J1+0HdHMI0GR8Zm8FDJqS5XHbBRMclHnIsKOB2AwMqZzERmz3cmOGR734kyWPDZtyG+bS6P2/5DHbGvksRkHO49ERsZsZ2eLyOg0BZOMTetYRKbmaMYDMBgZm9a/iLx2vFFbPRmzPUL7He9mvzbWoWOOj5FR23NkzNbJrjsZkLHp3BRx1JYqXZimYBr9isjI1JwI4wEYSh7Vt4hcvoeNTkvyBdtqdcT9XvZrYh1VVf86PO6orZBdb1IZtZ1OEUenUqULUxTMoncRYccDMJY8OqVzEQnfw84HRya+yG7TI/KY7X+l0amD7DqT0/SpFHFkKlUamaJgFp2LiGY8AGPJF/QtIkZ85xGKOGL7unxhapV0YXqGXWMSy5ktIsOTFMwypW8R0YwHYCx5WN8ionfohZmPSsNTz8gjk+3s2pLeyORJFBHToYhAYovXIqKq6l+R4cnbpZHJ8+ya4FVT2bNFZGiCglkm9S0imvEAjCUPTcZdEZGHJ/5bGprYz64FWJNZKeLgeKo0OE7BLBP6FZGBi3O04wEYSx6aiJsiIg5d/Ko8NLFcGhyfYtcBEQyNZ84WkYFxCmbRr4gII1Oflgcm+rVjAhhHHphYwJ6bVgsdnPqwNDD+pDw40cLOH97E4MTxS0XkIgWzjOtWREKhFy58VBqceFYeGO/Qjg2go8HxY/LA+G/Yc9JqIUNjt0iD43ma+cNbGxw/liL2jadK/RcpmEXfIvJqxIGJb8gD46ulgYsO7RwAYkfuv1ghDV68j46NvZM9D60UeXD8p/LAxT1S/0XCrgGuWEaK2DeaKvWNUTDLRUOKyKuR+y7+n9R/8Yh2HgDRkfsuDkj9Y2lC36ghf+TpWiMOjH5J7htLl/ovjrNrgKvUf/EoiojpjC0ioaidnW8Xe8fukfsulmrnA3CV+sf8ct/YFmlg/LvsuWal0O7xD0j9Fx+X+y42atYA16b/4pFwERF7RymYpG/M8CLyagJdw5+S+sZekvpG+zTzArgCUt9odrD/4h/Zc8tqEfvGbhJ7R8+w84foSL2jh1PEntFUsWeUgkl6zSsir0bqvvhtsWd0o9gz5tXMDyCyWql37CHa36/7k3Kjidxz8cdS7+gusXdMirAGiJLUO3potoh0X6Bgkp4LpheRVxPsGvud1DOaqZkjwCVSz+gFsXt0YbBn+Avs+WOlBPtG/lXsvrBY6rkwxq4BYkfqHj2QIvaMpIpdIxRM0j1imSISitppe5fUNTJX7Bqp0swVklf3BVHqHtkpdQ//iD1nrBS1ffR9UteFR8XukXrNGiDmpO6R/SgiZrNYEXk1wa6Lnxe7RuZLXSPDmjlDsskVu0duZM8Rq0XsGv2T1DVyKsL8QSdS18i+FLFzJFXsHKZgkq5hSxaRVyN1Df1A6hrZLnYOC5q5Q0KTOoebpO6Rx0M/1cSeF1aK1DX8Q5yj5pC6RvbOFpGOYQom6bR2EXk1Yvvwn6TOkVOa+UPCkTqGJ8SOkaVi9+CX2fPASgl2D/2L2DGyQOoYHmHXAMaQOob3pIhtI6li+xAFk3QMxUURCYV2jr1f6hh+TOwYbtCsAxKC1D60T24f+hl77K2U0E+ESZ1DD4kdQzXs/MFYUsfwbhQRs8VREXk1wc4L/yZ2DKdLHUPjmvVAvMoXO4ZvY4+11RLsGPqj1DGUHWH+YAKpY3hXitg2mCq2DVIwTdwVkVcjtw3+VGof2iO2DSoR1gVxQGofbJfaBp+h3Rc+yh5fK0VqHfq+2Da0VWwf5Nk1gHmktsGds0WkdZCCaeK2iLwasWXoVrFt8FyEtYFVtQ3ZxbbBVWJr/7+zx9NKEToHPie2Dc2TWoeGNGsA00mtg9vDRSTYOkDBNHFfRELh2of+X7B14Klg60BbhDWCtRwOtg/+kj2GVora2fkuqXVwbrB1sCrC/ME6tqWILX2pwZZ+CqZJiCLyasSWvq+LLf0rxZZ+W4S1gonE1v5isbX/LrWx8W/Z42alBFsGfy+2DGSy8wcLah3YOltEmvspmCahisirCTYP/iLYMnAownrBYGLLQE+wZeDFQOPQJ9njZKVITQPfCbYMbAo2D/jYNYBFtQxsmS0iTX0UTJOQRSQUtaTkb8Sm/jvFpr7iCOsGvTX3ecTm/vVSS9832WNjpQitvf8cbO5/RWzq69esAaxuc4rY2JcabOyjYJqELSKvJlA/8IlgY98LYmN/d4T1gw7Exr7jweaB37LHwkqh1WPvlJr67gs29Zez84c40dS36VIR6aVglsQvIq9Gauj7ptjYuz7Y2OvW7geIiaa+Sqmp7361ceLv2f1vpQSben4jNvUd08wf4orY1Ldxtog09FIwS3RFJHT81MbeD7LtVk6wvuc34VfKmn0B10ps6B0MNvamCQ39n2X3t5Vy+YVEQ6+bXQPEH7Gxd0OK2NiTGmzooWCW3uiKSENvcaiPYGPvc2rJyDvY7VaN2tj491JDz/3Bhp4K7T6BK9fLBxt7t0oNvd9j97GVEmjq+lSwoedFsaGnR7sGiFdiQ++6FLGuJzVY30PBLFEWkfqe7Nf0Vy419DxAK3v+kf06q0ao7/tMsL43TazvGdTuG3gzYn3PyWBDz3XsPrVSaH//34kNvfcEG3pL2flD/BMbetfOFpG6bgomqe+JrojUdZ9k+wy1iXW9f2K/1soJvZIO1ndvCdb18Ox6QKNOqOt5RG0ceg+7H62UYF33r4J1PUcjzB8ShFjXs2a2iNR2UTBJXXf0RYTtM6xbDNZ17xbquyz9RFY2wdrO6954TclNrO0eFWu7FwUbe7/I7jcrRarr+U+xrnttsK7bya4BEotY17U6RajpSRVquiiYpDa6IiLUdudo+nyNYE3XpFDTvUKs6rL0M5Jem9ArbKG68xGhtruOXU9y6paCNd27hJqeOey+slL42raPB2u7nxdquru0a4CEVNu1KkWo6UwVajopmKS2K8oi0pWj6TOyzmBt1wuBqq5PsX1YNcGq9i8KtZ2LgjVdYxHWkxxqO8+ItZ03sfvGSgn9UqlQ03lnsKarSDN/SGy1XStni0h1JwWT1ERZRGq6Tmn6fBPB6s5yobrrAbWk811sX1aNUNvx42BN5y6hukti15Owajqbg9WdT/jK+z/E7g8rJVjT9QuhpuuQZv6QLFbMFpGqDgomqe6MrohUdZ7S9Hklqjuyxequ69n+rByxuvNmoarjrGYtCSRY3TEpVHUuE6vbvsKu30oJvT0qVHWsClZ3zrBrgGTSuRxFxGxmFZHw2B3BYHXnLqGq66dsv1aNr7z5Q6FX6EJ1R7NmPXEuWN25X6js+G92zVaKv6z7o8HqjmeEqs52dv6QjDqXpQiVbalCZTsFk1R1RFdEKttPa/q8SsGq9kmhsn2FWNn5dbZ/q0asaf+qUNW+PFjZPsWuJ94EqzrOB6o7bldV9a/YdVolobmF5hiaKzt/SGYd6bNFpKKdgkkqY1BE2D6vWUdHsKL9+UBFp6UfGf7ahF65Bys6DmjXEg86OoKVHc/ylT3/xK7LShGqOv4nfvcx6KqyY0mKUNaWKpS3UTBNdEWkoi03Qp9RCVa0lwkV7ffHy4fvqqq+LVDRdkewvK2AXYsVBcvbHEJF+2qxrO0b7FqsFLGi42tCeduKYHnbNLsGgLCK9sWXikgrBdPEoIho+oyRtqxgeZulH6vx2vDVLR8LVrQ+J5S3dWrXYhEVbUeCZS3/x87dSuFq2v9fsKL1KaG8tVUzf4DXqmhdNFtEylopmCa6IlLeeiZCn7FT3ioEy9t2CmWtP2HHtmqkivb/EMpb1wplrU7NekwSLGsrEcpa71Y7O9/OztdKCZS33BosbzvHzh8govLWhSlCWUuqUNpCwTTRFZGy1jMR+oy5YGnrhFDaulwsaYmbD9+D5a2/EkpbjrJrMVhvsKzlpUBRk6V/yVMoaf5ZsLR1r1DaokRYA8AbWZAilLSkBkpaKJhDKImuiARKWs+wfeqsPVDS+lygrOMT7FysmNDj8YWSlnuFkpayCGvRUas3UNq6gS9u/RY7JytFLGn9slDSulQoaZ3QrgHgzQklLfMvFZFmCuYQSpqjLCItZ9k+jSCUtpQKpS330fzWf2DnZMUIJS2fDpQ2vxIoaeln1xJrQmnLCb6k5XfsHKwUX0njB/mSlseFkuYmdv4AV0ooaZk3W0SKmymYQyiOsogUt5xl+zSSUNycxZc0x82H74HSpu8ESlo2BYqbfexaoiWUtFQJJU0PWL2wBkqabwqUtJxh5w9wDdJmi0hREwVzCEVN0RWRouY8tk/jNQcCRU07hMKWH7Pzs2r4oqY/hAqgdi3XoLh5SChqmieUNH2OHcdKCR2fQHHzrkBRs6RZA8C1KG5+JXRipQYKmyiYQyiMQRGJ0K8ZhKKmcaGoeZlY1Pw1dp5WTOgvQArFzQ8Khc017FquTHMgUNi8LVDY+AO2bysleL7pX4WipsVCUfOYdg0AUShqfjlFKGxMDRQ2UjCHUNgYXREpbDrH9mm6oqb2QFHjc3xJ28fZ+VoxwYLWfxEKmxYEChtHNGt5YzmBokZLP8DSc676/cHCxseEwsaGCPMHiF5R00uzRaSggYI5hIKGKItIwzm2T8sobCwRzjfeq55q/Ht23lZM4Hz9jwIFDTsChQ1BzVouEQob64MFDY+6S1rey/57KyVQ0HRDoLDhNDt/gJgqbHgxRchvTA2cb6BgjqiLSEHDObZPqxEKGjL5gvo/snO3asI34PMNua9fQ+OYUNCwOPTWEPv1VsrlQljQEGSPA0DsNb6QIuTXpQby6ymYQzhfH10ROV+fz/ZpTQ2BQH79DqGgwdJ/4vXVhP9a3/mGB4Tz9TOB8w2nhHPW/tA8mFf/BeF8/UIhv/6Cdt8D6OZ5FBGTJU8RmSWcb7go5NcvFQubvsquxYpRS0rewbZZKWpB43uE8w0PC+fra9l9DaC7gsbnZovIuToK5hDyoywi+XXn2T7jQn59WyC/9ln+bG1cfPhuxfD5DdcF8utOavYtgFHya59NEfLqUgN5dRTMIZyLsojk1Z1n+4wv9cVCXt09NKP6nezakMgJnG38fuBc3dbAuTpeuz8BjFT/TLiI8Hm1FMwROFcXVRHhz9UWsH3Go0Be3Qk+r/4P7PqQv0Q4V/05Pq9uHp9XO8TuPwAzBM7VPZUi5NWk8mdrKJgjkFcbXRE5W1vA9hm38mr4QF7tdiGvJi4+fDcqaknJu/izNXP5s7VVmn0GYKJAXs2Ts0XkTA0FcwTORllEztQWsH0mgDH+TG06d6b6K+x6ky18Xs0fAmdqsiLsIwDzna19IkXIrUnlc6spmCNwpia6IpJbU8j2mSgCZ6pbA7nVz/Cnqz/GrjvRE8it+m4gt3ozn1vDsfsFwDJOVz8+W0ROh/8PmCBwOgZFJEK/CabYf6b6Hqv/uG0sIuTWfyaQW53G59YMRNgPAFbzahGpomCOwOnq6IrI6eoits9EFcitPs6frv49uw8SIaFHw/C51ffzuVUV7LoBLOyxFCGnJpU/VUXBHIFTMSgiEfpNYP7AqeptgVPVP2L3RbyGz6n8beBU9fEIawWwukdThJzKVD6nkoI5AjlV0RWRU5VFbJ9J4VTVGJ9Tmc6drvoyu0/iJfzJ8m8FTlVtCJyq9GjWBxAfHkERMVn0RaSqmO0zmQRyqloCp6qe5k9W/hO7b6yaQG7VpwKnql7iT1X1susBiCunqh6aLSInKyiYI3CyMroiklNRzPaZpIqEnIq7rfzhOz179u/82ZX38CcrSyPMHyD+nCp/MEXILkvls8spmCOQXR5dETlZXsL2mcwC2RXH+ezy37H7yez4s8p/xZ8sP8rOFyCuZZXPTfFnl6X6s8opmIPPirKIZJWXsH0mvewKzp9VvjVwsuyH7P4yOvzJyv/0Z5evDR1nzTwB4hyfVfnApSJSRsEcfFZZVEXEn11WyvYJl43y2eVLuMwSwz98D2SXfcKfXf4Cn13eHWFeAAmBzy67f7aIZJZRMAefGWURySwrZfuE1+Ozypu5E6VP+TPLPsruv1hH3db4t/6sirv4rPJidh4AiYbPrrgvxX+iLNV/opSCOfgTpdEVkROlpWyfEBmfWVroP156V+gDbnY/xiL+E6W/9GeWHWbHBUhYWSX3ooiYLOoikllaxvYJb6XsGH8idh+++zPLvuE/UbqaP1Fm144FkMAyy+5J8Z8oSfUfL6FgDv5ESXRF5HhJGdsnvDX+eInPf6J0S+B46Q/YfXqlCf1uiv9E6bP88dIOtn+ApHCi7O4U/7GSVP+xEgrm4I9HWUSOlZSzfcLVKL3AHytZzB0v+hK7b98oapr6Nv/xsjv4Y6UF2v4Aksqds0Uko5iCOfhjxdEVkYzicrZPuHp8RnFzIKP4Sf+xko+w+/i14Y6WfJnPKD7P/nuApHSsJHW2iBwtomAOPqMouiJytKiC7ROuHZ9RVOA/WnSneuzY25n9/DX+aPEKPqN4mv03AMmGP1rU7s8oeib8E4/80ZJvh25k7BeBYYZfe7O62qCI6KU4gz9S/FvucOH/444WP8VnFLdqvwYgufAZRXb/0eLV/qOF33jdjYg7XPhV/mjhcv5o4aT/SCEF4/BHC6P6ToQ/UlTM9gkxVRqhDSDphO81x0q+zt6DXhfucOHPucOF+7gjRQp3uJCC/vxHoisi/sOFlWyfEENHCms1bQBJ5NILqbvZt3jfNNzhwlu5Q4V53KECCvryHy6IvohE6BdipjZCG0DC8x8u6PUfLngpcLDoU+x954rC7c//MHeo4EnuUGEz2znEDoqIxR0uqNO0ASQw/+FCr/9QwUb+wPlvs/ebawp38PxXuEPnl3EHCya4g+cpxJb/YJRF5OD5KrZPiKm6CG0ACcl/qCCTP5D/e/Y+E5Nwhwp+xh0o2MMdKCDcgfMUYsN/IAZFJEK/ECMHz9dr2gAST7XvQMGDtmMl72LvMTEPd+jcLdyB82e5A/kUouc/cD6qIsIdyK9m+4QYOpjfoGkDSBjnR7iD+fO9h85/nr236BrfrrMf4vbnP8Htz2/i9p+jcO38+/OjLyIR+oVYyW/UtgHEuQP5QW5//nbfwXxz/3Abtz//y9z+/HRu/7lxbt85ClfPvy/KIrLvXA3bJ8RQqIiwbQDxbH/+aW7fuRvYe4mp4fac/Sm3L283tzdP5vbmUbhy/r3noi8iEfqFWDnXpG0DiEP78kIviB737Dj3fvY+Yplwu8/dzO09d4bbk0fhyvj3RFlE9p6rYfuEGAoVEbYNIJ7szZvg9uQt5fbmXfGTqk2N79CpD3J7zj7O7T3byO05S+HN+ffkRVdE9uTVsn1CDO3Na9G0AcSNvL3cvryfsfeNuEio6nF7zi7h9uRd5HafpRCZf3cMikiEfiFW8lq0bQBWl5fv35N3G3u/iMtwu8/8hNtzZpdvzxnJt/sMhdfjdp+NrojsPlPH9gkxtOdMq6YNwKr2nG337sl7xn8g76PsvSLuw+05eyO3+0yub9cZCn/B7YpBEYnQL8TI7jOtmjYAi+F2n7X7dp9Z7d+d9/pHtCdavPuyPsDtPPOYb1dug29nLoVcyu06E1UR8e3MrWf7hJhqj9AGYB27zhzx7z77S/bekNDx7Tr7b9yO3MW+Hbljvh2naTLjdp6OvohE6BdiJbdD2wZgPm5HbqlvV+7VPaI90cLtOD2H23F6p297rujbfpomI25HlEVkR2492yfEUm6Htg3APNz2072+7bkvBXbmXtsj2hMx3I5TN3A7Tp/2bT9Fkw2341R0RWT7qQa2T4gdbvvpLrYNwAzc9lNe347TG73bc2LziPZES+i3KLmtpx71bT9d79t2iiYLbnsMikiEfiE2wkUkQjuAkbhtpzJ920/r84j2RItv26kvcltzFvm25oz6tubQRMdty4muiGzNaWT7hNjhtuZ0s20ARuG25VT7tubMtW08pv8j2hMtvm2nfsRtzdnh25ojsDs2kaCIWBu3NaeHbQPQG7ft1Ai3NWe+d8tpYx/RnojhtuT8iduSc8q35SRNRNzWk9EVkS0nm9g+IXa4rTm9bBuAjgRua85239aT5j6iPdHiXpP9Xm5rziPc5pw63+aTNJFwW6IsIptPNrF9Quxwm3P62DYAPXBbTp7mtmZb6xHtiRbv5pNf8G7KXujbdHLEuymbJgLfpuyoioh3U3Yz2yfEjm9Tdj/bBhBLvs0nG72bsx/37Miw7iPaEy2+jSd/6N2cvd27OTvg3ZRF45lvU1aURSSrme0TYse3OWuAbQOIic1ZE97NWUu5jTnx8Yj2RAy3Oet638asHO/GLBqvfBujLCIbs5rZPiF2fJuyB9k2gGj5Nmbv5TafjM9HtCdaXNuOvcezMeth34asGu+GTBpvfBsyoysiGzJb2D4hdnwbsobYNoBr5duQme/deCIxHtGeaPFuyPoX74asBb4NmcPe9Zk0XvjWR1lE1me2sH1C7MTb+QSW1e5dn/WMf3Vm4j2iPdHiW3/8B971mdt860/w3vUnqNX51p+IrohsONHK9gmx41ufeYFtA7gKdt+GE6v9647/O3vtIhaPb13mdd51mdnedSeolfnWRVlE1p1oZfuE2PGtzxxl2wCuyPoTR7xrTyTXI9oTLY51B97NrTv+kG/diWrv2uPUinxroy4ibWyfEEPrjo9p2gDezLoTJb61J+5W05L4Ee2JFu/aE5/3rj0+37f2+JB3zXFqJb41URaRNSfa2D4hpi5GaAPQ8K090etbc+Il18YTeER7osa39tj3vWuObfGtPe73rjlGrcC35niUReRYG9snxNDa4+OaNoDX8K057vWuPbbRuy4Tj2hPlvjWHv2jd01Glnd1BjWbb/WxKItIRjvbJ8TSsQltG8Blmb5VGXhEezLGvuzkP3pWHXvQszqjyrMqg5rFG2UR8azOaGf7hBhanTGlaQNYlVHtWZmBR7QjKSmeVRmf86w8muZdlTHoWXmUGs27KiO6IrIqo4PtE2Jo1dEpTRskLe+qo8PelUfne5edwCPakdfHt/ro97yrjm72rjzKsSeOnlBELG7VUZumDZLPqgzBu/Lodt+Ko3hEO/Lmca848gfvyqOZnhVHqBG8K49GV0RWHO1k+4SYmonQBknl6GnPyiN4RDty5Qm9z+lZkfGAZ8WRSs/yI1RP3hUxKCIR+oUYWXHEoWmDpOBdfqTRu/zo455VeEQ7co3xLD/yWc/yw2ne5YcHPMsPUz14VxyJrogsP9zF9gmx411+2Mm2QYJbcWTCu+JwOrf0MB7RjsQm3qVHvutdfniTd/lhn2fZIRpL3uWHoysiyw53sX1C7ISOD9sGCWz5ob2e5YfwiHZEn/iWHfqdd+nhE56lh2iseJdFWUSWHu5i+4TY8S475GHbIPF4lx7K9y47gke0I/pnasX+f/AsPXS/J/1ghSf9II2Wd+nB6IpI+sFutk+IHe/SQ162DRLJoXZv+qFn/IsO4BHtiLHxpB/6jCf90CueJQf7PUsO0mvlXRJlEVlysJvtE2LHm36QY9sgIdg9Sw6tdqcfxiPaEXPjXXrg294lBzZ60w96PUsO0KvlXXIguiKSfqCH7RNix5t+gGPbIL55lxw44l1yCI9oR6wVX/rB33qXHDzuWXyAXg3v4iiLyJIDPWyfEDveJQd5tg3ik3fxwRLf4oN4RDti3Uykbft7z6L993kW7S93L9pPr1BURcS9eH9vhD4hRjyL9gfYNogzi/f3ehbvf8m18CAe0Y7ER9zpez7tWXTgZfeiA32aE1oLRcTSDgS1bRAPPIv3e92LDmz0LjqAR7Qj8Rnv/P3fci/ct8GzaL/bvXAffQPRFZGF+/si9Akxs1/StoHVeRbtz3Qv2P879npBkLiMa8He33gW7D3mXrCXRhBdEVmwty9CnxArC/fKmjawroV7qz0L9861peER7UiCRU3b8w73vH33eubvK3PP30tfI/oi8vr+IFYW7FPC2HawHM/8vcPuBXvnexfsxiPakcSOa+HOT3nm733JvWBvj3v+Huqevye6IjJ/b/+lfiD2SIQ2sJIFewXPgr3bXQv34hHtSHLFn7bnI575e3Z75u0ZZrddTdzz9va75+2hoIe9krYNLGP+ntOeefvwiHYkueNJ2/E5tu1q4p63Z8CdtpuCDubtFjVtYDpP2u5Gd9qexz1pO/CIdgSJNigiOpq3W9C0gWk88/ZMuOftTncu2ItHtCNIrOJO2zXoTttFIfY8absCbBuYw5O2a68nbRce0Y4gsY77lV2D7ld2UYg9zyu7/GwbGOzl3fneV/bgEe0IoldcL+0ccr28k0LsuV/axbFtYAz3yzvb3S/tfMaWtucj7DmPIEgM43pxx5DrpR0UYs/90g4f2wY6e3Gn3fXiztXul3biEe0IYkRQRPTjfnGHl20D/bhf2nnY+dIOPKIdQYyM64Udw64Xt1PQwQvb3Zo2iL0Xtpe4X9yOR7QjiBlxvbA9zf3CjoDrhfDFCDHkfn7HONsGMfT89l7X89tfcr24E49oRxAz435+60/dL2zf63phG4VY2u7QtkG03M9v97pf2LbR8cJ2PKIdQawU13Nb73A9t73Q9fw2CrGw3aFtg6g8ty3T/fw2PKIdQawa54ubPuF6btuLrj9v7XM9t5VCFP68dUbTBtfE/edt1c5nt861pW3EI9oRJB7ifG7rd9zPbt3sfnYr7/pz+IYIV22bTdsGV2nY/ezW+d5ntuAR7QgSj/E8t+1695+3nnb9eQuFq+P+89Yptg2ujPvZLYLrz1u3u/68FY9oR5B4j/e5DR9wP7v5cdezW1pcz26mcGXcz26ZYtvgrbmf3XLa88xWPKIdQRItrj9v+Zrrmc2rnM9stjuf2Uzhzbme3jLBtsGb2dLoembLY54n8Ih2BEnoOJ/a9H+upzdlOJ/eROGNuZ7eNM62gZbrqc0Tzqc2pTuf3YhHtCNIsmTsiVXvdD616QHXU5urnE9toqDlenLzRbYNXuPJzYrzyU17HU9vxiPaESRZE/qpGeeTmxY6n9w46nxqI4W/cD21cYxtg1muJzfmu57aiEe0IwgyG/dTm3/ifGLTHucTmxTnkxsphI1GaEtym9qdT2x8xvb0RjyiHUEQbVxPbLrd9eTGAueTG2jSe2LDBU1bsnpig931xIbV7ifX4RHtCIK8eRyPrPu46/GNL7ie2NDrfCJ8A0lOj28c1rQlIdcTGw47n9iIR7QjCHJ1cTyx7tvOxzZscj22we98fD1NOo+tH9K0JZcSx6Pr71bT0vCIdgRBrj2Ox9dd53x0/SnnY+toclk/qG1LfK5H1/e6Hlv/kuvBjXhEO4IgsYnniVXvdz+y7nHno+ubnY+uo0nhkXUDmrYE5np0vdf56PqNjsfWfIs9/giCIDGJ69G1X3U9sm6l89G1M85H19KE9si6fk1b4sp0P7YGj2hHEMSYOB9d+0vXI+uOOh8J3WwTk+uRtX1sW6JxPby2yvnIurm2B/GIdgRBDM7IHXve4Xx4zf2Oh9ZWOh5eQxON86G1vWxbwnhozbDz4bXzZx5di0e0IwhibmwPrvqc86E1CxwPr7mguVnFMedDa3rYtnjnfHiN4Hho7XbXg2vwiHYEQawV+9xVP3Y+uGa346HVxPHQahrvnA+t7mbb4tqDq087H1z9J/a4IQiCWCr2B1ff5py7+rzjwfCNK245H1zdxbbFI+eDqxsdc1c/5rl7FR7RjiBIfMRx/6qPOeauet75wKpux9xVNB45567uYtviiXPu6nHH3FXpzgdX4xHtCILEZxwPrviWc+6qjc4HVnKOuStpfFnVoW2LAw+sUpxzV+51zF2NR7QjCJIYcTyw6o/OB1aedDwQusnFiftXdWjaLM55/8p8+wMr8Ih2BEESL5656e9z3L/yMef9K5sc96+gcaA9QptVtTvvW/GM7cHleEQ7giCJHee9q77iuG/FCud9K20RbobWcd+KVk2b9dgd969Y5X5gJR7RjiBIcsV574pfOO5bfsRx33JqSfcub9W0Wcm9yw8771+GR7QjCJK8oY+s+zvnPSvvc9yzvMJxb/jGaCErWrRt5nPeu7zEcc+Ku9Xr8Yh2BEGQcGz3Lv+s/Z7l8+13Lx+x37OMWoHj7mUtbJuZHPcs73Xcs+wl110r8Ih2BEGQSLHfvWKO/e7lu+x3LZftdy+jZnLcvayZbTOJ13HXso2OO1fgEe0IgiBXEvvd6bfa716ab797KTWL4+6lTWyb4e5almm/axke0Y4gCHK1sd+17J/sdy59zn7X0i77XenUaI47lzaybQaqst+ZPtf2YBoe0Y4gCBJNHHcv/ab9zvQNjtR0n/3OdGqUcBGJ0K6r1PRhe+rS+TOpy/CIdgRBkFjGkbr4D47U9Gx76hJqBMcdSxrYNr04UtMFR2r69pnb0/GIdgRBEL3iviPtvY47ljzqSE1vZG/Esea4I72ebdOD4470087bl+IR7QiCIEbFeeviLztuX7LMcfuSafsdi6k+ltRp22LHccfiRsftix/z3J2GR7QjCIKYEecdS/7Xcdviw/bbF1Md1EZoi95tS8Ydty9Jd96+FI9oRxAEMTuh39yeuW3xvY7bF5Xbb19EY6g2Qtu1u22xYr990V7HbQvxiHYEQRCrZfrm9M/Yb1s8z37b4mH7baGbdrQW12jbrtGti/Ltty3BI9oRBEGsnpnbFv/IfsuinTO3LJRmbl1Io1AToe2q2G9d1G6/deHTtjvS8Ih2BEGQeIr9poW32G9ZeG7mlgX02iys1rZdGfstC+32mxesmrlxAR7RjiAIEq+ZuWXRR+03L3x25uaFnTM3L6BXZ2G1tu2t2W9ecHjmpgV4RDuCIEiixH7L/P+y3zx//czN8z0zN8+nV8J+8/wqtu1N3TS/ZObmeXhEO4IgSKLGceP838/ctCBr5qbwTf9N2W+aX8W2RXTj/N6ZG+e/5Lo+7ZPseAiCIEiCxXX90vfYbpz3yMwN8xtmbpxH34j9xnmVbNvr3DDfO3PD/I2OG+fjEe0IgiDJFueNC740c+P8pfYb509pCsRbFBH7DfNO2G9KwyPaEQRBkj22P83/n5k/zTs0c0MafS37DfMqNG1/mldlv2HeXNv1eEQ7giAIcinqfff97cx1r9xtv/6Vspk/pdEQ+59eqfjLf6cN2//0yvyZP7yER7QjCIIgkTNyR9o7Zq57Jc1+XVrAfn1a+cz1adR+XdrhqRvSPs1+LYIgCIJETKho2K97eZH9j2lz2G0Igvwl/x+CzBCctq+LIgAAAABJRU5ErkJggg==" preserveAspectRatio="none" id="img1"></image><clipPath id="clip2"><rect x="1548" y="1469" width="401" height="463"/></clipPath><linearGradient x1="5.66719" y1="56.5386" x2="5.66719" y2="-222.487" gradientUnits="userSpaceOnUse" spreadMethod="pad" id="fill3"><stop offset="0" stop-color="#E73768"/><stop offset="0.5" stop-color="#FFFFFF"/><stop offset="1" stop-color="#69E0F9"/></linearGradient></defs><g clip-path="url(#clip0)" transform="matrix(1 0 0 1 0 -1440)"><path d="M0 0 2266.88 0 2266.88 1666.82 0 1666.82Z" fill="#0A0619" transform="matrix(1.0005 0 0 1 0 867.18)"/><g clip-path="url(#clip2)"><use width="100%" height="100%" xlink:href="#img1" transform="translate(1548 1469)"></use></g><path d="M155.214-196.218 5.66719-196.218 5.66719-147.014 52.2715-147.014 52.2715 0 108.277 0 108.277-147.014 155.214-147.014ZM203.019-168.482C227.888-168.482 232.088-171.016 232.088-195.952 232.088-220.487 227.888-222.487 203.019-222.487 178.417-222.487 173.883-220.487 173.883-195.952 173.883-171.016 178.417-168.482 203.019-168.482ZM176.15 0 229.821 0 229.821-152.681 176.15-152.681ZM313.829 1.66682C320.296 1.66682 326.564 1.13344 332.764 0L332.764-40.4038C329.631-39.6037 328.764-39.8704 326.83-39.8704 318.03-39.8704 315.229-43.5374 315.229-56.2719L315.229-214.887 261.491-214.887 261.491-43.5374C261.491-12.7345 271.692 1.66682 313.829 1.66682ZM493.646-86.208C493.646-132.879 478.911-155.481 424.64-155.481 374.035-155.481 344.632-139.08 344.632-76.3404 344.632-13.6013 374.035 2.80026 422.106 2.80026 452.309 2.80026 476.644-2.80026 485.979-9.3342L485.979-48.9379C476.911-43.5374 453.442-37.8702 432.84-37.8702 413.305-37.8702 401.171-43.2707 398.037-56.5386L491.912-62.2058C492.779-64.4727 493.646-73.8069 493.646-86.208ZM397.77-93.0087C398.904-111.944 406.838-116.211 424.906-116.211 441.908-116.211 446.108-108.277 446.108-96.6757ZM577.32-48.3378 577.32-196.218 521.049-196.218 521.049 0 649.927 0 649.927-48.3378ZM739.269-155.214C728.268-155.214 711.866-154.614 700.265-152.681L700.265-107.743C710.199-109.41 720.6-110.277 731.668-110.277 754.003-110.277 759.671-108.01 760.471-92.4753L729.135-92.4753C684.73-92.4753 664.662-79.4741 664.662-44.4041 664.662-11.6011 684.73 2.80026 716.667 2.80026 743.536 2.80026 756.537-6.53394 761.071-14.1346L765.271 0 813.942 0 813.942-103.743C813.942-139.346 792.74-155.214 739.269-155.214ZM733.668-37.0034C722.601-37.0034 716.667-39.0036 716.667-46.9377 716.667-56.0052 722.067-58.5388 739.002-58.5388L760.471-58.5388 760.471-46.671C756.27-41.2705 746.936-37.0034 733.668-37.0034ZM949.888-155.481C925.019-155.481 910.351-147.547 902.684-137.146L902.684-152.681 849.012-152.681 849.012 0 902.684 0 902.684-100.676C904.951-108.277 910.885-113.077 924.486-113.077 941.421-113.077 946.221-109.944 946.221-91.0085L946.221 0 999.96 0 999.96-103.21C999.96-140.213 985.825-155.481 949.888-155.481ZM1129.97-152.681 1129.97-139.346C1124.04-150.681 1112.5-155.481 1086.43-155.481 1038.1-155.481 1025.7-119.611 1025.7-77.7406 1025.7-31.4029 1038.1 0 1086.43 0 1112.17 0 1124.04-6.53394 1129.97-17.535L1129.97-13.6013C1129.97 10.7343 1117.84 16.6682 1085.3 16.6682 1072.03 16.6682 1054.76 14.4013 1042.63 11.3344L1042.63 52.8716C1057.03 55.1385 1076.57 56.5386 1090.97 56.5386 1163.38 56.5386 1183.18 29.136 1183.18-12.7345L1183.18-152.681ZM1105.1-37.0034C1083.37-37.0034 1079.97-55.6719 1079.97-77.7406 1079.97-98.4092 1083.37-117.011 1105.1-117.011 1130.84-117.011 1132.57-102.343 1132.57-77.7406 1132.57-51.7382 1130.84-37.0034 1105.1-37.0034Z" fill="url(#fill3)" transform="matrix(1.0005 0 0 1 313.501 1774.2)"/><path d="M0.533383-56.0052 0.533383-46.8044 17.4683-46.8044 17.4683 0 28.2693 0 28.2693-46.8044 45.0709-46.8044 45.0709-56.0052ZM51.0714 0 61.8724 0 61.8724-56.0052 51.0714-56.0052ZM72.8068 0 113.544 0 113.544-10.5343 83.6745-10.5343 83.6745-56.0052 72.8068-56.0052ZM120.078 0 161.548 0 161.548-10.5343 130.946-10.5343 130.946-23.6689 158.948-23.6689 158.948-33.6031 130.946-33.6031 130.946-45.5376 161.548-45.5376 161.548-56.0052 120.078-56.0052ZM192.285 0 232.955 0 232.955-10.5343 203.152-10.5343 203.152-56.0052 192.285-56.0052ZM268.492-56.0052 254.824-56.0052 234.755 0 246.423 0 249.957-10.401 274.292-10.401 278.026 0 290.494 0ZM252.757-18.8018 261.291-44.4041 262.091-44.4041 271.292-18.8018ZM296.361 0 306.829 0 306.829-38.0702 307.229-38.0702 334.164 0 344.765 0 344.765-56.0052 334.431-56.0052 334.431-17.4683 334.031-17.4683 306.962-56.0052 296.361-56.0052ZM400.571-8.40078 400.171 0 410.305 0C410.305-0.400037 410.305-29.4694 410.305-29.8694L380.502-29.8694 380.502-22.002 400.371-22.002C399.571-15.2014 392.57-8.13409 382.436-8.13409 370.034-8.13409 363.767-16.8016 363.767-28.0026 363.767-38.8703 371.301-47.2711 382.502-47.2711 391.103-47.2711 396.57-43.2707 398.704-36.8701L410.105-36.8701C407.371-50.8714 397.704-56.8053 382.436-56.8053 364.567-56.8053 353.3-45.6042 353.3-28.0026 353.3-10.4676 363.501 0.800074 381.302 0.800074 394.037 0.800074 398.904-5.33383 400.171-8.40078ZM455.509-21.8687C455.509-13.868 450.175-9.73424 442.508-9.73424 434.574-9.73424 430.107-13.868 430.107-21.8687L430.107-56.0052 419.172-56.0052 419.172-20.2686C419.172-5.93389 430.04 0.800074 442.575 0.800074 455.576 0.800074 466.11-5.93389 466.11-20.2686L466.11-56.0052 455.509-56.0052ZM503.18-56.0052 489.512-56.0052 469.377 0 481.111 0 484.578-10.401 508.914-10.401 512.714 0 525.182 0ZM487.379-18.8018 495.979-44.4041 496.78-44.4041 505.98-18.8018ZM573.387-8.40078 572.987 0 583.121 0C583.121-0.400037 583.121-29.4694 583.121-29.8694L553.318-29.8694 553.318-22.002 573.187-22.002C572.387-15.2014 565.386-8.13409 555.252-8.13409 542.851-8.13409 536.583-16.8016 536.583-28.0026 536.583-38.8703 544.117-47.2711 555.318-47.2711 563.919-47.2711 569.386-43.2707 571.52-36.8701L582.921-36.8701C580.187-50.8714 570.52-56.8053 555.252-56.8053 537.383-56.8053 526.116-45.6042 526.116-28.0026 526.116-10.4676 536.317 0.800074 554.118 0.800074 566.853 0.800074 571.72-5.33383 572.987-8.40078ZM592.455 0 633.926 0 633.926-10.5343 603.323-10.5343 603.323-23.6689 631.325-23.6689 631.325-33.6031 603.323-33.6031 603.323-45.5376 633.926-45.5376 633.926-56.0052 592.455-56.0052Z" fill="#FFFFFF" transform="matrix(1.0005 0 0 1 590.804 1927.55)"/></g></svg>
diff --git a/maint/gemm_v2/correctness_evaluation.py b/maint/gemm_v2/correctness_evaluation.py
new file mode 100644
index 000000000..44441cdeb
--- /dev/null
+++ b/maint/gemm_v2/correctness_evaluation.py
@@ -0,0 +1,739 @@
+# pytest correctness_evaluation.py -n 32
+import pytest
+from tilelang import tvm as tvm
+import tilelang.testing
+from tilelang import language as T
+import torch
+
+
+def matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared.dyn")
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope="shared.dyn")
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def _compile_and_check(
+    program,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+):
+    kernel = tilelang.compile(
+        program,
+        out_idx=[2],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+            # tilelang.PassConfigKey.TIR_USE_ASYNC_COPY: False,
+        },
+    )
+
+    print(kernel.get_kernel_source())
+
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+
+    def ref_program(A, B):
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        if in_dtype == T.float32:
+            A = (A.view(torch.int32) - 0x1000).view(torch.float32)
+            B = (B.view(torch.int32) - 0x1000).view(torch.float32)
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(torch.__getattribute__(out_dtype))
+        return C
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+    print("assert_allclose")
+
+
+def run_gemm(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=2,
+    num_threads=128,
+):
+    if block_N >= 256 or block_M >= 256 or block_K >= 256:
+        num_stages = 0
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    _compile_and_check(program, trans_A, trans_B, in_dtype, out_dtype)
+
+
+def matmul_rs(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+    A_frag_shape = A_shared_shape
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared.dyn")
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope="shared.dyn")
+            A_frag = T.alloc_fragment(A_frag_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.copy(A_shared, A_frag)
+                T.gemm_v2(A_frag, B_shared, C_local, trans_A, trans_B)
+                # T.gemm(A_frag, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_rs(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=2,
+    num_threads=128,
+):
+    if block_N >= 256 or block_M >= 256 or block_K >= 256:
+        num_stages = 0
+    program = matmul_rs(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+    _compile_and_check(program, trans_A, trans_B, in_dtype, out_dtype)
+
+
+def matmul_sr(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+    B_frag_shape = B_shared_shape
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared.dyn")
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope="shared.dyn")
+            B_frag = T.alloc_fragment(B_frag_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.copy(B_shared, B_frag)
+                T.gemm_v2(A_shared, B_frag, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_sr(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=2,
+    num_threads=128,
+):
+    if block_N >= 256 or block_M >= 256 or block_K >= 256:
+        num_stages = 0
+    program = matmul_sr(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    _compile_and_check(program, trans_A, trans_B, in_dtype, out_dtype)
+
+
+def matmul_rr(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+    A_frag_shape = A_shared_shape
+    B_frag_shape = B_shared_shape
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared.dyn")
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope="shared.dyn")
+            A_frag = T.alloc_fragment(A_frag_shape, in_dtype)
+            B_frag = T.alloc_fragment(B_frag_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.copy(A_shared, A_frag)
+                T.copy(B_shared, B_frag)
+                T.gemm_v2(A_frag, B_frag, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_rr(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=2,
+    num_threads=128,
+):
+    if block_N >= 256 or block_M >= 256 or block_K >= 256:
+        num_stages = 0
+    program = matmul_rr(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    _compile_and_check(program, trans_A, trans_B, in_dtype, out_dtype)
+
+
+M_VALUES = [64, 128, 256]
+N_VALUES = [16, 32, 64, 128, 256, 512]
+K_VALUES = [16, 32, 64, 128]
+K_VALUES_8Bit = [32, 64, 128]
+FALSE_TRUE_CASES = (
+    [
+        pytest.param(
+            k,
+            T.float16,
+            T.float16,
+            T.float16,
+            id=f"K{k}-float16-float16-float16",
+        )
+        for k in K_VALUES
+    ]
+    + [
+        pytest.param(
+            k,
+            T.int8,
+            T.int32,
+            T.int32,
+            id="K32-int8-int32-int32",
+        )
+        for k in K_VALUES_8Bit
+    ]
+    + [
+        pytest.param(
+            k,
+            T.float8_e5m2,
+            T.float32,
+            T.float32,
+            id="K32-float8_e5m2-float32-float32",
+        )
+        for k in K_VALUES_8Bit
+    ]
+    + [
+        pytest.param(
+            k,
+            T.float8_e4m3fn,
+            T.float32,
+            T.float32,
+            id="K32-float8_e4m3-float32-float32",
+        )
+        for k in K_VALUES_8Bit
+    ]
+)
+
+
+def _ensure_torch_dtypes(*dtype_names):
+    import torch
+
+    for name in set(dtype_names):
+        if not hasattr(torch, name):
+            pytest.skip(f"Torch does not expose dtype {name}")
+
+
+def run_gemm_rs_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
+    run_gemm_rs(m, n, k * 3, False, True, in_dtype, out_dtype, accum_dtype, m, n, k)
+
+
+def run_gemm_rs_false_false(m, n, k):
+    run_gemm_rs(m, n, k * 3, False, False, T.float16, T.float16, T.float16, m, n, k)
+
+
+def run_gemm_rs_true_false(m, n, k):
+    run_gemm_rs(m, n, k * 3, True, False, T.float16, T.float16, T.float16, m, n, k)
+
+
+def run_gemm_rs_true_true(m, n, k):
+    run_gemm_rs(m, n, k * 3, True, True, T.float16, T.float16, T.float16, m, n, k)
+
+
+def run_gemm_sr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
+    run_gemm_sr(m, n, k * 3, False, True, in_dtype, out_dtype, accum_dtype, m, n, k)
+
+
+def run_gemm_sr_false_false(m, n, k):
+    run_gemm_sr(m, n, k * 3, False, False, T.float16, T.float16, T.float16, m, n, k)
+
+
+def run_gemm_sr_true_false(m, n, k):
+    run_gemm_sr(m, n, k * 3, True, False, T.float16, T.float16, T.float16, m, n, k)
+
+
+def run_gemm_sr_true_true(m, n, k):
+    run_gemm_sr(m, n, k * 3, True, True, T.float16, T.float16, T.float16, m, n, k)
+
+
+def run_gemm_rr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
+    run_gemm_rr(m, n, k * 3, False, True, in_dtype, out_dtype, accum_dtype, m, n, k)
+
+
+def run_gemm_rr_false_false(m, n, k):
+    run_gemm_rr(m, n, k * 3, False, False, T.float16, T.float16, T.float16, m, n, k)
+
+
+def run_gemm_rr_true_false(m, n, k):
+    run_gemm_rr(m, n, k * 3, True, False, T.float16, T.float16, T.float16, m, n, k)
+
+
+def run_gemm_rr_true_true(m, n, k):
+    run_gemm_rr(m, n, k * 3, True, True, T.float16, T.float16, T.float16, m, n, k)
+
+
+TRANS_CASES = [
+    pytest.param(False, False, id="nn"),
+    pytest.param(False, True, id="nt"),
+    pytest.param(True, False, id="tn"),
+    pytest.param(True, True, id="tt"),
+]
+
+
+@pytest.fixture(scope="module", autouse=True)
+def _setup_tilelang_environment():
+    tilelang.disable_cache()
+    tilelang.testing.set_random_seed(42)
+
+
+@pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
+@pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
+@pytest.mark.parametrize("k,in_dtype,out_dtype,accum_dtype", FALSE_TRUE_CASES)
+def test_gemm_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
+    import torch
+
+    required_torch_attrs = {
+        in_dtype,
+        out_dtype,
+        accum_dtype,
+    }
+    for attr in required_torch_attrs:
+        if not hasattr(torch, attr):
+            pytest.skip(f"Torch does not expose dtype {attr}")
+    run_gemm(
+        m,
+        n,
+        k * 3,
+        False,
+        True,
+        in_dtype,
+        out_dtype,
+        accum_dtype,
+        m,
+        n,
+        k,
+    )
+
+
+@pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
+@pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
+@pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
+def test_gemm_false_false(m, n, k):
+    run_gemm(
+        m,
+        n,
+        k * 3,
+        False,
+        False,
+        T.float16,
+        T.float16,
+        T.float16,
+        m,
+        n,
+        k,
+    )
+
+
+@pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
+@pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
+@pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
+def test_gemm_true_false(m, n, k):
+    run_gemm(
+        m,
+        n,
+        k * 3,
+        True,
+        False,
+        T.float16,
+        T.float16,
+        T.float16,
+        m,
+        n,
+        k,
+    )
+
+
+@pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
+@pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
+@pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
+def test_gemm_true_true(m, n, k):
+    run_gemm(
+        m,
+        n,
+        k * 3,
+        True,
+        True,
+        T.float16,
+        T.float16,
+        T.float16,
+        m,
+        n,
+        k,
+    )
+
+
+@pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
+@pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
+@pytest.mark.parametrize("k,in_dtype,out_dtype,accum_dtype", FALSE_TRUE_CASES)
+def test_gemm_rs_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
+    _ensure_torch_dtypes(in_dtype, out_dtype, accum_dtype)
+    run_gemm_rs_false_true(m, n, k, in_dtype, out_dtype, accum_dtype)
+
+
+@pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
+@pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
+@pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
+def test_gemm_rs_false_false(m, n, k):
+    _ensure_torch_dtypes(T.float16)
+    run_gemm_rs_false_false(m, n, k)
+
+
+@pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
+@pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
+@pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
+def test_gemm_rs_true_false(m, n, k):
+    _ensure_torch_dtypes(T.float16)
+    run_gemm_rs_true_false(m, n, k)
+
+
+@pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
+@pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
+@pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
+def test_gemm_rs_true_true(m, n, k):
+    _ensure_torch_dtypes(T.float16)
+    run_gemm_rs_true_true(m, n, k)
+
+
+@pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
+@pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
+@pytest.mark.parametrize("k,in_dtype,out_dtype,accum_dtype", FALSE_TRUE_CASES)
+def test_gemm_sr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
+    _ensure_torch_dtypes(in_dtype, out_dtype, accum_dtype)
+    run_gemm_sr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype)
+
+
+@pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
+@pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
+@pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
+def test_gemm_sr_false_false(m, n, k):
+    _ensure_torch_dtypes(T.float16)
+    run_gemm_sr_false_false(m, n, k)
+
+
+@pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
+@pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
+@pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
+def test_gemm_sr_true_false(m, n, k):
+    _ensure_torch_dtypes(T.float16)
+    run_gemm_sr_true_false(m, n, k)
+
+
+@pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
+@pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
+@pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
+def test_gemm_sr_true_true(m, n, k):
+    _ensure_torch_dtypes(T.float16)
+    run_gemm_sr_true_true(m, n, k)
+
+
+@pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
+@pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
+@pytest.mark.parametrize("k,in_dtype,out_dtype,accum_dtype", FALSE_TRUE_CASES)
+def test_gemm_rr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
+    _ensure_torch_dtypes(in_dtype, out_dtype, accum_dtype)
+    run_gemm_rr_false_true(m, n, k, in_dtype, out_dtype, accum_dtype)
+
+
+@pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
+@pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
+@pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
+def test_gemm_rr_false_false(m, n, k):
+    _ensure_torch_dtypes(T.float16)
+    run_gemm_rr_false_false(m, n, k)
+
+
+@pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
+@pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
+@pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
+def test_gemm_rr_true_false(m, n, k):
+    _ensure_torch_dtypes(T.float16)
+    run_gemm_rr_true_false(m, n, k)
+
+
+@pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
+@pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
+@pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
+def test_gemm_rr_true_true(m, n, k):
+    _ensure_torch_dtypes(T.float16)
+    run_gemm_rr_true_true(m, n, k)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
+
+    # # Test Pass
+    # for m in [64, 128, 256]:
+    #     for n in [16, 32, 64, 128]:
+    #         for k in [16, 32, 64, 128]:
+    #             print(f"======================= Test {m} {n} {k} False True =============================")
+    #             run_gemm(m, n, k * 3, False, True, T.float16, T.float16, T.float16, m, n, k, 2, 128)
+    #             print(f"Test {m} {n} {k} Pass")
+
+    # # Test Pass
+    # for m in [64, 128, 256]:
+    #     for n in [16, 32, 64, 128]:
+    #         for k in [16, 32, 64, 128]:
+    #             print(f"======================= Test {m} {n} {k} False False =============================")
+    #             run_gemm(m, n, k * 3, False, False, T.float16, T.float16, T.float16, m, n, k, 2, 128)
+    #             print(f"Test {m} {n} {k} Pass")
+
+    # # Test Pass
+    # for m in [64, 128, 256]:
+    #     for n in [16, 32, 64, 128]:
+    #         for k in [16, 32, 64, 128]:
+    #             print(f"======================= Test {m} {n} {k} True False =============================")
+    #             run_gemm(m, n, k * 3, True, False, T.float16, T.float16, T.float16, m, n, k, 2, 128)
+    #             print(f"Test {m}, {n} {k} Pass")
+    #         print(f"Test {n} Pass")
+
+    # # Test Pass
+    # for m in [64, 128, 256]:
+    #     for n in [16, 32, 64, 128]:
+    #         for k in [16, 32, 64, 128]:
+    #             print(f"======================= Test {m} {n} {k} True True =============================")
+    #             run_gemm(m, n, k * 3, True, True, T.float16, T.float16, T.float16, m, n, k, 2, 128)
+    #             print(f"Test {m}, {n} {k} Pass")
+    #         print(f"Test {n} Pass")
+
+    # Test Pass
+    # for m in [64, 128, 256]:
+    #     for n in [16, 32, 64, 128]:
+    #         for k in [16, 32, 64, 128]:
+    #             print(f"======================= Test {m} {n} {k} False True =============================")
+    #             run_gemm_rs(m, n, k * 3, False, True, T.float16, T.float16, T.float16, m, n, k, 2, 128)
+    #             print(f"Test {m} {n} {k} Pass")
+
+    # for n in [16, 32, 64, 128]:
+    #     for k in [16, 32, 64, 128]:
+    #         run_gemm_rs(64, n, k, False, False, T.float16, T.float16, T.float16, 64, n, k, 0, 256)
+    #         print(f"Test {64} {n} {k} Pass")
+
+    # for n in [16, 32, 64, 128]:
+    #     for k in [16, 32, 64, 128]:
+    #         run_gemm(64, n, k, False, False, T.float16, T.float16, T.float16, 64, n, k, 0, 256)
+    #         print(f"Test {64} {n} {k} Pass")
diff --git a/maint/gemm_v2/correctness_evaluation_sm70.py b/maint/gemm_v2/correctness_evaluation_sm70.py
new file mode 100644
index 000000000..606d10261
--- /dev/null
+++ b/maint/gemm_v2/correctness_evaluation_sm70.py
@@ -0,0 +1,350 @@
+# pytest maint/gemm_v2/correctness_evaluation_sm70.py -n 32
+import pytest
+from tilelang import tvm as tvm
+import tilelang.testing
+from tilelang import language as T
+
+
+def matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared.dyn")
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope="shared.dyn")
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                # T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+                T.gemm_v2(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def _compile_and_check(
+    program,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+):
+    kernel = tilelang.compile(
+        program,
+        out_idx=[2],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+            # tilelang.PassConfigKey.TIR_USE_ASYNC_COPY: False,
+        },
+    )
+
+    print(kernel.get_kernel_source())
+
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+
+    def ref_program(A, B):
+        import torch
+
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        if in_dtype == T.float32:
+            A = (A.view(torch.int32) - 0x1000).view(torch.float32)
+            B = (B.view(torch.int32) - 0x1000).view(torch.float32)
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(torch.__getattribute__(out_dtype))
+        return C
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+    print("assert_allclose")
+
+
+def run_gemm(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    _compile_and_check(program, trans_A, trans_B, in_dtype, out_dtype)
+
+
+def matmul_rs(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+    A_frag_shape = A_shared_shape
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared.dyn")
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope="shared.dyn")
+            A_frag = T.alloc_fragment(A_frag_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.copy(A_shared, A_frag)
+                T.gemm_v2(A_frag, B_shared, C_local, trans_A, trans_B)
+                # T.gemm(A_frag, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_rs(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    program = matmul_rs(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+    _compile_and_check(program, trans_A, trans_B, in_dtype, out_dtype)
+
+
+M_VALUES = [64, 128]
+N_VALUES = [32, 64, 128]
+K_VALUES = [16, 32, 64]
+FALSE_TRUE_CASES = [
+    pytest.param(
+        k,
+        T.float16,
+        T.float16,
+        T.float16,
+        id=f"K{k}-float16-float16-float16",
+    )
+    for k in K_VALUES
+] + [
+    pytest.param(
+        k,
+        T.float16,
+        T.float16,
+        T.float32,
+        id=f"K{k}-float16-float16-float32",
+    )
+    for k in K_VALUES
+]
+
+
+def _ensure_torch_dtypes(*dtype_names):
+    import torch
+
+    for name in set(dtype_names):
+        if not hasattr(torch, name):
+            pytest.skip(f"Torch does not expose dtype {name}")
+
+
+def run_gemm_rs_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
+    run_gemm_rs(m, n, k * 3, False, True, in_dtype, out_dtype, accum_dtype, m, n, k, 2, 128)
+
+
+def run_gemm_rs_false_false(m, n, k):
+    run_gemm_rs(m, n, k * 3, False, False, T.float16, T.float16, T.float16, m, n, k, 2, 128)
+
+
+TRANS_CASES = [
+    pytest.param(False, False, id="nn"),
+    pytest.param(False, True, id="nt"),
+    pytest.param(True, False, id="tn"),
+    pytest.param(True, True, id="tt"),
+]
+
+
+@pytest.fixture(scope="module", autouse=True)
+def _setup_tilelang_environment():
+    tilelang.disable_cache()
+    tilelang.testing.set_random_seed(42)
+
+
+@pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
+@pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
+@pytest.mark.parametrize("k,in_dtype,out_dtype,accum_dtype", FALSE_TRUE_CASES)
+def test_gemm_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
+    import torch
+
+    required_torch_attrs = {
+        in_dtype,
+        out_dtype,
+        accum_dtype,
+    }
+    for attr in required_torch_attrs:
+        if not hasattr(torch, attr):
+            pytest.skip(f"Torch does not expose dtype {attr}")
+    run_gemm(
+        m,
+        n,
+        k * 3,
+        False,
+        True,
+        in_dtype,
+        out_dtype,
+        accum_dtype,
+        m,
+        n,
+        k,
+        2,
+        128,
+    )
+
+
+@pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
+@pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
+@pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
+def test_gemm_false_false(m, n, k):
+    run_gemm(
+        m,
+        n,
+        k * 3,
+        False,
+        False,
+        T.float16,
+        T.float16,
+        T.float16,
+        m,
+        n,
+        k,
+        2,
+        128,
+    )
+
+
+@pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
+@pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
+@pytest.mark.parametrize("k,in_dtype,out_dtype,accum_dtype", FALSE_TRUE_CASES)
+def test_gemm_rs_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
+    _ensure_torch_dtypes(in_dtype, out_dtype, accum_dtype)
+    run_gemm_rs_false_true(m, n, k, in_dtype, out_dtype, accum_dtype)
+
+
+@pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
+@pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
+@pytest.mark.parametrize("k", K_VALUES, ids=lambda v: f"K{v}")
+def test_gemm_rs_false_false(m, n, k):
+    _ensure_torch_dtypes(T.float16)
+    run_gemm_rs_false_false(m, n, k)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
+
+    # # Test Pass
+    # for m in [64, 128]:
+    #     for n in [16, 32, 64, 128]:
+    #         for k in [16, 32, 64]:
+    #             print(f"======================= Test {m} {n} {k} False True =============================")
+    #             run_gemm(m, n, k * 3, False, True, T.float16, T.float16, T.float16, m, n, k, 2, 128)
+    #             print(f"Test {m} {n} {k} Pass")
+
+    # # Test Pass
+    # for m in [64, 128]:
+    #     for n in [16, 32, 64, 128]:
+    #         for k in [16, 32, 64]:
+    #             print(f"======================= Test {m} {n} {k} False False =============================")
+    #             run_gemm(m, n, k * 3, False, False, T.float16, T.float16, T.float16, m, n, k, 2, 128)
+    #             print(f"Test {m} {n} {k} Pass")
diff --git a/maint/gemm_v2/correctness_evaluation_tcgen05.py b/maint/gemm_v2/correctness_evaluation_tcgen05.py
new file mode 100644
index 000000000..8d9728182
--- /dev/null
+++ b/maint/gemm_v2/correctness_evaluation_tcgen05.py
@@ -0,0 +1,218 @@
+# pytest correctness_evaluation.py -n 32
+import pytest
+from tilelang import tvm as tvm
+import tilelang.testing
+import tilelang.language as T
+
+
+def matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_tmem = T.alloc_tmem([block_M, block_N], accum_dtype)
+            mbar = T.alloc_barrier(1)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            C_shared = T.alloc_shared((block_M, block_N), out_dtype)
+
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[bx * block_N, k * block_K], B_shared)
+                T.gemm(A_shared, B_shared, C_tmem, trans_A, trans_B, mbar=mbar, wg_wait=-1, clear_accum=k == 0)
+                T.mbarrier_wait_parity(mbar, k % 2)
+
+            T.copy(C_tmem, C_local)
+            T.copy(C_local, C_shared)
+
+            T.copy(C_shared, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def _compile_and_check(
+    program,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+):
+    kernel = tilelang.compile(
+        program,
+        out_idx=[2],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+
+    print(kernel.get_kernel_source())
+
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+
+    def ref_program(A, B):
+        import torch
+
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        if in_dtype == T.float32:
+            A = (A.view(torch.int32) - 0x1000).view(torch.float32)
+            B = (B.view(torch.int32) - 0x1000).view(torch.float32)
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(torch.__getattribute__(out_dtype))
+        return C
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+    print("assert_allclose")
+
+
+def run_gemm(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=2,
+    num_threads=128,
+):
+    if block_N >= 256 or block_M >= 256 or block_K >= 256:
+        num_stages = 0
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    _compile_and_check(program, trans_A, trans_B, in_dtype, out_dtype)
+
+
+M_VALUES = [32, 64, 128, 256]
+N_VALUES = [64, 128, 256, 512]
+K_VALUES = [16, 32, 64, 128]
+K_VALUES_8Bit = [32, 64, 128]
+FALSE_TRUE_CASES = [
+    pytest.param(
+        k,
+        T.float16,
+        T.float32,
+        T.float32,
+        id=f"K{k}-float16-float-float",
+    )
+    for k in K_VALUES
+] + [
+    pytest.param(
+        k,
+        T.float8_e5m2,
+        T.float32,
+        T.float32,
+        id="K32-float8_e5m2-float32-float32",
+    )
+    for k in K_VALUES_8Bit
+]
+
+TRANS_CASES = [
+    pytest.param(False, True, id="nt"),
+]
+
+
+@pytest.mark.parametrize("m", M_VALUES, ids=lambda v: f"M{v}")
+@pytest.mark.parametrize("n", N_VALUES, ids=lambda v: f"N{v}")
+@pytest.mark.parametrize("k,in_dtype,out_dtype,accum_dtype", FALSE_TRUE_CASES)
+def test_gemm_false_true(m, n, k, in_dtype, out_dtype, accum_dtype):
+    import torch
+
+    required_torch_attrs = {
+        in_dtype,
+        out_dtype,
+        accum_dtype,
+    }
+    for attr in required_torch_attrs:
+        if not hasattr(torch, attr):
+            pytest.skip(f"Torch does not expose dtype {attr}")
+    run_gemm(
+        m,
+        n,
+        k * 3,
+        False,
+        True,
+        in_dtype,
+        out_dtype,
+        accum_dtype,
+        m,
+        n,
+        k,
+    )
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
+
+    # # Test Pass
+    # for m in [32, 64, 128, 256]:
+    #     for n in [16, 32, 64, 128]:
+    #         for k in [16, 32, 64, 128]:
+    #             if m in [32, 64] and (n not in [64, 128, 256]):
+    #                 continue
+    #             print(f"======================= Test {m} {n} {k} False True =============================")
+    #             run_gemm(m, n, k * 3, False, True, T.float16, T.float, T.float, m, n, k, 2, 128)
+    #             print(f"Test {m} {n} {k} Pass")
+
+    # # Test Pass
+    # for m in [32, 64, 128, 256]:
+    #     for n in [32, 64, 128]:
+    #         for k in [16, 32, 64, 128]:
+    #             if m in [32, 64] and (n not in [64, 128, 256]):
+    #                 continue
+    #             print(f"======================= Test {m} {n} {k} False True =============================")
+    #             run_gemm(m, n, k * 3, False, True, T.float16, T.float, T.float, m, n, k, 2, 256)
+    #             print(f"Test {m} {n} {k} Pass")
+
+    # # Test Pass
+    # for m in [32, 64, 128, 256]:
+    #     for n in [16, 32, 64, 128]:
+    #         for k in [32, 64, 128]:
+    #             if m in [32, 64] and (n not in [64, 128, 256]):
+    #                 continue
+    #             print(f"======================= Test {m} {n} {k} False True =============================")
+    #             run_gemm(m, n, k * 3, False, True, T.float8_e5m2, T.float, T.float, m, n, k, 2, 128)
diff --git a/maint/gemm_v2/latency.py b/maint/gemm_v2/latency.py
new file mode 100644
index 000000000..b7b2a2af9
--- /dev/null
+++ b/maint/gemm_v2/latency.py
@@ -0,0 +1,98 @@
+import tilelang
+import tilelang.language as T
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--use_v2", action="store_true")
+args = parser.parse_args()
+
+use_v2 = args.use_v2
+
+
+# @tilelang.jit(target="cuda")
+# target currently can be "cuda" or "hip" or "cpu".
+# if not specified, it will be inferred from the input tensors during compile time
+@tilelang.jit
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
+    @T.prim_func
+    def matmul_relu_kernel(
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
+    ):
+        # Initialize Kernel Context
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            # Enable rasterization for better L2 cache locality (Optional)
+            # T.use_swizzle(panel_size=10, enable=True)
+
+            # Clear local accumulation
+            T.clear(C_local)
+
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                # Copy tile of A
+                # This is a sugar syntax for parallelized copy
+                T.copy(A[by * block_M, ko * block_K], A_shared)
+
+                # Copy tile of B
+                T.copy(B[ko * block_K, bx * block_N], B_shared)
+
+                # Perform a tile-level GEMM on the shared buffers
+                # Currently we dispatch to the cute/hip on Nvidia/AMD GPUs
+                if use_v2:
+                    T.gemm_v2(A_shared, B_shared, C_local)
+                else:
+                    T.gemm_v1(A_shared, B_shared, C_local)
+
+            # relu
+            for i, j in T.Parallel(block_M, block_N):
+                C_local[i, j] = T.max(C_local[i, j], 0)
+
+            # Copy result back to global memory
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return matmul_relu_kernel
+
+
+M = 16384  # M = T.dynamic("m") if you want to use dynamic shape
+N = 16384
+K = 16384
+block_M = 128
+block_N = 128
+block_K = 32
+
+# 1. Define the kernel (matmul) and compile/lower it into an executable module
+matmul_relu_kernel = matmul(M, N, K, block_M, block_N, block_K)
+
+# 3. Test the kernel in Python with PyTorch data
+import torch
+
+# Create random input tensors on the GPU
+a = torch.randn(M, K, device="cuda", dtype=torch.float16)
+b = torch.randn(K, N, device="cuda", dtype=torch.float16)
+c = torch.empty(M, N, device="cuda", dtype=torch.float16)
+
+# Run the kernel through the Profiler
+matmul_relu_kernel(a, b, c)
+
+print(c)
+# Reference multiplication using PyTorch
+ref_c = torch.relu(a @ b)
+
+# Validate correctness
+torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+print("Kernel output matches PyTorch reference.")
+
+# 4. Retrieve and inspect the generated CUDA source (optional)
+# cuda_source = jit_kernel.get_kernel_source()
+# print("Generated CUDA kernel:\n", cuda_source)
+
+# 5.Profile latency with kernel
+profiler = matmul_relu_kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+
+latency = profiler.do_bench()
+
+print(f"Latency: {latency} ms")
diff --git a/maint/gemm_v2/latency_gemm.py b/maint/gemm_v2/latency_gemm.py
new file mode 100644
index 000000000..5f0450e02
--- /dev/null
+++ b/maint/gemm_v2/latency_gemm.py
@@ -0,0 +1,98 @@
+import tilelang
+import tilelang.language as T
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--use_v2", action="store_true")
+args = parser.parse_args()
+
+use_v2 = args.use_v2
+
+
+# @tilelang.jit(target="cuda")
+# target currently can be "cuda" or "hip" or "cpu".
+# if not specified, it will be inferred from the input tensors during compile time
+@tilelang.jit
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
+    @T.prim_func
+    def matmul_relu_kernel(
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
+    ):
+        # Initialize Kernel Context
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            # Enable rasterization for better L2 cache locality (Optional)
+            # T.use_swizzle(panel_size=10, enable=True)
+
+            # Clear local accumulation
+            T.clear(C_local)
+
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                # Copy tile of A
+                # This is a sugar syntax for parallelized copy
+                T.copy(A[by * block_M, ko * block_K], A_shared)
+
+                # Copy tile of B
+                T.copy(B[ko * block_K, bx * block_N], B_shared)
+
+                # Perform a tile-level GEMM on the shared buffers
+                # Currently we dispatch to the cute/hip on Nvidia/AMD GPUs
+                if use_v2:
+                    T.gemm_v2(A_shared, B_shared, C_local)
+                else:
+                    T.gemm_v1(A_shared, B_shared, C_local)
+
+            # relu
+            for i, j in T.Parallel(block_M, block_N):
+                C_local[i, j] = T.max(C_local[i, j], 0)
+
+            # Copy result back to global memory
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return matmul_relu_kernel
+
+
+M = 16384  # M = T.dynamic("m") if you want to use dynamic shape
+N = 16384
+K = 16384
+block_M = 128
+block_N = 128
+block_K = 64
+
+# 1. Define the kernel (matmul) and compile/lower it into an executable module
+matmul_relu_kernel = matmul(M, N, K, block_M, block_N, block_K)
+
+# 3. Test the kernel in Python with PyTorch data
+import torch
+
+# Create random input tensors on the GPU
+a = torch.randn(M, K, device="cuda", dtype=torch.float16)
+b = torch.randn(K, N, device="cuda", dtype=torch.float16)
+c = torch.empty(M, N, device="cuda", dtype=torch.float16)
+
+# Run the kernel through the Profiler
+matmul_relu_kernel(a, b, c)
+
+print(c)
+# Reference multiplication using PyTorch
+ref_c = torch.relu(a @ b)
+
+# Validate correctness
+torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+print("Kernel output matches PyTorch reference.")
+
+# 4. Retrieve and inspect the generated CUDA source (optional)
+# cuda_source = jit_kernel.get_kernel_source()
+# print("Generated CUDA kernel:\n", cuda_source)
+
+# 5.Profile latency with kernel
+profiler = matmul_relu_kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+
+latency = profiler.do_bench()
+
+print(f"Latency: {latency} ms")
diff --git a/maint/gemm_v2/latency_mha_fwd_bhsd.py b/maint/gemm_v2/latency_mha_fwd_bhsd.py
new file mode 100644
index 000000000..7a83d7cec
--- /dev/null
+++ b/maint/gemm_v2/latency_mha_fwd_bhsd.py
@@ -0,0 +1,228 @@
+import torch
+import torch.nn.functional as F
+import tilelang
+from tilelang.autotuner import *
+import tilelang.language as T
+import itertools
+import argparse
+from functools import partial
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--batch", type=int, default=128, help="batch size")
+parser.add_argument("--heads", type=int, default=16, help="heads")
+parser.add_argument("--seq_q", type=int, default=1024, help="query sequence length")
+parser.add_argument("--seq_kv", type=int, default=1024, help="key/value sequence length")
+parser.add_argument("--dim", type=int, default=256, help="dim")
+parser.add_argument("--is_causal", action="store_true", help="causal")
+parser.add_argument("--tune", action="store_true", help="tune configs")
+parser.add_argument("--use_v2", action="store_true")
+
+args = parser.parse_args()
+
+use_v2 = args.use_v2
+
+
+def get_configs():
+    iter_params = dict(block_M=[128], block_N=[128], num_stages=[2], threads=[256])
+    return [dict(zip(iter_params, values)) for values in itertools.product(*iter_params.values())]
+
+
+@autotune(configs=get_configs(), warmup=10, rep=10)
+@tilelang.jit(
+    out_idx=[3],
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    },
+)
+def flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=64, block_N=64, num_stages=0, threads=128):
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
+    q_shape = [batch, heads, seq_q, dim]
+    kv_shape = [batch, heads, seq_kv, dim]
+    dtype = T.float16
+    accum_dtype = T.float32
+
+    past_len = seq_kv - seq_q
+    assert past_len >= 0, "seq_kv must be greater than or equal to seq_q"
+
+    @T.macro
+    def MMA0(
+        K: T.Tensor(kv_shape, dtype),
+        Q_shared: T.SharedBuffer([block_M, dim], dtype),
+        K_shared: T.SharedBuffer([block_N, dim], dtype),
+        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+        k: T.int32,
+        bx: T.int32,
+        by: T.int32,
+        bz: T.int32,
+    ):
+        T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
+        if is_causal:
+            for i, j in T.Parallel(block_M, block_N):
+                q_idx = bx * block_M + i + past_len
+                k_idx = k * block_N + j
+                acc_s[i, j] = T.if_then_else(q_idx >= k_idx, 0, -T.infinity(acc_s.dtype))
+        else:
+            T.clear(acc_s)
+        if use_v2:
+            T.gemm_v2(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+        else:
+            T.gemm_v1(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+    @T.macro
+    def MMA1(
+        V: T.Tensor(kv_shape, dtype),
+        V_shared: T.SharedBuffer([block_N, dim], dtype),
+        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+        k: T.int32,
+        by: T.int32,
+        bz: T.int32,
+    ):
+        T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
+        # T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+        if use_v2:
+            T.gemm_v2(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+        else:
+            T.gemm_v1(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
+    @T.macro
+    def Softmax(
+        acc_s: T.FragmentBuffer([block_M, block_N], accum_dtype),
+        acc_s_cast: T.FragmentBuffer([block_M, block_N], dtype),
+        scores_max: T.FragmentBuffer([block_M], accum_dtype),
+        scores_max_prev: T.FragmentBuffer([block_M], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+        scores_sum: T.FragmentBuffer([block_M], accum_dtype),
+        logsum: T.FragmentBuffer([block_M], accum_dtype),
+    ):
+        T.copy(scores_max, scores_max_prev)
+        T.fill(scores_max, -T.infinity(accum_dtype))
+        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+        # To do causal softmax, we need to set the scores_max to 0 if it is -inf
+        # This process is called Check_inf in FlashAttention3 code, and it only need to be done
+        # in the first ceil_div(kBlockM, kBlockN) steps.
+        # for i in T.Parallel(block_M):
+        #     scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
+        for i in T.Parallel(block_M):
+            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+
+        for i, j in T.Parallel(block_M, block_N):
+            # Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
+            # max * log_2(e)) This allows the compiler to use the ffma
+            # instruction instead of fadd and fmul separately.
+            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+        T.reduce_sum(acc_s, scores_sum, dim=1)
+        for i in T.Parallel(block_M):
+            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+        T.copy(acc_s, acc_s_cast)
+
+    @T.macro
+    def Rescale(
+        acc_o: T.FragmentBuffer([block_M, dim], accum_dtype),
+        scores_scale: T.FragmentBuffer([block_M], accum_dtype),
+    ):
+        for i, j in T.Parallel(block_M, dim):
+            acc_o[i, j] *= scores_scale[i]
+
+    @T.prim_func
+    def main(
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
+    ):
+        with T.Kernel(T.ceildiv(seq_q, block_M), heads, batch, threads=threads) as (bx, by, bz):
+            Q_shared = T.alloc_shared([block_M, dim], dtype)
+            K_shared = T.alloc_shared([block_N, dim], dtype)
+            V_shared = T.alloc_shared([block_N, dim], dtype)
+            O_shared = T.alloc_shared([block_M, dim], dtype)
+            acc_s = T.alloc_fragment([block_M, block_N], accum_dtype)
+            acc_s_cast = T.alloc_fragment([block_M, block_N], dtype)
+            acc_o = T.alloc_fragment([block_M, dim], accum_dtype)
+            scores_max = T.alloc_fragment([block_M], accum_dtype)
+            scores_max_prev = T.alloc_fragment([block_M], accum_dtype)
+            scores_scale = T.alloc_fragment([block_M], accum_dtype)
+            scores_sum = T.alloc_fragment([block_M], accum_dtype)
+            logsum = T.alloc_fragment([block_M], accum_dtype)
+
+            T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
+            T.fill(acc_o, 0)
+            T.fill(logsum, 0)
+            T.fill(scores_max, -T.infinity(accum_dtype))
+
+            loop_range = (
+                T.min(T.ceildiv(seq_kv, block_N), T.ceildiv((bx + 1) * block_M + past_len, block_N))
+                if is_causal
+                else T.ceildiv(seq_kv, block_N)
+            )
+
+            for k in T.Pipelined(loop_range, num_stages=num_stages):
+                MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
+                Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
+                Rescale(acc_o, scores_scale)
+                MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
+            for i, j in T.Parallel(block_M, dim):
+                acc_o[i, j] /= logsum[i]
+            T.copy(acc_o, O_shared)
+            T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
+
+    return main
+
+
+def ref_program(Q, K, V, is_causal):
+    dim = Q.size(-1)
+    scores = torch.einsum("bhqd,bhkd->bhqk", Q, K)
+    scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
+    if is_causal:
+        seq_q = Q.size(2)
+        seq_kv = K.size(2)
+        mask = torch.tril(torch.ones(seq_q, seq_kv, device=scores.device), seq_kv - seq_q)
+        mask = mask.unsqueeze(0).unsqueeze(0)
+        scores = scores.masked_fill(mask == 0, float("-inf"))
+    attention_weights = F.softmax(scores, dim=-1)
+    output = torch.einsum("bhqk,bhkd->bhqd", attention_weights, V)
+    return output
+
+
+def main(
+    batch: int = 1,
+    heads: int = 1,
+    seq_q: int = 256,
+    seq_kv: int = 256,
+    dim: int = 64,
+    is_causal: bool = False,
+    tune: bool = False,
+):
+    flops_per_matmul = 2.0 * batch * heads * seq_q * seq_kv * dim
+    total_flops = 2 * flops_per_matmul
+    if is_causal:
+        total_flops *= 0.5
+
+    if not tune:
+        kernel = flashattn(batch, heads, seq_q, seq_kv, dim, is_causal, block_M=64, block_N=64, num_stages=0, threads=128)
+        print(kernel.get_kernel_source())
+        ref_program_processed = partial(ref_program, is_causal=is_causal)
+
+        profiler = kernel.get_profiler()
+        profiler.assert_allclose(ref_program_processed, rtol=0.01, atol=0.01)
+        print("All checks pass.")
+        latency = profiler.do_bench(ref_program_processed, warmup=500)
+        print(f"Ref: {latency:.2f} ms")
+        print(f"Ref: {total_flops / latency * 1e-9:.2f} TFlops")
+        latency = profiler.do_bench(warmup=500)
+        print(f"Tile-lang: {latency:.2f} ms")
+        print(f"Tile-lang: {total_flops / latency * 1e-9:.2f} TFlops")
+    else:
+        kernel = flashattn(batch, heads, seq_q, seq_kv, dim, is_causal)
+        best_latency = kernel.latency
+        best_config = kernel.config
+        ref_latency = kernel.ref_latency
+        print(f"Best latency: {best_latency}")
+        print(f"Best TFlops: {total_flops / best_latency * 1e-9}")
+        print(f"Best config: {best_config}")
+        print(f"Ref latency: {ref_latency}")
+
+
+if __name__ == "__main__":
+    tilelang.disable_cache()
+    main(args.batch, args.heads, args.seq_q, args.seq_kv, args.dim, args.is_causal, args.tune)
diff --git a/maint/host_checks/01_num_args_mismatch.py b/maint/host_checks/01_num_args_mismatch.py
new file mode 100644
index 000000000..9528652ee
--- /dev/null
+++ b/maint/host_checks/01_num_args_mismatch.py
@@ -0,0 +1,22 @@
+"""Reproduce: Argument count mismatch.
+
+Note: The adapter-level wrapper expects only inputs (A, B) because C is marked as output.
+Calling with the wrong number of inputs raises a ValueError before host entry.
+"""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 256
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    a = torch.empty((M, K), device="cuda", dtype=torch.float16)
+    # Missing b
+    # Expected: ValueError with message about expected vs. actual inputs
+    fn(a)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/02_pointer_type_error.py b/maint/host_checks/02_pointer_type_error.py
new file mode 100644
index 000000000..188a4f8cc
--- /dev/null
+++ b/maint/host_checks/02_pointer_type_error.py
@@ -0,0 +1,23 @@
+"""Reproduce: Pointer-type argument expected but scalar provided.
+
+We pass an integer for A; wrapper forwards it to the host where a pointer is expected.
+Expected: error like "Expect buffer A_handle to be pointer or tensor" (exact name depends on kernel param).
+"""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 256
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    # Wrong type for A (int instead of tensor)
+    a = 1
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/03_ndim_mismatch.py b/maint/host_checks/03_ndim_mismatch.py
new file mode 100644
index 000000000..76637e8de
--- /dev/null
+++ b/maint/host_checks/03_ndim_mismatch.py
@@ -0,0 +1,19 @@
+"""Reproduce: ndim (rank) mismatch for A."""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 128
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    # A has rank 3 instead of 2
+    a = torch.empty((M, K, 1), device="cuda", dtype=torch.float16)
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/04_dtype_mismatch.py b/maint/host_checks/04_dtype_mismatch.py
new file mode 100644
index 000000000..f3554c1d6
--- /dev/null
+++ b/maint/host_checks/04_dtype_mismatch.py
@@ -0,0 +1,19 @@
+"""Reproduce: dtype mismatch for A (float32 vs expected float16)."""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 128
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+    print(fn.get_host_source())
+
+    a = torch.empty((M, K), device="cuda", dtype=torch.float32)  # should be float16
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/05_shape_mismatch.py b/maint/host_checks/05_shape_mismatch.py
new file mode 100644
index 000000000..a48248176
--- /dev/null
+++ b/maint/host_checks/05_shape_mismatch.py
@@ -0,0 +1,19 @@
+"""Reproduce: shape constant/symbol mismatch on A."""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 128
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    # A's second dimension is wrong (K+1 instead of K)
+    a = torch.empty((M, K + 1), device="cuda", dtype=torch.float16)
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/06_strides_mismatch.py b/maint/host_checks/06_strides_mismatch.py
new file mode 100644
index 000000000..7e523cd64
--- /dev/null
+++ b/maint/host_checks/06_strides_mismatch.py
@@ -0,0 +1,19 @@
+"""Reproduce: strides check failure (non-contiguous A via transpose)."""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 128
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    a = torch.empty((M, K), device="cuda", dtype=torch.float16)
+    a_nc = a.t()  # non-contiguous after transpose
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a_nc, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/07_device_type_mismatch.py b/maint/host_checks/07_device_type_mismatch.py
new file mode 100644
index 000000000..af8e5efd5
--- /dev/null
+++ b/maint/host_checks/07_device_type_mismatch.py
@@ -0,0 +1,18 @@
+"""Reproduce: device_type mismatch by passing CPU tensors to a CUDA kernel."""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 64
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    a = torch.empty((M, K), device="cpu", dtype=torch.float16)
+    b = torch.empty((K, N), device="cpu", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/08_device_id_mismatch.py b/maint/host_checks/08_device_id_mismatch.py
new file mode 100644
index 000000000..280aca157
--- /dev/null
+++ b/maint/host_checks/08_device_id_mismatch.py
@@ -0,0 +1,25 @@
+"""Reproduce: device_id mismatch (requires >=2 CUDA devices)."""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is not available")
+    if torch.cuda.device_count() < 2:
+        print("[SKIP] Need at least 2 CUDA devices to reproduce device_id mismatch.")
+        return
+
+    M = N = K = 64
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    a = torch.empty((M, K), device="cuda:0", dtype=torch.float16)
+    b = torch.empty((K, N), device="cuda:1", dtype=torch.float16)
+    # Output device is derived by the adapter; mismatch occurs in host checks
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/09_null_data_pointer.py b/maint/host_checks/09_null_data_pointer.py
new file mode 100644
index 000000000..09f5de1af
--- /dev/null
+++ b/maint/host_checks/09_null_data_pointer.py
@@ -0,0 +1,26 @@
+"""Reproduce: NULL data pointer (advanced).
+
+Passing None for a tensor argument will be forwarded through the adapter. Depending on
+FFI handling, this commonly triggers a pointer-type assertion (e.g., "Expect buffer <name> to be pointer or tensor")
+or a host-side non-NULL pointer check.
+
+Note: Constructing a true DLTensor with NULL data in PyTorch is not typical; this script
+demonstrates passing None, which still reproduces the intended class of failure.
+"""
+
+import torch
+from common import build_matmul_kernel
+
+
+def main():
+    M = N = K = 64
+    fn = build_matmul_kernel(M, N, K, target="cuda")
+
+    a = None  # attempt to pass a null-like pointer
+    b = torch.empty((K, N), device="cuda", dtype=torch.float16)
+
+    fn(a, b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/10_scalar_type_mismatch.py b/maint/host_checks/10_scalar_type_mismatch.py
new file mode 100644
index 000000000..4f2c90b8d
--- /dev/null
+++ b/maint/host_checks/10_scalar_type_mismatch.py
@@ -0,0 +1,15 @@
+"""Reproduce: scalar parameter type mismatch (int/bool)."""
+
+from common import build_scalar_check_kernel
+
+
+def main():
+    fn = build_scalar_check_kernel(target="cuda")
+
+    # Wrong types
+    fn(1.0, True)  # x should be int -> Expect arg[0] to be int
+    fn(1, 2.5)  # flag should be bool -> Expect arg[1] to be boolean
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/host_checks/README.md b/maint/host_checks/README.md
new file mode 100644
index 000000000..ac23d6fd2
--- /dev/null
+++ b/maint/host_checks/README.md
@@ -0,0 +1,21 @@
+# Host-Side Check Repro Scripts
+
+This folder contains standalone scripts that deliberately trigger host-side (and adapter-side) validation errors described in `docs/compiler_internals/tensor_checks.md`. Each script can be run directly and will reproduce the corresponding error with a minimal example.
+
+Prerequisites
+- CUDA-capable environment (most scripts compile a CUDA-targeted kernel)
+- Python packages: torch, tilelang
+
+Usage
+- Run any script, e.g.:
+  - `python 01_num_args_mismatch.py`
+  - `python 02_pointer_type_error.py`
+  - ... up to `10_scalar_type_mismatch.py`
+
+- Or run all at once with a summary:
+  - `python run_all.py`
+  - Logs per test are saved under `logs/` as `<script>.out` / `<script>.err`.
+
+Notes
+- Scripts assume at least one CUDA device. For the device-id mismatch case (08), two GPUs are required; the script will skip with a note if only one is available.
+- The adapter raises some errors before the host stub (e.g., wrong input count). The messages are aligned with the host checks as far as possible.
diff --git a/maint/host_checks/common.py b/maint/host_checks/common.py
new file mode 100644
index 000000000..3dbac5481
--- /dev/null
+++ b/maint/host_checks/common.py
@@ -0,0 +1,41 @@
+import tilelang
+import tilelang.language as T
+import torch
+
+
+def make_matmul_prim(M, N, K, block_M=128, block_N=128, block_K=32, dtype=T.float16, accum_dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=0):
+                T.copy(A[by * block_M, ko * block_K], A_shared)
+                T.copy(B[ko * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def build_matmul_kernel(M=1024, N=1024, K=1024, target="cuda"):
+    """Compile and return a callable kernel that takes (A, B) and returns C."""
+    if target.startswith("cuda") and not torch.cuda.is_available():
+        raise RuntimeError("CUDA is not available; cannot build CUDA kernel for host-check repros.")
+    prim = make_matmul_prim(M, N, K)
+    # out_idx=[2] means the 3rd param C is treated as output; wrapper takes (A,B)
+    return tilelang.compile(prim, out_idx=[2], target=target)
+
+
+def build_scalar_check_kernel(target="cuda"):
+    @T.prim_func
+    def scalar_check(x: T.int32, flag: T.bool()):
+        T.evaluate(0)
+
+    return tilelang.compile(scalar_check, target=target)
diff --git a/maint/host_checks/run_all.py b/maint/host_checks/run_all.py
new file mode 100644
index 000000000..7fecd8b18
--- /dev/null
+++ b/maint/host_checks/run_all.py
@@ -0,0 +1,71 @@
+import sys
+import subprocess
+from pathlib import Path
+
+
+def main():
+    root = Path(__file__).resolve().parent
+    scripts = [
+        "01_num_args_mismatch.py",
+        "02_pointer_type_error.py",
+        "03_ndim_mismatch.py",
+        "04_dtype_mismatch.py",
+        "05_shape_mismatch.py",
+        "06_strides_mismatch.py",
+        "07_device_type_mismatch.py",
+        "08_device_id_mismatch.py",
+        "09_null_data_pointer.py",
+        "10_scalar_type_mismatch.py",
+    ]
+
+    logs_dir = root / "logs"
+    logs_dir.mkdir(exist_ok=True)
+
+    results = []
+    for name in scripts:
+        script_path = root / name
+        if not script_path.exists():
+            results.append((name, "MISSING", 0))
+            print(f"[MISSING] {name}")
+            continue
+
+        print(f"\n=== Running {name} ===")
+        proc = subprocess.run(
+            [sys.executable, str(script_path)],
+            cwd=str(root),
+            capture_output=True,
+            text=True,
+        )
+
+        # Save logs
+        (logs_dir / f"{name}.out").write_text(proc.stdout)
+        (logs_dir / f"{name}.err").write_text(proc.stderr)
+
+        out = (proc.stdout or "") + (proc.stderr or "")
+        if "[SKIP]" in out:
+            status = "SKIP"
+        elif proc.returncode != 0:
+            status = "PASS"  # error reproduced as expected
+        else:
+            status = "FAIL"  # no error observed
+
+        results.append((name, status, proc.returncode))
+        print(f"[{status}] {name} (rc={proc.returncode})")
+
+    # Summary
+    print("\n=== Summary ===")
+    counts = {"PASS": 0, "FAIL": 0, "SKIP": 0, "MISSING": 0}
+    for name, status, _ in results:
+        counts[status] = counts.get(status, 0) + 1
+        print(f"{status:7} {name}")
+
+    print("\nTotals:")
+    for k in ("PASS", "FAIL", "SKIP", "MISSING"):
+        print(f"  {k:7}: {counts.get(k, 0)}")
+
+    # Exit non-zero if any FAIL
+    sys.exit(1 if counts.get("FAIL", 0) else 0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maint/precision/compare_ops.py b/maint/precision/compare_ops.py
old mode 100644
new mode 100755
index 7d0d67db7..c77a67cfd
--- a/maint/precision/compare_ops.py
+++ b/maint/precision/compare_ops.py
@@ -37,7 +37,7 @@
     6: "sqrt",
     7: "tanh",
     8: "rsqrt",
-    9: "inv_sqrt"
+    9: "inv_sqrt",
 }
 
 # Block sizes for kernels
@@ -49,8 +49,7 @@
 
 def parse_arguments() -> argparse.Namespace:
     """Parse command line arguments."""
-    parser = argparse.ArgumentParser(
-        description="Precision comparison tool for various CUDA implementations")
+    parser = argparse.ArgumentParser(description="Precision comparison tool for various CUDA implementations")
     parser.add_argument("--n", type=int, default=1000000, help="Number of elements to test")
     parser.add_argument("--low", type=float, default=-4.0, help="Lower bound for random values")
     parser.add_argument("--high", type=float, default=4.0, help="Upper bound for random values")
@@ -67,7 +66,7 @@ def initialize_cuda() -> torch.nn.Module:
     return load(
         name="cuda_ops",
         sources=["cuda_ops.cu"],
-        extra_cuda_cflags=[]  # No fast_math flags
+        extra_cuda_cflags=[],  # No fast_math flags
     )
 
 
@@ -149,8 +148,7 @@ def triton_unary_kernel(x_ptr, out_ptr, n_elements, op_id: tl.constexpr, BLOCK_S
 
 
 @triton.jit
-def triton_libdevice_unary_kernel(x_ptr, out_ptr, n_elements, op_id: tl.constexpr,
-                                  BLOCK_SIZE: tl.constexpr):
+def triton_libdevice_unary_kernel(x_ptr, out_ptr, n_elements, op_id: tl.constexpr, BLOCK_SIZE: tl.constexpr):
     """LibDevice Triton kernel for unary operations."""
     pid = tl.program_id(0)
     block_start = pid * BLOCK_SIZE
@@ -188,13 +186,10 @@ def make_tilelang_unary_kernel(M: int, N: int, op_id: int, use_fastmath: bool =
 
     @T.prim_func
     def tilelang_unary_kernel(
-            A: T.Tensor((M, N), "float32"),
-            B: T.Tensor((M, N), "float32"),
+        A: T.Tensor((M, N), T.float32),
+        B: T.Tensor((M, N), T.float32),
     ):
-        with T.Kernel(
-                T.ceildiv(N, TILELANG_BLOCK_N),
-                T.ceildiv(M, TILELANG_BLOCK_M),
-                threads=TILELANG_THREADS) as (bx, by):
+        with T.Kernel(T.ceildiv(N, TILELANG_BLOCK_N), T.ceildiv(M, TILELANG_BLOCK_M), threads=TILELANG_THREADS) as (bx, by):
             for i, j in T.Parallel(TILELANG_BLOCK_M, TILELANG_BLOCK_N):
                 row = by * TILELANG_BLOCK_M + i
                 col = bx * TILELANG_BLOCK_N + j
@@ -229,14 +224,11 @@ def make_tilelang_binary_kernel(M: int, N: int):
 
     @T.prim_func
     def tilelang_binary_kernel(
-            A: T.Tensor((M, N), "float32"),
-            B: T.Tensor((M, N), "float32"),
-            C: T.Tensor((M, N), "float32"),
+        A: T.Tensor((M, N), T.float32),
+        B: T.Tensor((M, N), T.float32),
+        C: T.Tensor((M, N), T.float32),
     ):
-        with T.Kernel(
-                T.ceildiv(N, TILELANG_BLOCK_N),
-                T.ceildiv(M, TILELANG_BLOCK_M),
-                threads=TILELANG_THREADS) as (bx, by):
+        with T.Kernel(T.ceildiv(N, TILELANG_BLOCK_N), T.ceildiv(M, TILELANG_BLOCK_M), threads=TILELANG_THREADS) as (bx, by):
             for i, j in T.Parallel(TILELANG_BLOCK_M, TILELANG_BLOCK_N):
                 row = by * TILELANG_BLOCK_M + i
                 col = bx * TILELANG_BLOCK_N + j
@@ -247,10 +239,7 @@ def tilelang_binary_kernel(
     return tilelang_binary_kernel
 
 
-def tilelang_op(x: torch.Tensor,
-                op_id: int,
-                y: Optional[torch.Tensor] = None,
-                use_fastmath: bool = False) -> torch.Tensor:
+def tilelang_op(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None, use_fastmath: bool = False) -> torch.Tensor:
     """TileLang operation interface."""
     assert x.is_cuda
 
@@ -272,7 +261,8 @@ def tilelang_op(x: torch.Tensor,
             target="cuda",
             pass_configs={
                 tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: use_fastmath,
-            })
+            },
+        )
         out = kernel(x, y)
     else:  # Unary operation
         kernel_func = make_tilelang_unary_kernel(M, N, op_id, use_fastmath)
@@ -282,7 +272,8 @@ def tilelang_op(x: torch.Tensor,
             target="cuda",
             pass_configs={
                 tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: use_fastmath,
-            })
+            },
+        )
         out = kernel(x)
 
     # Restore original shape
@@ -293,7 +284,7 @@ def triton_op(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None) ->
     """Standard Triton operation interface."""
     assert x.is_cuda
     out = torch.empty_like(x)
-    grid = lambda meta: ((x.numel() + meta['BLOCK_SIZE'] - 1) // meta['BLOCK_SIZE'],)
+    grid = lambda meta: ((x.numel() + meta["BLOCK_SIZE"] - 1) // meta["BLOCK_SIZE"],)
 
     if op_id == 0:  # Division - binary operation
         assert y is not None, "Division operation requires second operand"
@@ -304,13 +295,11 @@ def triton_op(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None) ->
     return out
 
 
-def triton_libdevice_op(x: torch.Tensor,
-                        op_id: int,
-                        y: Optional[torch.Tensor] = None) -> torch.Tensor:
+def triton_libdevice_op(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None) -> torch.Tensor:
     """LibDevice Triton operation interface."""
     assert x.is_cuda
     out = torch.empty_like(x)
-    grid = lambda meta: ((x.numel() + meta['BLOCK_SIZE'] - 1) // meta['BLOCK_SIZE'],)
+    grid = lambda meta: ((x.numel() + meta["BLOCK_SIZE"] - 1) // meta["BLOCK_SIZE"],)
 
     if op_id == 0:  # Division - binary operation
         assert y is not None, "Division operation requires second operand"
@@ -321,9 +310,7 @@ def triton_libdevice_op(x: torch.Tensor,
     return out
 
 
-def get_pytorch_reference(x: torch.Tensor,
-                          op_id: int,
-                          y: Optional[torch.Tensor] = None) -> torch.Tensor:
+def get_pytorch_reference(x: torch.Tensor, op_id: int, y: Optional[torch.Tensor] = None) -> torch.Tensor:
     """Get PyTorch reference implementation for the given operation."""
     if op_id == 0:
         assert y is not None, "Division requires second operand"
@@ -362,8 +349,10 @@ def summarize_error(tag: str, output: Optional[torch.Tensor], reference: torch.T
 
     abs_err = (output_double - reference_double).abs()
     rel_err = abs_err / (reference_double.abs().clamp_min(1e-30))
-    print(f"{tag:<32} max abs: {abs_err.max():.3e}, mean abs: {abs_err.mean():.3e}, "
-          f"max rel: {rel_err.max():.3e}, mean rel: {rel_err.mean():.3e}")
+    print(
+        f"{tag:<32} max abs: {abs_err.max():.3e}, mean abs: {abs_err.mean():.3e}, "
+        f"max rel: {rel_err.max():.3e}, mean rel: {rel_err.mean():.3e}"
+    )
 
 
 # Precision comparison function
@@ -407,9 +396,7 @@ def compare(op_id: int, x: torch.Tensor, y: Optional[torch.Tensor] = None) -> No
             results[name] = None
 
     # Print comparison header
-    print(
-        f"{'Implementation':<32} {'Max Abs Error':<19} {'Mean Abs Error':<20} {'Max Rel Error':<19} {'Mean Rel Error'}"
-    )
+    print(f"{'Implementation':<32} {'Max Abs Error':<19} {'Mean Abs Error':<20} {'Max Rel Error':<19} {'Mean Rel Error'}")
     print("-" * 90)
 
     # Compare all implementations against double precision reference
@@ -427,8 +414,7 @@ def compare(op_id: int, x: torch.Tensor, y: Optional[torch.Tensor] = None) -> No
         summarize_error(tag, output, ref_double)
 
 
-def generate_test_data(op_id: int, n: int, device: torch.device, low: float,
-                       high: float) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+def generate_test_data(op_id: int, n: int, device: torch.device, low: float, high: float) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
     """Generate appropriate test data for each operation."""
     if op_id == 0:  # Division
         x = torch.empty(n, device=device).uniform_(low, high)
@@ -450,9 +436,7 @@ def generate_test_data(op_id: int, n: int, device: torch.device, low: float,
 
 def main() -> None:
     """Main execution function."""
-    print(
-        "Precision comparison between CUDA Precise/Fast, Triton, Triton LibDevice, PyTorch, and TileLang"
-    )
+    print("Precision comparison between CUDA Precise/Fast, Triton, Triton LibDevice, PyTorch, and TileLang")
     print("=" * 90)
 
     for op_id in range(len(OP_NAMES)):
diff --git a/maint/precision/cuda_ops.cu b/maint/precision/cuda_ops.cu
index 519335751..1f37d53de 100644
--- a/maint/precision/cuda_ops.cu
+++ b/maint/precision/cuda_ops.cu
@@ -239,4 +239,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
           py::arg("x"), py::arg("y") = c10::nullopt, py::arg("result"), py::arg("op_type"));
     m.def("launch_fast_operator", &launch_fast_operator, "CUDA Fast Operator",
           py::arg("x"), py::arg("y") = c10::nullopt, py::arg("result"), py::arg("op_type"));
-}
\ No newline at end of file
+}
diff --git a/maint/scripts/apply_mit_license.sh b/maint/scripts/apply_mit_license.sh
index cc425b964..2bb7cc946 100755
--- a/maint/scripts/apply_mit_license.sh
+++ b/maint/scripts/apply_mit_license.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 echo "Add MIT license boilerplate..."
 PWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 # TO source code root
@@ -17,8 +19,8 @@ done
 for SRC_FILE in $(find . -path './3rdparty' -prune -false -o -path './build' -prune -false -o -type f -not -name \
     '*apply_mit_liscense.sh' -not -name '*check_mit_liscense.sh' -and \( -name 'CMakeLists.txt' -or -name '*.cmake' \
     -or -name '*.py' -or -name '*.dockerfile' -or -name '*.yaml' \) ); do
-    sed -i '/\#\s*Microsoft\s*(c)/Id' ${SRC_FILE} 
-    if !(grep -q "Copyright (c) Tile-AI Corporation." "${SRC_FILE}"); then       
+    sed -i '/\#\s*Microsoft\s*(c)/Id' ${SRC_FILE}
+    if !(grep -q "Copyright (c) Tile-AI Corporation." "${SRC_FILE}"); then
         cat maint/scripts/mit_liscense2.txt ${SRC_FILE} > ${SRC_FILE}.new
         mv ${SRC_FILE}.new ${SRC_FILE}
     fi
@@ -26,7 +28,7 @@ done
 
 for SRC_FILE in $(find . -path './3rdparty' -prune -false -o -path './build' -prune -false -o -type f -not -name \
     '*apply_mit_liscense.sh' -not -name '*check_mit_liscense.sh' -name '*.sh' ); do
-    sed -i '/\#\s*Microsoft\s*(c)/Id' ${SRC_FILE} 
+    sed -i '/\#\s*Microsoft\s*(c)/Id' ${SRC_FILE}
     if !(grep -q "Copyright (c) Tile-AI Corporation." "${SRC_FILE}"); then
         line=$(head -n 1 ${SRC_FILE})
         if [[ $line == "#!/bin/bash"* ]]; then
diff --git a/maint/scripts/build_docs.sh b/maint/scripts/build_docs.sh
index f367dcc70..3119eb8c7 100755
--- a/maint/scripts/build_docs.sh
+++ b/maint/scripts/build_docs.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 python -m venv .venv
 source .venv/bin/activate
 python -m pip install --upgrade pip --no-user
diff --git a/maint/scripts/check_mit_license.sh b/maint/scripts/check_mit_license.sh
index 855c48f4c..3802b1efa 100755
--- a/maint/scripts/check_mit_license.sh
+++ b/maint/scripts/check_mit_license.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 echo "Check MIT License boilerplate..."
 PWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 # To source code root
@@ -8,7 +10,7 @@ EXITCODE=0
 for SRC_FILE in $(find . -path './3rdparty' -prune -false -o -path './build' -prune -false -o -type f -not -name '*apply_mit_license.sh' \
     -not -name '*check_mit_license.sh' -and \( -name 'CMakeLists.txt' -or -name '*.cpp' -or -name '*.cu' -or -name '*.h'  -or -name '*.hpp' \
     -or -name '*.py' -or -name '*.sh' -or -name '*.dockerfile' -or -name '*.yaml' \) ); do
-    
+
     # Skip files that already contain the Apache License
     if grep -q "Apache License" "${SRC_FILE}"; then
         continue
diff --git a/maint/scripts/ci_performance.py b/maint/scripts/ci_performance.py
deleted file mode 100644
index 998e7b650..000000000
--- a/maint/scripts/ci_performance.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import subprocess
-import re
-from tabulate import tabulate
-
-import os
-
-env = os.environ.copy()
-env["TILELANG_CLEAR_CACHE"] = "1"
-
-
-def parse_output(output):
-    data = {}
-    for line in output.split('\n'):
-        line = line.strip()
-        if line.startswith('Latency:'):
-            match = re.search(r'Latency: ([\d.]+)', line)
-            data['latency'] = match.group(1) if match else 'N/A'
-        elif line.startswith('TFlops:'):
-            match = re.search(r'TFlops: ([\d.]+)', line)
-            data['best_tflops'] = match.group(1) if match else 'N/A'
-        elif line.startswith('Config:'):
-            data['config'] = line.split('Config: ')[-1]
-        elif line.startswith('Reference TFlops:'):
-            match = re.search(r'Reference TFlops: ([\d.]+)', line)
-            data['ref_tflops'] = match.group(1) if match else 'N/A'
-    return data
-
-
-output_v1 = subprocess.run(['./tl/bin/python', './maint/scripts/performance.py'],
-                           capture_output=True,
-                           text=True,
-                           env=env).stdout
-data_v1 = parse_output(output_v1)
-
-output_v2 = subprocess.run(['./tll/bin/python', './maint/scripts/performance.py'],
-                           capture_output=True,
-                           text=True,
-                           env=env).stdout
-data_v2 = parse_output(output_v2)
-
-table = [[
-    "original", data_v1['latency'], data_v1['best_tflops'], data_v1['ref_tflops'], data_v1['config']
-], [
-    "current", data_v2['latency'], data_v2['best_tflops'], data_v2['ref_tflops'], data_v2['config']
-]]
-
-headers = ["version", "Best Latency (s)", "Best TFlops", "Reference TFlops", "Best Config"]
-
-print(tabulate(table, headers=headers, tablefmt="github", stralign="left", numalign="decimal"))
diff --git a/maint/scripts/docker_build_all.sh b/maint/scripts/docker_build_all.sh
deleted file mode 100755
index ae566c6d0..000000000
--- a/maint/scripts/docker_build_all.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-./maint/scripts/docker_local_distribute.sh 2>&1 | tee docker_local_distribute.log
-
-./maint/scripts/docker_pypi_distribute.sh 2>&1 | tee docker_pypi_distribute.log
diff --git a/maint/scripts/docker_local_distribute.sh b/maint/scripts/docker_local_distribute.sh
index d01427b7b..2263066dc 100755
--- a/maint/scripts/docker_local_distribute.sh
+++ b/maint/scripts/docker_local_distribute.sh
@@ -1,9 +1,5 @@
-set -eux
+#!/usr/bin/env bash
+set -euxo pipefail
 
-# Get the CUDA version from the command line
-IMAGE="tilelang-builder:manylinux"
-docker build . -f "$(dirname "${BASH_SOURCE[0]}")/pypi.manylinux.Dockerfile" --tag ${IMAGE}
-
-script="sh maint/scripts/local_distribution.sh"
-
-docker run --rm -v $(pwd):/tilelang ${IMAGE} /bin/bash -c "$script"
+# Build for local architecture
+CIBW_BUILD='cp39-*' cibuildwheel . 2>&1 | tee cibuildwheel.log
diff --git a/maint/scripts/docker_pypi_distribute.sh b/maint/scripts/docker_pypi_distribute.sh
index 731966967..f8d746de1 100755
--- a/maint/scripts/docker_pypi_distribute.sh
+++ b/maint/scripts/docker_pypi_distribute.sh
@@ -1,9 +1,19 @@
-set -eux
+#!/usr/bin/env bash
+set -euxo pipefail
 
-# Get the CUDA version from the command line
-IMAGE="tilelang-builder:manylinux"
-docker build . -f "$(dirname "${BASH_SOURCE[0]}")/pypi.manylinux.Dockerfile" --tag ${IMAGE}
+if docker buildx version >/dev/null 2>&1; then
+  if docker info >/dev/null 2>&1; then
+    docker run --rm --privileged tonistiigi/binfmt --install amd64,arm64 >/dev/null 2>&1 || true
+  fi
 
-script="sh maint/scripts/pypi_distribution.sh"
+  if ! docker buildx inspect multi >/dev/null 2>&1; then
+    docker buildx create --name multi --driver docker-container --use >/dev/null 2>&1 || true
+  else
+    docker buildx use multi >/dev/null 2>&1 || true
+  fi
+  docker buildx inspect --bootstrap >/dev/null 2>&1 || true
 
-docker run --rm -v $(pwd):/tilelang -w /tilelang ${IMAGE} /bin/bash -c "$script"
+  export CIBW_ARCHS='x86_64 aarch64'
+fi
+
+NO_VERSION_LABEL=ON CIBW_BUILD='cp39-*' cibuildwheel . 2>&1 | tee cibuildwheel.log
diff --git a/maint/scripts/local_distribution.sh b/maint/scripts/local_distribution.sh
index ff8239dff..d3b137fb4 100755
--- a/maint/scripts/local_distribution.sh
+++ b/maint/scripts/local_distribution.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 set -eux
 
 rm -rf dist
diff --git a/maint/scripts/performance.py b/maint/scripts/performance.py
deleted file mode 100644
index 24c4a21e8..000000000
--- a/maint/scripts/performance.py
+++ /dev/null
@@ -1,91 +0,0 @@
-import argparse
-import tilelang.language as T
-from tilelang.autotuner import AutoTuner
-
-
-def ref_program(A, B):
-    return A @ B.T
-
-
-def get_configs():
-    configs = [{
-        "block_M": 128,
-        "block_N": 128,
-        "block_K": 64,
-        "num_stages": 2,
-        "thread_num": 256,
-        "enable_rasteration": True,  # keep param name for backward-compat
-    }]
-    return configs
-
-
-def run(M, N, K):
-
-    def kernel(
-        block_M=None,
-        block_N=None,
-        block_K=None,
-        num_stages=None,
-        thread_num=None,
-        enable_rasteration=None,
-    ):
-        dtype = "float16"
-        accum_dtype = "float"
-
-        @T.prim_func
-        def main(
-                A: T.Tensor((M, K), dtype),
-                B: T.Tensor((N, K), dtype),
-                C: T.Tensor((M, N), dtype),
-        ):
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-                A_shared = T.alloc_shared((block_M, block_K), dtype)
-                B_shared = T.alloc_shared((block_N, block_K), dtype)
-                C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-                C_shared = T.alloc_shared((block_M, block_N), dtype)
-                T.use_swizzle(panel_size=10, enable=enable_rasteration)
-                T.clear(C_local)
-                for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                    T.gemm(
-                        A_shared,
-                        B_shared,
-                        C_local,
-                        transpose_B=True,
-                    )
-                T.copy(C_local, C_shared)
-                T.copy(C_shared, C[by * block_M, bx * block_N])
-
-        return main
-
-    autotuner = AutoTuner.from_kernel(
-        kernel=kernel, configs=get_configs()).set_compile_args(
-            out_idx=[-1],
-            target="auto",
-        ).set_profile_args(
-            ref_prog=ref_program,)
-    return autotuner.run(warmup=3, rep=20)
-
-
-if __name__ == "__main__":
-    # Parse command-line arguments for matrix dimensions
-    parser = argparse.ArgumentParser(description="Autotuned MatMul Benchmark")
-    parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
-    parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
-    parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
-    args = parser.parse_args()
-
-    M, N, K = args.m, args.n, args.k
-
-    # Compute total floating-point operations to measure throughput
-    total_flops = 2 * M * N * K
-
-    result = run(M, N, K)
-
-    print(f"Latency: {result.latency}")
-    print(f"TFlops: {total_flops / result.latency * 1e-9:.3f}")
-    print(f"Config: {result.config}")
-
-    print(f"Reference TFlops: {total_flops / result.ref_latency * 1e-9:.3f}")
diff --git a/maint/scripts/pypi.manylinux.Dockerfile b/maint/scripts/pypi.manylinux.Dockerfile
index 5be11ab7a..de45df02e 100644
--- a/maint/scripts/pypi.manylinux.Dockerfile
+++ b/maint/scripts/pypi.manylinux.Dockerfile
@@ -1,24 +1,29 @@
-FROM pytorch/manylinux2_28-builder:cuda12.1 AS builder_amd64
-ENV CUDA_VERSION=12.1 \
-    AUDITWHEEL_PLAT=manylinux_2_28_x86_64
-RUN pip3 install uv
+FROM quay.io/pypa/manylinux_2_28_x86_64 AS builder_amd64
 
-FROM pytorch/manylinuxaarch64-builder:cuda12.8 AS builder_arm64
-ENV CUDA_VERSION=12.8 \
-    AUDITWHEEL_PLAT=manylinux_2_28_aarch64
+RUN dnf config-manager --add-repo https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
 
+ARG CUDA_VERSION=12.8
+ENV CUDA_VERSION=${CUDA_VERSION}
+
+FROM quay.io/pypa/manylinux_2_28_aarch64 AS builder_arm64
+
+RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
+
+ARG CUDA_VERSION=12.8
+ENV CUDA_VERSION=${CUDA_VERSION}
+
+ARG TARGETARCH
 FROM builder_${TARGETARCH}
 
 ENV DEBIAN_FRONTEND=noninteractive \
     TZ=Etc/UTC
 
-RUN set -eux; \
-    uv venv -p 3.12 --seed /venv; \
-    git config --global --add safe.directory '/tilelang'
+ENV PATH="/usr/local/cuda/bin:${PATH}"
 
-ENV PATH="/venv/bin:$PATH" \
-    VIRTUAL_ENV=/venv
+ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
 
-RUN uv pip install build wheel
+RUN set -eux; \
+    pipx install cibuildwheel; \
+    git config --global --add safe.directory '/tilelang'
 
 WORKDIR /tilelang
diff --git a/maint/scripts/pypi_distribution.sh b/maint/scripts/pypi_distribution.sh
index 2201fc59e..9a8c6e62c 100755
--- a/maint/scripts/pypi_distribution.sh
+++ b/maint/scripts/pypi_distribution.sh
@@ -1,6 +1,8 @@
+#!/usr/bin/env bash
+
 set -eux
 
-rm -rf dist
+rm -rf dist raw_dist
 
 python -mpip install -U pip
 python -mpip install -U build wheel auditwheel patchelf
diff --git a/maint/scripts/regression_all.py b/maint/scripts/regression_all.py
new file mode 100644
index 000000000..bed6ff68b
--- /dev/null
+++ b/maint/scripts/regression_all.py
@@ -0,0 +1,149 @@
+from __future__ import annotations
+
+import json
+import os
+import subprocess
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+
+try:
+    from tabulate import tabulate
+except Exception:  # pragma: no cover
+    tabulate = None  # type: ignore
+
+try:
+    from tqdm import tqdm
+except ImportError:
+
+    def tqdm(iterable, **kwargs):  # type: ignore
+        return iterable
+
+
+@dataclass(frozen=True)
+class PerfResult:
+    name: str
+    latency: float
+
+
+_RESULTS: list[PerfResult] = []
+
+_RESULTS_JSON_PREFIX = "__TILELANG_PERF_RESULTS_JSON__="
+
+
+def _parse_table(output: str) -> dict[str, float]:
+    # Prefer a single JSON marker line if present.
+    for line in reversed(output.splitlines()):
+        if line.startswith(_RESULTS_JSON_PREFIX):
+            payload = line[len(_RESULTS_JSON_PREFIX) :].strip()
+            items = json.loads(payload)
+            data: dict[str, float] = {}
+            for item in items:
+                name = str(item["name"]).strip()
+                latency = float(item["latency"])
+                data[name] = latency
+            return data
+
+    # Backward-compatible text parsing (best-effort).
+    data = {}
+    for line in output.splitlines():
+        line = line.strip()
+        if not line or ":" not in line:
+            continue
+        name, _, val = line.partition(":")
+        name = name.strip()
+        val = val.strip()
+        if not name:
+            continue
+        try:
+            data[name] = float(val)
+        except ValueError:
+            # Ignore unrelated prints/logs.
+            continue
+    return data
+
+
+def _examples_root() -> Path:
+    # repo_root/tilelang/testing/perf_regression.py -> repo_root
+    return Path(__file__).resolve().parents[2] / "examples"
+
+
+def _discover_bench_files(examples_root: Path) -> list[Path]:
+    patterns = ("regression_*.py",)
+    files: list[Path] = []
+    for pat in patterns:
+        files.extend(examples_root.rglob(pat))
+    # Avoid picking up things like __pycache__ etc.
+    return sorted({p for p in files if p.is_file() and p.name != "__init__.py"})
+
+
+def regression_all(examples_root: str | os.PathLike[str] | None = None) -> None:
+    """Run all example benchmark drivers and print a consolidated table.
+
+    Intended usage (CI): `python -c "import tilelang.testing.perf_regression as pr; pr.regression_all()"`
+    """
+
+    root = Path(examples_root) if examples_root is not None else _examples_root()
+    if not root.exists():
+        raise FileNotFoundError(f"Examples root not found: {root}")
+
+    bench_files = _discover_bench_files(root)
+    if not bench_files:
+        raise RuntimeError(f"No drivers found under: {root}")
+
+    merged: dict[str, float] = {}
+    failures: list[str] = []
+
+    for bench_file in tqdm(bench_files, desc="Running regression tests ..."):
+        proc = subprocess.run(
+            [sys.executable, str(bench_file)],
+            cwd=str(bench_file.parent),
+            capture_output=True,
+            text=True,
+            env={
+                **os.environ,
+                # Keep child processes from picking up user-site or random paths.
+                "PYTHONNOUSERSITE": "1",
+                # Ask child to emit a single JSON marker line for robust parsing.
+                "TL_PERF_REGRESSION_FORMAT": "json",
+            },
+        )
+        if proc.returncode != 0:
+            failures.append(f"{bench_file.relative_to(root)}\nSTDOUT:\n{proc.stdout}\nSTDERR:\n{proc.stderr}")
+            continue
+
+        parsed = _parse_table(proc.stdout)
+        for k, v in parsed.items():
+            # First writer wins to keep stable behavior if duplicates happen.
+            if k not in merged:
+                merged[k] = v
+                _RESULTS.append(PerfResult(name=k, latency=v))
+
+    if failures and not merged:
+        raise RuntimeError("All benchmark drivers failed:\n\n" + "\n\n".join(failures))
+    if failures:
+        # Don't hard-fail if we have some results; surface the errors for debugging.
+        print("# Some benchmark drivers failed (partial results)")
+        for msg in failures:
+            print("# ---")
+            for line in msg.splitlines():
+                print(f"# {line}")
+
+    fmt = os.environ.get("TL_PERF_REGRESSION_FORMAT", "text").strip().lower()
+    if fmt == "json":
+        print(_RESULTS_JSON_PREFIX + json.dumps(merged, separators=(",", ":")))
+        return
+
+    rows = [[k, merged[k]] for k in sorted(merged.keys())]
+    headers = ["File", "Latency"]
+    if tabulate is None:
+        print(f"| {headers[0]} | {headers[1]} |")
+        print("|---|---|")
+        for name, latency in rows:
+            print(f"| {name} | {latency} |")
+    else:
+        print(tabulate(rows, headers=headers, tablefmt="github", stralign="left", numalign="decimal"))
+
+
+if __name__ == "__main__":
+    regression_all()
diff --git a/maint/scripts/run_local_ci_test.sh b/maint/scripts/run_local_ci_test.sh
index f8fe54384..f6a58d78d 100755
--- a/maint/scripts/run_local_ci_test.sh
+++ b/maint/scripts/run_local_ci_test.sh
@@ -14,7 +14,9 @@ cd examples
 python -m pytest -n 4 . --verbose --color=yes --durations=0 --showlocals --cache-clear
 cd ..
 
-# Run pytest in parallel (4 workers) for all tests in the testing/python directory
+# Run pytest in parallel (4 workers) for all tests in the testing/python directory.
+# IMPORTANT: CuTeDSL backend currently requires GEMM v1 (TILELANG_USE_GEMM_V1=1).
+# Do NOT export it globally here, or you'll silently change the default GEMM selection
+# for unrelated tests. Run the CuTeDSL JIT tests in a separate pytest invocation.
 cd testing/python
 python -m pytest -n 4 . --verbose --color=yes --durations=0 --showlocals --cache-clear
-cd ..
diff --git a/maint/scripts/run_perf_regression.sh b/maint/scripts/run_perf_regression.sh
new file mode 100755
index 000000000..90edfeddc
--- /dev/null
+++ b/maint/scripts/run_perf_regression.sh
@@ -0,0 +1,195 @@
+#!/bin/bash
+# Performance regression test: compare current branch vs origin/main
+#
+# Usage:
+#   ./maint/scripts/run_perf_regression.sh
+#
+# Environment variables:
+#   PYTHON_VERSION  - Python version to use (default: 3.12)
+#   WORK_DIR        - Working directory for venvs (default: .perf_regression)
+#   SKIP_BUILD_NEW  - Set to 1 to skip rebuilding new venv
+#   SKIP_BUILD_OLD  - Set to 1 to skip rebuilding old venv
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+
+PYTHON_VERSION="${PYTHON_VERSION:-3.12}"
+WORK_DIR="${WORK_DIR:-${REPO_ROOT}/.perf_regression}"
+RESULT_MD="${WORK_DIR}/regression_result.md"
+RESULT_PNG="${WORK_DIR}/regression_result.png"
+
+UPSTREAM_URL="https://github.com/tile-ai/tilelang"
+
+# Check if user has a local build directory that might conflict with pip install
+BUILD_DIR="${REPO_ROOT}/build"
+BUILD_BACKUP=""
+if [[ -d "${BUILD_DIR}" ]]; then
+    BUILD_BACKUP="${BUILD_DIR}.bak.$$"
+    echo "Found existing build directory, will rename to ${BUILD_BACKUP}"
+fi
+
+echo "============================================"
+echo "Performance Regression Test"
+echo "============================================"
+echo "Repo root:      ${REPO_ROOT}"
+echo "Work dir:       ${WORK_DIR}"
+echo "Python version: ${PYTHON_VERSION}"
+echo "Upstream:       ${UPSTREAM_URL}"
+echo ""
+
+cd "${REPO_ROOT}"
+
+# Ensure origin points to the correct upstream
+ORIGIN_URL="$(git remote get-url origin 2>/dev/null || echo "")"
+if [[ "${ORIGIN_URL}" != *"tile-ai/tilelang"* ]]; then
+    echo "WARNING: origin (${ORIGIN_URL}) does not point to ${UPSTREAM_URL}"
+    echo "Adding 'upstream' remote..."
+    git remote remove upstream 2>/dev/null || true
+    git remote add upstream "${UPSTREAM_URL}"
+    REMOTE="upstream"
+else
+    REMOTE="origin"
+fi
+echo "Using remote: ${REMOTE}"
+
+# Check for uncommitted changes
+if [[ -n "$(git status --porcelain)" ]]; then
+    echo "WARNING: You have uncommitted changes. They will be stashed."
+    read -p "Continue? [y/N] " -n 1 -r
+    echo
+    if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+        echo "Aborted."
+        exit 1
+    fi
+    STASHED=1
+    git stash push -m "perf_regression_temp_stash"
+else
+    STASHED=0
+fi
+
+# Save current branch/commit
+CURRENT_REF="$(git rev-parse --abbrev-ref HEAD)"
+if [[ "${CURRENT_REF}" == "HEAD" ]]; then
+    # Detached HEAD
+    CURRENT_REF="$(git rev-parse HEAD)"
+fi
+echo "Current ref: ${CURRENT_REF}"
+
+# Cleanup function
+cleanup() {
+    echo ""
+    echo "Cleaning up..."
+    cd "${REPO_ROOT}"
+    git checkout "${CURRENT_REF}" 2>/dev/null || true
+    git submodule update --init --recursive 2>/dev/null || true
+    if [[ "${STASHED}" == "1" ]]; then
+        echo "Restoring stashed changes..."
+        git stash pop || true
+    fi
+    # Restore original build directory if it was backed up
+    if [[ -n "${BUILD_BACKUP}" && -d "${BUILD_BACKUP}" ]]; then
+        echo "Restoring original build directory..."
+        rm -rf "${BUILD_DIR}" 2>/dev/null || true
+        mv "${BUILD_BACKUP}" "${BUILD_DIR}"
+    fi
+}
+trap cleanup EXIT
+
+# Create work directory
+mkdir -p "${WORK_DIR}"
+
+# Fetch latest main from upstream
+echo ""
+echo "Fetching ${REMOTE}/main..."
+git fetch "${REMOTE}" main
+
+# Backup existing build directory to avoid conflict with pip install
+if [[ -n "${BUILD_BACKUP}" ]]; then
+    echo "Renaming build -> ${BUILD_BACKUP##*/}"
+    mv "${BUILD_DIR}" "${BUILD_BACKUP}"
+fi
+
+# ============================================
+# Build NEW version (current branch)
+# ============================================
+if [[ "${SKIP_BUILD_NEW}" != "1" ]]; then
+    echo ""
+    echo "============================================"
+    echo "Building NEW version (current branch: ${CURRENT_REF})"
+    echo "============================================"
+
+    git checkout "${CURRENT_REF}"
+    git submodule update --init --recursive
+
+    rm -rf "${WORK_DIR}/new"
+    uv venv --python "${PYTHON_VERSION}" "${WORK_DIR}/new"
+    source "${WORK_DIR}/new/bin/activate"
+
+    uv pip install -v -r requirements-test.txt
+    uv pip install -v .
+
+    deactivate
+else
+    echo "Skipping NEW build (SKIP_BUILD_NEW=1)"
+fi
+
+# ============================================
+# Build OLD version (upstream main)
+# ============================================
+if [[ "${SKIP_BUILD_OLD}" != "1" ]]; then
+    echo ""
+    echo "============================================"
+    echo "Building OLD version (${REMOTE}/main)"
+    echo "============================================"
+
+    # Clean build artifacts before switching
+    # Note: -e requires relative paths, not absolute paths
+    git clean -dxf -e .perf_regression/ -e .cache/ -e "*.egg-info" -e "build.bak.*"
+
+    git checkout "${REMOTE}/main"
+    git submodule update --init --recursive
+
+    rm -rf "${WORK_DIR}/old"
+    uv venv --python "${PYTHON_VERSION}" "${WORK_DIR}/old"
+    source "${WORK_DIR}/old/bin/activate"
+
+    uv pip install -v -r requirements-test.txt
+    uv pip install -v .
+
+    deactivate
+else
+    echo "Skipping OLD build (SKIP_BUILD_OLD=1)"
+fi
+
+# ============================================
+# Run regression test
+# ============================================
+echo ""
+echo "============================================"
+echo "Running performance regression test"
+echo "============================================"
+
+# Switch back to current branch for running the test script
+git checkout "${CURRENT_REF}"
+git submodule update --init --recursive
+
+source "${WORK_DIR}/new/bin/activate"
+
+OLD_PYTHON="${WORK_DIR}/old/bin/python" \
+NEW_PYTHON="${WORK_DIR}/new/bin/python" \
+PERF_REGRESSION_MD="${RESULT_MD}" \
+PERF_REGRESSION_PNG="${RESULT_PNG}" \
+    python "${SCRIPT_DIR}/test_perf_regression.py"
+
+deactivate
+
+echo ""
+echo "============================================"
+echo "Results"
+echo "============================================"
+echo "Markdown: ${RESULT_MD}"
+echo "Plot:     ${RESULT_PNG}"
+echo ""
+cat "${RESULT_MD}"
diff --git a/maint/scripts/test_perf_regression.py b/maint/scripts/test_perf_regression.py
new file mode 100644
index 000000000..3ef098e0b
--- /dev/null
+++ b/maint/scripts/test_perf_regression.py
@@ -0,0 +1,221 @@
+import contextlib
+import subprocess
+import re
+import os
+import json
+from tabulate import tabulate
+import pandas as pd
+import numpy as np
+import textwrap
+from pathlib import Path
+
+try:
+    import tilelang
+
+    tilelang.disable_cache()
+    os.environ["TILELANG_DISABLE_CACHE"] = "1"
+except Exception:
+    tilelang = None
+
+OLD_PYTHON = os.environ.get("OLD_PYTHON", "./old/bin/python")
+NEW_PYTHON = os.environ.get("NEW_PYTHON", "./new/bin/python")
+OUT_MD = os.environ.get("PERF_REGRESSION_MD", "regression_result.md")
+OUT_PNG = os.environ.get("PERF_REGRESSION_PNG", "regression_result.png")
+_RESULTS_JSON_PREFIX = "__TILELANG_PERF_RESULTS_JSON__="
+
+
+def parse_output(output):
+    for line in output.splitlines():
+        if line.startswith(_RESULTS_JSON_PREFIX):
+            return json.loads(line[len(_RESULTS_JSON_PREFIX) :])
+
+    # Fallback to regex parsing
+    data = {}
+    for line in output.split("\n"):
+        line = line.strip()
+        m = re.match(r"\|\s*([^\|]+)\s*\|\s*([0-9\.]+)\s*\|", line)
+        if m is not None:
+            with contextlib.suppress(ValueError):
+                data[m.group(1)] = float(m.group(2))
+    return data
+
+
+def run_cmd(cmd, env=None):
+    full_env = os.environ.copy()
+    if env:
+        full_env.update(env)
+    # Don't capture stderr so that tqdm progress bar is visible
+    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=None, text=True, env=full_env)
+    if p.returncode != 0:
+        raise RuntimeError(f"Command failed: {' '.join(cmd)}\nSTDOUT:\n{p.stdout}")
+    return p.stdout
+
+
+def draw(df: pd.DataFrame) -> None:
+    import matplotlib.pyplot as plt
+
+    if df is None or len(df) == 0:
+        return
+
+    # ---- copy + sanitize ----
+    df = df.copy()
+    df["Speedup"] = pd.to_numeric(df["Speedup"], errors="coerce")
+    df = df.dropna(subset=["Speedup"])
+
+    # categorize
+    df["Performance"] = np.where(df["Speedup"] >= 1.0, "Improved", "Regressed")
+    df["DeltaPct"] = (df["Speedup"] - 1.0) * 100.0
+
+    # sort: worst regressions at top? (common for dashboards)
+    # If you prefer best-to-worst, change ascending=False
+    df = df.sort_values("Speedup", ascending=True).reset_index(drop=True)
+
+    # ---- style ----
+    plt.rcParams.update(
+        {
+            "figure.dpi": 120,
+            "savefig.dpi": 300,
+            "axes.titlesize": 16,
+            "axes.labelsize": 12,
+            "xtick.labelsize": 10,
+            "ytick.labelsize": 10,
+        }
+    )
+
+    n = len(df)
+    # height: ~0.35 inch per row + margins, with a sensible cap/floor
+    fig_h = min(max(6.0, 0.35 * n + 2.2), 22.0)
+    fig_w = 14.0
+    fig, ax = plt.subplots(figsize=(fig_w, fig_h))
+
+    # palette
+    colors = {"Improved": "#2ecc71", "Regressed": "#e74c3c"}
+    bar_colors = df["Performance"].map(colors).tolist()
+
+    # wrap long labels (optional)
+    def wrap_label(s: str, width: int = 42) -> str:
+        return "\n".join(textwrap.wrap(str(s), width=width)) if len(str(s)) > width else str(s)
+
+    ylabels = [wrap_label(x) for x in df["File"].tolist()]
+    y = np.arange(n)
+
+    # bars
+    ax.barh(y, df["Speedup"].values, color=bar_colors, edgecolor="black", linewidth=0.4, height=0.72)
+
+    # baseline at 1.0x
+    ax.axvline(1.0, linestyle="--", linewidth=1.4, alpha=0.85)
+
+    # grid
+    ax.xaxis.grid(True, linestyle="-", linewidth=0.6, alpha=0.25)
+    ax.set_axisbelow(True)
+
+    # y ticks
+    ax.set_yticks(y)
+    ax.set_yticklabels(ylabels)
+
+    # x limits with padding (ensure 1.0 included)
+    x_min = float(df["Speedup"].min())
+    x_max = float(df["Speedup"].max())
+    pad = max(0.02, (x_max - x_min) * 0.12)
+    left = min(1.0, x_min) - pad
+    right = max(1.0, x_max) + pad
+    ax.set_xlim(left, right)
+
+    # annotate each bar
+    for i, (sx, dp) in enumerate(zip(df["Speedup"].values, df["DeltaPct"].values)):
+        label = f"{sx:.3f}x ({dp:+.2f}%)"
+        # place to right for improved, left for regressed (near bar end)
+        if sx >= 1.0:
+            ax.text(sx + 0.003, i, label, va="center", ha="left", fontsize=9)
+        else:
+            ax.text(sx - 0.003, i, label, va="center", ha="right", fontsize=9)
+
+    # labels & title
+    ax.set_xlabel("Speedup Ratio (New / Old)")
+    ax.set_ylabel("Benchmark File")
+    ax.set_title("Performance Regression Analysis")
+
+    # legend
+    from matplotlib.patches import Patch
+
+    legend_handles = [
+        Patch(facecolor=colors["Improved"], edgecolor="black", label="Improved (>= 1.0x)"),
+        Patch(facecolor=colors["Regressed"], edgecolor="black", label="Regressed (< 1.0x)"),
+    ]
+    ax.legend(handles=legend_handles, loc="upper left", frameon=True)
+
+    # summary box
+    num_improved = int((df["Performance"] == "Improved").sum())
+    num_regressed = int((df["Performance"] == "Regressed").sum())
+    best = df.iloc[df["Speedup"].idxmax()]
+    worst = df.iloc[df["Speedup"].idxmin()]
+    summary = (
+        f"Items: {n}\n"
+        f"Improved: {num_improved}\n"
+        f"Regressed: {num_regressed}\n"
+        f"Best:  {best['File']}  {best['Speedup']:.3f}x\n"
+        f"Worst: {worst['File']}  {worst['Speedup']:.3f}x"
+    )
+    ax.text(
+        0.99,
+        0.01,
+        summary,
+        transform=ax.transAxes,
+        ha="right",
+        va="bottom",
+        fontsize=9,
+        bbox=dict(boxstyle="round,pad=0.45", facecolor="white", edgecolor="0.3", alpha=0.9),
+    )
+
+    # clean spines
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+
+    fig.tight_layout()
+    print(f"Saving plot to {OUT_PNG} ({fig_w:.1f}x{fig_h:.1f} inches)")
+    fig.savefig(OUT_PNG, bbox_inches="tight")
+    # Optional: also save SVG
+    # fig.savefig(OUT_PNG.replace(".png", ".svg"), bbox_inches="tight")
+    plt.close(fig)
+
+
+env = {"TL_PERF_REGRESSION_FORMAT": "json"}
+print("Running regression on OLD version...")
+test_file = Path(__file__).parent / "regression_all.py"
+output_v1 = run_cmd([OLD_PYTHON, str(test_file)], env=env)
+print("Running regression on NEW version...")
+output_v2 = run_cmd([NEW_PYTHON, str(test_file)], env=env)
+
+data_v1 = parse_output(output_v1)
+data_v2 = parse_output(output_v2)
+
+common_keys = sorted(set(data_v1) & set(data_v2))
+if not common_keys:
+    print("No common entries between old and new versions")
+    # Write empty file or message
+    with open(OUT_MD, "w") as f:
+        f.write("No common benchmarks found between old and new versions.\n")
+    exit(0)
+
+table = []
+for key in common_keys:
+    if data_v2[key] == 0:
+        speedup = 0.0
+    else:
+        speedup = data_v1[key] / data_v2[key]
+    table.append([key, data_v1[key], data_v2[key], speedup])
+
+if not table:
+    raise RuntimeError("All results are invalid")
+
+table.sort(key=lambda x: x[-1])
+
+headers = ["File", "Original Latency", "Current Latency", "Speedup"]
+
+with open(OUT_MD, "w") as f:
+    f.write(tabulate(table, headers=headers, tablefmt="github", stralign="left", numalign="decimal"))
+    f.write("\n")
+
+df = pd.DataFrame(table, columns=headers)
+df = df.sort_values("Speedup", ascending=False).reset_index(drop=True)
+draw(df)
diff --git a/pyproject.toml b/pyproject.toml
index 17a65115d..4256809f4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,27 +2,36 @@
 name = "tilelang"
 description = "A tile level programming language to generate high performance code."
 readme = "README.md"
-requires-python = ">=3.8"
-authors = [{name = "TileLang Contributors"}, {name = "Tile-AI"}]
-maintainers = [{name = "Lei Wang", email = "leiwang1999@outlook.com"}]
+requires-python = ">=3.9"
+authors = [{ name = "TileLang Contributors" }, { name = "Tile-AI" }]
+maintainers = [{ name = "Lei Wang", email = "leiwang1999@outlook.com" }]
 license = "MIT"
 keywords = ["BLAS", "CUDA", "HIP", "Code Generation", "TVM"]
 classifiers = [
+    "Development Status :: 4 - Beta",
     "Environment :: GPU",
     "Operating System :: POSIX :: Linux",
-    "Operating System :: OS Independent",
     "Operating System :: MacOS",
-    "Programming Language :: Python :: 3.8",
+    "Programming Language :: C++",
+    "Programming Language :: Python :: 3",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python :: 3.14",
+    "Programming Language :: Python :: Implementation :: CPython",
     "Intended Audience :: Developers",
     "Intended Audience :: Science/Research",
-    "Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
 ]
 dynamic = ["version"]
 dependencies = [
+    # Memory issue: tilelang#1502
+    "apache-tvm-ffi~=0.1.0,>=0.1.6",
+    # torch-c-dlpack-ext provides prebuilt torch extensions.
+    # Without it, TVM FFI may require JIT compilation on first import.
+    "torch-c-dlpack-ext",
     "cloudpickle",
     "ml-dtypes",
     "numpy>=1.23.5",
@@ -31,20 +40,26 @@ dependencies = [
     "torch>=2.7; platform_system == 'Darwin'",
     "tqdm>=4.62.3",
     "typing-extensions>=4.10.0",
-    "nvidia-nvshmem-cu12",
-    "tilescale_ext @ file:./tilelang/utils/ts_ext",
+    "z3-solver>=4.13.0",
+    "nvidia-nvshmem-cu12; platform_system == 'Linux'",
 ]
 
 [project.optional-dependencies]
 # mldtypes should be greater than 0.5.1
 # if you want to enable fp4
 fp4 = ["ml-dtypes>=0.5.1"]
+# if you want to enable layout inference visualization
+vis = ["matplotlib"]
 
 [build-system]
 requires = [
     "cython>=3.0.0",
     "scikit-build-core",
-    "setuptools>=63",
+    "z3-solver>=4.13.0",
+    # Not for auditwheel, explicitly add patchelf for repairing libz3.so.
+    # See tvm's CMakeLists.txt for more information.
+    "patchelf>=0.17.2; platform_system == 'Linux'",
+    # torch is needed for building tilescale_ext._C (requires libtorch)
     "torch",
 ]
 build-backend = "scikit_build_core.build"
@@ -61,20 +76,65 @@ metadata.version.provider = "version_provider"
 metadata.version.provider-path = "."
 experimental = true
 
+# build.verbose = true
+# logging.level = "DEBUG"
+
+[tool.scikit-build.sdist]
+include = [
+    "./VERSION",
+    ".git_commit.txt",
+    "./LICENSE",
+    "THIRDPARTYNOTICES.txt",
+    "version_provider.py",
+    "requirements*.txt",
+    "tilelang/jit/adapter/cython/cython_wrapper.pyx",
+    "CMakeLists.txt",
+    "src/**",
+    "cmake/**",
+    # The vendored 3rdparty contents in sdist should be same as wheel.
+    # Need full TVM to build from source.
+    "3rdparty/tvm",
+    # CUTLASS
+    "3rdparty/cutlass/include",
+    "3rdparty/cutlass/tools",
+    # Composable Kernel
+    "3rdparty/composable_kernel/include",
+    "3rdparty/composable_kernel/library",
+    # tilescale_ext C++ extension
+    "tilelang/utils/ts_ext/**",
+    "tilescale_ext/**",
+    "testing/**",
+    "examples/**",
+]
+exclude = [
+    ".git",
+    ".github",
+    "**/.git",
+    "**/.github",
+    "3rdparty/**",
+    "build",
+]
+
 [tool.scikit-build.wheel.packages]
 tilelang = "tilelang"
+tilescale_ext = "tilescale_ext"
 "tilelang/src" = "src"
-"tilelang/3rdparty" = "3rdparty"
+# NOTE: The mapping below places the contents of '3rdparty' inside 'tilelang/3rdparty' in the wheel.
+# This is necessary to find TVM shared libraries at runtime.
+# The vendored 3rdparty contents in wheel should be same as sdist.
+# TVM
+"tilelang/3rdparty/tvm/src" = "3rdparty/tvm/src"
+"tilelang/3rdparty/tvm/python" = "3rdparty/tvm/python"
+"tilelang/3rdparty/tvm/include" = "3rdparty/tvm/include"
+"tilelang/3rdparty/tvm/version.py" = "3rdparty/tvm/version.py"
+# CUTLASS
+"tilelang/3rdparty/cutlass/include" = "3rdparty/cutlass/include"
+"tilelang/3rdparty/cutlass/tools" = "3rdparty/cutlass/tools"
+# Composable Kernel
+"tilelang/3rdparty/composable_kernel/include" = "3rdparty/composable_kernel/include"
+"tilelang/3rdparty/composable_kernel/library" = "3rdparty/composable_kernel/library"
 
-# TODO: we might want to not include these in wheel?
-"tilelang/benchmark" = "benchmark"
-"tilelang/examples" = "examples"
-"tilelang/testing" = "testing"
 
-[tool.yapf]
-based_on_style = "yapf"
-column_limit = 100
-indent_width = 4
 
 [tool.codespell]
 ignore-words = "docs/spelling_wordlist.txt"
@@ -86,8 +146,8 @@ skip = [
 ]
 
 [tool.ruff]
-target-version = "py38"
-line-length = 100
+target-version = "py39"
+line-length = 140
 output-format = "full"
 
 exclude = [
@@ -95,6 +155,14 @@ exclude = [
     "examples/deepseek_v32/inference",
 ]
 
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+skip-magic-trailing-comma = false
+line-ending = "auto"
+docstring-code-format = false
+docstring-code-line-length = "dynamic"
+
 [tool.ruff.lint.per-file-ignores]
 # Do not upgrade type hint in testing and examples.
 # See https://github.com/tile-ai/tilelang/issues/1079 for more information.
@@ -145,28 +213,48 @@ filterwarnings = ["always"]
 
 [tool.cibuildwheel]
 archs = ["auto64"]
-# Pin to glibc 2.17 for x86 and 2.28 for aarch64 for now
-manylinux-x86_64-image = "manylinux2014"
-manylinux-aarch64-image = "manylinux_2_28"
 skip = "*musllinux*"
-environment-pass = ["CUDA_VERSION"]
+build-frontend = "build"
+environment = { PYTHONDEVMODE = "1", PYTHONUNBUFFERED = "1" }
+environment-pass = [
+    "CUDA_VERSION",
+    "NO_VERSION_LABEL",
+    "NO_TOOLCHAIN_VERSION",
+    "NO_GIT_VERSION",
+    "COLUMNS",
+    "CMAKE_GENERATOR",
+    "CMAKE_BUILD_PARALLEL_LEVEL",
+    "FORCE_COLOR",
+    "CLICOLOR_FORCE",
+]
+before-build = "env -0 | sort -z | tr '\\0' '\\n'"
+windows.before-build = "set"
+test-command = [
+    "python -c 'import tilelang; print(tilelang.__version__)'",
+]
 
 [tool.cibuildwheel.linux]
-repair-wheel-command = [
-    "auditwheel repair --exclude libcuda.so.1 --exclude '/usr/local/cuda*' -w {dest_dir} {wheel}",
-    "pipx run abi3audit --strict --report {wheel}",
-]
+environment.PYTHONDEVMODE = "1"
+environment.PYTHONUNBUFFERED = "1"
 environment.PATH = "/usr/local/cuda/bin:$PATH"
+environment.LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
+manylinux-x86_64-image  = "manylinux_2_28"  # AlmaLinux 8
+manylinux-aarch64-image = "manylinux_2_34"  # Z3 requires
 # Install CUDA runtime and stub driver library
-# manylinux_2_28 uses gcc 14, which needs CUDA 12.8
+# manylinux_2_28 uses gcc 14, which needs CUDA >=12.8
 before-all = """
 set -eux
 
+cat /etc/*-release
+uname -a
+
 case "$(uname -m)" in
     "x86_64")
-        yum-config-manager --add-repo https://developer.download.nvidia.cn/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
+        DEFAULT_CUDA_VERSION="12.8"
+        dnf config-manager --add-repo https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
         ;;
     "aarch64")
+        DEFAULT_CUDA_VERSION="12.8"
         dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
         ;;
     *)
@@ -174,7 +262,26 @@ case "$(uname -m)" in
         ;;
 esac
 
-cudaver="$(echo "${CUDA_VERSION:-"12.4"}" | cut -d '.' -f-2)"
+cudaver="$(echo "${CUDA_VERSION:-$DEFAULT_CUDA_VERSION}" | cut -d '.' -f-2)"
 v="${cudaver//./-}"
-yum install -y "cuda-minimal-build-${v}" "cuda-driver-devel-${v}" "cuda-nvrtc-devel-${v}"
+yum install -y "cuda-minimal-build-${v}" "cuda-driver-devel-${v}" "cuda-nvrtc-devel-${v}" nvidia-driver-cuda-libs
+yum clean all
 """
+repair-wheel-command = [
+    "auditwheel -v repair --exclude libtvm_ffi.so --exclude libz3.so --exclude libcuda.so.1 --exclude 'libnvshmem*' --exclude 'libnccl*' --exclude '/usr/local/cuda*' -w {dest_dir} {wheel}",
+    "pipx run abi3audit --verbose {wheel} || true",
+]
+
+[tool.cibuildwheel.macos]
+repair-wheel-command = [
+    "delocate-wheel --verbose --ignore-missing-dependencies --no-sanitize-rpaths --require-archs {delocate_archs} -w {dest_dir} -v {wheel}",
+    "pipx run abi3audit --verbose {wheel} || true",
+]
+
+[[tool.cibuildwheel.overrides]]
+select = "*linux*x86_64*"
+# x86_64 runners in GitHub Actions have limited storage,
+# pre-install torch without caching to reduce disk usage during install tilelang.
+before-test = [
+    "pip install torch --no-cache-dir",
+]
diff --git a/requirements-dev.txt b/requirements-dev.txt
index eddf74041..f2661d5c0 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,5 +1,6 @@
 # Requirements to run local build with `--no-build-isolation` or other developments
 
+apache-tvm-ffi~=0.1.0,>=0.1.6
 build
 cmake>=3.26
 cython>=3.0.0
@@ -8,9 +9,9 @@ packaging
 scikit-build-core
 setuptools>=61
 torch
-tabulate
 cuda-python>=12.0.0
 wheel
+z3-solver>=4.13.0
 
 auditwheel; platform_system == 'Linux'
 patchelf; platform_system == 'Linux'
diff --git a/requirements-lint.txt b/requirements-lint.txt
index d604b1ec2..b68f81a2d 100644
--- a/requirements-lint.txt
+++ b/requirements-lint.txt
@@ -1,7 +1,6 @@
 # Format and lint requirements
 pre-commit
-clang-format==21.1.2
-clang-tidy==21.1.1
+clang-format==21.1.7
+clang-tidy==21.1.6
 codespell[toml]==2.4.1
-ruff==0.14.1
-yapf==0.43.0
+ruff==0.14.9
diff --git a/requirements-test-cuda.txt b/requirements-test-cuda.txt
index 5413ad510..d40f2221e 100644
--- a/requirements-test-cuda.txt
+++ b/requirements-test-cuda.txt
@@ -6,3 +6,6 @@
 
 # CUDA specific requirements
 flash-attn==2.5.8
+cuda-python==12.9.4
+# CuTeDSL (CUTLASS Python DSL with CuTe support)
+nvidia-cutlass-dsl==4.3.3
diff --git a/requirements-test.txt b/requirements-test.txt
index 18b83b346..533cab567 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -17,27 +17,20 @@ cpplint
 cython
 docutils
 dtlib
-numpy>=1.23.5
-pytest>=6.2.4
-pytest_xdist>=2.2.1
-packaging>=21.0
-PyYAML
-tqdm>=4.62.3
-typing_extensions>=4.10.0
-requests
-cloudpickle
-ml_dtypes
-psutil
-torch
-tabulate
-wheel
-setuptools
 einops
-scipy
-tornado
-ninja==1.10.0
-cuda-python>=12.0.0
-pytest-xdist>=2.2.1
+flash-linear-attention==0.3.2
+matplotlib
+packaging>=21.0
+pandas
 pytest-durations
 pytest-timeout
+pytest-xdist>=2.2.1
+pytest>=6.2.4
 pyyaml
+requests
+scipy
+seaborn
+tabulate
+tornado
+wheel
+z3-solver>=4.13.0
diff --git a/requirements.txt b/requirements.txt
index 49a398844..6735d178b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,7 @@
 # Runtime requirements
+
+apache-tvm-ffi~=0.1.0,>=0.1.6
+torch-c-dlpack-ext
 cloudpickle
 ml-dtypes
 numpy>=1.23.5
@@ -7,4 +10,4 @@ torch
 torch>=2.7; platform_system == 'Darwin'
 tqdm>=4.62.3
 typing-extensions>=4.10.0
-flash-linear-attention==0.3.2
\ No newline at end of file
+z3-solver>=4.13.0
diff --git a/src/ir.cc b/src/ir.cc
index aea1c3697..b40723a84 100644
--- a/src/ir.cc
+++ b/src/ir.cc
@@ -7,9 +7,13 @@
 #include "./transform/common/attr.h"
 #include "op/builtin.h"
 #include "tvm/ffi/any.h"
+#include <tvm/ffi/object.h>
+
+#include "support/ffi_aliases.h"
 #include <tvm/arith/analyzer.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/script/ir_builder/tir/ir.h>
+#include <tvm/tir/analysis.h>
 
 #include <utility>
 
@@ -37,22 +41,28 @@ static ForFrame MakeIterVarFrame(const std::string &name, const PrimExpr &dom) {
   using namespace tvm::tir;
   Var var = Var(name, dom->dtype);
   // Create a frame that represents a loop over the given domain.
-  ObjectPtr<ForFrameNode> n = make_object<ForFrameNode>();
+  ObjectPtr<ForFrameNode> n = tvm::ffi::make_object<ForFrameNode>();
   n->vars.push_back(var);
   n->doms.push_back(Range(0, dom));
   n->f_make_for_loop = [](const Array<Var> &vars, const Array<Range> &doms,
-                          const Stmt &body) -> Stmt {
+                          const Array<Optional<PrimExpr>> &steps,
+                          Stmt body) -> Stmt {
     ICHECK_EQ(vars.size(), 1);
     ICHECK_EQ(doms.size(), 1);
-    return For(vars[0], doms[0]->min, doms[0]->extent, ForKind::kSerial, body);
+    Optional<PrimExpr> step =
+        !steps.empty() ? steps[0] : Optional<PrimExpr>(std::nullopt);
+    return For(vars[0], doms[0]->min, doms[0]->extent, ForKind::kSerial, body,
+               /*thread_binding=*/std::nullopt,
+               /*annotations=*/tvm::ffi::Map<tvm::ffi::String, tvm::ffi::Any>{},
+               /*step=*/step);
   };
   return ForFrame(n);
 }
 
 ForFrame ParallelFor(const Array<PrimExpr> &extents,
-                     const Map<String, ObjectRef> &annotations) {
+                     const Map<String, tvm::ffi::Any> &annotations) {
   using namespace tvm::tir;
-  ObjectPtr<ForFrameNode> n = make_object<ForFrameNode>();
+  ObjectPtr<ForFrameNode> n = tvm::ffi::make_object<ForFrameNode>();
   n->vars.reserve(extents.size());
   n->doms.reserve(extents.size());
   for (const auto &extent : extents) {
@@ -60,16 +70,19 @@ ForFrame ParallelFor(const Array<PrimExpr> &extents,
     n->vars.push_back(Var("v", extent.dtype()));
     n->doms.push_back(Range(make_const(dtype, 0), extent));
   }
-  n->f_make_for_loop = [annotations](const Array<Var> &vars,
-                                     const Array<Range> &doms,
-                                     Stmt body) -> Stmt {
+  n->f_make_for_loop =
+      [annotations](const Array<Var> &vars, const Array<Range> &doms,
+                    const Array<Optional<PrimExpr>> &steps, Stmt body) -> Stmt {
     ICHECK_EQ(vars.size(), doms.size());
     int n = vars.size();
     for (int i = n - 1; i >= 0; --i) {
       Range dom = doms[i];
       Var var = vars[i];
+      Optional<PrimExpr> step =
+          i < steps.size() ? steps[i] : Optional<PrimExpr>(std::nullopt);
       body = For(var, dom->min, dom->extent, ForKind::kParallel, body,
-                 /*thread_binding=*/std::nullopt, /*annotations=*/annotations);
+                 /*thread_binding=*/std::nullopt, /*annotations=*/annotations,
+                 /*step=*/step);
     }
     return body;
   };
@@ -82,28 +95,30 @@ ForFrame PipelinedFor(PrimExpr start, const PrimExpr &stop, int num_stages,
                       const Array<Array<PrimExpr>> &sync,
                       const Array<Array<PrimExpr>> &groups) {
   using namespace tvm::tir;
-  ObjectPtr<ForFrameNode> n = make_object<ForFrameNode>();
+  ObjectPtr<ForFrameNode> n = tvm::ffi::make_object<ForFrameNode>();
   DataType dtype = stop.dtype();
   n->vars.push_back(Var("v", dtype));
   n->doms.push_back(Range(std::move(start), stop));
   n->f_make_for_loop = [=](const Array<Var> &vars, const Array<Range> &doms,
+                           const Array<Optional<PrimExpr>> &steps,
                            Stmt body) -> Stmt {
     ICHECK_EQ(vars.size(), doms.size());
     int n = vars.size();
     ICHECK(n == 1);
-    Map<String, ObjectRef> anno;
+    Map<String, tvm::ffi::Any> anno;
     if (num_stages > 0)
       anno.Set("num_stages", PrimExpr(num_stages));
     if (!order.empty())
       anno.Set("tl_pipeline_order", order);
     if (!stages.empty())
       anno.Set("tl_pipeline_stage", stages);
-    if (!sync.empty())
-      anno.Set("tl_pipeline_sync", sync);
     if (!groups.empty())
       anno.Set("tl_pipeline_group", groups);
+    Optional<PrimExpr> step =
+        !steps.empty() ? steps[0] : Optional<PrimExpr>(std::nullopt);
     body = For(vars[0], doms[0]->min, doms[0]->extent, ForKind::kSerial, body,
-               /*thread_binding=*/std::nullopt, /*annotations=*/anno);
+               /*thread_binding=*/std::nullopt, /*annotations=*/anno,
+               /*step=*/step);
     return body;
   };
   return ForFrame(n);
@@ -113,7 +128,7 @@ ForFrame PersistentFor(const Array<PrimExpr> &domain, const PrimExpr &wave_size,
                        const PrimExpr &index, PrimExpr group_size) {
   using namespace tvm::tir;
   ICHECK(!domain.empty());
-  ObjectPtr<ForFrameNode> n = make_object<ForFrameNode>();
+  ObjectPtr<ForFrameNode> n = tvm::ffi::make_object<ForFrameNode>();
   n->vars.reserve(domain.size());
   n->doms.reserve(domain.size());
   PrimExpr domain_size = domain[0];
@@ -142,9 +157,10 @@ ForFrame PersistentFor(const Array<PrimExpr> &domain, const PrimExpr &wave_size,
   grouped_domain.push_back(group_size);
 
   n->f_make_for_loop = [=](const Array<Var> &vars, const Array<Range> &doms,
-                           const Stmt &body) -> Stmt {
+                           const Array<Optional<PrimExpr>> &steps,
+                           Stmt body) -> Stmt {
     ICHECK_EQ(vars.size(), doms.size());
-    Map<String, ObjectRef> anno;
+    Map<String, tvm::ffi::Any> anno;
     Array<PrimExpr> idxs(grouped_domain.size(), PrimExpr());
     PrimExpr rem = loop_var * wave_size + index;
 
@@ -165,8 +181,11 @@ ForFrame PersistentFor(const Array<PrimExpr> &domain, const PrimExpr &wave_size,
     if (analyzer.CanProveGreaterEqual(waves, 2)) {
       new_body = SeqStmt({out_if, body});
     }
-    Stmt outer =
-        For(loop_var, 0, waves, ForKind::kSerial, new_body, std::nullopt, anno);
+    Optional<PrimExpr> step =
+        !steps.empty() ? steps[0] : Optional<PrimExpr>(std::nullopt);
+    Stmt outer = For(loop_var, 0, waves, ForKind::kSerial, new_body,
+                     /*thread_binding=*/std::nullopt, /*annotations=*/anno,
+                     /*step=*/step);
     for (int i = 0; i < vars.size() - 1; ++i) {
       outer = tvm::tir::LetStmt(vars[i], idxs[i + 1], outer);
     }
@@ -193,8 +212,8 @@ class KernelLaunchFrameNode : public TIRFrameNode {
         "frames", &KernelLaunchFrameNode::frames);
   }
 
-  static constexpr const char *_type_key = "tl.KernelLaunchFrame";
-  TVM_DECLARE_FINAL_OBJECT_INFO(KernelLaunchFrameNode, TIRFrameNode);
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.KernelLaunchFrame",
+                                    KernelLaunchFrameNode, TIRFrameNode);
 
 public:
   TVM_DLL void EnterWithScope() final {
@@ -218,14 +237,20 @@ class KernelLaunchFrameNode : public TIRFrameNode {
  */
 class KernelLaunchFrame : public TIRFrame {
 public:
-  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(KernelLaunchFrame, TIRFrame,
-                                                    KernelLaunchFrameNode);
+  explicit KernelLaunchFrame(ObjectPtr<KernelLaunchFrameNode> data)
+      : TIRFrame(::tvm::ffi::UnsafeInit{}) {
+    ICHECK(data != nullptr);
+    data_ = std::move(data);
+  }
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NOTNULLABLE(KernelLaunchFrame, TIRFrame,
+                                                KernelLaunchFrameNode);
 };
 
 KernelLaunchFrame KernelLaunch(const Array<PrimExpr> &grid_size,
                                const Optional<Array<PrimExpr>> &block_size_opt,
                                const Map<String, ffi::Any> &attrs) {
-  ObjectPtr<KernelLaunchFrameNode> n = make_object<KernelLaunchFrameNode>();
+  ObjectPtr<KernelLaunchFrameNode> n =
+      tvm::ffi::make_object<KernelLaunchFrameNode>();
 
   // If the kernel is a CPU kernel, we don't need to launch any threads.
   bool is_cpu_kernel_frame =
@@ -289,16 +314,14 @@ KernelLaunchFrame KernelLaunch(const Array<PrimExpr> &grid_size,
   return KernelLaunchFrame(n);
 }
 
-TVM_REGISTER_NODE_TYPE(KernelLaunchFrameNode);
-
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
       .def("tl.Parallel", ParallelFor)
       .def("tl.Pipelined", PipelinedFor)
       .def("tl.Persistent", PersistentFor)
       .def("tl.KernelLaunch", KernelLaunch);
-});
+}
 
 class WarpSpecializeFrameNode : public TIRFrameNode {
 public:
@@ -310,8 +333,8 @@ class WarpSpecializeFrameNode : public TIRFrameNode {
         "frames", &WarpSpecializeFrameNode::frames);
   }
 
-  static constexpr const char *_type_key = "tl.WarpSpecializeFrame";
-  TVM_DECLARE_FINAL_OBJECT_INFO(WarpSpecializeFrameNode, TIRFrameNode);
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.WarpSpecializeFrame",
+                                    WarpSpecializeFrameNode, TIRFrameNode);
 
 public:
   TVM_DLL void EnterWithScope() final {
@@ -330,15 +353,20 @@ class WarpSpecializeFrameNode : public TIRFrameNode {
 
 class WarpSpecializeFrame : public TIRFrame {
 public:
-  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(WarpSpecializeFrame,
-                                                    TIRFrame,
-                                                    WarpSpecializeFrameNode);
+  explicit WarpSpecializeFrame(ObjectPtr<WarpSpecializeFrameNode> data)
+      : TIRFrame(::tvm::ffi::UnsafeInit{}) {
+    ICHECK(data != nullptr);
+    data_ = std::move(data);
+  }
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NOTNULLABLE(WarpSpecializeFrame, TIRFrame,
+                                                WarpSpecializeFrameNode);
 };
 
 WarpSpecializeFrame WarpSpecialize(const Array<IntImm> &warp_group_ids,
                                    const PrimExpr &thread_idx,
                                    int warp_group_size = 128) {
-  ObjectPtr<WarpSpecializeFrameNode> n = make_object<WarpSpecializeFrameNode>();
+  ObjectPtr<WarpSpecializeFrameNode> n =
+      tvm::ffi::make_object<WarpSpecializeFrameNode>();
   PrimExpr condition;
   std::vector<int> warp_groups;
   warp_groups.reserve(warp_group_ids.size());
@@ -376,13 +404,14 @@ WarpSpecializeFrame WarpSpecialize(const Array<IntImm> &warp_group_ids,
   return WarpSpecializeFrame(n);
 }
 
-TVM_REGISTER_NODE_TYPE(WarpSpecializeFrameNode);
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("tl.WarpSpecialize", WarpSpecialize);
+  refl::GlobalDef()
+      .def("tl.WarpSpecialize", WarpSpecialize)
+      .def("tl.SideEffect", SideEffect);
   KernelLaunchFrameNode::RegisterReflection();
   WarpSpecializeFrameNode::RegisterReflection();
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/layout/gemm_layouts.cc b/src/layout/gemm_layouts.cc
index 1fc07ae66..97fcf2069 100644
--- a/src/layout/gemm_layouts.cc
+++ b/src/layout/gemm_layouts.cc
@@ -487,11 +487,23 @@ Layout makeGemmABLayoutF64_Kouter(int stride, int continuous) {
   return Layout(Array<PrimExpr>{stride, continuous}, {tc, ts, index});
 }
 
-// The Default Layout for Tensor Access
-Layout makeGemmLayoutLinear(int stride, int continuous) {
-  IterVar i = make_itervar("i", stride);
-  IterVar j = make_itervar("j", continuous);
-  return Layout(Array{i, j}, {i * continuous + j});
+// The Default Layout for Tensor Access (row-major linear layout)
+Layout makeLinearLayout(Array<PrimExpr> shape) {
+  int ndim = static_cast<int>(shape.size());
+  Array<IterVar> iter_vars;
+  for (int i = 0; i < ndim; i++) {
+    iter_vars.push_back(make_itervar(std::string{char('i' + i)}, shape[i]));
+  }
+  // Row-major: index = i0 * (d1 * d2 * ...) + i1 * (d2 * ...) + ... + i_{n-1}
+  PrimExpr linear_index = 0;
+  for (int i = 0; i < ndim; i++) {
+    PrimExpr stride = 1;
+    for (int j = i + 1; j < ndim; j++) {
+      stride = stride * shape[j];
+    }
+    linear_index = linear_index + iter_vars[i]->var * stride;
+  }
+  return Layout(iter_vars, {linear_index});
 }
 
 Layout makeGemmABLayoutPadded(int stride, int continuous, int element_size) {
@@ -577,11 +589,11 @@ Layout MakeGemmVoltaBLayoutCongruous(int stride, int continuous) {
 
 Layout makeGemmVoltaABLayout(int stride, int continuous, bool is_a,
                              bool k_inner) {
-  if (k_inner)
+  if (k_inner && continuous % 32 == 0 && stride % 32 == 0)
     return MakeGemmVoltaABLayoutCrosswise(stride, continuous);
-  if (is_a && continuous % 64 == 0)
+  if (is_a && continuous % 64 == 0 && stride % 4 == 0)
     return MakeGemmVoltaALayoutCongruous(stride, continuous);
-  if (!is_a && continuous % 64 == 0)
+  if (!is_a && continuous % 64 == 0 && stride % 4 == 0)
     return MakeGemmVoltaBLayoutCongruous(stride, continuous);
   return makeGemmABLayoutPadded(stride, continuous, 16);
 }
@@ -745,20 +757,30 @@ Layout makeGemmABLayoutHopper(int mat_stride, int mat_continuous,
       return makeGemmABLayoutF64_Kouter(mat_stride, mat_continuous);
     if (k_inner && continuity % 16 == 0) // float64 NxK
       return makeGemmABLayoutF64_Kinner(mat_stride, mat_continuous);
+    // fallback for float64 when stride % 8 != 0
+    if (mat_stride % 8 != 0)
+      return makeLinearLayout(
+          Array<PrimExpr>{Integer(mat_stride), Integer(mat_continuous)});
     return makeQuarterBankSwizzleLayout(mat_stride, mat_continuous,
                                         element_size);
   }
   int vector_size = 128 / element_size;
 
-  if (mat_continuous % (vector_size * 8) == 0)
-    return makeFullBankSwizzleLayout(mat_stride, mat_continuous, element_size);
-  else if (mat_continuous % (vector_size * 4) == 0)
-    return makeHalfBankSwizzleLayout(mat_stride, mat_continuous, element_size);
-  else if (mat_continuous % (vector_size * 2) == 0)
-    return makeQuarterBankSwizzleLayout(mat_stride, mat_continuous,
-                                        element_size);
-  else if (mat_continuous % vector_size == 0)
-    return makeGemmLayoutLinear(mat_stride, mat_continuous);
+  if (mat_stride % 8 == 0) {
+    if (mat_continuous % (vector_size * 8) == 0)
+      return makeFullBankSwizzleLayout(mat_stride, mat_continuous,
+                                       element_size);
+    else if (mat_continuous % (vector_size * 4) == 0)
+      return makeHalfBankSwizzleLayout(mat_stride, mat_continuous,
+                                       element_size);
+    else if (mat_continuous % (vector_size * 2) == 0)
+      return makeQuarterBankSwizzleLayout(mat_stride, mat_continuous,
+                                          element_size);
+  }
+
+  if (mat_continuous % vector_size == 0)
+    return makeLinearLayout(
+        Array<PrimExpr>{Integer(mat_stride), Integer(mat_continuous)});
   else
     ICHECK(0) << "Unsupported layout for Hopper with stride=" << mat_stride
               << ", continuous=" << mat_continuous
@@ -771,15 +793,22 @@ Layout makeGemmABLayoutSm100(int mat_stride, int mat_continuous, int continuity,
     ICHECK(0) << "float64 on sm100 is not supported now";
   }
   int vector_size = 128 / element_size;
-  if (mat_continuous % (vector_size * 8) == 0)
-    return makeFullBankSwizzleLayout(mat_stride, mat_continuous, element_size);
-  else if (mat_continuous % (vector_size * 4) == 0)
-    return makeHalfBankSwizzleLayout(mat_stride, mat_continuous, element_size);
-  else if (mat_continuous % (vector_size * 2) == 0)
-    return makeQuarterBankSwizzleLayout(mat_stride, mat_continuous,
-                                        element_size);
-  else if (mat_continuous % vector_size == 0)
-    return makeGemmLayoutLinear(mat_stride, mat_continuous);
+
+  if (mat_stride % 8 == 0) {
+    if (mat_continuous % (vector_size * 8) == 0)
+      return makeFullBankSwizzleLayout(mat_stride, mat_continuous,
+                                       element_size);
+    else if (mat_continuous % (vector_size * 4) == 0)
+      return makeHalfBankSwizzleLayout(mat_stride, mat_continuous,
+                                       element_size);
+    else if (mat_continuous % (vector_size * 2) == 0)
+      return makeQuarterBankSwizzleLayout(mat_stride, mat_continuous,
+                                          element_size);
+  }
+
+  if (mat_continuous % vector_size == 0)
+    return makeLinearLayout(
+        Array<PrimExpr>{Integer(mat_stride), Integer(mat_continuous)});
   else
     ICHECK(0) << "Unsupported layout for sm100 with stride=" << mat_stride
               << ", continuous=" << mat_continuous
diff --git a/src/layout/layout.cc b/src/layout/layout.cc
index 5eb4a822d..ccccc903d 100644
--- a/src/layout/layout.cc
+++ b/src/layout/layout.cc
@@ -5,12 +5,15 @@
 
 #include "layout.h"
 #include <tvm/ffi/reflection/registry.h>
+#include <tvm/runtime/logging.h>
 
 #include <tvm/arith/pattern.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/stmt_functor.h>
 
 #include "arith/pattern_match.h"
+#include "tvm/node/functor.h"
+#include "tvm/node/repr_printer.h"
 #include "utils.h"
 
 namespace tvm {
@@ -64,13 +67,12 @@ Layout::Layout(Array<IterVar> forward_var, Array<PrimExpr> forward_index) {
   }
   forward_index =
       forward_index.Map([&](const PrimExpr &e) { return Substitute(e, vmap); });
-
-  auto n = make_object<LayoutNode>(input_size, forward_index);
+  auto n = tvm::ffi::make_object<LayoutNode>(input_size, forward_index);
   data_ = std::move(n);
 }
 
 Layout::Layout(Array<PrimExpr> input_size, Array<PrimExpr> forward_index) {
-  auto n = make_object<LayoutNode>(input_size, forward_index);
+  auto n = tvm::ffi::make_object<LayoutNode>(input_size, forward_index);
   data_ = std::move(n);
 }
 
@@ -78,7 +80,8 @@ void LayoutNode::RegisterReflection() {
   namespace refl = tvm::ffi::reflection;
   refl::ObjectDef<LayoutNode>()
       .def_ro("input_size", &LayoutNode::input_size_)
-      .def_ro("forward_index", &LayoutNode::forward_index_);
+      .def_ro("forward_index", &LayoutNode::forward_index_)
+      .def("_DebugOutput", &LayoutNode::DebugOutput);
 }
 
 void LayoutNode::UpdateAnalyzer(arith::Analyzer *analyzer) const {
@@ -102,10 +105,24 @@ Array<PrimExpr> LayoutNode::OutputShape() const {
   for (size_t i = 0; i < ret.size(); i++) {
     auto ist = analyzer.int_set(forward_index_[i] + 1);
     if (arith::is_neg_inf(ist.min()) && arith::is_pos_inf(ist.max())) {
-      // X-OR Expression
-      ret.Set(i, input_size_[i]);
+      // Analyzer couldn't form an IntervalSet (e.g. bitwise ops).
+      // Fall back to ConstIntBound to derive a safe extent.
+      auto cib = analyzer.const_int_bound(forward_index_[i]);
+      if (cib->min_value != arith::ConstIntBound::kNegInf &&
+          cib->max_value != arith::ConstIntBound::kPosInf &&
+          cib->min_value >= 0) {
+        // extent = max - min + 1, using 64-bit integer literal
+        ret.Set(i, Integer(cib->max_value - cib->min_value + 1));
+      } else {
+        // Last-resort conservative fallback to avoid OOB/crash
+        // Prefer to keep dimension from known input_size_ if available.
+        if (i < input_size_.size()) {
+          ret.Set(i, input_size_[i]);
+        } else {
+          ret.Set(i, Integer(1));
+        }
+      }
     } else {
-      // CHECK(is_one(ist.min())) << ist.min();
       ret.Set(i, ist.max());
     }
   }
@@ -130,7 +147,6 @@ Array<PrimExpr> LayoutNode::Forward(const Array<PrimExpr> &vars) const {
 
   Array<PrimExpr> transformed = forward_index_.Map(
       [&](const PrimExpr &e) { return Substitute(e, vmap); });
-
   // Concatenate with the remaining elements from vars
   Array<PrimExpr> result;
   for (size_t i = 0; i < vars.size() - InputDim(); i++) {
@@ -212,7 +228,7 @@ Fragment FragmentNode::DeReplicate() const {
     factor = arith::ZeroAwareGCD(*rep_size, *idx_size);
   }
   if (factor == 1)
-    return GetRef<Fragment>(this);
+    return tvm::ffi::GetRef<Fragment>(this);
 
   Map<Var, PrimExpr> vmap;
   vmap.Set(ReplicationPlaceholder(), ReplicationPlaceholder() * factor +
@@ -224,7 +240,7 @@ Fragment FragmentNode::DeReplicate() const {
 }
 
 Fragment FragmentNode::BindThreadRange(Range thread_range) const {
-  auto n = make_object<FragmentNode>(*this);
+  auto n = tvm::ffi::make_object<FragmentNode>(*this);
   n->thread_range_ = thread_range;
   return Fragment(n);
 }
@@ -251,14 +267,17 @@ std::pair<Layout, arith::IterMapLevel> LayoutNode::InverseWithLevel() const {
   if (!is_static_shape) {
     // Runtime guards keep dynamic tails safe, so we allow NoCheck here and
     // warn.
-    LOG(WARNING) << "Layout::Inverse on symbolic layout, falling back to "
-                    "NoCheck; symbolic dims: "
-                 << symbolic_dims;
+    DLOG(WARNING) << "Layout::Inverse on symbolic layout, falling back to "
+                     "NoCheck; symbolic dims: "
+                  << symbolic_dims;
   }
   arith::IterMapResult res =
       arith::DetectIterMap(forward_index_, getVarMap(), 1, level, &analyzer);
-  ICHECK(res->errors.empty())
-      << "Layout " << DebugOutput() << " has errors: " << res->errors;
+  if (!res->errors.empty()) {
+    std::ostringstream msg;
+    msg << "Layout " << DebugOutput() << " has errors: " << res->errors;
+    throw NormalizeIterException(msg.str());
+  }
 
   auto outputs_shape = OutputShape();
   Array<PrimExpr> outputs;
@@ -280,25 +299,204 @@ std::pair<Layout, arith::IterMapLevel> LayoutNode::InverseWithLevel() const {
   return {Layout(outputs_shape, backward_index), level};
 }
 
+Layout LayoutNode::Reshape(const Array<PrimExpr> &shape,
+                           arith::Analyzer *analyzer,
+                           const PrimExpr rescale_num,
+                           const PrimExpr rescale_den) const {
+
+  // Fast path: if shape is the same, return the original layout
+  if (StructuralEqual()(InputShape(), shape)) {
+    return ffi::GetRef<Layout>(this);
+  }
+
+  // Step 1. Prove the product relation holds under rescale:
+  //   prod(InputShape) * rescale_num == prod(shape) * rescale_den
+  PrimExpr input_shape_product = Integer(1);
+  for (const auto &dim : InputShape()) {
+    input_shape_product *= dim;
+  }
+  PrimExpr shape_product = Integer(1);
+  for (const auto &dim : shape) {
+    shape_product *= dim;
+  }
+
+  // Use provided analyzer if present, otherwise a local fallback to avoid
+  // potential null dereference paths flagged by static analysis.
+  arith::Analyzer fallback_analyzer;
+  arith::Analyzer *az = analyzer ? analyzer : &fallback_analyzer;
+  ICHECK(az->CanProveEqual(input_shape_product * rescale_num,
+                           shape_product * rescale_den))
+      << "InputShape() = " << InputShape() << " shape = " << shape
+      << ", rescale_num = " << rescale_num << ", rescale_den = " << rescale_den;
+
+  // Step 2. Create new forward indices by reshaping
+  // For each dimension in the new shape, we create a placeholder variable
+  Array<Var> new_vars;
+  new_vars.reserve(shape.size());
+  for (size_t i = 0; i < shape.size(); ++i) {
+    auto var = Var(std::string("n_") + std::to_string(i), shape[i].dtype());
+    az->Bind(var, Range(0, shape[i]));
+    new_vars.push_back(var);
+  }
+  // Step 3. Compute the flat index from new shape indices
+  // flat_index = k0 * (s1 * s2 * ...) + k1 * (s2 * s3 * ...) + ... + kn
+  PrimExpr flat_index = Integer(0);
+  for (size_t i = 0; i < shape.size(); ++i) {
+    PrimExpr stride = Integer(1);
+    for (size_t j = i + 1; j < shape.size(); ++j) {
+      stride = stride * shape[j];
+    }
+    flat_index = flat_index + new_vars[i] * stride;
+  }
+  // Convert new flat index (in units of new elements) to the old flat index
+  // (in units of old elements) using the rational rescale factor.
+  // old_flat = floor((flat_index * rescale_den) / rescale_num)
+  PrimExpr old_flat_index = floordiv(flat_index * rescale_den, rescale_num);
+  // Step 4. Convert flat index back to original shape indices
+  // For original shape [s0, s1, ..., sm]:
+  // i0 = flat_index // (s1 * s2 * ... * sm)
+  // i1 = (flat_index % (s1 * s2 * ... * sm)) // (s2 * s3 * ... * sm)
+  // ...
+  Array<PrimExpr> original_indices;
+  PrimExpr remaining = old_flat_index;
+  for (size_t i = 0; i < InputShape().size(); ++i) {
+    PrimExpr stride = Integer(1);
+    for (size_t j = i + 1; j < InputShape().size(); ++j) {
+      stride = stride * InputShape()[j];
+    }
+    original_indices.push_back(floordiv(remaining, stride));
+    remaining = floormod(remaining, stride);
+  }
+  // Step 5. Substitute original indices into forward_index_
+  Array<PrimExpr> new_forward_index;
+  for (const auto &fwd_expr : forward_index_) {
+    PrimExpr substituted = fwd_expr;
+    // Replace each InputPlaceholder(i) with original_indices[i]
+    for (size_t i = 0; i < InputShape().size(); ++i) {
+      substituted =
+          Substitute(substituted, {{InputPlaceholder(i), original_indices[i]}});
+    }
+    new_forward_index.push_back(az->Simplify(substituted));
+  }
+  for (size_t i = 0; i < new_vars.size(); ++i) {
+    new_forward_index =
+        Substitute(new_forward_index, {{new_vars[i], InputPlaceholder(i)}});
+  }
+  return Layout(shape, new_forward_index);
+}
+
+Layout FragmentNode::Reshape(const Array<PrimExpr> &shape,
+                             arith::Analyzer *analyzer,
+                             const PrimExpr rescale_num,
+                             const PrimExpr rescale_den) const {
+
+  // Fast path: identical input shape, return self
+  if (StructuralEqual()(InputShape(), shape)) {
+    return ffi::GetRef<Fragment>(this);
+  }
+
+  // 1) Prove total number of elements remains the same
+  PrimExpr input_prod = Integer(1);
+  for (const auto &d : InputShape())
+    input_prod *= d;
+  PrimExpr shape_prod = Integer(1);
+  for (const auto &d : shape)
+    shape_prod *= d;
+
+  // Use provided analyzer if present, otherwise a local fallback.
+  arith::Analyzer fallback_analyzer;
+  arith::Analyzer *az = analyzer ? analyzer : &fallback_analyzer;
+  ICHECK(az->CanProveEqual(input_prod * rescale_num, shape_prod * rescale_den))
+      << "InputShape() = " << InputShape() << " shape = " << shape
+      << ", rescale_num = " << rescale_num << ", rescale_den = " << rescale_den
+      << " input fragment layout is = " << DebugOutput();
+
+  // 2) Build flat index from new-shape indices
+  Array<Var> new_vars;
+  new_vars.reserve(shape.size());
+  for (size_t i = 0; i < shape.size(); ++i) {
+    // Cannot use InputPlaceholder(i) here, because it would cause name capture
+    // (variable capture) with InputPlaceholder(i) in upper scopes. Therefore,
+    // we must create a fresh variable here to avoid confusion when
+    // substituting.
+    auto var = Var(std::string("n_") + std::to_string(i), shape[i].dtype());
+    az->Bind(var, Range(0, shape[i]));
+    new_vars.push_back(var);
+  }
+
+  PrimExpr flat = Integer(0);
+  for (size_t i = 0; i < shape.size(); ++i) {
+    PrimExpr stride = Integer(1);
+    for (size_t j = i + 1; j < shape.size(); ++j)
+      stride = stride * shape[j];
+    flat = flat + new_vars[i] * stride;
+  }
+  // Convert to old flat index units using the rational rescale factor.
+  // old_flat = floor((flat * rescale_den) / rescale_num)
+  PrimExpr old_flat = floordiv(flat * rescale_den, rescale_num);
+  // 3) Recover original indices from flat index
+  Array<PrimExpr> orig_indices;
+  PrimExpr remain = old_flat;
+  for (size_t i = 0; i < InputShape().size(); ++i) {
+    PrimExpr stride = Integer(1);
+    for (size_t j = i + 1; j < InputShape().size(); ++j)
+      stride = stride * InputShape()[j];
+    orig_indices.push_back(floordiv(remain, stride));
+    remain = floormod(remain, stride);
+  }
+  // 4) Substitute old placeholders with expressions of new indices
+  Array<PrimExpr> new_forward_index;
+  for (const auto &e : forward_index_) {
+    PrimExpr cur = e;
+    for (size_t i = 0; i < InputShape().size(); ++i) {
+      cur = Substitute(cur, {{InputPlaceholder(i), orig_indices[i]}});
+    }
+    cur = az->Simplify(cur);
+    new_forward_index.push_back(cur);
+  }
+  PrimExpr new_forward_thread = forward_thread_;
+  for (size_t i = 0; i < InputShape().size(); ++i) {
+    new_forward_thread = Substitute(new_forward_thread,
+                                    {{InputPlaceholder(i), orig_indices[i]}});
+  }
+  new_forward_thread = az->Simplify(new_forward_thread);
+  for (size_t i = 0; i < new_vars.size(); ++i) {
+    auto var = new_vars[i];
+    new_forward_index =
+        Substitute(new_forward_index, {{var, InputPlaceholder(i)}});
+    new_forward_thread =
+        Substitute(new_forward_thread, {{var, InputPlaceholder(i)}});
+  }
+  Fragment reshaped(shape, new_forward_index, new_forward_thread,
+                    ReplicateExtent(), std::nullopt);
+  if (thread_range_.defined()) {
+    reshaped = reshaped->BindThreadRange(thread_range_);
+  }
+  return reshaped;
+}
+
 Layout LayoutNode::Inverse() const {
   auto inverse_result = InverseWithLevel();
   return std::move(inverse_result.first);
 }
+
 PrimExpr infer_fragment_index(const Map<Var, Range> &input_iters,
                               const PrimExpr &forward_thread,
                               arith::Analyzer *analyzer) {
-  Array<arith::IterSplitExpr> splits = DivideUnusedIterators(
-      {forward_thread}, ToIterVars(input_iters), analyzer);
-
-  Array<arith::IterSplitExpr> split_without_rep;
-  for (const auto &split : splits) {
-    CHECK(split->source->source.as<Var>());
-    if (split->source->source.as<Var>().value().same_as(
-            ReplicationPlaceholder()))
-      continue;
-    split_without_rep.push_back(split);
+  // we build iter_vars from input_iters, but set _rep to range [0, 1)
+  // to make it not contribute to the index of the forward_idx
+  Array<IterVar> iter_vars;
+  for (const auto &[var, range_] : input_iters) {
+    Range range = range_;
+    if (var.same_as(ReplicationPlaceholder())) {
+      range = Range(0, 1);
+    }
+    iter_vars.push_back(IterVar(range, var, IterVarType::kDataPar));
   }
-  return MakeFlattenedExpression(split_without_rep);
+
+  Array<arith::IterSplitExpr> splits =
+      DivideUnusedIterators({forward_thread}, iter_vars, analyzer);
+  return MakeFlattenedExpression(splits);
 }
 
 FragmentNode::FragmentNode(Array<PrimExpr> input_size,
@@ -336,8 +534,8 @@ Fragment::Fragment(Array<IterVar> forward_var, Array<PrimExpr> forward_index,
       forward_index.Map([&](const PrimExpr &e) { return Substitute(e, vmap); });
   forward_thread = Substitute(forward_thread, vmap);
 
-  auto n = make_object<FragmentNode>(input_size, forward_index, forward_thread,
-                                     replicate_size);
+  auto n = tvm::ffi::make_object<FragmentNode>(input_size, forward_index,
+                                               forward_thread, replicate_size);
   data_ = std::move(n);
 }
 
@@ -348,11 +546,17 @@ Fragment::Fragment(Array<PrimExpr> input_size, Array<PrimExpr> forward_index,
     forward_thread = Substitute(
         forward_thread, {{replicate_var.value(), ReplicationPlaceholder()}});
   }
-  auto n = make_object<FragmentNode>(input_size, forward_index, forward_thread,
-                                     replicate_size);
+  auto n = tvm::ffi::make_object<FragmentNode>(input_size, forward_index,
+                                               forward_thread, replicate_size);
   data_ = std::move(n);
 }
 
+Fragment Fragment::FullyReplicated(Array<PrimExpr> shape,
+                                   PrimExpr thread_extent) {
+  return Fragment(shape, {}, ReplicationPlaceholder(), thread_extent,
+                  std::nullopt);
+}
+
 // which means the forward_thread is rep_var -> lambda i, rep: rep
 bool FragmentNode::IsCompletedReplicated() const {
   arith::Analyzer analyzer;
@@ -360,6 +564,52 @@ bool FragmentNode::IsCompletedReplicated() const {
                          ReplicationPlaceholder());
 }
 
+arith::IterMapResult FragmentNode::DetectInjective() const {
+  // lei:To perform injective check, we need to reverse the layout
+  // and use surjective check, now we use bijective check for convenience
+  // can be relaxed in future
+  arith::Analyzer analyzer;
+  // Build a flat indices array: [forward_thread_, forward_index_[...]]
+  Array<PrimExpr> indices;
+  indices.push_back(forward_thread_);
+  for (const auto &e : forward_index_) {
+    indices.push_back(e);
+  }
+
+  // Mirror Layout::InverseWithLevel(): if any participating shape is
+  // symbolic, relax to NoCheck and rely on runtime guards elsewhere.
+  auto collect_symbolic = [&](const Array<PrimExpr> &shape) {
+    Array<PrimExpr> symbolic_dims;
+    for (const auto &dim : shape) {
+      if (!as_const_int(dim)) {
+        symbolic_dims.push_back(dim);
+      }
+    }
+    return symbolic_dims;
+  };
+
+  Array<PrimExpr> symbolic_dims = collect_symbolic(InputShape());
+  Array<PrimExpr> output_shape = OutputShape();
+  symbolic_dims.insert(symbolic_dims.end(), output_shape.begin(),
+                       output_shape.end());
+  // Also consider replicate size for fragments
+  if (!as_const_int(ReplicateExtent())) {
+    symbolic_dims.push_back(ReplicateExtent());
+  }
+  symbolic_dims = collect_symbolic(symbolic_dims);
+
+  bool is_static_shape = symbolic_dims.empty();
+  auto level = is_static_shape ? arith::IterMapLevel::Bijective
+                               : arith::IterMapLevel::NoCheck;
+  if (!is_static_shape) {
+    DLOG(WARNING)
+        << "Fragment::DetectInjective on symbolic layout, falling back to "
+        << "NoCheck; symbolic dims: " << symbolic_dims;
+  }
+
+  return arith::DetectIterMap(indices, getVarMap(), 1, level, &analyzer);
+}
+
 PrimExpr FragmentNode::ThreadExtent() const {
   Array<PrimExpr> ret(OutputDim(), 1);
   arith::Analyzer analyzer;
@@ -442,26 +692,27 @@ std::string FragmentNode::DebugOutput() const {
   return ss.str();
 }
 
-bool LayoutNode::SEqualReduce(const LayoutNode *other,
-                              SEqualReducer equal) const {
-  return equal(this->InputShape(), other->InputShape()) &&
-         equal(this->forward_index_, other->forward_index_);
-}
-
-bool FragmentNode::SEqualReduce(const FragmentNode *other,
-                                SEqualReducer equal) const {
-  return equal(this->ReplicateExtent(), other->ReplicateExtent()) &&
-         equal(this->InputShape(), other->InputShape()) &&
-         equal(this->ThreadExtent(), other->ThreadExtent()) &&
-         equal(this->forward_index_, other->forward_index_) &&
-         equal(this->forward_thread_, other->forward_thread_);
-}
-
 bool LayoutNode::IsEqual(const LayoutNode *other, bool skip_index) const {
   bool ret = StructuralEqual()(this->InputShape(), other->InputShape());
   ret &= StructuralEqual()(this->OutputShape(), other->OutputShape());
+  if (!ret) {
+    return false;
+  }
   if (!skip_index) {
-    ret &= StructuralEqual()(this->forward_index_, other->forward_index_);
+    // Create common variables for comparison. Using Forward with common
+    // variables ensures we compare the actual mapping rather than AST
+    // structure, since InputPlaceholder may compare equal in StructuralEqual.
+    Array<PrimExpr> common_vars;
+    for (size_t i = 0; i < this->InputDim(); i++) {
+      common_vars.push_back(Var("_cmp_v" + std::to_string(i)));
+    }
+
+    auto this_forward = this->Forward(common_vars);
+    auto other_forward = other->Forward(common_vars);
+
+    if (!StructuralEqual()(this_forward, other_forward)) {
+      return false;
+    }
   }
   return ret;
 }
@@ -482,8 +733,32 @@ bool FragmentNode::IsEqual(const FragmentNode *other, bool skip_index) const {
   ret &= StructuralEqual()(this->OutputShape(), other->OutputShape());
   ret &= StructuralEqual()(this->ReplicateExtent(), other->ReplicateExtent());
   ret &= StructuralEqual()(this->ThreadExtent(), other->ThreadExtent());
+  if (!ret) {
+    return false;
+  }
   if (!skip_index) {
-    ret &= StructuralEqual()(this->forward_index_, other->forward_index_);
+    // Create common variables for comparison. Using Forward/ForwardThread with
+    // common variables ensures we compare the actual mapping rather than AST
+    // structure, since InputPlaceholder may compare equal in StructuralEqual.
+    Array<PrimExpr> common_vars;
+    for (size_t i = 0; i < this->InputDim(); i++) {
+      common_vars.push_back(Var("_cmp_v" + std::to_string(i)));
+    }
+    Var common_rep("_cmp_rep");
+
+    auto this_forward = this->Forward(common_vars);
+    auto other_forward = other->Forward(common_vars);
+
+    if (!StructuralEqual()(this_forward, other_forward)) {
+      return false;
+    }
+
+    // Also compare forward_thread mapping.
+    auto this_thread = this->ForwardThread(common_vars, common_rep);
+    auto other_thread = other->ForwardThread(common_vars, common_rep);
+    if (!StructuralEqual()(this_thread, other_thread)) {
+      return false;
+    }
   }
   return ret;
 }
@@ -492,13 +767,21 @@ void FragmentNode::RegisterReflection() {
   namespace refl = tvm::ffi::reflection;
   refl::ObjectDef<FragmentNode>()
       .def_ro("forward_thread", &FragmentNode::forward_thread_)
-      .def_ro("replicate_size", &FragmentNode::replicate_size_);
+      .def_ro("replicate_size", &FragmentNode::replicate_size_)
+      .def("_DebugOutput", &FragmentNode::DebugOutput);
 }
 
-TVM_REGISTER_NODE_TYPE(LayoutNode);
-TVM_REGISTER_NODE_TYPE(FragmentNode);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<FragmentNode>([](const ObjectRef &obj, ReprPrinter *p) {
+      auto *node = static_cast<const FragmentNode *>(obj.get());
+      p->stream << node->DebugOutput();
+    })
+    .set_dispatch<LayoutNode>([](const ObjectRef &obj, ReprPrinter *p) {
+      auto *node = static_cast<const LayoutNode *>(obj.get());
+      p->stream << node->DebugOutput();
+    });
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
       .def_packed("tl.Layout",
@@ -560,12 +843,23 @@ TVM_FFI_STATIC_INIT_BLOCK({
                                              element_size, k_inner);
              }
            })
+      .def("tl.make_volta_swizzled_layout",
+           [](int stride, int mat_continuous, bool is_a, bool k_inner) {
+             return makeGemmVoltaABLayout(stride, mat_continuous, is_a,
+                                          k_inner);
+           })
       .def("tl.make_wgmma_swizzled_layout",
            [](int stride, int mat_continuous, int continuity, int element_size,
               bool k_inner) {
              return makeGemmABLayoutHopper(stride, mat_continuous, continuity,
                                            element_size, k_inner);
            })
+      .def("tl.make_tcgen05mma_swizzled_layout",
+           [](int stride, int mat_continuous, int continuity, int element_size,
+              bool k_inner) {
+             return makeGemmABLayoutSm100(stride, mat_continuous, continuity,
+                                          element_size, k_inner);
+           })
       .def("tl.make_full_bank_swizzled_layout",
            [](int stride, int continuous, int element_size) {
              return makeFullBankSwizzleLayout(stride, continuous, element_size);
@@ -579,16 +873,15 @@ TVM_FFI_STATIC_INIT_BLOCK({
              return makeQuarterBankSwizzleLayout(stride, continuous,
                                                  element_size);
            })
-      .def("tl.make_linear_layout", [](int stride, int continuous) {
-        return makeGemmLayoutLinear(stride, continuous);
-      });
-});
+      .def("tl.make_linear_layout",
+           [](Array<PrimExpr> shape) { return makeLinearLayout(shape); });
+}
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   LayoutNode::RegisterReflection();
   FragmentNode::RegisterReflection();
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/layout/layout.h b/src/layout/layout.h
index 0001c803b..073718342 100644
--- a/src/layout/layout.h
+++ b/src/layout/layout.h
@@ -6,15 +6,38 @@
 #ifndef TVM_TL_LAYOUT_LAYOUT_H_
 #define TVM_TL_LAYOUT_LAYOUT_H_
 
+#include <exception>
 #include <tvm/arith/analyzer.h>
 #include <tvm/arith/iter_affine_map.h>
+#include <tvm/ffi/object.h>
 #include <utility>
 
+#include "../support/ffi_aliases.h"
+
 namespace tvm {
 namespace tl {
 
 using namespace tir;
 
+// Common layout-related exceptions
+class LayoutConflictException : public std::exception {
+public:
+  const char *what() const noexcept override { return msg_.c_str(); }
+  explicit LayoutConflictException(const std::string &msg) : msg_(msg) {}
+
+private:
+  std::string msg_;
+};
+
+class LoopLayoutInjectiveException : public std::exception {
+public:
+  const char *what() const noexcept override { return msg_.c_str(); }
+  explicit LoopLayoutInjectiveException(const std::string &msg) : msg_(msg) {}
+
+private:
+  std::string msg_;
+};
+
 class Layout;
 class Fragment;
 
@@ -38,17 +61,30 @@ class LayoutNode : public Object {
   virtual Array<PrimExpr> Forward(const Array<PrimExpr> &vars) const;
 
   virtual Layout Inverse() const;
+
+  // Reshape the layout to a new logical shape. When aliasing buffers of
+  // different dtypes, the element count may change while the underlying
+  // byte-size stays equal. Use rescale_num/rescale_den to represent the
+  // ratio between the old element size and the new element size in bytes.
+  // Specifically, define factor = rescale_num / rescale_den where:
+  //   new_num_elems = old_num_elems * factor
+  // For example, f32->i8 (4B -> 1B) uses rescale_num=4, rescale_den=1.
+  // i8->f32 (1B -> 4B) uses rescale_num=1, rescale_den=4.
+  virtual Layout Reshape(const Array<PrimExpr> &shape,
+                         arith::Analyzer *analyzer,
+                         const PrimExpr rescale_num = Integer(1),
+                         const PrimExpr rescale_den = Integer(1)) const;
+
   virtual std::pair<Layout, arith::IterMapLevel> InverseWithLevel() const;
 
   virtual std::string DebugOutput() const;
 
   virtual bool IsEqual(const LayoutNode *other, bool skip_index = false) const;
 
-  static constexpr bool _type_has_method_sequal_reduce = true;
-  static constexpr const char *_type_key = "tl.Layout";
-  bool SEqualReduce(const LayoutNode *other, SEqualReducer equal) const;
   static void RegisterReflection();
-  TVM_DECLARE_BASE_OBJECT_INFO(LayoutNode, Object);
+  TVM_FFI_DECLARE_OBJECT_INFO("tl.Layout", LayoutNode, Object);
+  static constexpr TVMFFISEqHashKind _type_s_eq_hash_kind =
+      kTVMFFISEqHashKindTreeNode;
 
 protected:
   virtual Map<Var, Range> getVarMap() const;
@@ -65,7 +101,7 @@ class Layout : public ObjectRef {
   TVM_DLL Layout(Array<IterVar> forward_var, Array<PrimExpr> forward_index);
   TVM_DLL Layout(Array<PrimExpr> input_size, Array<PrimExpr> forward_index);
 
-  TVM_DEFINE_OBJECT_REF_METHODS(Layout, ObjectRef, LayoutNode);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Layout, ObjectRef, LayoutNode);
 };
 
 class FragmentNode : public LayoutNode {
@@ -79,6 +115,11 @@ class FragmentNode : public LayoutNode {
   Array<PrimExpr> GetForwardVars() const final;
 
   Layout Inverse() const final;
+
+  Layout Reshape(const Array<PrimExpr> &shape, arith::Analyzer *analyzer,
+                 const PrimExpr rescale_num = Integer(1),
+                 const PrimExpr rescale_den = Integer(1)) const;
+
   std::pair<Layout, arith::IterMapLevel> InverseWithLevel() const final;
 
   PrimExpr ThreadExtent() const;
@@ -107,11 +148,13 @@ class FragmentNode : public LayoutNode {
 
   bool IsCompletedReplicated() const;
 
+  arith::IterMapResult DetectInjective() const;
+
   static void RegisterReflection();
 
-  bool SEqualReduce(const FragmentNode *other, SEqualReducer equal) const;
-  static constexpr const char *_type_key = "tl.Fragment";
-  TVM_DECLARE_FINAL_OBJECT_INFO(FragmentNode, LayoutNode);
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.Fragment", FragmentNode, LayoutNode);
+  static constexpr TVMFFISEqHashKind _type_s_eq_hash_kind =
+      kTVMFFISEqHashKindTreeNode;
 
 protected:
   Map<Var, Range> getVarMap() const final;
@@ -132,7 +175,21 @@ class Fragment : public Layout {
                    PrimExpr forward_thread, PrimExpr replicate_size,
                    Optional<Var> replicate_var);
 
-  TVM_DEFINE_OBJECT_REF_METHODS(Fragment, Layout, FragmentNode);
+  /*!
+   * \brief Create a fully replicated fragment layout.
+   *
+   * A fully replicated fragment means all threads hold identical copies of the
+   * entire buffer. This is useful for index buffers or masks that need to be
+   * accessed uniformly across all threads.
+   *
+   * \param shape The shape of the buffer.
+   * \param thread_extent The number of threads.
+   * \return A Fragment where each thread has a complete copy of all elements.
+   */
+  TVM_DLL static Fragment FullyReplicated(Array<PrimExpr> shape,
+                                          PrimExpr thread_extent);
+
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Fragment, Layout, FragmentNode);
 };
 
 Var InputPlaceholder(size_t idx);
@@ -166,8 +223,8 @@ Fragment makeGemmFragmentACDNA(const int block_m, const int block_n,
                                const int warp_n, const int element_size,
                                const int k_pack, bool transposed = false);
 
-// Default Memory Layout
-Layout makeGemmLayoutLinear(int stride, int continuous);
+// Default Memory Layout (row-major linear layout for any dimension)
+Layout makeLinearLayout(Array<PrimExpr> shape);
 Layout makeGemmABLayoutPadded(int stride, int continuous, int element_size);
 Layout makeGemmABLayout(int mat_stride, int mat_continuous, int continuity,
                         int element_size, bool k_inner = true);
@@ -201,6 +258,12 @@ Layout makeQuarterBankSwizzleLayout(int stride, int continuous,
 namespace attr {
 // BlockAttr, Containing the layout for all the buffers in the block
 constexpr const char *kLayoutMap = "layout_map";
+// ForAttr, Containing the parallel loop layout for a parallel for loop
+constexpr const char *kParallelLoopLayout = "parallel_loop_layout";
+// ForAttr, Containing the predicate for a parallel for loop
+constexpr const char *kParallelLoopPredicate = "parallel_loop_predicate";
+// ForAttr, Width (in elements) for coalesced memory access
+constexpr const char *kCoalescedWidth = "coalesced_width";
 } // namespace attr
 
 } // namespace tl
diff --git a/src/layout/swizzle.cc b/src/layout/swizzle.cc
index 2da308038..e3222b9c0 100644
--- a/src/layout/swizzle.cc
+++ b/src/layout/swizzle.cc
@@ -6,6 +6,7 @@
 
 #include "swizzle.h"
 
+#include <tvm/node/node.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/stmt_functor.h>
 
@@ -86,14 +87,16 @@ SwizzledLayout::SwizzledLayout(Array<IterVar> forward_var,
   forward_index =
       forward_index.Map([&](const PrimExpr &e) { return Substitute(e, vmap); });
 
-  auto n = make_object<SwizzledLayoutNode>(input_size, forward_index, pattern);
+  auto n = tvm::ffi::make_object<SwizzledLayoutNode>(input_size, forward_index,
+                                                     pattern);
   data_ = std::move(n);
 }
 
 SwizzledLayout::SwizzledLayout(Array<PrimExpr> input_size,
                                Array<PrimExpr> forward_index,
                                SwizzlePattern pattern) {
-  auto n = make_object<SwizzledLayoutNode>(input_size, forward_index, pattern);
+  auto n = tvm::ffi::make_object<SwizzledLayoutNode>(input_size, forward_index,
+                                                     pattern);
   data_ = std::move(n);
 }
 
@@ -102,14 +105,5 @@ void SwizzledLayoutNode::RegisterReflection() {
   refl::ObjectDef<SwizzledLayoutNode>();
 }
 
-bool SwizzledLayoutNode::SEqualReduce(const SwizzledLayoutNode *other,
-                                      SEqualReducer equal) const {
-  return equal(this->InputShape(), other->InputShape()) &&
-         equal(this->forward_index_, other->forward_index_) &&
-         pattern_ == other->pattern_;
-}
-
-TVM_REGISTER_NODE_TYPE(SwizzledLayoutNode);
-
 } // namespace tl
-} // namespace tvm
\ No newline at end of file
+} // namespace tvm
diff --git a/src/layout/swizzle.h b/src/layout/swizzle.h
index 5f7f4f3dd..b0bf5f1c9 100644
--- a/src/layout/swizzle.h
+++ b/src/layout/swizzle.h
@@ -44,10 +44,9 @@ class SwizzledLayoutNode : public LayoutNode {
   Layout Inverse() const final;
   std::string DebugOutput() const final;
   bool IsEqual(const SwizzledLayoutNode *other, bool skip_index = false) const;
-  static constexpr const char *_type_key = "tl.SwizzledLayout";
-  bool SEqualReduce(const SwizzledLayoutNode *other, SEqualReducer equal) const;
   static void RegisterReflection();
-  TVM_DECLARE_FINAL_OBJECT_INFO(SwizzledLayoutNode, LayoutNode);
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.SwizzledLayout", SwizzledLayoutNode,
+                                    LayoutNode);
 
 private:
   SwizzlePattern pattern_;
@@ -62,11 +61,11 @@ class SwizzledLayout : public Layout {
                          Array<PrimExpr> forward_index, SwizzlePattern pattern);
   TVM_DLL SwizzledLayout(Array<PrimExpr> input_size,
                          Array<PrimExpr> forward_index, SwizzlePattern pattern);
-
-  TVM_DEFINE_OBJECT_REF_METHODS(SwizzledLayout, Layout, SwizzledLayoutNode);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(SwizzledLayout, Layout,
+                                             SwizzledLayoutNode);
 };
 
 } // namespace tl
 } // namespace tvm
 
-#endif // TVM_TL_LAYOUT_SWIZZLE_H_
\ No newline at end of file
+#endif // TVM_TL_LAYOUT_SWIZZLE_H_
diff --git a/src/layout/utils.cc b/src/layout/utils.cc
index 22849a0d8..860e746a7 100644
--- a/src/layout/utils.cc
+++ b/src/layout/utils.cc
@@ -5,7 +5,13 @@
  */
 
 #include "utils.h"
+#include "tvm/arith/iter_affine_map.h"
+#include "tvm/ffi/container/map.h"
+#include "tvm/node/functor.h"
+#include "tvm/node/repr_printer.h"
+#include "tvm/node/structural_equal.h"
 
+#include <sstream>
 #include <tvm/tir/op.h>
 #include <tvm/tir/stmt_functor.h>
 
@@ -98,7 +104,9 @@ Array<IterSplitExpr> get_unused_iters(const IterMark &mark,
           << " and " << expected_lower_factor;
       results.emplace_back(
           mark, expected_lower_factor,
-          FloorDiv(splits[lowest]->lower_factor, expected_lower_factor), 1);
+          analyzer->Simplify(
+              FloorDiv(splits[lowest]->lower_factor, expected_lower_factor)),
+          1);
       expected_lower_factor = splits[lowest]->lower_factor;
     } else {
       used[j] = true;
@@ -109,12 +117,115 @@ Array<IterSplitExpr> get_unused_iters(const IterMark &mark,
   bool match_full_iter =
       analyzer->CanProveEqual(expected_lower_factor, mark->extent);
   if (!match_full_iter) {
-    results.emplace_back(mark, expected_lower_factor,
-                         FloorDiv(mark->extent, expected_lower_factor), 1);
+    results.emplace_back(
+        mark, expected_lower_factor,
+        analyzer->Simplify(FloorDiv(mark->extent, expected_lower_factor)), 1);
   }
   return results;
 }
 
+struct IterExprPP {
+  // std::vector<std::pair<std::string, PrimExpr>> marks;
+  ffi::Map<ffi::String, PrimExpr> marks;
+  std::string data;
+
+  IterExprPP(const PrimExpr &expr) { data = Visit(expr); }
+
+  IterExprPP(const IterMark &mark) { data = Visit_(mark.get()); }
+
+  std::string Visit(const PrimExpr &expr) {
+    if (auto *sum = expr.as<IterSumExprNode>()) {
+      return Visit_(sum);
+    } else if (auto *split = expr.as<IterSplitExprNode>()) {
+      return Visit_(split);
+    } else if (auto *var = expr.as<VarNode>()) {
+      return var->name_hint;
+    } else {
+      std::stringstream ss;
+      ss << "<UNKNOWN: " << expr << ">";
+      return ss.str();
+    }
+  }
+
+  std::string Visit_(const IterMarkNode *op) {
+    std::stringstream ss;
+    ss << "(";
+    ss << Visit(op->source);
+    ss << ")";
+    auto res = ss.str();
+    marks.Set(res, op->extent);
+    return res;
+  }
+
+  std::string Visit_(const IterSumExprNode *op) {
+    std::stringstream ss;
+    bool first = true;
+    for (const auto args : op->args) {
+      if (!first) {
+        ss << " + ";
+      } else {
+        first = false;
+      }
+      ss << Visit_(args.get());
+    }
+    return ss.str();
+  }
+
+  std::string Visit_(const IterSplitExprNode *op) {
+    std::stringstream ss;
+    ss << Visit_(op->source.get());
+    if (!is_one(op->lower_factor)) {
+      ss << " / " << op->lower_factor;
+    }
+    ss << " % " << op->extent;
+    if (!is_one(op->scale)) {
+      ss << " * " << op->scale;
+    }
+    return ss.str();
+  }
+
+  friend std::ostream &operator<<(std::ostream &os, const IterExprPP &pp) {
+    os << "IterExpr(\n";
+    os << "  expr=" << pp.data << "\n";
+    os << "  iter_mark_extents=";
+    if (pp.marks.empty()) {
+      os << "{}\n";
+    } else {
+      os << "{\n";
+      for (const auto &[k, v] : pp.marks) {
+        os << "    " << k << ": " << v << ",\n";
+      }
+      os << "  }\n";
+    }
+    os << ")";
+    return os;
+  }
+};
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .clear_dispatch<IterMarkNode>()
+    .set_dispatch<IterMarkNode>([](const ObjectRef &obj, ReprPrinter *p) {
+      auto *node = static_cast<const IterMarkNode *>(obj.get());
+      IterExprPP pp(tvm::ffi::GetRef<IterMark>(node));
+      p->stream << pp;
+    })
+    .clear_dispatch<IterSumExprNode>()
+    .set_dispatch<IterSumExprNode>([](const ObjectRef &obj, ReprPrinter *p) {
+      auto *node = static_cast<const IterSumExprNode *>(obj.get());
+      IterExprPP pp(tvm::ffi::GetRef<IterSumExpr>(node));
+      p->stream << pp;
+    })
+    .clear_dispatch<IterSplitExprNode>()
+    .set_dispatch<IterSplitExprNode>([](const ObjectRef &obj, ReprPrinter *p) {
+      auto *node = static_cast<const IterSplitExprNode *>(obj.get());
+      IterExprPP pp(tvm::ffi::GetRef<IterSplitExpr>(node));
+      p->stream << pp;
+    });
+
+// Heuristic: detect per-iterator gaps ("unused" pieces) even when the iterator
+// appears in fused forms across multiple index expressions. We first normalize
+// every index into IterSumExpr, collect all splits per source Var, then
+// consolidate them to avoid misclassifying a used split as unused.
 Array<IterSplitExpr> DivideUnusedIterators(const Array<PrimExpr> &exprs,
                                            const Array<IterVar> input_iters,
                                            Analyzer *analyzer) {
@@ -123,35 +234,40 @@ Array<IterSplitExpr> DivideUnusedIterators(const Array<PrimExpr> &exprs,
   });
   IterMarkSplitCollector collector;
   collector.Collect(iter_sum);
-  Array<IterSplitExpr> results;
 
-  for (const IterMark &mark : collector.visited_) {
-    if (!mark->source.as<Var>()) {
-      std::ostringstream oss;
-      oss << "Not a normalized iterator: " << mark;
-      throw NormalizeIterException(oss.str());
-    }
+  std::unordered_map<IterMark, std::vector<IterSplitExpr>, StructuralHash,
+                     StructuralEqual>
+      mark_splits;
+  std::vector<IterMark> mark_order;
+
+  // Step. 1: force add all input_iters to marks (some may not appear in
+  // collector)
+  for (auto &iter : input_iters) {
+    IterMark mark(iter->var, iter->dom->extent);
+    mark_splits[mark] = {};
+    mark_order.push_back(mark);
   }
 
-  for (const IterVar &iter : input_iters) {
-    IterMark iv_mark;
-    for (const IterMark &mark : collector.visited_) {
-      if (mark->source.as<Var>()->same_as(iter->var)) { // NOLINT(*)
-        iv_mark = mark;
-        break;
-      }
+  // Step. 2: add all collected marks and their splits
+  for (auto &mark : collector.visited_) {
+    if (!mark_splits.count(mark)) {
+      mark_splits[mark] = {};
+      mark_order.push_back(mark);
     }
-    if (iv_mark.defined()) {
-      auto splits =
-          get_unused_iters(iv_mark, collector.mark2splits_[iv_mark], analyzer);
-      // Put the small axis last
-      results.insert(results.end(), splits.rbegin(), splits.rend());
-    } else if (!is_one(iter->dom->extent)) {
-      auto mark = IterMark(iter->var, iter->dom->extent);
-      auto split = IterSplitExpr(mark, 1, iter->dom->extent, 1);
-      results.push_back(split);
+    for (const auto &splits : collector.mark2splits_[mark]) {
+      mark_splits[mark].push_back(splits);
     }
   }
+
+  Array<IterSplitExpr> results;
+  // Step. 3: process marks in order and collect complement
+  for (const auto &mark : mark_order) {
+    const auto &existing_splits = mark_splits.at(mark);
+    auto complement_splits = get_unused_iters(mark, existing_splits, analyzer);
+    results.insert(results.end(), complement_splits.rbegin(),
+                   complement_splits.rend());
+  }
+
   return results;
 }
 
@@ -189,7 +305,7 @@ class IterSumMutator {
 
   IterMark Mutate(const IterMark &mark) {
     if (auto *op = mark->source.as<IterSumExprNode>()) {
-      return IterMark(Mutate(GetRef<IterSumExpr>(op)), mark->extent);
+      return IterMark(Mutate(tvm::ffi::GetRef<IterSumExpr>(op)), mark->extent);
     } else {
       return mark;
     }
diff --git a/src/layout/utils.h b/src/layout/utils.h
index 87732bf97..0f03a8617 100644
--- a/src/layout/utils.h
+++ b/src/layout/utils.h
@@ -9,6 +9,8 @@
 
 #include <tvm/arith/iter_affine_map.h>
 
+#include "../support/ffi_aliases.h"
+
 namespace tvm {
 namespace tl {
 
diff --git a/src/op/atomic_add.cc b/src/op/atomic_add.cc
index 31c5bfb4d..408895966 100644
--- a/src/op/atomic_add.cc
+++ b/src/op/atomic_add.cc
@@ -5,11 +5,12 @@
  */
 
 #include "./atomic_add.h"
-#include "./region.h"
+#include "utils.h"
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/op_attr_types.h>
 
+#include "../layout/layout.h"
 #include "../target/utils.h"
 #include "../transform/atomicadd_vectorize.h"
 #include "../transform/common/loop_fusion_utils.h"
@@ -23,48 +24,36 @@ namespace tl {
 using namespace tir;
 
 /**
- * @brief Construct an AtomicAdd operator from call arguments and a buffer map.
+ * @brief Construct an AtomicAdd operator from call arguments and annotations.
  *
  * Builds the internal AtomicAddNode, extracts the source and destination
- * regions and their backing Buffers from the first two call-style expressions
- * in `args` (via RegionOp), and stores them along with their ranges. If a third
- * argument is provided, it is interpreted as an integer immediate and stored as
- * the node's coalesced width.
+ * regions and their backing Buffers from the first two region-style expressions
+ * in `args` (BufferLoad/BufferRegion), and stores them along with their
+ * ranges. Annotations are copied directly from the Call node.
  *
  * @param args Call-style PrimExprs where:
  *             - args[0] is the source region call,
- *             - args[1] is the destination region call,
- *             - args[2] (optional) is an IntImm specifying coalesced width.
- * @param vmap Mapping from buffers used by RegionOp to concrete Buffer objects.
- *
+ *             - args[1] is the destination region call.
+ * @param annotations Map containing optional keys:
+ *             - "use_tma": whether to use TMA for memory operations
+ *             - "memory_order": memory order for atomic operations
  * Notes:
- * - The constructor checks that args[0] and args[1] are CallNodes.
+ * - The constructor checks that args[0] and args[1] are region-compatible.
  * - The constructed node is stored in this->data_.
  */
-AtomicAdd::AtomicAdd(Array<PrimExpr> args, BufferMap vmap) {
-  ObjectPtr<AtomicAddNode> node = make_object<AtomicAddNode>();
+AtomicAdd::AtomicAdd(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
+  ObjectPtr<AtomicAddNode> node = tvm::ffi::make_object<AtomicAddNode>();
   Array<Range> rgs[2];
   Buffer bf[2];
   for (int i = 0; i < 2; i++) {
-    auto expr = args[i];
-    auto call = expr.as<CallNode>();
-    ICHECK(call);
-    auto region = RegionOp(call->args, vmap);
-    rgs[i] = region->GetRanges();
-    bf[i] = region->GetBuffer();
+    auto region = NormalizeToBufferRegion(args[i]);
+    rgs[i] = region->region;
+    bf[i] = region->buffer;
   }
   std::tie(node->src, node->dst) = std::tie(bf[0], bf[1]);
   std::tie(node->src_range, node->dst_range) = std::tie(rgs[0], rgs[1]);
-  if (args.size() >= 3) {
-    node->use_tma = Downcast<IntImm>(args[2]);
-  }
-  node->memory_order = IntImm(0);
-  if (args.size() >= 4) {
-    node->memory_order = Downcast<IntImm>(args[3]);
-  }
-  if (args.size() >= 5) {
-    node->coalesced_width = Downcast<IntImm>(args[4]);
-  }
+  // Copy annotations from the Call node
+  node->annotations = annotations;
   data_ = std::move(node);
 }
 
@@ -78,7 +67,7 @@ AtomicAdd::AtomicAdd(Array<PrimExpr> args, BufferMap vmap) {
  * @return TileOperator A TileOperator owning the cloned AtomicAddNode.
  */
 TileOperator AtomicAddNode::Clone() const {
-  auto op = make_object<AtomicAddNode>(*this);
+  auto op = tvm::ffi::make_object<AtomicAddNode>(*this);
   if (par_op_.defined()) {
     op->par_op_ = Downcast<ParallelOp>(par_op_->Clone());
   }
@@ -272,24 +261,24 @@ For AtomicAddNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
   Array<PrimExpr> src_indices = MakeIndices(loop_vars, 0);
   Array<PrimExpr> dst_indices = MakeIndices(loop_vars, 1);
 
+  Array<PrimExpr> new_args;
+
+  // Optional bounds predicates for src and dst
   PrimExpr src_predicate = MakePredicate(analyzer, loop_vars, src->shape, 0);
   PrimExpr dst_predicate = MakePredicate(analyzer, loop_vars, dst->shape, 1);
 
-  Array<PrimExpr> new_args;
-
+  // Load source value and cast to dst dtype if needed
   PrimExpr src_value = BufferLoad(src, src_indices);
   if (src->dtype != dst->dtype)
     src_value = Cast(dst->dtype, src_value);
-  if (src_predicate.defined())
-    src_value = if_then_else(src_predicate, src_value, make_zero(dst->dtype));
 
-  PrimExpr dst_value = BufferLoad(dst, dst_indices);
-  if (dst_predicate.defined())
-    dst_value = if_then_else(dst_predicate, dst_value, make_zero(dst->dtype));
+  // Build a pointer to destination element using tvm_access_ptr
+  PrimExpr dst_ptr = Call(DataType::Handle(), builtin::address_of(),
+                          {BufferLoad(dst, dst_indices)});
 
-  new_args.push_back(dst_value);
+  new_args.push_back(dst_ptr);
   new_args.push_back(src_value);
-  new_args.push_back(memory_order);
+  new_args.push_back(GetMemoryOrder());
 
   Call atomicadd_call =
       tvm::tir::Call(dst->dtype, atomicadd_elem_op(), new_args);
@@ -297,13 +286,14 @@ For AtomicAddNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
   Stmt body = tvm::tir::Evaluate(atomicadd_call);
 
   for (int i = loop_vars.size() - 1; i >= 0; i--) {
-    Map<String, ObjectRef> annotations = {};
-    if (coalesced_width.defined()) {
-      annotations.Set("coalesced_width", coalesced_width);
+    Map<String, ObjectRef> loop_annotations;
+    if (annotations.count(attr::kCoalescedWidth)) {
+      loop_annotations.Set(attr::kCoalescedWidth,
+                           annotations.Get(attr::kCoalescedWidth).value());
     }
 
     body = For(loop_vars[i]->var, 0, loop_vars[i]->dom->extent,
-               ForKind::kParallel, body, std::nullopt, annotations);
+               ForKind::kParallel, body, std::nullopt, loop_annotations);
   }
   return Downcast<For>(body);
 }
@@ -330,7 +320,7 @@ For AtomicAddNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
 LayoutMap AtomicAddNode::InferLayout(const LayoutInferArgs &T,
                                      InferLevel level) const {
   if (T.layout_map.count(src) && T.layout_map.count(dst)) {
-    if (src.scope() == "local.fragment" && dst.scope() == "local.fragment") {
+    if (IsFragmentBuffer(src) && IsFragmentBuffer(dst)) {
       const FragmentNode *src_layout = T.layout_map[src].as<FragmentNode>();
       const FragmentNode *dst_layout = T.layout_map[dst].as<FragmentNode>();
       if (src_layout && dst_layout) {
@@ -382,7 +372,7 @@ LayoutMap AtomicAddNode::InferLayout(const LayoutInferArgs &T,
  */
 Stmt AtomicAddNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   Target target = T.target;
-  if (use_tma->value != 0) {
+  if (GetUseTMA()) {
     Array<PrimExpr> src_indices, dst_indices;
     PrimExpr src_size, dst_size;
     std::tie(src_indices, src_size) = ReturnIndicesAndSize(0);
@@ -436,14 +426,14 @@ Stmt AtomicAddNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
       StmtExprVisitor::VisitStmt_(op);
     }
     void VisitStmt_(const BufferStoreNode *op) final {
-      if (op->buffer.scope() == "local.fragment") {
+      if (IsFragmentBuffer(op->buffer)) {
         indice_map.Set(op->buffer, op->indices);
         writes.insert(op->buffer);
       }
       StmtExprVisitor::VisitStmt_(op);
     }
     void VisitExpr_(const BufferLoadNode *op) final {
-      if (op->buffer.scope() == "local.fragment") {
+      if (IsFragmentBuffer(op->buffer)) {
         indice_map.Set(op->buffer, op->indices);
       }
       StmtExprVisitor::VisitExpr_(op);
@@ -478,7 +468,7 @@ Stmt AtomicAddNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
     int best_rank = -1;
     for (auto kv : C.indice_map) {
       const Buffer &buf = kv.first;
-      if (buf.scope() != "local.fragment")
+      if (!IsFragmentBuffer(buf))
         continue;
       if (!args.layout_map.count(buf))
         continue;
@@ -492,7 +482,7 @@ Stmt AtomicAddNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
     int sm = GetArchInt(target);
     auto plan = planner.Plan(loop, sm);
     int vec = std::max(plan.vector_size, 1);
-    if (auto cw = loop->annotations.Get("coalesced_width")) {
+    if (auto cw = loop->annotations.Get(attr::kCoalescedWidth)) {
       if (const auto *imm = cw->as<IntImmNode>()) {
         int expected = imm->value;
         ICHECK_GT(expected, 0);
@@ -544,12 +534,12 @@ Stmt AtomicAddNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   return vectorized_thread_loop;
 }
 
-TIR_REGISTER_TL_OP(AtomicAdd, atomicadd)
+TIR_REGISTER_TL_TILE_OP(AtomicAdd, atomicadd)
     .set_num_inputs(2)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
-TVM_FFI_STATIC_INIT_BLOCK({ AtomicAddNode::RegisterReflection(); });
+TVM_FFI_STATIC_INIT_BLOCK() { AtomicAddNode::RegisterReflection(); }
 
 } // namespace tl
-} // namespace tvm
\ No newline at end of file
+} // namespace tvm
diff --git a/src/op/atomic_add.h b/src/op/atomic_add.h
index ae9cc99af..56f48839f 100644
--- a/src/op/atomic_add.h
+++ b/src/op/atomic_add.h
@@ -19,14 +19,16 @@ class AtomicAddNode : public TileOperatorNode {
 public:
   Buffer src, dst; ///< Source and destination buffers
   Array<Range> src_range,
-      dst_range;          ///< Access ranges for source and destination
-  IntImm use_tma;         ///< Whether to use TMA for memory operations
-  IntImm coalesced_width; ///< Width for memory coalescing optimization
-  IntImm memory_order;    ///< Memory order for atomic operations
+      dst_range; ///< Access ranges for source and destination
+  Map<String, ObjectRef> annotations; ///< Annotations for the atomic operation
+  // Supported annotation keys:
+  //   - "use_tma": IntImm, whether to use TMA for memory operations
+  //   - "coalesced_width": IntImm, width for memory coalescing optimization
+  //   - "memory_order": IntImm, memory order for atomic operations
 
   mutable ParallelOp par_op_; ///< Associated parallel operation
-  static constexpr const char *_type_key = "tl.AtomicAdd";
-  TVM_DECLARE_FINAL_OBJECT_INFO(AtomicAddNode, TileOperatorNode);
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.AtomicAdd", AtomicAddNode,
+                                    TileOperatorNode);
 
   Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const;
   LayoutMap InferLayout(const LayoutInferArgs &T, InferLevel level) const;
@@ -41,33 +43,28 @@ class AtomicAddNode : public TileOperatorNode {
         .def_ro("dst", &AtomicAddNode::dst)
         .def_ro("src_range", &AtomicAddNode::src_range)
         .def_ro("dst_range", &AtomicAddNode::dst_range)
-        .def_ro("use_tma", &AtomicAddNode::use_tma)
-        .def_ro("coalesced_width", &AtomicAddNode::coalesced_width)
-        .def_ro("memory_order", &AtomicAddNode::memory_order);
+        .def_ro("annotations", &AtomicAddNode::annotations);
   }
 
-  bool SEqualReduce(const AtomicAddNode *other, SEqualReducer equal) const {
-    return equal(src, other->src) && equal(dst, other->dst) &&
-           equal(src_range, other->src_range) &&
-           equal(dst_range, other->dst_range) &&
-           equal(use_tma, other->use_tma) &&
-           equal(coalesced_width, other->coalesced_width) &&
-           equal(memory_order, other->memory_order);
+  // Helper methods to get annotation values
+  bool GetUseTMA() const {
+    if (auto val = annotations.Get("use_tma")) {
+      if (auto int_val = val->as<IntImmNode>()) {
+        return int_val->value != 0;
+      }
+    }
+    return false;
   }
 
-  void SHashReduce(SHashReducer hash_reduce) const {
-    hash_reduce(src);
-    hash_reduce(dst);
-    hash_reduce(src_range);
-    hash_reduce(dst_range);
-    hash_reduce(use_tma);
-    hash_reduce(coalesced_width);
-    hash_reduce(memory_order);
+  int GetMemoryOrder() const {
+    if (auto val = annotations.Get("memory_order")) {
+      if (auto int_val = val->as<IntImmNode>()) {
+        return int_val->value;
+      }
+    }
+    return 0; // default: relaxed
   }
 
-  static constexpr bool _type_has_method_sequal_reduce = true;
-  static constexpr bool _type_has_method_shash_reduce = true;
-
 protected:
   /// Create SIMT-style parallel loop structure
   For MakeSIMTLoop(arith::Analyzer *analyzer) const;
@@ -85,12 +82,15 @@ class AtomicAddNode : public TileOperatorNode {
 /// Wrapper class for atomic addition operations
 class AtomicAdd : public TileOperator {
 public:
-  TVM_DEFINE_OBJECT_REF_METHODS(AtomicAdd, TileOperator, AtomicAddNode);
-  TVM_DLL AtomicAdd(Array<PrimExpr> args, BufferMap vmap);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(AtomicAdd, TileOperator,
+                                             AtomicAddNode);
+  TVM_DLL
+  AtomicAdd(Array<PrimExpr> args,
+            Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
 } // namespace tl
 } // namespace tvm
 
-#endif //  TVM_TL_OP_ATOMIC_ADD_H_
\ No newline at end of file
+#endif //  TVM_TL_OP_ATOMIC_ADD_H_
diff --git a/src/op/builtin.cc b/src/op/builtin.cc
index 18baaae3c..e82870af7 100644
--- a/src/op/builtin.cc
+++ b/src/op/builtin.cc
@@ -22,10 +22,7 @@ TVM_REGISTER_PASS_CONFIG_OPTION(kDisableSafeMemoryLegalize, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableWarpSpecialized, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableThreadStorageSync, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kConfigIndexBitwidth, Integer);
-TVM_REGISTER_PASS_CONFIG_OPTION(kDisableDynamicTailSplit, Bool);
-TVM_REGISTER_PASS_CONFIG_OPTION(kDynamicAlignment, Integer);
 TVM_REGISTER_PASS_CONFIG_OPTION(kEnableAggressiveSharedMemoryMerge, Bool);
-TVM_REGISTER_PASS_CONFIG_OPTION(kDisableRDC, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kForceLetInline, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableFastMath, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kEnableFastMath, Bool);
@@ -35,6 +32,10 @@ TVM_REGISTER_PASS_CONFIG_OPTION(kDisableVectorize256, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableWGMMA, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableShuffleElect, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kStorageRewriteDetectInplace, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kASTPrintEnable, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kLayoutVisualizationEnable, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kLayoutVisualizationFormats, String);
+TVM_REGISTER_PASS_CONFIG_OPTION(kDeviceCompileFlags, ffi::Array<ffi::String>);
 
 DataType cuTensorMapType() { return DataType::UInt(8, 128); }
 
@@ -100,6 +101,12 @@ TIR_DEFINE_TL_BUILTIN(ieee_frsqrt)
 TIR_DEFINE_TL_BUILTIN(ieee_fdiv).set_num_inputs(3).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kPure));
 
+TIR_DEFINE_TL_BUILTIN(rng_init).set_num_inputs(3).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(rng_rand).set_num_inputs(0).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_TL_BUILTIN(create_list_of_mbarrier)
     .set_num_inputs(-1)
     .set_attr<TCallEffectKind>("TCallEffectKind",
@@ -156,6 +163,16 @@ TIR_DEFINE_TL_BUILTIN(ptx_wgmma_rs)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_TL_BUILTIN(ptx_tcgen05_mma_ss)
+    .set_num_inputs(14)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(ptx_tcgen05_mma_ts)
+    .set_num_inputs(13)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_TL_BUILTIN(ptx_init_tensor_memory)
     .set_num_inputs(2)
     .set_attr<TCallEffectKind>("TCallEffectKind",
@@ -166,6 +183,11 @@ TIR_DEFINE_TL_BUILTIN(ptx_deallocate_tensor_memory)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_TL_BUILTIN(ptx_mma_sm70)
+    .set_num_inputs(13)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_TL_BUILTIN(ptx_ldmatrix)
     .set_num_inputs(4)
     .set_attr<TCallEffectKind>("TCallEffectKind",
@@ -181,11 +203,6 @@ TIR_DEFINE_TL_BUILTIN(ptx_cp_async_barrier_noinc)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
-TIR_DEFINE_TL_BUILTIN(copy_unrolled)
-    .set_num_inputs(4)
-    .set_attr<TCallEffectKind>("TCallEffectKind",
-                               Integer(CallEffectKind::kOpaque));
-
 TIR_DEFINE_TL_BUILTIN(fence_proxy_async)
     .set_num_inputs(0)
     .set_attr<TCallEffectKind>("TCallEffectKind",
@@ -225,6 +242,11 @@ TIR_DEFINE_TL_BUILTIN(warpgroup_wait)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_TL_BUILTIN(warpgroup_fence_operand)
+    .set_num_inputs(4)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_TL_BUILTIN(get_lane_idx)
     .set_num_inputs(-1)
     .set_attr<TCallEffectKind>("TCallEffectKind",
@@ -253,10 +275,8 @@ TIR_DEFINE_TL_BUILTIN(wait_wgmma)
 TIR_DEFINE_TL_BUILTIN(pack_b16).set_num_inputs(2).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kPure));
 
-TIR_DEFINE_TL_BUILTIN(sync_grid_cg)
-    .set_num_inputs(0)
-    .set_attr<TCallEffectKind>("TCallEffectKind",
-                               Integer(CallEffectKind::kOpaque));
+TIR_DEFINE_TL_BUILTIN(sync_grid).set_num_inputs(0).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
 TIR_DEFINE_TL_BUILTIN(loop_break)
     .set_num_inputs(0)
@@ -294,26 +314,43 @@ TIR_DEFINE_TL_BUILTIN(tl_shuffle_elect)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kPure));
 
-TIR_DEFINE_TL_BUILTIN(get_clock).set_num_inputs(0).set_attr<TCallEffectKind>(
-    "TCallEffectKind", Integer(CallEffectKind::kPure));
-
-TIR_DEFINE_TL_BUILTIN(initialize_descriptor)
+TIR_DEFINE_TL_BUILTIN(initialize_wgmma_descriptor)
     .set_num_inputs(5)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_TL_BUILTIN(initialize_tcgen05_descriptor)
+    .set_num_inputs(7)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_TL_BUILTIN(increase_descriptor_offset)
     .set_num_inputs(2)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_TL_BUILTIN(atom_add).set_num_inputs(4).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_TL_BUILTIN(atomicadd_elem_op)
     .set_num_inputs(3)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
-TIR_DEFINE_TL_BUILTIN(atom_add).set_num_inputs(4).set_attr<TCallEffectKind>(
-    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+TIR_DEFINE_TL_BUILTIN(device_assert)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(device_assert_with_msg)
+    .set_num_inputs(2)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(tcgen05_mma_arrive)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
 
 TIR_DEFINE_TL_BUILTIN(warp_reduce_sum)
     .set_num_inputs(1)
@@ -340,23 +377,23 @@ TIR_DEFINE_TL_BUILTIN(warp_reduce_bitor)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
-TIR_DEFINE_TL_BUILTIN(elect_one_sync)
-    .set_num_inputs(0)
-    .set_attr<TCallEffectKind>("TCallEffectKind",
-                               Integer(CallEffectKind::kOpaque));
+// __ldg(BufferLoad | Buffer, idx?) -> value
+// Treat as a pure call that returns the loaded value.
+TIR_DEFINE_TL_BUILTIN(__ldg).set_num_inputs(-1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
+
+// =====================================================================
+// TileScale Distributed Features
+// =====================================================================
 
 TIR_DEFINE_TL_BUILTIN(sync_warp).set_num_inputs(0).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
-TIR_DEFINE_TL_BUILTIN(loop_continue)
-    .set_num_inputs(0)
-    .set_attr<TCallEffectKind>("TCallEffectKind",
-                               Integer(CallEffectKind::kOpaque));
-
 TIR_DEFINE_TL_BUILTIN(warp_any).set_num_inputs(2).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kPure));
 
 TIR_DEFINE_TL_BUILTIN(warp_all).set_num_inputs(2).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kPure));
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/builtin.h b/src/op/builtin.h
index b4b9bf934..99da3d755 100644
--- a/src/op/builtin.h
+++ b/src/op/builtin.h
@@ -28,6 +28,10 @@ static constexpr const char *kWarpSpecializationScope =
 static constexpr const char *kCustomWarpSpecialization =
     "kCustomWarpSpecialization";
 static constexpr const char *kLocalVarInit = "tl.local_var_init";
+// A PrimFunc-level attribute carrying a list of handle Vars
+// that must NOT be marked with the restrict qualifier in codegen.
+// Type: Array<tir::Var>
+static constexpr const char *kNonRestrictParams = "tl.non_restrict_params";
 } // namespace attr
 
 static constexpr const char *kDebugMergeSharedMemoryAllocations =
@@ -40,7 +44,6 @@ static constexpr const char *kDisableWarpSpecialized =
 static constexpr const char *kConfigIndexBitwidth = "tl.config_index_bitwidth";
 static constexpr const char *kEnableAggressiveSharedMemoryMerge =
     "tl.enable_aggressive_shared_memory_merge";
-static constexpr const char *kDisableRDC = "tl.disable_rdc";
 static constexpr const char *kDisableFastMath = "tl.disable_fast_math";
 static constexpr const char *kEnableFastMath = "tl.enable_fast_math";
 static constexpr const char *kPtxasRegisterUsageLevel =
@@ -52,14 +55,12 @@ static constexpr const char *kDisableWGMMA = "tl.disable_wgmma";
 static constexpr const char *kDisableShuffleElect = "tl.disable_shuffle_elect";
 static constexpr const char *kStorageRewriteDetectInplace =
     "tl.storage_rewrite_detect_inplace";
-/*!
- * \brief Whether to disable dynamic tail split
- *
- * kDisableDynamicTailSplit = "tl.disable_dynamic_tail_split"
- *
- */
-static constexpr const char *kDisableDynamicTailSplit =
-    "tl.disable_dynamic_tail_split";
+static constexpr const char *kASTPrintEnable = "tl.ast_print_enable";
+static constexpr const char *kLayoutVisualizationEnable =
+    "tl.layout_visualization_enable";
+static constexpr const char *kLayoutVisualizationFormats =
+    "tl.layout_visualization_formats";
+static constexpr const char *kDeviceCompileFlags = "tl.device_compile_flags";
 
 /*!
  * \brief Whether to disable thread storage synchronization
@@ -83,18 +84,6 @@ static constexpr const char *kDisableThreadStorageSync =
  */
 static constexpr const char *kForceLetInline = "tl.force_let_inline";
 
-/*!
- * \brief The size of the vectorized dimension in buffer, designed by user
- *
- * For example, if the vectorized dimension is 128 bits and the dtype of buffer
- * A[m, k] is float16, the size of the vectorized dimension (i.e. k) in buffer A
- * should be divisible by 8 (8 = 128 / 16).
- *
- * kDynamicAlignment = "tl.dynamic_alignment"
- *
- */
-static constexpr const char *kDynamicAlignment = "tl.dynamic_alignment";
-
 /*!
  * \brief Get the type of the CUDA tensor map
  *
@@ -139,6 +128,10 @@ TVM_DLL const Op &ieee_frsqrt();
 // ieee_fdiv(x, y, rounding_mode) - IEEE-compliant division
 TVM_DLL const Op &ieee_fdiv();
 
+// random op
+TVM_DLL const Op &rng_init();
+TVM_DLL const Op &rng_rand();
+
 /*!
  * \brief tvm intrinsics for TMADescriptor creation for tiled load
  *
@@ -242,14 +235,24 @@ TVM_DLL const Op &ptx_wgmma_ss();
 /*!
  * \brief tvm intrinsics for ptx tensor core wgmma instructions.
  *
- *  void ptx_wgmma_rs(StringImm accum_dtype, StringImm wgmma_prefix, bool
- * a_is_k_major, bool b_is_k_major, StringImm a_dtype_abbrv, StringImm
- * b_dtype_abbrv, StringImm accum_dtype_abbrv, Var A_descriptor, PrimExpr
- * A_offset, Var B_descriptor, Var B_offset, Var C_data, Var C_offset, bool
- * scale_out, bool scale_in_a, bool scale_in_b);
+ *  void ptx_wgmma_rs(StringImm accum_dtype, StringImm wgmma_prefix,
+ * bool b_is_k_major, StringImm a_dtype_abbrv, StringImm b_dtype_abbrv,
+ * StringImm accum_dtype_abbrv, Var A_descriptor, PrimExpr A_offset, Var
+ * B_descriptor, Var B_offset, Var C_data, Var C_offset, bool scale_out,
+ * bool scale_in_a, bool scale_in_b);
  */
 TVM_DLL const Op &ptx_wgmma_rs();
 
+/*!
+ * \brief tvm intrinsic for tcgen05 mma shared-shared instructions.
+ */
+TVM_DLL const Op &ptx_tcgen05_mma_ss();
+
+/*!
+ * \brief tvm intrinsic for tcgen05 mma tensor-shared instructions.
+ */
+TVM_DLL const Op &ptx_tcgen05_mma_ts();
+
 /*!
  * \brief tvm intrinsics for initializing tensor memory
  *
@@ -266,6 +269,17 @@ TVM_DLL const Op &ptx_init_tensor_memory();
  */
 TVM_DLL const Op &ptx_deallocate_tensor_memory();
 
+/*!
+ * \brief tvm intrinsic for ptx tensor core mma instructions on SM70.
+ *
+ *  void ptx_mma_sm70(StringImm shape, StringImm A_layout, StringImm B_layout,
+ *                    StringImm A_dtype, StringImm B_dtype, StringImm C_dtype,
+ *                    Var multiplicand_a, Expr a_index,
+ *                    Var multiplicand_b, Expr b_index,
+ *                    Var accumulator, Expr c_index, bool saturate);
+ */
+TVM_DLL const Op &ptx_mma_sm70();
+
 /*!
  * \brief tvm intrinsics for ldmatrix
  *
@@ -282,22 +296,6 @@ TVM_DLL const Op &ptx_ldmatrix();
  */
 TVM_DLL const Op &ptx_stmatrix();
 
-/*!
- * \brief tvm intrinsics for sync threads partial
- *
- * sync_thread_partial()
- *
- */
-TVM_DLL const Op &sync_thread_partial();
-
-/*!
- * \brief tvm intrinsics for copy unrolled
- *
- * copy_unrolled(dst, src, size, unroll_factor)
- *
- */
-TVM_DLL const Op &copy_unrolled();
-
 /*!
  * \brief tvm intrinsic for ptx async copy barrier using
  * cp.async.mbarrier.arrive.noinc
@@ -378,6 +376,14 @@ TVM_DLL const Op &warpgroup_commit_batch();
  */
 TVM_DLL const Op &warpgroup_wait();
 
+/*!
+ * \brief Fence accumulator operand registers for upcoming WGMMA operations
+ *
+ * warpgroup_fence_operand(dtype, ptr, offset, num_regs)
+ *
+ */
+TVM_DLL const Op &warpgroup_fence_operand();
+
 /*!
  * \brief Return the canonical lane index for the calling thread.
  *
@@ -421,10 +427,10 @@ TVM_DLL const Op &wait_wgmma();
 /*!
  * \brief Synchronize all threads in a grid
  *
- * sync_grid_cg()
+ * sync_grid()
  *
  */
-TVM_DLL const Op &sync_grid_cg();
+TVM_DLL const Op &sync_grid();
 
 /*!
  * \brief tvm intrinsic for loop continue
@@ -505,21 +511,27 @@ TVM_DLL const Op &tl_gemm_sp();
 TVM_DLL const Op &tl_shuffle_elect();
 
 /*!
- * \brief tvm intrinsic to get the current clock cycle count.
- *
- *  uint64 get_clock()
+ * \brief tilelang intrinsic for initializing a descriptor buffer for
+ * wgmma/utcmma.
  *
+ *  This op is used to represent a descriptor initialization operation in
+ * tilelang.
  */
-TVM_DLL const Op &get_clock();
+TVM_DLL const Op &initialize_wgmma_descriptor();
 
 /*!
  * \brief tilelang intrinsic for initializing a descriptor buffer for
- * wgmma/utcmma.
+ * tcgen05 mma.
+ */
+TVM_DLL const Op &initialize_tcgen05_descriptor();
+
+/*!
+ * \brief tilelang intrinsic for committing UMMA (TCGEN05) barrier arrive.
  *
- *  This op is used to represent a descriptor initialization operation in
- * tilelang.
+ *  This op wraps the device-side arrive used to signal completion of MMA work
+ *  to a shared-memory mbarrier. It mirrors CUTLASS's umma_arrive.
  */
-TVM_DLL const Op &initialize_descriptor();
+TVM_DLL const Op &tcgen05_mma_arrive();
 
 /*!
  * \brief tilelang intrinsic for setting the start address of a descriptor
@@ -528,7 +540,17 @@ TVM_DLL const Op &initialize_descriptor();
  *  This op is used to represent a descriptor start address setting operation in
  * tilelang.
  */
+
 TVM_DLL const Op &increase_descriptor_offset();
+
+/*!
+ * \brief tilelang intrinsic for atomic add that returns the original value.
+ *
+ *  This op is used to represent an atomic add operation that returns the
+ * original value before addition in tilelang.
+ */
+TVM_DLL const Op &atom_add();
+
 /*!
  * \brief tilelang intrinsic for element-wise atomic addition.
  *
@@ -538,12 +560,18 @@ TVM_DLL const Op &increase_descriptor_offset();
 TVM_DLL const Op &atomicadd_elem_op();
 
 /*!
- * \brief tilelang intrinsic for atomic add that returns the original value.
+ * \brief tilelang intrinsic for assert on device.
  *
- *  This op is used to represent an atomic add operation that returns the
- * original value before addition in tilelang.
+ *  This op is used to represent an assert on device
  */
-TVM_DLL const Op &atom_add();
+TVM_DLL const Op &device_assert();
+
+/*!
+ * \brief tilelang intrinsic for assert on device with additional message.
+ *
+ *  This op is used to represent an assert on device with additional message.
+ */
+TVM_DLL const Op &device_assert_with_msg();
 
 /*!
  * \brief tilelang intrinsic for warp reduction sum.
@@ -571,33 +599,46 @@ TVM_DLL const Op &warp_reduce_bitand();
 TVM_DLL const Op &warp_reduce_bitor();
 
 /*!
- * \brief tilelang intrinsic for electing exactly one lane within a logical
- * thread group.
+ * \brief tilelang intrinsic for CUDA read-only cache load (__ldg).
+ *
+ *  This op allows users to explicitly request a non-coherent cached load
+ *  from global memory on CUDA by emitting `__ldg(&ptr[idx])` for 32-bit
+ *  element types on supported architectures. It provides a direct way to
+ *  leverage the read-only data cache for performance-sensitive loads when
+ *  the compiler cannot infer `const __restrict__` automatically.
+ *
+ *  Usage from TVMScript:
+ *    y[i] = T.__ldg(x[i])
+ *
+ *  The op takes one argument preferred as a BufferLoad identifying the
+ *  source element; alternatively, backends may support passing a Buffer and
+ *  index expression.
  */
-TVM_DLL const Op &elect_one_sync();
+TVM_DLL const Op &__ldg();
 
 /*!
- * \brief tilelang intrinsic for synchronizing all threads in a warp.
+ * \brief Synchronize all threads in a warp
+ *
+ * sync_warp()
  */
 TVM_DLL const Op &sync_warp();
 
 /*!
- * \brief tilelang intrinsic for continuing the innermost loop.
- */
-TVM_DLL const Op &loop_continue();
-
-/*!
- * \brief tilelang intrinsic for checking if any lane in the warp has a true
- * value.
+ * \brief Check if any lane in the warp has a true value
+ *
+ * int warp_any(value, mask)
  */
 TVM_DLL const Op &warp_any();
 
 /*!
- * \brief tilelang intrinsic for checking if all lanes in the warp have a true
- * value.
+ * \brief Check if all lanes in the warp have a true value
+ *
+ * int warp_all(value, mask)
  */
 TVM_DLL const Op &warp_all();
 
+// Note: ld and st are TileOperators defined in remote_copy.h, not builtins
+
 } // namespace tl
 } // namespace tvm
 
diff --git a/src/op/copy.cc b/src/op/copy.cc
index 754dd7336..23770fc95 100644
--- a/src/op/copy.cc
+++ b/src/op/copy.cc
@@ -3,7 +3,6 @@
  * \brief Define copy operator for various memory transfer strategies (Normal,
  *        Bulk/TMA, LDSM/STSM) and lowering logic for GPU code generation.
  *
- * This module is part of TVM TensorIR's Tensor Layout (TL) operations,
  * implementing memory copy operations that can target CPUs or GPUs with
  * optimization for different instructions like bulk copy, matrix load/store,
  * and Hopper's new TMA (Tensor Memory Accelerator).
@@ -16,7 +15,7 @@
 #include "../transform/common/loop_parallel_transform_utils.h"
 #include "../transform/loop_partition.h"
 #include "../transform/loop_vectorize.h"
-#include "region.h"
+#include "utils.h"
 
 #include "../target/cuda.h"
 #include "../target/utils.h"
@@ -31,11 +30,7 @@ namespace tl {
 
 using namespace tir;
 
-/*!
- * \brief Helper to map TVM's DataType to CUDA's CUtensorMapDataType enum value.
- * This function converts TVM data types to CUDA tensor map data types for TMA
- * operations.
- */
+// Maps TVM DataType to CUDA's CUtensorMapDataType enum value.
 static int to_CUtensorMapDataType(DataType dtype) {
   CUtensorMapDataType tp;
   if (dtype.is_float()) {
@@ -57,7 +52,7 @@ static int to_CUtensorMapDataType(DataType dtype) {
     }
   } else if (dtype.is_bfloat16()) {
     tp = CU_TENSOR_MAP_DATA_TYPE_BFLOAT16;
-  } else if (dtype.is_float8_e4m3() || dtype.is_float8_e5m2()) {
+  } else if (dtype.is_float8()) {
     tp = CU_TENSOR_MAP_DATA_TYPE_UINT8;
   } else if (dtype.is_int()) {
     switch (dtype.bits()) {
@@ -99,112 +94,137 @@ static int to_CUtensorMapDataType(DataType dtype) {
   return static_cast<int>(tp);
 }
 
-/*!
- * \brief Utility function to reverse an array.
- * This is commonly used to convert between row-major and column-major layouts.
- */
+// Reverses an array (used for row-major/column-major layout conversion).
 template <typename T> static Array<T> ReverseArray(Array<T> array) {
   return Array<T>{array.rbegin(), array.rend()};
 }
 
-/*!
- * \brief Construct a Copy operator node from call arguments and a buffer map.
- *
- * This constructor parses the first two entries of `args` as Call nodes
- * describing source and destination Regions (via RegionOp), extracts their
- * Buffers and Ranges, and stores them on the newly created CopyNode. It also
- * reads optional arguments:
- * - args[2] (IntImm): coalesced width (stored only if > 0),
- * - args[3] (Bool): disable TMA lowering flag,
- * - args[4] (IntImm): eviction policy.
- *
- * Preconditions:
- * - `args` must contain at least two Call-compatible PrimExpr entries
- * describing regions; an ICHECK will fail if they are not CallNodes.
- *
- * @param args Array of PrimExpr where:
- *   - args[0] is the source Region call,
- *   - args[1] is the destination Region call,
- *   - optional args[2..4] are coalesced width, disable_tma, and eviction
- * policy.
- * @param vmap BufferMap used to resolve RegionOp buffers and ranges.
- */
-Copy::Copy(Array<PrimExpr> args, BufferMap vmap) {
-  ObjectPtr<CopyNode> node = make_object<CopyNode>();
+// Constructs a Copy operator node from call arguments and annotations.
+// args[0]: source region, args[1]: destination region
+// annotations: Map containing coalesced_width, disable_tma, eviction_policy,
+// etc.
+Copy::Copy(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
+  ObjectPtr<CopyNode> node = tvm::ffi::make_object<CopyNode>();
   Array<Range> rgs[2];
   Buffer bf[2];
   for (int i = 0; i < 2; i++) {
-    auto expr = args[i];
-    auto call = expr.as<CallNode>();
-    ICHECK(call);
-    auto region = RegionOp(call->args, vmap);
-    rgs[i] = region->GetRanges();
-    bf[i] = region->GetBuffer();
+    auto region = NormalizeToBufferRegion(args[i]);
+    rgs[i] = region->region;
+    bf[i] = region->buffer;
   }
   std::tie(node->src, node->dst) = std::tie(bf[0], bf[1]);
   std::tie(node->src_range, node->dst_range) = std::tie(rgs[0], rgs[1]);
-  if (args.size() >= 3) {
-    auto coalesced_width = Downcast<IntImm>(args[2]);
-    if (coalesced_width->value > 0) {
-      node->coalesced_width = coalesced_width;
-    }
-  }
-  if (args.size() >= 4) {
-    node->disable_tma = Downcast<Bool>(args[3]);
-  }
-  if (args.size() >= 5) {
-    node->eviction_policy = args[4].as<IntImmNode>()->value;
-  }
+  // Copy annotations from the Call node
+  node->annotations = annotations;
   data_ = std::move(node);
 }
 
-/**
- * @brief Create a shallow clone of this CopyNode as a TileOperator.
- *
- * Produces a new CopyNode object copy-constructed from this node. If a parallel
- * sub-operation (par_op_) is present, the sub-operation is cloned as well and
- * attached to the new node. The returned value is a TileOperator wrapper
- * around the newly created node.
- *
- * @return TileOperator A TileOperator owning the cloned CopyNode.
- */
+// Creates a shallow clone of this CopyNode.
 TileOperator CopyNode::Clone() const {
-  auto op = make_object<CopyNode>(*this);
+  auto op = tvm::ffi::make_object<CopyNode>(*this);
   if (par_op_.defined()) {
     op->par_op_ = Downcast<ParallelOp>(par_op_->Clone());
   }
   return Copy(op);
 }
 
-/*!
- * \brief Create iterator variables for the copy operation.
- * This function creates iteration variables for dimensions that have extent
- * > 1. \return Array of IterVar representing the iterator variables for the
- * copy operation.
- */
+// Creates iterator variables for dimensions with extent > 1.
 Array<IterVar> CopyNode::MakeIterVars() const {
+  // Choose the range set from the lowest-level memory scope between src and
+  // dst. Scope levels: global < shared/shared.dyn/shared.tmem < local.fragment
+  // (fragment)
+  auto scope_level = [](const Buffer &b) -> int {
+    String s = b.scope();
+    if (s == "local.fragment" || s == "local")
+      return 2;
+    if (s == "shared" || s == "shared.dyn" || s == "shared.tmem")
+      return 1;
+    // default to global level for unknown scopes
+    return 0;
+  };
+
+  int src_level = scope_level(src);
+  int dst_level = scope_level(dst);
+  bool base_is_src = (src_level >= dst_level);
+  const Array<Range> &base_ranges = base_is_src ? src_range : dst_range;
+
+  // Sanity check: when switching away from the original (src_range),
+  // ensure the chosen base ranges are not provably smaller than the original
+  // per dimension. This guards against generating undersized loop domains.
+  // Improved logic: use two pointers to traverse both base_ranges and
+  // src_range, skipping dimensions with extent == 1. The number of non-1
+  // extents must match.
+  arith::Analyzer analyzer;
+
+  size_t base_dim = 0, src_dim = 0;
+  while (base_dim < base_ranges.size() && src_dim < src_range.size()) {
+    // Skip base extents that are 1
+    while (base_dim < base_ranges.size() &&
+           is_one(base_ranges[base_dim]->extent)) {
+      ++base_dim;
+    }
+    // Skip src extents that are 1
+    while (src_dim < src_range.size() && is_one(src_range[src_dim]->extent)) {
+      ++src_dim;
+    }
+    // Both indices now at non-1, or at end
+    if (base_dim < base_ranges.size() && src_dim < src_range.size()) {
+      PrimExpr base_ext = base_ranges[base_dim]->extent;
+      PrimExpr src_ext = src_range[src_dim]->extent;
+      // Only fail if base extent is provably smaller than src extent
+      if (analyzer.CanProve(base_ext < src_ext)) {
+        std::ostringstream oss;
+        oss << "Selected loop range is smaller than original src range at "
+               "matched non-1 dimension: "
+            << "base(extent=" << base_ext
+            << ", scope=" << (base_is_src ? src.scope() : dst.scope())
+            << ", min=" << base_ranges[base_dim]->min
+            << ", base_dim=" << base_dim << ") < src(extent=" << src_ext
+            << ", min=" << src_range[src_dim]->min << ", src_dim=" << src_dim
+            << ", scope=" << src.scope() << ") for src=" << src->name
+            << ", dst=" << dst->name << "\n";
+        oss << "src buffer: " << src->name << ", scope=" << src.scope() << "\n";
+        oss << "dst buffer: " << dst->name << ", scope=" << dst.scope() << "\n";
+        oss << "base_ranges[" << base_dim
+            << "]: min=" << base_ranges[base_dim]->min
+            << ", extent=" << base_ext << "\n";
+        oss << "src_ranges[" << src_dim << "]: min=" << src_range[src_dim]->min
+            << ", extent=" << src_ext << "\n";
+        LOG(FATAL) << oss.str();
+      }
+      ++base_dim;
+      ++src_dim;
+    }
+  }
+
+  // Any remaining unmatched dimensions in either range must all have extent ==
+  // 1
+  while (base_dim < base_ranges.size()) {
+    ICHECK(is_one(base_ranges[base_dim]->extent))
+        << "base_ranges has extra non-1 extent at dim " << base_dim;
+    ++base_dim;
+  }
+  while (src_dim < src_range.size()) {
+    ICHECK(is_one(src_range[src_dim]->extent))
+        << "src_range has extra non-1 extent at dim " << src_dim;
+    ++src_dim;
+  }
+
   Array<IterVar> loop_vars;
   size_t idx = 0;
-  for (size_t i = 0; i < src_range.size(); i++) {
-    if (is_one(src_range[i]->extent))
+  for (size_t i = 0; i < base_ranges.size(); i++) {
+    if (is_one(base_ranges[i]->extent))
       continue;
-    Var var = Var(std::string{char('i' + idx)}, src_range[i]->extent->dtype);
+    Var var = Var(std::string{char('i' + idx)}, base_ranges[i]->extent->dtype);
     idx++;
     loop_vars.push_back(
-        {Range(0, src_range[i]->extent), var, IterVarType::kDataPar});
+        {Range(0, base_ranges[i]->extent), var, IterVarType::kDataPar});
   }
   return loop_vars;
 }
 
-/*!
- * \brief Create s for the copy operation.
- * This function generates the actual index expressions for accessing source or
- * destination buffers. For dimensions with extent=1, it uses the range minimum;
- * for others, it adds the iteration variable. \param ivs Array of IterVar
- * returned by MakeIterVars(). \param src_dst 0 for src_indices, 1 for
- * dst_indices. \return Array of PrimExpr representing the indices for the copy
- * operation.
- */
+// Generates index expressions for accessing src (src_dst=0) or dst (src_dst=1)
+// buffers.
 Array<PrimExpr> CopyNode::MakeIndices(const Array<IterVar> &ivs,
                                       int src_dst) const {
   Array<PrimExpr> indices;
@@ -224,32 +244,13 @@ Array<PrimExpr> CopyNode::MakeIndices(const Array<IterVar> &ivs,
   return indices;
 }
 
-/**
- * @brief Build a boundary predicate that guards memory accesses for the copy.
- *
- * Constructs a conjunction of per-dimension bounds checks (e.g. `min + iv <
- * extent` and `min + iv >= 0`) for every dynamic dimension involved in the
- * copy. Uses the provided arithmetic analyzer to elide checks that can be
- * proven statically.
- *
- * The function ICHECKs that the supplied `extents` align with the operator's
- * recorded ranges for the selected side (source when `src_dst == 0`,
- * destination when `src_dst == 1`).
- *
- * @param ivs IterVars corresponding to the varying dimensions of the copy. Each
- *   IterVar maps to a non-unit extent dimension in the stored ranges.
- * @param extents Extents of the tensor being accessed (must match the number of
- *   ranges); used as the upper bounds for generated checks.
- * @param src_dst Selects which side's ranges to use: `0` for source, `1` for
- *   destination.
- * @return PrimExpr A conjunction of necessary bounds checks, or an empty
- * `PrimExpr` (null) if all checks are provably true and no predicate is
- * required.
- */
+// Builds a boundary predicate for memory accesses.
+// Returns a conjunction of bounds checks, or empty PrimExpr if all checks pass.
 PrimExpr CopyNode::MakePredicate(arith::Analyzer *analyzer,
                                  const Array<IterVar> &ivs,
                                  Array<PrimExpr> extents, int src_dst) const {
   Array<Range> ranges = src_dst == 0 ? src_range : dst_range;
+
   Array<PrimExpr> cond_list;
   ICHECK(extents.size() == ranges.size()) << extents << " " << ranges;
   size_t idx = 0;
@@ -276,33 +277,13 @@ PrimExpr CopyNode::MakePredicate(arith::Analyzer *analyzer,
   }
 }
 
-/**
- * @brief Construct a SIMT-style nested loop that implements the copy.
- *
- * Builds a loop nest that performs element-wise loads from the source buffer
- * and stores into the destination buffer. For a scalar copy (no varying
- * iteration dimensions) this returns a single serial loop executing one
- * store. For multi-dimensional copies it:
- * - creates data-parallel loops (Parallel For) for each varying dimension,
- * - binds the resulting iteration variables to the provided arithmetic
- *   analyzer for simplification,
- * - computes source and destination index expressions,
- * - applies per-buffer boundary predicates (if needed) to mask out-of-range
- *   accesses,
- * - inserts a cast when src and dst dtypes differ,
- * - applies an optional `coalesced_width` annotation to generated parallel
- *   loops when present.
- *
- * @param analyzer Analyzer used to simplify and bind loop variable domains.
- * @return For A nested For statement representing the generated SIMT loop nest.
- */
+// Constructs a SIMT-style nested loop that implements the copy.
 For CopyNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
   Array<IterVar> loop_vars = MakeIterVars();
   bool is_scalar = loop_vars.empty();
 
   for (const auto &iv : loop_vars)
     analyzer->Bind(iv->var, iv->dom);
-
   ICHECK(loop_vars.size() <= src_range.size())
       << "loop_vars.size() = " << loop_vars.size()
       << ", src_range.size() = " << src_range.size() << ", src = " << src->name
@@ -332,30 +313,19 @@ For CopyNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
     return For(Var("i"), 0, 1, ForKind::kSerial, body);
   }
   for (int i = loop_vars.size() - 1; i >= 0; i--) {
-    Map<String, ObjectRef> annotations = {};
-    if (coalesced_width.defined()) {
-      annotations.Set("coalesced_width", coalesced_width);
+    Map<String, ObjectRef> loop_annotations;
+    if (annotations.count(attr::kCoalescedWidth)) {
+      loop_annotations.Set(attr::kCoalescedWidth,
+                           annotations.Get(attr::kCoalescedWidth).value());
     }
     body = For(loop_vars[i]->var, 0, loop_vars[i]->dom->extent,
-               ForKind::kParallel, body, std::nullopt, annotations);
+               ForKind::kParallel, body, std::nullopt, loop_annotations);
   }
   return Downcast<For>(body);
 }
 
-/**
- * @brief Compute a linearized shared-memory layout used for TMA transfers.
- *
- * Creates a Layout that maps an N-D shared tensor into a 1-D-like ordering
- * suitable for TMA by blocking each dimension into 256-element tiles and
- * splitting each original index into a quotient and remainder. Effectively
- * transforms each index i_k into two coordinates: floor(i_k / 256) and
- * i_k % 256, producing an ordering equivalent to concatenating all quotients
- * followed by all remainders.
- *
- * @param shared_tensor The shared-memory buffer whose shape defines the input
- *        dimensions for the layout inference.
- * @return Layout A Layout describing the linearized ordering for the TMA copy.
- */
+// Computes a linearized shared-memory layout for TMA transfers.
+// Maps [i, j] -> [i // 256, j // 256, i % 256, j % 256]
 Layout CopyNode::ComputeLinearLayout(const Buffer &shared_tensor) const {
   Array<PrimExpr> input_size = shared_tensor->shape;
   Array<PrimExpr> forward_vars;
@@ -373,36 +343,16 @@ Layout CopyNode::ComputeLinearLayout(const Buffer &shared_tensor) const {
   return Layout(input_size, forward_index);
 }
 
-/**
- * @brief Infer memory layouts for this Copy operation.
- *
- * Determines an appropriate LayoutMap for the copy based on the target and
- * enabled lowering paths. For TMA-capable targets when the chosen copy
- * instruction is BulkLoad or BulkStore, this may produce a linearized shared
- * memory layout suitable for TMA transfers (only when inference is invoked at
- * InferLevel::kFree and no layout for the shared buffer is already annotated).
- * For other cases (including LDSM/STSM and the normal copy path), layout
- * inference is delegated to the SIMT parallel operation produced by
- * MakeSIMTLoop().
- *
- * This method may read PassContext configuration (kDisableTMALower) and may
- * lazily construct and cache the parallel operation in par_op_ as a side
- * effect.
- *
- * @param T LayoutInferArgs containing target and the current layout map.
- * @param level The inference level controlling how aggressive/layouts may be
- *              proposed.
- * @return LayoutMap mapping buffers to inferred layouts (may be empty if no
- *         additional layouts are suggested).
- */
+// Infers memory layouts for this Copy operation based on target and copy
+// instruction.
 LayoutMap CopyNode::InferLayout(const LayoutInferArgs &T,
                                 InferLevel level) const {
   auto target = T.target;
   using namespace tvm::transform;
   PassContext pass_ctx = PassContext::Current();
   bool disable_tma_lower =
-      pass_ctx->GetConfig<bool>(kDisableTMALower, false).value();
-  auto copy_inst = GetCopyInst(target, disable_tma_lower || disable_tma,
+      pass_ctx->GetConfig<Bool>(kDisableTMALower, Bool(false)).value();
+  auto copy_inst = GetCopyInst(target, disable_tma_lower || GetDisableTMA(),
                                T.layout_map, T.analyzer, T.buffer_oob);
 
   // Handle tensor memory (tmem) layout inference
@@ -469,23 +419,82 @@ LayoutMap CopyNode::InferLayout(const LayoutInferArgs &T,
     return results;
   }
 
-  if (copy_inst == CopyInst::kBulkLoad || copy_inst == CopyInst::kBulkStore) {
+  if (copy_inst == CopyInst::kBulkLoad || copy_inst == CopyInst::kBulkStore ||
+      copy_inst == CopyInst::kBulkLoad1D ||
+      copy_inst == CopyInst::kBulkStore1D) {
     // if can apply swizzling, we skip layout inference
     // for bulk load/store, we can directly apply the layout of normal copy
     // This must be a global/shared layout, so we can skip the parallel op
     // layout inference (parallel layout inference only annotate the loop layout
     // and the register layout).
-    bool is_load = copy_inst == CopyInst::kBulkLoad;
-    Buffer global_tensor = is_load ? src : dst;
-    Buffer shared_tensor = is_load ? dst : src;
+    Map<Buffer, Layout> result_map;
+
+    bool is_tma_1d = copy_inst == CopyInst::kBulkLoad1D ||
+                     copy_inst == CopyInst::kBulkStore1D;
+    bool is_load =
+        copy_inst == CopyInst::kBulkLoad || copy_inst == CopyInst::kBulkLoad1D;
+    bool is_store = copy_inst == CopyInst::kBulkStore ||
+                    copy_inst == CopyInst::kBulkStore1D;
+    auto global_tensor = is_load ? src : dst;
+    auto shared_tensor = is_load ? dst : src;
+    auto shared_range = is_load ? dst_range : src_range;
+
+    if (is_tma_1d && shared_range.size() == 1) {
+      // 1D TMA Store with single dimension can not be swizzled
+      // But 1D TMA can also have multiple dimensions when the last
+      // dimension is continuous.
+      return result_map;
+    }
+
+    // Collect fragment buffers from indices and mark them as fully replicated
+    // For Bulk Load/Store, fragment buffers used as indices should be
+    // replicated across all threads
+    PrimExpr thread_extent = T.thread_bounds->extent;
+    for (const auto &range : src_range) {
+      CollectFragmentLayouts(range->min, T.let_var_to_expr, T.layout_map,
+                             thread_extent, T.thread_bounds, result_map);
+      CollectFragmentLayouts(range->extent, T.let_var_to_expr, T.layout_map,
+                             thread_extent, T.thread_bounds, result_map);
+    }
+    for (const auto &range : dst_range) {
+      CollectFragmentLayouts(range->min, T.let_var_to_expr, T.layout_map,
+                             thread_extent, T.thread_bounds, result_map);
+      CollectFragmentLayouts(range->extent, T.let_var_to_expr, T.layout_map,
+                             thread_extent, T.thread_bounds, result_map);
+    }
+
     // check shared layout is non-swizzle
     // skip layout inference if shared layout is already annotated
     if (level == InferLevel::kFree && !T.layout_map.count(shared_tensor)) {
-      // create a new layout map for tma linear layout
-      Layout linear_layout = ComputeLinearLayout(shared_tensor);
-      return Map<Buffer, Layout>({{shared_tensor, linear_layout}});
+      if (is_store) {
+        // For BulkStore, we should perform swizzle if possible.
+        // TMA Store is always 1d like, we can directly use the last two
+        // dimensions to analysis swizzling.
+        int dim = shared_tensor->shape.size();
+        const int64_t mat_stride = *as_const_int(shared_tensor->shape[dim - 2]);
+        const int64_t mat_continuous =
+            *as_const_int(shared_tensor->shape[dim - 1]);
+        Layout swizzle_layout = makeGemmABLayoutHopper(
+            mat_stride, mat_continuous, mat_continuous,
+            shared_tensor->dtype.bits(), /*k_inner=*/true);
+        // If makeGemmABLayoutHopper returns a linear layout, fallback to
+        // ComputeLinearLayout which handles arbitrary tensor shapes correctly.
+        if (StructuralEqual()(swizzle_layout, makeLinearLayout(Array<PrimExpr>{
+                                                  Integer(mat_stride),
+                                                  Integer(mat_continuous)}))) {
+          result_map.Set(shared_tensor, ComputeLinearLayout(shared_tensor));
+        } else {
+          result_map.Set(shared_tensor, swizzle_layout);
+        }
+      } else if (level == InferLevel::kFree) {
+        // create a new layout map for tma linear layout
+        Layout linear_layout = ComputeLinearLayout(shared_tensor);
+        result_map.Set(shared_tensor, linear_layout);
+      }
     }
+    return result_map;
   }
+
   // for LDSM/STSM, the layout was deduced from register layout
   // so we can directly apply the layout of normal copy
   // Use parallel op to infer the layout
@@ -493,27 +502,11 @@ LayoutMap CopyNode::InferLayout(const LayoutInferArgs &T,
     arith::Analyzer analyzer;
     par_op_ = ParallelOp((MakeSIMTLoop(&analyzer)));
   }
-  return par_op_->InferLayout(T, level);
+  auto layout_map = par_op_->InferLayout(T, level);
+  return layout_map;
 }
-/**
- * @brief Determine whether this CopyNode can be lowered to a Bulk Load (TMA)
- * instruction.
- *
- * The function returns true when all of the following hold:
- * - the target architecture advertises bulk-copy/TMA support;
- * - the source buffer resides in global memory;
- * - the destination buffer resides in shared memory (either "shared" or
- * "shared.dyn");
- * - the source and destination have the same element data type.
- *
- * If the source and destination dtypes differ, a warning is logged and the
- * function returns false (the caller is expected to fall back to a normal
- * copy).
- *
- * @param target The compilation target to query for bulk-copy support.
- * @return true if the copy can be implemented as a Bulk Load (TMA); false
- * otherwise.
- */
+// Checks if this copy can be lowered to a Bulk Load (TMA) instruction.
+// Requires: TMA support, global->shared scope, matching dtypes.
 bool CopyNode::CheckBulkLoad(Target target, arith::Analyzer *analyzer,
                              bool check_last_dim) const {
   // 1. arch must have bulk copy support
@@ -556,10 +549,14 @@ bool CopyNode::CheckBulkCopy1D(const Buffer &global_tensor,
                                const LayoutMap &layout_map,
                                arith::Analyzer *analyzer) const {
 
-  // Step 1: check shared is contiguous
+  // Step 1: check shared is contiguous (linear layout is also contiguous)
   bool shared_is_contiguous = true;
   if (layout_map.count(shared_tensor)) {
-    shared_is_contiguous = false;
+    // Check if the layout is linear
+    Layout existing =
+        layout_map.Get(shared_tensor).value().as<Layout>().value();
+    Layout linear_layout = makeLinearLayout(shared_tensor->shape);
+    shared_is_contiguous = StructuralEqual()(existing, linear_layout);
   }
   // Step 2: check global is contiguous
   bool global_is_contiguous = true;
@@ -620,18 +617,8 @@ bool CopyNode::CheckBulkStore1D(Target target, const LayoutMap &layout_map,
                          shared_range, layout_map, analyzer);
 }
 
-/**
- * @brief Determine if this CopyNode can be lowered to a CUDA BulkStore (TMA
- * store).
- *
- * Checks whether the target supports bulk copy, the source buffer is in shared
- * memory (shared or shared.dyn), the destination buffer is in global memory,
- * and both buffers have the same element data type. If the data types differ,
- * a warning is logged and false is returned.
- *
- * @param target Target device/architecture to check for bulk-copy support.
- * @return true if all conditions for a BulkStore are met; false otherwise.
- */
+// Checks if this copy can be lowered to a Bulk Store (TMA) instruction.
+// Requires: TMA support, shared->global scope, matching dtypes.
 bool CopyNode::CheckBulkStore(Target target, arith::Analyzer *analyzer,
                               bool check_last_dim) const {
   // 1. arch must have bulk copy support
@@ -666,82 +653,38 @@ bool CopyNode::CheckBulkStore(Target target, arith::Analyzer *analyzer,
   return true;
 }
 
-/*!
- * \brief Check if the copy operation is a LDSM copy.
- * This function verifies if the copy operation can be implemented using CUDA's
- * Load Matrix (LDSM) instruction. Requirements include: target supports
- * LDMATRIX, source is shared.dyn, destination is local.fragment. \param target
- * Target device. \return True if the copy operation is a LDSM copy, false
- * otherwise.
- */
+// Checks if copy can use CUDA's Load Matrix (LDSM) instruction.
+// Requires: LDMATRIX support, shared->fragment scope.
 bool CopyNode::CheckLDSMCopy(Target target) const {
   return TargetHasLdmatrix(target) &&
          (src.scope() == "shared.dyn" || src.scope() == "shared") &&
-         dst.scope() == "local.fragment";
+         IsFragmentBuffer(dst);
 }
 
-/**
- * @brief Determine whether this copy can use the STMATRIX store (STSM) path.
- *
- * Returns true when the target supports STMATRIX and the source buffer is in
- * the `local.fragment` scope while the destination buffer is in shared memory
- * (`shared` or `shared.dyn`).
- *
- * @param target The compilation target to query for STMATRIX support.
- * @return true if the copy may be lowered to an STSM instruction; false
- * otherwise.
- */
+// Checks if copy can use CUDA's Store Matrix (STSM) instruction.
+// Requires: STMATRIX support, fragment->shared scope.
 bool CopyNode::CheckSTSMCopy(Target target) const {
-  return TargetHasStmatrix(target) && src.scope() == "local.fragment" &&
+  return TargetHasStmatrix(target) && IsFragmentBuffer(src) &&
          (dst.scope() == "shared.dyn" || dst.scope() == "shared");
 }
 
-/**
- * @brief Determine whether this copy can use tensor memory load (tcgen05.ld).
- *
- * Returns true when the target supports tensor memory and the source buffer is
- * in `shared.tmem` scope while the destination buffer is in `local.fragment`.
- *
- * @param target The compilation target to query for tensor memory support.
- * @return true if the copy may be lowered to a tcgen05.ld instruction; false
- * otherwise.
- */
+// Checks if copy can use tensor memory load (tcgen05.ld).
+// Requires: tmem support, shared.tmem->fragment scope.
 bool CopyNode::CheckTMemLoad(Target target) const {
   return TargetHasTmem(target) && src.scope() == "shared.tmem" &&
-         dst.scope() == "local.fragment";
+         IsFragmentBuffer(dst);
 }
 
-/**
- * @brief Determine whether this copy can use tensor memory store (tcgen05.st).
- *
- * Returns true when the target supports tensor memory and the source buffer is
- * in `local.fragment` scope while the destination buffer is in `shared.tmem`.
- *
- * @param target The compilation target to query for tensor memory support.
- * @return true if the copy may be lowered to a tcgen05.st instruction; false
- * otherwise.
- */
+// Checks if copy can use tensor memory store (tcgen05.st).
+// Requires: tmem support, fragment->shared.tmem scope.
 bool CopyNode::CheckTMemStore(Target target) const {
-  return TargetHasTmem(target) && src.scope() == "local.fragment" &&
+  return TargetHasTmem(target) && IsFragmentBuffer(src) &&
          dst.scope() == "shared.tmem";
 }
 
-/**
- * @brief Selects the most specific copy instruction supported for the given
- * target and buffers.
- *
- * Determines which specialized copy lowering to use (TMA bulk load/store, LDSM,
- * STSM, TMem load/store) based on target capabilities and the memory scopes of
- * the source/destination buffers. If TMA lowering is disabled via the flag,
- * BulkLoad/BulkStore are not selected. The selection priority is: TMemLoad,
- * TMemStore, BulkLoad1D, BulkStore1D, BulkLoad, BulkStore, LDSM, STSM, then
- * Normal (fallback).
- *
- * @param target The compilation target used to query hardware capabilities.
- * @param disable_tma_lower If true, prevents selecting TMA-based bulk
- * load/store instructions.
- * @return CopyInst The chosen copy instruction enum value.
- */
+// Selects the most specific copy instruction for the given target and buffers.
+// Priority: BulkLoad1D, BulkStore1D, BulkLoad, BulkStore, LDSM, STSM, TMemLoad,
+// TMemStore, Normal.
 CopyInst CopyNode::GetCopyInst(Target target, bool disable_tma_lower,
                                const LayoutMap &layout_map,
                                arith::Analyzer *analyzer,
@@ -775,26 +718,16 @@ CopyInst CopyNode::GetCopyInst(Target target, bool disable_tma_lower,
   }
 }
 
-/*!
- * \brief Lower the copy operation to PTX code.
- * This function converts the high-level copy operation into low-level PTX
- * instructions. It dispatches to specialized lowering functions based on the
- * determined copy instruction type:
- * - Bulk Load/Store: Uses Tensor Memory Accelerator (TMA) instructions
- * - LDSM/STSM: Uses matrix load/store instructions for tensor cores
- * - Normal: Uses standard load/store operations with loop transformations
- * \param T LowerArgs containing target and layout map.
- * \param analyzer Arithmetic analyzer for simplification.
- * \return Stmt representing the PTX code for the copy operation.
- */
+// Lowers the copy operation to PTX code by dispatching to specialized lowering
+// functions.
 Stmt CopyNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   Target target = T.target;
 
   using namespace tvm::transform;
   PassContext pass_ctx = PassContext::Current();
   bool disable_tma_lower =
-      pass_ctx->GetConfig<bool>(kDisableTMALower, false).value();
-  auto copy_inst = GetCopyInst(target, disable_tma_lower || disable_tma,
+      pass_ctx->GetConfig<Bool>(kDisableTMALower, Bool(false)).value();
+  auto copy_inst = GetCopyInst(target, disable_tma_lower || GetDisableTMA(),
                                T.layout_map, analyzer);
   if (copy_inst == CopyInst::kTMemLoad || copy_inst == CopyInst::kTMemStore) {
     auto tmem_copy = LowerTmemCopy(T, analyzer);
@@ -821,24 +754,7 @@ Stmt CopyNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   }
 }
 
-/**
- * @brief Lower the copy operator using the generic (non-specialized) path.
- *
- * Generates standard load/store code paths for targets that cannot or should
- * not use specialized copy instructions (TMA, LDSM/STSM). Builds a SIMT loop,
- * fuses and transforms parallel loops, infers and applies loop layouts on GPU
- * targets, partitions by thread, and applies vectorization appropriate to the
- * device (CPU or GPU). If a thread-level predicate is required, the resulting
- * body is guarded with an IfThenElse.
- *
- * @param T Lowering context including the target, thread bounds, thread var,
- *          layout map, and buffer remapping used during layout inference and
- *          loop partitioning.
- * @param analyzer Arithmetic analyzer used to simplify and reason about bounds
- *                 during loop partitioning and predicate construction.
- * @return Stmt Lowered statement representing the transformed, vectorized
- *              normal-copy loop (possibly wrapped in a predicate).
- */
+// Lowers the copy using standard load/store with loop transformations.
 Stmt CopyNode::LowerNormalCopy(const LowerArgs &T,
                                arith::Analyzer *analyzer) const {
   bool is_cpu_target = T.target->GetTargetDeviceType() == kDLCPU;
@@ -851,54 +767,37 @@ Stmt CopyNode::LowerNormalCopy(const LowerArgs &T,
   For vectorized_thread_loop;
   auto par_op = ParallelOp(transformed_loop);
 
-  if (is_cpu_target) {
+  if (is_cpu_target || IsLocalBuffer(src) || IsLocalBuffer(dst)) {
+    if (IsLocalBuffer(src) && !IsLocalBuffer(dst)) {
+      LOG(WARNING) << "Copy from local buffer `" << src->name << "` to "
+                   << dst.scope() << " buffer `" << dst->name
+                   << "` may cause conflicted write.";
+    }
     vectorized_thread_loop = VectorizeLoop(transformed_loop);
+    return vectorized_thread_loop;
   } else {
     std::vector<InferLevel> levels = {InferLevel::kCommon, InferLevel::kStrict,
                                       InferLevel::kFree};
     for (auto level : levels) {
-      par_op->InferLayout({T.target, T.thread_bounds, T.layout_map, analyzer,
-                           false, T.buffer_remap},
+      par_op->InferLayout({T.target,
+                           T.thread_bounds,
+                           T.layout_map,
+                           analyzer,
+                           false,
+                           T.buffer_remap,
+                           {}},
                           level);
     }
     auto loop_layout = par_op->GetLoopLayout();
-    auto thread_var = T.thread_var;
-    auto thread_loop =
-        PartitionLoop(par_op->GetRoot(), T.thread_var, analyzer, loop_layout);
-    vectorized_thread_loop = VectorizeLoop(thread_loop);
-  }
-
-  if (par_op->GetPredicate(T.thread_var).defined()) {
-    return IfThenElse(par_op->GetPredicate(T.thread_var).value(),
-                      vectorized_thread_loop);
+    // Use LowerParallelLoop to handle partitioning, vectorization, and
+    // predicate
+    return LowerParallelLoop(par_op->GetRoot(), loop_layout, T.thread_var,
+                             analyzer, par_op->GetPredicate(T.thread_var));
   }
-  return vectorized_thread_loop;
 }
 
-/**
- * @brief Lower a Copy operator to LDSM/STSM (warp-level 8x8 matrix)
- * instructions.
- *
- * Lowers a CopyNode into PTX matrix load/store (LDSM/STSM) sequences when the
- * access/layouts meet the hardware constraints required by warp-level 8x8
- * fragment transfers (thread-mapped 8x8 fragment layout, 16-byte contiguous
- * shared memory accesses, full-range local tiles, matching dtypes for loads,
- * and no access predicates). If these conditions are not met the function
- * falls back to lowering via LowerNormalCopy().
- *
- * The routine validates layout/thread-mapping compatibility (including support
- * for transposed fragment layouts), determines vectorization factor (4/2/1)
- * based on extent alignment, computes shared/local addresses, emits the
- * appropriate ptx_ldmatrix/ptx_stmatrix call(s), and wraps them in a small
- * loop that may be unrolled and adjusted for thread-bounds offsets.
- *
- * @param T Lowering context (target, layout/ buffer remaps, thread/ bounds).
- * @param analyzer Arithmetic analyzer used to simplify and prove bounds.
- * @param copy_inst Must be either CopyInst::kLDSM or CopyInst::kSTSM to select
- *                  matrix-load vs matrix-store lowering.
- * @return Stmt A statement implementing the LDSM/STSM lowering, or the result
- *              of LowerNormalCopy(...) when constraints require fallback.
- */
+// Lowers copy to LDSM/STSM (warp-level 8x8 matrix) instructions.
+// Falls back to LowerNormalCopy if hardware constraints are not met.
 Stmt CopyNode::LowerLDSMCopy(const LowerArgs &T, arith::Analyzer *analyzer,
                              CopyInst copy_inst) const {
   ICHECK(copy_inst == CopyInst::kLDSM || copy_inst == CopyInst::kSTSM)
@@ -1081,30 +980,8 @@ Stmt CopyNode::LowerLDSMCopy(const LowerArgs &T, arith::Analyzer *analyzer,
   return for_node;
 }
 
-/**
- * @brief Lower tensor memory copy operations (tcgen05.ld/st/cp).
- *
- * Handles copy operations involving shared.tmem buffers (tensor memory on
- * SM100/Blackwell). Supports three types of tensor memory copies:
- * - tcgen05.ld: tensor memory -> register (local.fragment)
- * - tcgen05.st: register (local.fragment) -> tensor memory
- * - tcgen05.cp: shared memory -> tensor memory
- *
- * The function validates buffer scopes, extracts 2D loop structure, performs
- * layout compatibility checks, selects an appropriate TCGEN05 instruction
- * variant based on data width and thread count, and emits the corresponding PTX
- * intrinsic call.
- *
- * Currently only tcgen05.ld is fully supported; st/cp will trigger an ICHECK
- * failure.
- *
- * @param T Lowering context (target, thread bounds, layout maps, buffer
- * remaps).
- * @param analyzer Arithmetic analyzer for proving bounds and simplifying
- * expressions.
- * @return Stmt The lowered tensor memory copy statement, or an empty Stmt if
- * this copy does not involve tensor memory.
- */
+// Lowers tensor memory copy operations (tcgen05.ld/st/cp).
+// Currently only tcgen05.ld is fully supported.
 Stmt CopyNode::LowerTmemCopy(const LowerArgs &T,
                              arith::Analyzer *analyzer) const {
   if (src.scope() != "shared.tmem" && dst.scope() != "shared.tmem") {
@@ -1117,16 +994,21 @@ Stmt CopyNode::LowerTmemCopy(const LowerArgs &T,
   bool is_ld = false; // tcgen05.ld (tensor memory -> register)
   bool is_st = false; // tcgen05.st (register -> tensor memory)
   bool is_cp = false; // tcgen05.cp (shared memory -> tensor memory)
-  if (src.scope() == "shared.tmem" && dst.scope() == "local.fragment") {
+  bool src_needs_pack =
+      16 == src->dtype.bits(); // if needs .pack::16b when is_ld
+  bool dst_needs_unpack =
+      16 == dst->dtype.bits(); // if needs .unpack::16b when is_st
+
+  if (src.scope() == "shared.tmem" && IsFragmentBuffer(dst)) {
     is_ld = true;
-  } else if (src.scope() == "local.fragment" && dst.scope() == "shared.tmem") {
+  } else if (IsFragmentBuffer(src) && dst.scope() == "shared.tmem") {
     is_st = true;
   } else if (src.scope() == "shared.dyn" && dst.scope() == "shared.tmem") {
     is_cp = true;
   } else {
-    ICHECK(0) << "Unsupported tensor memory copy: "
-              << "src scope = " << src.scope()
-              << ", dst scope = " << dst.scope();
+    LOG(FATAL) << "Unsupported tensor memory copy: "
+               << "src scope = " << src.scope()
+               << ", dst scope = " << dst.scope();
   }
   // Currently tcgen05.cp is not supported
   // TODO (mzw) Support tcgen05.cp
@@ -1246,8 +1128,10 @@ Stmt CopyNode::LowerTmemCopy(const LowerArgs &T,
               : relative_wg_idx * (num_chunks_each_wg * meta.width);
       have_succeeded = true;
       Array<PrimExpr> args;
+      const char *bool_str = src_needs_pack ? "true" : "false";
       args.push_back(StringImm(meta.intrinsics_name + "<" +
-                               std::to_string(num_chunks_each_wg) + ">"));
+                               std::to_string(num_chunks_each_wg) + ", " +
+                               bool_str + ">"));
       args.push_back(
           BufferLoad(src, {(int)logical_row_min,
                            (int)logical_col_min})); // Will be translated later
@@ -1281,32 +1165,8 @@ Stmt CopyNode::LowerTmemCopy(const LowerArgs &T,
   return body;
 }
 
-/**
- * @brief Lower a Copy operator to a bulk TMA (Tensor Memory Accelerator)
- * transfer.
- *
- * Lowers the copy to an optimized TMA load or store when the target and buffer
- * layouts permit. Constructs a TMADesc, detects shared-memory
- * swizzle/interleave patterns, encodes global shape/stride/SMEM parameters, and
- * emits either a 1D TMA transfer (when global/shared are contiguous and element
- * counts match, currently only for loads) or a full multi-dimensional TMA call.
- * The emitted statement is guarded so only the thread with min thread id
- * executes the TMA.
- *
- * If preconditions are not satisfied (unsupported swizzle, stride/size limits,
- * mismatched element counts, OOB risks, or other hardware constraints), this
- * function falls back to LowerNormalCopy.
- *
- * @param T LowerArgs containing target information, thread/bounds variables,
- *          and layout/ buffer remap information used for descriptor
- * construction.
- * @param analyzer Analyzer used to prove shapes/contiguity/equality
- * constraints.
- * @param copy_inst Indicates whether to emit a BulkLoad (TMA load) or BulkStore
- *                  (TMA store). Must be CopyInst::kBulkLoad or kBulkStore.
- * @return Stmt A TIR statement performing the bulk TMA copy (or the result of
- *         LowerNormalCopy when falling back).
- */
+// Lowers copy to a bulk TMA (Tensor Memory Accelerator) transfer.
+// Falls back to LowerNormalCopy if preconditions are not satisfied.
 Stmt CopyNode::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer,
                              CopyInst copy_inst) const {
   ICHECK(copy_inst == CopyInst::kBulkLoad || copy_inst == CopyInst::kBulkStore)
@@ -1504,7 +1364,12 @@ Stmt CopyNode::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer,
   }
 
   auto inner_box_dim = as_const_int(desc.smem_box[0]);
-  ICHECK(inner_box_dim != nullptr);
+  if (inner_box_dim == nullptr) {
+    LOG(WARNING) << "inner_box_dim " << desc.smem_box[0]
+                 << " can only be a constant integer for TMA bulk copy, "
+                    "fallback to normal copy";
+    return LowerNormalCopy(T, analyzer);
+  }
   int instruction_dim = *inner_box_dim;
   if (desc.swizzle == static_cast<int>(CU_TENSOR_MAP_SWIZZLE_64B)) {
     instruction_dim = 64 / src->dtype.bytes();
@@ -1573,7 +1438,7 @@ Stmt CopyNode::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer,
     int need_reduce = 0;
     if (!is_load)
       args.push_back(need_reduce);
-    args.push_back(this->eviction_policy);
+    args.push_back(GetEvictionPolicy());
     tma_copy = For(loop_var, 0, loop_extent, ForKind::kUnrolled,
                    Evaluate(Call(DataType::Handle(), op, args)));
   } else {
@@ -1585,7 +1450,7 @@ Stmt CopyNode::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer,
     int need_reduce = 0;
     if (!is_load)
       args.push_back(need_reduce);
-    args.push_back(this->eviction_policy);
+    args.push_back(GetEvictionPolicy());
     tma_copy = Evaluate(Call(DataType::Handle(), op, args));
   }
   tma_copy = IfThenElse(EQ(T.thread_var, T.thread_bounds->min), tma_copy);
@@ -1657,24 +1522,19 @@ Stmt CopyNode::LowerBulkCopy1D(const LowerArgs &T, arith::Analyzer *analyzer,
     tma_copy = Evaluate(
         Call(DataType::Handle(), tma_load(),
              {shared_addr, global_addr, 0,
-              elements * shared_tensor->dtype.bytes(), this->eviction_policy}));
+              elements * shared_tensor->dtype.bytes(), GetEvictionPolicy()}));
   } else {
     int need_reduce = 0;
     tma_copy = Evaluate(
         Call(DataType::Handle(), tma_store(),
              {global_addr, shared_addr, elements * shared_tensor->dtype.bytes(),
-              need_reduce, this->eviction_policy}));
+              need_reduce, GetEvictionPolicy()}));
   }
   tma_copy = IfThenElse(EQ(T.thread_var, T.thread_bounds->min), tma_copy);
   return tma_copy;
 }
-/*!
- * \brief Encode the TMA descriptor into an array of PrimExpr.
- * This function serializes the TMA descriptor fields into a format suitable for
- * passing to the create_tma_descriptor() builtin function. The encoding follows
- * the expected argument order for the TMA descriptor creation.
- * \return Array of PrimExpr representing the encoded TMA descriptor.
- */
+// Encodes the TMA descriptor into an array of PrimExpr for
+// create_tma_descriptor().
 Array<PrimExpr> TMADesc::EncodeCallArgs() const {
   Array<PrimExpr> args;
   args.reserve(rank * 4 + 7);
@@ -1698,108 +1558,55 @@ Array<PrimExpr> TMADesc::EncodeCallArgs() const {
   return args;
 }
 
-/**
- * @brief Construct a Conv2DIm2ColOp node.
- *
- * Initializes a Conv2DIm2ColOpNode from raw TL-call arguments and a buffer map.
- * The constructor extracts source and destination Buffers from vmap and reads
- * convolution parameters encoded in args:
- * - args[0]: source tensor access pointer
- * - args[1]: destination tensor access pointer
- * - args[2]: nhw_step (PrimExpr)
- * - args[3]: c_step (PrimExpr)
- * - args[4]: kernel (IntImm)
- * - args[5]: stride (IntImm)
- * - args[6]: dilation (IntImm)
- * - args[7]: padding (IntImm)
- * - args[8]: eviction_policy (IntImm)
- *
- * The created node stores these values (src, dst, nhw_step, c_step, kernel,
- * stride, dilation, padding, eviction_policy) for later lowering to TMA-based
- * GPU intrinsics.
- *
- * @param args Array of PrimExpr TL-call arguments (see list above).
- * @param vmap Mapping from original buffer variables to actual Buffer objects.
- */
-Conv2DIm2ColOp::Conv2DIm2ColOp(Array<PrimExpr> args, BufferMap vmap) {
-  ObjectPtr<Conv2DIm2ColOpNode> node = make_object<Conv2DIm2ColOpNode>();
-  node->src = vmap[GetVarFromAccessPtr(args[0])];
-  node->dst = vmap[GetVarFromAccessPtr(args[1])];
-  node->nhw_step = args[2];
-  node->c_step = args[3];
-  node->kernel = args[4].as<IntImm>().value()->value;
-  node->stride = args[5].as<IntImm>().value()->value;
-  node->dilation = args[6].as<IntImm>().value()->value;
-  node->padding = args[7].as<IntImm>().value()->value;
-  node->eviction_policy = args[8].as<IntImm>().value()->value;
+// Constructs a Conv2DIm2ColOp node from call arguments.
+// args: src, dst, nhw_step, c_step, kernel, stride, dilation, padding,
+// eviction_policy
+Conv2DIm2ColOp::Conv2DIm2ColOp(Array<PrimExpr> args,
+                               Map<String, ObjectRef> annotations) {
+  ObjectPtr<Conv2DIm2ColOpNode> node =
+      tvm::ffi::make_object<Conv2DIm2ColOpNode>();
+  node->srcRegion_ = NormalizeToBufferRegion(args[0]);
+  node->dstRegion_ = NormalizeToBufferRegion(args[1]);
+  node->src_ = node->srcRegion_->buffer;
+  node->dst_ = node->dstRegion_->buffer;
+  node->nhw_step_ = args[2];
+  node->c_step_ = args[3];
+  node->kernel_ = args[4].as<IntImm>().value()->value;
+  node->stride_ = args[5].as<IntImm>().value()->value;
+  node->dilation_ = args[6].as<IntImm>().value()->value;
+  node->padding_ = args[7].as<IntImm>().value()->value;
+  node->eviction_policy_ = args[8].as<IntImm>().value()->value;
   data_ = std::move(node);
 }
 
-/**
- * @brief Create a shallow copy of this Conv2DIm2ColOpNode wrapped as a
- * TileOperator.
- *
- * Produces a new Conv2DIm2ColOp that owns a freshly allocated
- * Conv2DIm2ColOpNode initialized from this node (member-wise copy). This is
- * used to duplicate the operator node for compiler passes that require
- * independent operator instances.
- *
- * @return TileOperator A TileOperator containing the cloned Conv2DIm2ColOpNode.
- */
+// Creates a shallow copy of this Conv2DIm2ColOpNode.
 TileOperator Conv2DIm2ColOpNode::Clone() const {
-  auto op = make_object<Conv2DIm2ColOpNode>(*this);
+  auto op = tvm::ffi::make_object<Conv2DIm2ColOpNode>(*this);
   return Conv2DIm2ColOp(op);
 }
 
-/**
- * @brief Lower Conv2D im2col into a TMA-backed PTX sequence for Hopper.
- *
- * Constructs a TMA im2col descriptor from the Conv2DIm2ColOp parameters
- * (kernel, stride, dilation, padding, channel/image tiling, dtype and shapes),
- * emits a call to create the im2col descriptor, and returns a statement that
- * invokes the corresponding tma_load_im2col builtin guarded to a single
- * thread. The lowering assumes the destination resides in shared memory and the
- * source in global memory and uses the provided layout information (when
- * available) to select the appropriate shared-memory swizzle.
- *
- * Preconditions (checked with ICHECK):
- * - Target is Hopper.
- * - src.scope() == "global" and dst.scope() is "shared.dyn" or "shared".
- * - src->shape has rank 4 and dst->shape has rank 2.
- * - src and dst have the same dtype.
- * - When a shared layout is supplied it must match a recognized TMA swizzle
- *   pattern (32B/64B/128B) or an ICHECK will fail.
- *
- * @param T Lowering context (target, layout map, thread_var, thread_bounds,
- *          buffer remapping, etc.). Used to fetch target/layout and to emit a
- *          thread-guarded TMA call.
- * @param analyzer Arithmetic analyzer used to prove divisibility and simplify
- *                 expressions required by descriptor construction.
- * @return Stmt A TIR statement that performs a tma_load_im2col call wrapped in
- *              a thread-min guard (IfThenElse). The returned statement is ready
- *              to be inserted into the lowered TIR.
- */
+// Lowers Conv2D im2col into a TMA-backed PTX sequence for Hopper.
 Stmt Conv2DIm2ColOpNode::Lower(const LowerArgs &T,
                                arith::Analyzer *analyzer) const {
   ICHECK(TargetIsHopper(T.target));
-  ICHECK(src.scope() == "global" &&
-         (dst.scope() == "shared.dyn" || dst.scope() == "shared"));
-  ICHECK(src->shape.size() == 4);
-  ICHECK(dst->shape.size() == 2);
-  ICHECK(src->dtype == dst->dtype);
+  ICHECK(src_.scope() == "global" &&
+         (dst_.scope() == "shared.dyn" || dst_.scope() == "shared"));
+  ICHECK(src_->shape.size() == 4);
+  ICHECK(dst_->shape.size() == 2);
+  ICHECK(src_->dtype == dst_->dtype);
   Layout shared_layout;
-  if (T.layout_map.count(dst)) {
-    shared_layout = T.layout_map[dst];
+  if (T.layout_map.count(dst_)) {
+    shared_layout = T.layout_map[dst_];
   }
 
   TMAIm2ColDesc desc;
-  desc.rank = src->shape.size();
-  desc.data_type = to_CUtensorMapDataType(src->dtype);
-  desc.global_addr = src->data;
-  desc.global_shape = ReverseArray(src->shape);
+  desc.rank = src_->shape.size();
+  desc.data_type = to_CUtensorMapDataType(src_->dtype);
+  desc.global_addr = src_->data;
+  desc.global_shape = ReverseArray(src_->shape);
 
-  if (!src->strides.empty()) {
-    desc.global_stride = ReverseArray(src->strides);
+  if (!src_->strides.empty()) {
+    desc.global_stride = ReverseArray(src_->strides);
   } else {
     // Create stride from shape
     PrimExpr stride = 1;
@@ -1813,13 +1620,13 @@ Stmt Conv2DIm2ColOpNode::Lower(const LowerArgs &T,
   ICHECK(is_one(desc.global_stride[0])) << desc.global_stride;
   // Make global stride in bytes
   desc.global_stride = desc.global_stride.Map([&](PrimExpr e) {
-    return cast(DataType::Int(64), e) * src->dtype.bytes();
+    return cast(DataType::Int(64), e) * src_->dtype.bytes();
   });
-  desc.elem_stride = {1, stride, stride, 1};
-  desc.lower_corner = {-padding, -padding};
-  desc.upper_corner = {-padding, -padding};
-  desc.smem_box_pixel = Downcast<IntImm>(dst->shape[0])->value;
-  desc.smem_box_channel = Downcast<IntImm>(dst->shape[1])->value;
+  desc.elem_stride = {1, stride_, stride_, 1};
+  desc.lower_corner = {-padding_, -padding_};
+  desc.upper_corner = {-padding_, -padding_};
+  desc.smem_box_pixel = Downcast<IntImm>(dst_->shape[0])->value;
+  desc.smem_box_channel = Downcast<IntImm>(dst_->shape[1])->value;
   desc.l2_promotion = static_cast<int>(CU_TENSOR_MAP_L2_PROMOTION_L2_128B);
   desc.oob_fill = static_cast<int>(CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
   desc.interleave = static_cast<int>(CU_TENSOR_MAP_INTERLEAVE_NONE);
@@ -1833,18 +1640,18 @@ Stmt Conv2DIm2ColOpNode::Lower(const LowerArgs &T,
 
     if (StructuralEqual()(shared_layout,
                           makeQuarterBankSwizzleLayout(*stride, *continuous,
-                                                       dst->dtype.bits()))) {
+                                                       dst_->dtype.bits()))) {
       desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_32B);
     } else if (StructuralEqual()(shared_layout, makeHalfBankSwizzleLayout(
                                                     *stride, *continuous,
-                                                    dst->dtype.bits()))) {
+                                                    dst_->dtype.bits()))) {
       desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_64B);
     } else if (StructuralEqual()(shared_layout, makeFullBankSwizzleLayout(
                                                     *stride, *continuous,
-                                                    dst->dtype.bits()))) {
+                                                    dst_->dtype.bits()))) {
       desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_128B);
     } else {
-      ICHECK(0) << "Cannot detect TMA layout.";
+      LOG(FATAL) << "Cannot detect TMA layout.";
     }
   }
 
@@ -1860,57 +1667,50 @@ Stmt Conv2DIm2ColOpNode::Lower(const LowerArgs &T,
       << "Currently can only support divisible channel case";
 
   global_coords.push_back(
-      FloorMod(c_step * desc.smem_box_channel, desc.global_shape[0]));
+      FloorMod(c_step_ * desc.smem_box_channel, desc.global_shape[0]));
   image_offset.push_back(
-      dilation *
-      FloorMod(FloorDiv(c_step * desc.smem_box_channel, desc.global_shape[0]),
-               kernel));
-  image_offset.push_back(dilation * FloorDiv(c_step * desc.smem_box_channel,
-                                             desc.global_shape[0] * kernel));
+      dilation_ *
+      FloorMod(FloorDiv(c_step_ * desc.smem_box_channel, desc.global_shape[0]),
+               kernel_));
+  image_offset.push_back(dilation_ * FloorDiv(c_step_ * desc.smem_box_channel,
+                                              desc.global_shape[0] * kernel_));
 
   PrimExpr h_dim =
-      FloorDiv(src->shape[1] + 2 * padding - (kernel - 1) * dilation - 1,
-               stride) +
+      FloorDiv(src_->shape[1] + 2 * padding_ - (kernel_ - 1) * dilation_ - 1,
+               stride_) +
       1;
   PrimExpr w_dim =
-      FloorDiv(src->shape[2] + 2 * padding - (kernel - 1) * dilation - 1,
-               stride) +
+      FloorDiv(src_->shape[2] + 2 * padding_ - (kernel_ - 1) * dilation_ - 1,
+               stride_) +
       1;
   global_coords.push_back(
-      stride * FloorMod(nhw_step * desc.smem_box_pixel, w_dim) - padding);
+      stride_ * FloorMod(nhw_step_ * desc.smem_box_pixel, w_dim) - padding_);
   global_coords.push_back(
-      stride *
-          FloorMod(FloorDiv(nhw_step * desc.smem_box_pixel, w_dim), h_dim) -
-      padding);
+      stride_ *
+          FloorMod(FloorDiv(nhw_step_ * desc.smem_box_pixel, w_dim), h_dim) -
+      padding_);
   global_coords.push_back(
-      FloorDiv(nhw_step * desc.smem_box_pixel, w_dim * h_dim));
+      FloorDiv(nhw_step_ * desc.smem_box_pixel, w_dim * h_dim));
 
   Array<PrimExpr> args;
   args.reserve(desc.rank * 2 + 2);
   args.push_back(create_desc);
   args.push_back(0); // mbar placeholder
-  auto dst_buffer = T.buffer_remap.count(dst) ? T.buffer_remap[dst] : dst;
+  auto dst_buffer = T.buffer_remap.count(dst_) ? T.buffer_remap[dst_] : dst_;
   auto shared_addr = dst_buffer.access_ptr(2);
   args.push_back(shared_addr);
   for (auto coord : global_coords)
     args.push_back(coord);
   for (auto offset : image_offset)
     args.push_back(offset);
-  args.push_back(this->eviction_policy);
+  args.push_back(this->eviction_policy_);
   Stmt tma_copy =
       IfThenElse(EQ(T.thread_var, T.thread_bounds->min),
                  Evaluate(Call(DataType::Handle(), tma_load_im2col(), args)));
   return tma_copy;
 }
 
-/*!
- * \brief Encode the TMA im2col descriptor into an array of PrimExpr.
- * This function serializes the TMA im2col descriptor fields for passing to the
- * create_tma_im2col_descriptor() builtin function. It includes
- * convolution-specific parameters like kernel size, stride, padding, and
- * dilation in addition to standard tensor descriptor fields. \return Array of
- * PrimExpr representing the encoded TMA im2col descriptor.
- */
+// Encodes the TMA im2col descriptor for create_tma_im2col_descriptor().
 Array<PrimExpr> TMAIm2ColDesc::EncodeCallArgs() const {
   Array<PrimExpr> args;
   args.reserve(rank * 5 + 5);
@@ -1938,26 +1738,41 @@ Array<PrimExpr> TMAIm2ColDesc::EncodeCallArgs() const {
   return args;
 }
 
+void CopyNode::CollectFragmentLayouts(const PrimExpr &expr,
+                                      const Map<Var, PrimExpr> &let_var_to_expr,
+                                      const LayoutMap &existing_layouts,
+                                      PrimExpr thread_extent,
+                                      Range thread_bounds,
+                                      Map<Buffer, Layout> &result_map) const {
+  PostOrderVisit(expr, [&](const ObjectRef &node) {
+    if (auto bl = node.as<BufferLoadNode>()) {
+      if (IsFragmentBuffer(bl->buffer) && !existing_layouts.count(bl->buffer) &&
+          !result_map.count(bl->buffer)) {
+        auto f = Fragment::FullyReplicated(bl->buffer->shape, thread_extent);
+        result_map.Set(bl->buffer, f->BindThreadRange(thread_bounds));
+      }
+    } else if (auto var_node = node.as<VarNode>()) {
+      auto var = tvm::ffi::GetRef<Var>(var_node);
+      if (let_var_to_expr.count(var)) {
+        CollectFragmentLayouts(let_var_to_expr[var], let_var_to_expr,
+                               existing_layouts, thread_extent, thread_bounds,
+                               result_map);
+      }
+    }
+  });
+}
+
 // Register the Copy operation with TVM's TIR system
 // This makes the copy operation available for use in TVM programs
 // - Takes 5 inputs: src_buffer, dst_buffer, coalesced_width, disable_tma,
 // eviction_policy
 // - Marked as opaque since it has side effects (memory writes)
-TIR_REGISTER_TL_OP(Copy, copy)
+TIR_REGISTER_TL_TILE_OP(Copy, copy)
     .set_num_inputs(5)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
-/**
- * @brief Layout inference hook for Conv2DIm2ColOpNode.
- *
- * This operator does not provide any layout inference; the function
- * intentionally returns an empty LayoutMap to indicate no layout suggestions.
- *
- * @param T Context for layout inference (ignored).
- * @param level Inference level (ignored).
- * @return LayoutMap An empty map.
- */
+// Layout inference hook - returns empty map (no layout suggestions).
 LayoutMap Conv2DIm2ColOpNode::InferLayout(const LayoutInferArgs &T,
                                           InferLevel level) const {
   return {};
@@ -1968,14 +1783,14 @@ LayoutMap Conv2DIm2ColOpNode::InferLayout(const LayoutInferArgs &T,
 // - Takes 9 inputs: src_buffer, dst_buffer, nhw_step, c_step, kernel, stride,
 // dilation, padding, eviction_policy
 // - Marked as opaque since it has side effects (memory writes)
-TIR_REGISTER_TL_OP(Conv2DIm2ColOp, c2d_im2col)
+TIR_REGISTER_TL_TILE_OP(Conv2DIm2ColOp, c2d_im2col)
     .set_num_inputs(9)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   CopyNode::RegisterReflection();
   Conv2DIm2ColOpNode::RegisterReflection();
-});
+}
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/copy.h b/src/op/copy.h
index 00d07f169..6009c7ce0 100644
--- a/src/op/copy.h
+++ b/src/op/copy.h
@@ -28,6 +28,32 @@ enum class CopyInst : uint8_t {
   kTMemStore = 8,   // tcgen05.st (register -> tensor memory)
 };
 
+/// Convert CopyInst enum to string for debugging
+inline const char *CopyInstToString(CopyInst inst) {
+  switch (inst) {
+  case CopyInst::kNormal:
+    return "Normal";
+  case CopyInst::kLDSM:
+    return "LDSM";
+  case CopyInst::kSTSM:
+    return "STSM";
+  case CopyInst::kBulkLoad:
+    return "BulkLoad";
+  case CopyInst::kBulkStore:
+    return "BulkStore";
+  case CopyInst::kBulkLoad1D:
+    return "BulkLoad1D";
+  case CopyInst::kBulkStore1D:
+    return "BulkStore1D";
+  case CopyInst::kTMemLoad:
+    return "TMemLoad";
+  case CopyInst::kTMemStore:
+    return "TMemStore";
+  default:
+    return "Unknown";
+  }
+}
+
 /// Descriptor for Tensor Memory Access (TMA) copy operations
 struct TMADesc {
   size_t rank;                   ///< Tensor rank (number of dimensions)
@@ -89,20 +115,16 @@ class CopyNode : public TileOperatorNode {
 public:
   Buffer src, dst;                   // Source and destination buffers
   Array<Range> src_range, dst_range; // Ranges for each dimension in src and dst
-  IntImm coalesced_width; // Width (in elements) for coalesced memory access
-  Bool disable_tma = Bool(false); // Whether to disable TMA acceleration
+  Map<String, ObjectRef> annotations; // Annotations for the copy operation
+  // Supported annotation keys:
+  //   - "coalesced_width": IntImm, width for coalesced memory access
+  //   - "disable_tma": Bool, whether to disable TMA acceleration
+  //   - "eviction_policy": IntImm, cache eviction policy (0=normal, 1=first,
+  //   2=last)
 
   mutable ParallelOp par_op_; // Optional associated parallelization operator
 
-  enum class EvictionPolicy : uint8_t {
-    kEvictNormal = 0,
-    kEvictFirst = 1,
-    kEvictLast = 2,
-  };
-
-  uint8_t eviction_policy; // Policy for cache eviction
-  static constexpr const char *_type_key = "tl.Copy";
-  TVM_DECLARE_FINAL_OBJECT_INFO(CopyNode, TileOperatorNode);
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.Copy", CopyNode, TileOperatorNode);
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -111,25 +133,27 @@ class CopyNode : public TileOperatorNode {
         .def_ro("dst", &CopyNode::dst)
         .def_ro("src_range", &CopyNode::src_range)
         .def_ro("dst_range", &CopyNode::dst_range)
-        .def_ro("coalesced_width", &CopyNode::coalesced_width);
+        .def_ro("annotations", &CopyNode::annotations);
   }
 
-  bool SEqualReduce(const CopyNode *other, SEqualReducer equal) const {
-    return equal(src, other->src) && equal(dst, other->dst) &&
-           equal(src_range, other->src_range) &&
-           equal(dst_range, other->dst_range) &&
-           equal(coalesced_width, other->coalesced_width);
+  // Helper methods to get annotation values
+  bool GetDisableTMA() const {
+    if (auto val = annotations.Get("disable_tma")) {
+      if (auto int_val = val->as<IntImmNode>()) {
+        return int_val->value != 0;
+      }
+    }
+    return false;
   }
 
-  void SHashReduce(SHashReducer hash_reduce) const {
-    hash_reduce(src);
-    hash_reduce(dst);
-    hash_reduce(src_range);
-    hash_reduce(dst_range);
-    hash_reduce(coalesced_width);
+  int GetEvictionPolicy() const {
+    if (auto val = annotations.Get("eviction_policy")) {
+      if (auto int_val = val->as<IntImmNode>()) {
+        return int_val->value;
+      }
+    }
+    return 0; // default: evict_normal
   }
-  static constexpr bool _type_has_method_sequal_reduce = true;
-  static constexpr bool _type_has_method_shash_reduce = true;
 
   /*!
    * \brief Lower the copy operator to a TIR statement.
@@ -287,18 +311,41 @@ class CopyNode : public TileOperatorNode {
    * @return Reference to the singleton TVM Op representing this operator.
    */
   TileOperator Clone() const;
+
+private:
+  /*!
+   * \brief Collect fragment buffers from expression and create fully replicated
+   * layouts.
+   *
+   * Recursively searches the expression for BufferLoad nodes with
+   * "local.fragment" scope, following let bindings. For each found fragment
+   * buffer, creates a fully replicated layout and adds it to result_map.
+   *
+   * \param expr            Expression to search.
+   * \param let_var_to_expr Map from let variables to their bound expressions.
+   * \param existing_layouts Existing layout map to check for already-inferred
+   * layouts. \param thread_extent   Number of threads for replication. \param
+   * thread_bounds   Thread bounds for binding the layout. \param result_map
+   * Output map to store collected fragment layouts.
+   */
+  void CollectFragmentLayouts(const PrimExpr &expr,
+                              const Map<Var, PrimExpr> &let_var_to_expr,
+                              const LayoutMap &existing_layouts,
+                              PrimExpr thread_extent, Range thread_bounds,
+                              Map<Buffer, Layout> &result_map) const;
 };
 
 class Copy : public TileOperator {
 public:
-  TVM_DEFINE_OBJECT_REF_METHODS(Copy, TileOperator, CopyNode);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Copy, TileOperator, CopyNode);
 
   /*!
    * \brief Constructor.
    * \param args  Expression arguments for the copy.
-   * \param vmap  Buffer variable mapping.
+   * \param annotations  Annotations map from the Call node.
    */
-  TVM_DLL Copy(Array<PrimExpr> args, BufferMap vmap);
+  TVM_DLL Copy(Array<PrimExpr> args,
+               Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
 
   /*!
    * \brief Get the TVM Op handle corresponding to this Copy op.
@@ -314,49 +361,33 @@ class Copy : public TileOperator {
  */
 class Conv2DIm2ColOpNode : public TileOperatorNode {
 public:
-  Buffer src, dst; // Source (input feature map) and destination (im2col matrix)
-  int stride;      // Stride for convolution
-  int padding;     // Padding amount
-  int dilation;    // Dilation factor
-  int kernel;      // Kernel size
-  int eviction_policy; // Cache eviction policy
-  PrimExpr nhw_step;   // Step size in NHW dimensions
-  PrimExpr c_step;     // Step size in channel dimension
-
-  static constexpr const char *_type_key = "tl.Conv2DIm2Col";
-  TVM_DECLARE_FINAL_OBJECT_INFO(Conv2DIm2ColOpNode, TileOperatorNode);
+  BufferRegion srcRegion_, dstRegion_;
+  Buffer src_,
+      dst_;      // Source (input feature map) and destination (im2col matrix)
+  int stride_;   // Stride for convolution
+  int padding_;  // Padding amount
+  int dilation_; // Dilation factor
+  int kernel_;   // Kernel size
+  int eviction_policy_; // Cache eviction policy
+  PrimExpr nhw_step_;   // Step size in NHW dimensions
+  PrimExpr c_step_;     // Step size in channel dimension
+
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.Conv2DIm2Col", Conv2DIm2ColOpNode,
+                                    TileOperatorNode);
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
     refl::ObjectDef<Conv2DIm2ColOpNode>()
-        .def_ro("src", &Conv2DIm2ColOpNode::src)
-        .def_ro("dst", &Conv2DIm2ColOpNode::dst)
-        .def_ro("stride", &Conv2DIm2ColOpNode::stride)
-        .def_ro("padding", &Conv2DIm2ColOpNode::padding)
-        .def_ro("dilation", &Conv2DIm2ColOpNode::dilation)
-        .def_ro("kernel", &Conv2DIm2ColOpNode::kernel)
-        .def_ro("eviction_policy", &Conv2DIm2ColOpNode::eviction_policy);
-  }
-
-  bool SEqualReduce(const Conv2DIm2ColOpNode *other,
-                    SEqualReducer equal) const {
-    return equal(src, other->src) && equal(dst, other->dst) &&
-           equal(stride, other->stride) && equal(padding, other->padding) &&
-           equal(dilation, other->dilation) && equal(kernel, other->kernel) &&
-           equal(eviction_policy, other->eviction_policy);
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const {
-    hash_reduce(src);
-    hash_reduce(dst);
-    hash_reduce(stride);
-    hash_reduce(padding);
-    hash_reduce(dilation);
-    hash_reduce(kernel);
-    hash_reduce(eviction_policy);
+        .def_ro("srcRegion", &Conv2DIm2ColOpNode::srcRegion_)
+        .def_ro("dstRegion", &Conv2DIm2ColOpNode::dstRegion_)
+        .def_ro("src", &Conv2DIm2ColOpNode::src_)
+        .def_ro("dst", &Conv2DIm2ColOpNode::dst_)
+        .def_ro("stride", &Conv2DIm2ColOpNode::stride_)
+        .def_ro("padding", &Conv2DIm2ColOpNode::padding_)
+        .def_ro("dilation", &Conv2DIm2ColOpNode::dilation_)
+        .def_ro("kernel", &Conv2DIm2ColOpNode::kernel_)
+        .def_ro("eviction_policy", &Conv2DIm2ColOpNode::eviction_policy_);
   }
-  static constexpr bool _type_has_method_sequal_reduce = true;
-  static constexpr bool _type_has_method_shash_reduce = true;
 
   /*!
    * \brief Lower to TIR statement.
@@ -378,13 +409,15 @@ class Conv2DIm2ColOpNode : public TileOperatorNode {
 
 class Conv2DIm2ColOp : public TileOperator {
 public:
-  TVM_DEFINE_OBJECT_REF_METHODS(Conv2DIm2ColOp, TileOperator,
-                                Conv2DIm2ColOpNode);
-  TVM_DLL Conv2DIm2ColOp(Array<PrimExpr> args, BufferMap vmap);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Conv2DIm2ColOp, TileOperator,
+                                             Conv2DIm2ColOpNode);
+  TVM_DLL
+  Conv2DIm2ColOp(Array<PrimExpr> args,
+                 Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
 } // namespace tl
 } // namespace tvm
 
-#endif // TVM_TL_OP_COPY_H_
\ No newline at end of file
+#endif // TVM_TL_OP_COPY_H_
diff --git a/src/op/distributed.cc b/src/op/distributed.cc
index 84a23afa7..edd369a34 100644
--- a/src/op/distributed.cc
+++ b/src/op/distributed.cc
@@ -212,5 +212,9 @@ TIR_DEFINE_TL_BUILTIN(get_uintptr_t)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
+// Note: put, get, wait are implemented as TileOperators in remote_copy.cc
+// Note: wait_eq is defined in sync.cc (simple builtin) and remote_copy.cc uses
+// it
+
 } // namespace tl
-} // namespace tvm
\ No newline at end of file
+} // namespace tvm
diff --git a/src/op/distributed.h b/src/op/distributed.h
index 5170cc7ea..e8c580c8a 100644
--- a/src/op/distributed.h
+++ b/src/op/distributed.h
@@ -212,12 +212,12 @@ const Op &FcollectBlock();
 const Op &CpengineCpAsync();
 
 /*!
- * \brief tvm intrinsics for getting the rank of the current process
+ * \brief tvm intrinsics for getting the rank
  */
 const Op &get_rank();
 
 /*!
- * \brief tvm intrinsics for getting the number of processes
+ * \brief tvm intrinsics for getting the total number of ranks
  */
 const Op &get_num_ranks();
 
@@ -227,8 +227,12 @@ const Op &get_num_ranks();
 const Op &get_remote_base_ptr();
 
 /*!
- * \brief tvm intrinsics for getting the uintptr_t of a pointer
+ * \brief tvm intrinsics for getting uintptr_t
  */
 const Op &get_uintptr_t();
+
+// Note: put, get, wait are TileOperators defined in remote_copy.h
+// Note: wait_eq is defined in sync.h
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/fill.cc b/src/op/fill.cc
index 8f0dec63b..02962d242 100644
--- a/src/op/fill.cc
+++ b/src/op/fill.cc
@@ -17,6 +17,7 @@
 #include "../transform/loop_partition.h"
 #include "../transform/loop_vectorize.h"
 #include "builtin.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
@@ -51,38 +52,18 @@ using namespace tir;
  * value].
  *             - args[0]: destination access (BufferLoad or pointer expression).
  *             - args[1]: value to fill (scalar or vector).
- * @param vmap Mapping from buffer variables to Buffer objects; used to resolve
- * the destination when args[0] is not a BufferLoad.
  *
  * Notes:
  * - The constructor enforces constraints (e.g., stride == 1 ramps, constant
  * lanes) and will terminate (via CHECK/ICHECK) if inputs are unsupported or out
  * of bounds.
  */
-Fill::Fill(Array<PrimExpr> args, BufferMap vmap) {
-  ObjectPtr<FillNode> node = make_object<FillNode>();
-
-  if (args[0]->IsInstance<BufferLoadNode>()) {
-    auto buffer_load = Downcast<BufferLoad>(args[0]);
-    for (const auto &index : buffer_load->indices) {
-      if (const auto *ramp = index.as<RampNode>()) {
-        CHECK(ramp->stride.as<IntImmNode>()->value == 1)
-            << "Only stride 1 ramps are supported";
-        const auto *lanes = ramp->lanes.as<IntImmNode>();
-        CHECK(lanes)
-            << "Scalable vectors not supported in BufferRegion conversion";
-        node->region.push_back(Range::FromMinExtent(ramp->base, ramp->lanes));
-      } else {
-        node->region.push_back(Range::FromMinExtent(index, 1));
-      }
-    }
-    node->dst = buffer_load->buffer;
-  } else {
-    node->dst = vmap[GetVarFromAccessPtr(args[0])];
-    for (int i = 0; i < node->dst->shape.size(); i++) {
-      node->region.push_back(Range(0, node->dst->shape[i]));
-    }
-  }
+Fill::Fill(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
+  ObjectPtr<FillNode> node = tvm::ffi::make_object<FillNode>();
+
+  BufferRegion region = NormalizeToBufferRegion(args[0]);
+  node->dst = region->buffer;
+  node->region = region->region;
 
   if (args[1]->dtype != node->dst->dtype) {
     node->value = Cast(node->dst->dtype, args[1]);
@@ -95,14 +76,19 @@ Fill::Fill(Array<PrimExpr> args, BufferMap vmap) {
       << " != " << node->dst->shape.size();
   for (int i = 0; i < node->region.size(); i++) {
     // bound check if region is static
-    if (node->region[i]->min.as<IntImm>()) {
-      int64_t min = Downcast<IntImm>(node->region[i]->min)->value;
+    if (const auto *min_imm = node->region[i]->min.as<IntImmNode>()) {
+      int64_t min = min_imm->value;
       ICHECK_GE(min, 0) << "region[" << i << "] = " << min << " < 0";
     }
-    if (node->region[i]->extent.as<IntImm>()) {
-      int64_t extent = Downcast<IntImm>(node->region[i]->extent)->value;
-      ICHECK_LE(extent, Downcast<IntImm>(node->dst->shape[i])->value)
-          << "region[" << i << "] = " << extent << " > " << node->dst->shape[i];
+    if (const auto *extent_imm = node->region[i]->extent.as<IntImmNode>()) {
+      // Only perform the upper-bound check when the destination shape
+      // extent is also statically known. If the shape is symbolic (e.g., Var),
+      // skip this static check to avoid invalid downcasts.
+      if (const auto *shape_imm = node->dst->shape[i].as<IntImmNode>()) {
+        ICHECK_LE(extent_imm->value, shape_imm->value)
+            << "region[" << i << "] = " << extent_imm->value << " > "
+            << node->dst->shape[i];
+      }
     }
   }
   data_ = std::move(node);
@@ -117,7 +103,7 @@ Fill::Fill(Array<PrimExpr> args, BufferMap vmap) {
  * @return TileOperator A TileOperator that owns the copied FillNode.
  */
 TileOperator FillNode::Clone() const {
-  auto op = make_object<FillNode>(*this);
+  auto op = tvm::ffi::make_object<FillNode>(*this);
   return Fill(op);
 }
 
@@ -140,7 +126,8 @@ For FillNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
   for (int i = 0; i < ndim; i++) {
     Var var = Var(std::string{char('i' + i)}, region[i]->extent->dtype);
     loop_vars.push_back({region[i], var, IterVarType::kDataPar});
-    dst_indices.push_back(var);
+    // Offset the loop induction variable by region min to honor sliced regions
+    dst_indices.push_back(region[i]->min + var);
   }
   Stmt body = BufferStore(dst, value, dst_indices);
   for (int i = ndim - 1; i >= 0; i--) {
@@ -169,32 +156,41 @@ For FillNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
  * @return Stmt The lowered TIR statement implementing the fill.
  */
 Stmt FillNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
-  if (dst.scope() == "local.fragment") {
+  if (IsFragmentBuffer(dst)) {
     auto par_op = ParallelOp(MakeSIMTLoop(analyzer));
-    par_op->InferLayout({T.target, T.thread_bounds, T.layout_map, analyzer,
-                         false, T.buffer_remap},
+    par_op->InferLayout({T.target,
+                         T.thread_bounds,
+                         T.layout_map,
+                         analyzer,
+                         false,
+                         T.buffer_remap,
+                         {}},
                         InferLevel::kFree);
     auto thread_loop = PartitionLoop(par_op->GetRoot(), T.thread_var, analyzer,
                                      par_op->GetLoopLayout());
-    auto vectorized_thread_loop = VectorizeLoop(thread_loop);
+    auto vectorized_thread_loop = VectorizeLoop(thread_loop, analyzer);
     if (par_op->GetPredicate(T.thread_var).defined()) {
       return IfThenElse(par_op->GetPredicate(T.thread_var).value(),
                         vectorized_thread_loop);
     }
     return vectorized_thread_loop;
-  } else if (dst.scope() == "local") {
+  } else if (IsLocalBuffer(dst) || IsLocalVarBuffer(dst)) {
     auto init_loop = MakeSIMTLoop(analyzer);
-    auto vectorized_thread_loop = VectorizeLoop(init_loop);
+    auto vectorized_thread_loop = VectorizeLoop(init_loop, analyzer);
     return vectorized_thread_loop;
-  } else if (dst.scope() == "shared.dyn" || dst.scope() == "shared" ||
-             dst.scope() == "global") {
+  } else if (IsSharedBuffer(dst) || IsGlobalBuffer(dst)) {
     auto par_op = ParallelOp(MakeSIMTLoop(analyzer));
-    par_op->InferLayout({T.target, T.thread_bounds, T.layout_map, analyzer,
-                         false, T.buffer_remap},
+    par_op->InferLayout({T.target,
+                         T.thread_bounds,
+                         T.layout_map,
+                         analyzer,
+                         false,
+                         T.buffer_remap,
+                         {}},
                         InferLevel::kFree);
     auto thread_loop = PartitionLoop(par_op->GetRoot(), T.thread_var, analyzer,
                                      par_op->GetLoopLayout());
-    auto vectorized_thread_loop = VectorizeLoop(thread_loop);
+    auto vectorized_thread_loop = VectorizeLoop(thread_loop, analyzer);
     if (par_op->GetPredicate(T.thread_var).defined()) {
       return IfThenElse(par_op->GetPredicate(T.thread_var).value(),
                         vectorized_thread_loop);
@@ -202,6 +198,7 @@ Stmt FillNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
     return vectorized_thread_loop;
   } else {
     LOG(FATAL) << "Unsupported scope " << dst.scope();
+    return Stmt();
   }
 }
 
@@ -221,12 +218,12 @@ LayoutMap FillNode::InferLayout(const LayoutInferArgs &T,
   return {};
 }
 
-TIR_REGISTER_TL_OP(Fill, fill)
+TIR_REGISTER_TL_TILE_OP(Fill, fill)
     .set_num_inputs(2)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
-TVM_FFI_STATIC_INIT_BLOCK({ FillNode::RegisterReflection(); });
+TVM_FFI_STATIC_INIT_BLOCK() { FillNode::RegisterReflection(); }
 
 } // namespace tl
-} // namespace tvm
\ No newline at end of file
+} // namespace tvm
diff --git a/src/op/fill.h b/src/op/fill.h
index 6d3840763..b5734ad56 100644
--- a/src/op/fill.h
+++ b/src/op/fill.h
@@ -20,8 +20,7 @@ class FillNode : public TileOperatorNode {
   tir::Buffer dst;     ///< Destination buffer to fill
   PrimExpr value;      ///< Value to fill with
   Array<Range> region; ///< Region to fill within the buffer
-  static constexpr const char *_type_key = "tl.Fill";
-  TVM_DECLARE_FINAL_OBJECT_INFO(FillNode, TileOperatorNode);
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.Fill", FillNode, TileOperatorNode);
 
   Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const;
   LayoutMap InferLayout(const LayoutInferArgs &T, InferLevel level) const;
@@ -35,19 +34,6 @@ class FillNode : public TileOperatorNode {
         .def_ro("region", &FillNode::region);
   }
 
-  bool SEqualReduce(const FillNode *other, SEqualReducer equal) const {
-    return equal(dst, other->dst) && equal(value, other->value) &&
-           equal(region, other->region);
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const {
-    hash_reduce(dst);
-    hash_reduce(value);
-    hash_reduce(region);
-  }
-  static constexpr bool _type_has_method_sequal_reduce = true;
-  static constexpr bool _type_has_method_shash_reduce = true;
-
   TileOperator Clone() const;
 
 private:
@@ -58,12 +44,13 @@ class FillNode : public TileOperatorNode {
 /// Wrapper class for fill operations
 class Fill : public TileOperator {
 public:
-  TVM_DEFINE_OBJECT_REF_METHODS(Fill, TileOperator, FillNode);
-  TVM_DLL Fill(Array<PrimExpr> args, BufferMap vmap);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Fill, TileOperator, FillNode);
+  TVM_DLL Fill(Array<PrimExpr> args,
+               Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
 } // namespace tl
 } // namespace tvm
 
-#endif // TVM_TL_OP_FILL_H_
\ No newline at end of file
+#endif // TVM_TL_OP_FILL_H_
diff --git a/src/op/finalize_reducer.cc b/src/op/finalize_reducer.cc
index def940b4b..e9e2fca54 100644
--- a/src/op/finalize_reducer.cc
+++ b/src/op/finalize_reducer.cc
@@ -12,6 +12,7 @@
 #include <tvm/tir/op_attr_types.h>
 
 #include "../target/utils.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
@@ -29,12 +30,15 @@ using namespace tir;
  * @param args TL operator arguments: expects at least two elements where
  *             `args[0]` is an access pointer identifying the reducer variable
  * and `args[1]` is an integer encoding a `ReducerOpType` (e.g., Sum/Max/Min).
- * @param vmap Mapping from variables to Buffers used to look up the reducer
- * Buffer.
  */
-FinalizeReducerOp::FinalizeReducerOp(Array<PrimExpr> args, BufferMap vmap) {
-  auto node = make_object<FinalizeReducerOpNode>();
-  node->reducer = vmap[GetVarFromAccessPtr(args[0])];
+FinalizeReducerOp::FinalizeReducerOp(Array<PrimExpr> args,
+                                     Map<String, ObjectRef> annotations) {
+  auto node = tvm::ffi::make_object<FinalizeReducerOpNode>();
+  // Normalize any supported region expression
+  // (BufferRegion/BufferLoad/tl.region) to a BufferRegion, then take the
+  // underlying Buffer as reducer.
+  auto region = NormalizeToBufferRegion(args[0]);
+  node->reducer = region->buffer;
   node->op = (ReducerOpType)*as_const_int(args[1]);
   data_ = std::move(node);
 }
@@ -95,7 +99,8 @@ Stmt FinalizeReducerOpNode::Lower(const LowerArgs &T,
   int reducing_threads = extent;
   std::stringstream ss;
   auto thread_offset = T.thread_bounds->min;
-  if (TargetIsHopper(T.target) || TargetIsSm100(T.target)) {
+  if (TargetIsHopper(T.target) || TargetIsSm100(T.target) ||
+      TargetIsSM120(T.target)) {
     auto all_threads = T.thread_bounds->extent;
     ss << "tl::AllReduce<" << op_str << ", " << reducing_threads << ", " << 1
        << ", " << thread_offset << ", " << all_threads << ">::run_hopper";
@@ -152,15 +157,15 @@ LayoutMap FinalizeReducerOpNode::InferLayout(const LayoutInferArgs &T,
  * @return TileOperator A TileOperator that contains a deep copy of this node.
  */
 TileOperator FinalizeReducerOpNode::Clone() const {
-  auto node = make_object<FinalizeReducerOpNode>(*this);
+  auto node = tvm::ffi::make_object<FinalizeReducerOpNode>(*this);
   return TileOperator(node);
 }
 
-TIR_REGISTER_TL_OP(FinalizeReducerOp, finalize_reducer)
+TIR_REGISTER_TL_TILE_OP(FinalizeReducerOp, finalize_reducer)
     .set_num_inputs(1)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
-TVM_FFI_STATIC_INIT_BLOCK({ FinalizeReducerOpNode::RegisterReflection(); });
+TVM_FFI_STATIC_INIT_BLOCK() { FinalizeReducerOpNode::RegisterReflection(); }
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/finalize_reducer.h b/src/op/finalize_reducer.h
index d9a66d1b9..3899f59ca 100644
--- a/src/op/finalize_reducer.h
+++ b/src/op/finalize_reducer.h
@@ -1,6 +1,3 @@
-// Copyright (c) Tile-AI Corporation.
-// Licensed under the MIT License.
-
 /*!
  * \file src/op/finalize_reducer.h
  * \brief Define finalize_reducer operator.
@@ -27,8 +24,8 @@ class FinalizeReducerOpNode : public TileOperatorNode {
   tir::Buffer reducer;
   ReducerOpType op;
 
-  static constexpr const char *_type_key = "tl.FinalizeReducerOp";
-  TVM_DECLARE_FINAL_OBJECT_INFO(FinalizeReducerOpNode, TileOperatorNode);
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.FinalizeReducerOp",
+                                    FinalizeReducerOpNode, TileOperatorNode);
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -37,18 +34,6 @@ class FinalizeReducerOpNode : public TileOperatorNode {
         .def_ro("op", &FinalizeReducerOpNode::op);
   }
 
-  bool SEqualReduce(const FinalizeReducerOpNode *other,
-                    SEqualReducer equal) const {
-    return equal(reducer, other->reducer) && equal(op, other->op);
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const {
-    hash_reduce(reducer);
-    hash_reduce(op);
-  }
-  static constexpr bool _type_has_method_sequal_reduce = true;
-  static constexpr bool _type_has_method_shash_reduce = true;
-
   Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
   LayoutMap InferLayout(const LayoutInferArgs &T,
                         InferLevel level) const override;
@@ -58,13 +43,15 @@ class FinalizeReducerOpNode : public TileOperatorNode {
 
 class FinalizeReducerOp : public TileOperator {
 public:
-  TVM_DEFINE_OBJECT_REF_METHODS(FinalizeReducerOp, TileOperator,
-                                FinalizeReducerOpNode);
-  TVM_DLL FinalizeReducerOp(Array<PrimExpr> args, BufferMap vmap);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(FinalizeReducerOp, TileOperator,
+                                             FinalizeReducerOpNode);
+  TVM_DLL FinalizeReducerOp(
+      Array<PrimExpr> args,
+      Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
 } // namespace tl
 } // namespace tvm
 
-#endif //  TVM_TL_OP_FINALIZE_REDUCER_H_
\ No newline at end of file
+#endif //  TVM_TL_OP_FINALIZE_REDUCER_H_
diff --git a/src/op/gemm.cc b/src/op/gemm.cc
index 8912a7a33..7ad8b8c1e 100644
--- a/src/op/gemm.cc
+++ b/src/op/gemm.cc
@@ -12,77 +12,14 @@
 #include <tvm/tir/transform.h>
 
 #include "../target/utils.h"
+#include "tcgen5_meta.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
 
 using namespace tir;
 
-struct TCGEN5MMAMeta {
-  int atom_m, atom_n, atom_k;
-};
-
-// Return {is_success, meta}
-static inline std::pair<bool, TCGEN5MMAMeta>
-GetTCGEN5MMAMeta(int M, int N, int K, DataType ab_dtype, DataType c_dtype) {
-// TODO (lei) Currently not all shapes / dtypes are supported for TCGEN5MMA.
-#define FAIL                                                                   \
-  return { false, TCGEN5MMAMeta{0, 0, 0} }
-#define SUCCESS(atom_m, atom_n, atom_k)                                        \
-  return {                                                                     \
-    true, TCGEN5MMAMeta { atom_m, atom_n, atom_k }                             \
-  }
-  std::vector<int> ws_valid_atom_ns = {256, 128, 64};
-  if ((ab_dtype.is_bfloat16() || ab_dtype.is_float16()) &&
-      (c_dtype.is_float() && c_dtype.bits() == 32)) {
-    if (K % 16 != 0)
-      FAIL;
-    if (M % 128 == 0) {
-      for (int atom_n = 256; atom_n >= 16; atom_n -= 16)
-        if (N % atom_n == 0)
-          SUCCESS(128, atom_n, 16);
-      FAIL;
-    } else if (M % 64 == 0) {
-      for (int atom_n : ws_valid_atom_ns)
-        if (N % atom_n == 0)
-          SUCCESS(64, atom_n, 16);
-      FAIL;
-    } else if (M % 32 == 0) {
-      for (int atom_n : ws_valid_atom_ns)
-        if (N % atom_n == 0)
-          SUCCESS(32, atom_n, 16);
-      FAIL;
-    } else {
-      FAIL;
-    }
-  } else if ((ab_dtype.is_float8_e4m3fn() || ab_dtype.is_float8_e5m2()) &&
-             (c_dtype.is_float() && c_dtype.bits() == 32)) {
-    if (K % 32 != 0)
-      FAIL;
-    if (M % 128 == 0) {
-      for (int atom_n = 256; atom_n >= 16; atom_n -= 16)
-        if (N % atom_n == 0)
-          SUCCESS(128, atom_n, 32);
-      FAIL;
-    } else if (M % 64 == 0) {
-      for (int atom_n : ws_valid_atom_ns)
-        if (N % atom_n == 0)
-          SUCCESS(64, atom_n, 32);
-      FAIL;
-    } else if (M % 32 == 0) {
-      for (int atom_n : ws_valid_atom_ns)
-        if (N % atom_n == 0)
-          SUCCESS(32, atom_n, 32);
-      FAIL;
-    } else {
-      FAIL;
-    }
-  }
-  FAIL;
-#undef FAIL
-#undef SUCCESS
-}
-
 /**
  * @brief Construct a Gemm operator from serialized TL arguments and a buffer
  * map.
@@ -104,49 +41,55 @@ GetTCGEN5MMAMeta(int M, int N, int K, DataType ab_dtype, DataType c_dtype) {
  *      M (Int), N (Int), K (Int), policy (Int), clear_accum (Bool),
  *      stride_A (Int), stride_B (Int), offset_A (Int), offset_B (Int),
  *      (optional) kPack (Int), (optional) wg_wait (Int)]
- * @param vmap Mapping from access pointer vars to Buffer objects used to
- *   resolve the Buffer corresponding to each pointer argument.
  *
  * @note If `kPack` is provided it must be 1; otherwise the constructor
  *       fails with an ICHECK (runtime assertion). No other validation is
  *       performed here.
  */
-Gemm::Gemm(Array<PrimExpr> args, BufferMap vmap) {
-  ObjectPtr<GemmNode> node = make_object<GemmNode>();
-
-  node->Aptr = args[0];
-  node->Bptr = args[1];
-  node->Cptr = args[2];
-  node->A = vmap[GetVarFromAccessPtr(node->Aptr)];
-  node->B = vmap[GetVarFromAccessPtr(node->Bptr)];
-  node->C = vmap[GetVarFromAccessPtr(node->Cptr)];
-  node->trans_A = args[3].as<Bool>().value();
-  node->trans_B = args[4].as<Bool>().value();
-  node->M = args[5].as<IntImm>().value()->value;
-  node->N = args[6].as<IntImm>().value()->value;
-  node->K = args[7].as<IntImm>().value()->value;
-  node->policy = GemmWarpPolicy(args[8].as<IntImm>().value()->value);
-  node->clear_accum = args[9].as<PrimExpr>().value();
-  node->stride_A = args[10].as<IntImm>().value()->value;
-  node->stride_B = args[11].as<IntImm>().value()->value;
-  node->offset_A = args[12].as<IntImm>().value()->value;
-  node->offset_B = args[13].as<IntImm>().value()->value;
+// NormalizeToBufferRegion moved to src/op/utils.{h,cc}
+
+// MakeAccessPtrFromRegion moved to src/op/utils.{h,cc}
+
+Gemm::Gemm(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
+  ObjectPtr<GemmNode> node = tvm::ffi::make_object<GemmNode>();
+
+  node->aRegion_ = NormalizeToBufferRegion(args[0]);
+  node->bRegion_ = NormalizeToBufferRegion(args[1]);
+  node->cRegion_ = NormalizeToBufferRegion(args[2]);
+
+  node->a_ = node->aRegion_->buffer;
+  node->b_ = node->bRegion_->buffer;
+  node->c_ = node->cRegion_->buffer;
+  node->transA_ = args[3].as<Bool>().value();
+  node->transB_ = args[4].as<Bool>().value();
+  node->m_ = args[5].as<IntImm>().value()->value;
+  node->n_ = args[6].as<IntImm>().value()->value;
+  node->k_ = args[7].as<IntImm>().value()->value;
+  node->policy_ = GemmWarpPolicy(args[8].as<IntImm>().value()->value);
+  node->clearAccum_ = args[9].as<PrimExpr>().value();
+  node->strideA_ = args[10].as<IntImm>().value()->value;
+  node->strideB_ = args[11].as<IntImm>().value()->value;
+  node->offsetA_ = args[12].as<IntImm>().value()->value;
+  node->offsetB_ = args[13].as<IntImm>().value()->value;
   if (args.size() > 14) {
-    node->kPack = args[14].as<IntImm>().value()->value;
-    if (node->kPack != 1 && node->kPack != 2) {
+    node->kPack_ = args[14].as<IntImm>().value()->value;
+    if (node->kPack_ != 1 && node->kPack_ != 2) {
       ICHECK(false) << "kPack must be 1 or 2";
     }
   }
   if (args.size() > 15) {
-    node->wg_wait = args[15].as<IntImm>().value()->value;
+    node->wgWait_ = args[15].as<IntImm>().value()->value;
   }
-  node->mbarptr = args[16];
-  if (node->mbarptr.as<CallNode>()) {
-    node->mbar = vmap[GetVarFromAccessPtr(node->mbarptr)];
-  } else {
-    node->mbar = std::nullopt;
+  if (args.size() > 16) {
+    if (const auto *load = args[16].as<BufferLoadNode>()) {
+      node->mbarRegion_ =
+          NormalizeToBufferRegion(Downcast<BufferLoad>(args[16]));
+      node->mbar_ = node->mbarRegion_->buffer;
+    } else {
+      node->mbar_ = std::nullopt;
+    }
   }
-  node->C_coords = Array<PrimExpr>(
+  node->cCoords_ = Array<PrimExpr>(
       {args[17].as<PrimExpr>().value(), args[18].as<PrimExpr>().value()});
   data_ = std::move(node);
 }
@@ -160,46 +103,45 @@ Gemm::Gemm(Array<PrimExpr> args, BufferMap vmap) {
  * @return TileOperator A Gemm operator that owns a copy of this node.
  */
 TileOperator GemmNode::Clone() const {
-  auto op = make_object<GemmNode>(*this);
+  auto op = tvm::ffi::make_object<GemmNode>(*this);
   return Gemm(op);
 }
 
-bool GemmNode::AllowTCGEN5MMA(Target target) const {
+bool GemmNode::allowTcgen5Mma(Target target) const {
   return TargetIsSm100(target) &&
-         ((A.scope() == "shared.dyn" || A.scope() == "shared" ||
-           A.scope() == "shared.tmem") &&
-          (B.scope() == "shared.dyn" || B.scope() == "shared") &&
-          C.scope() == "shared.tmem") &&
-         GetTCGEN5MMAMeta(M, N, K, A->dtype, C->dtype).first;
+         ((a_.scope() == "shared.dyn" || a_.scope() == "shared" ||
+           a_.scope() == "shared.tmem") &&
+          (b_.scope() == "shared.dyn" || b_.scope() == "shared") &&
+          c_.scope() == "shared.tmem") &&
+         GetTCGEN5MMAMeta(m_, n_, k_, a_->dtype, c_->dtype).first;
 }
 
-bool GemmNode::AllowWGMMA(int block_size, Target target) const {
+bool GemmNode::allowWgmma(int block_size, Target target) const {
   tvm::transform::PassContext ctxt = tvm::transform::PassContext::Current();
 
   int warp_size = TargetGetWarpSize(target);
   int num_warps = block_size / warp_size;
   return !ctxt->GetConfig(kDisableWGMMA, Optional<Bool>()).value_or(false) &&
-         TargetIsHopper(target) && (this->M >= 64) && (num_warps % 4 == 0) &&
-         CheckWGMMA();
+         TargetIsHopper(target) && (this->m_ >= 64) && (num_warps % 4 == 0) &&
+         checkWgmma();
 }
 
-GemmInst GemmNode::GetGemmInst(int block_size, Target target) const {
-  bool allow_tcgen5mma = AllowTCGEN5MMA(target);
-  bool allow_wgmma = AllowWGMMA(block_size, target);
-  if (allow_tcgen5mma) {
+GemmInst GemmNode::getGemmInst(int block_size, Target target) const {
+  if (allowTcgen5Mma(target)) {
     return GemmInst::kTCGEN5MMA;
-  } else if (allow_wgmma) {
+  } else if (allowWgmma(block_size, target)) {
     return GemmInst::kWGMMA;
   } else if (TargetIsCDNA(target)) {
     return GemmInst::kMFMA;
   } else if (TargetIsCuda(target)) {
     return GemmInst::kMMA;
   } else {
-    ICHECK(0) << "Unsupported target for gemm: " << target->str();
+    ICHECK(0) << "Unsupported target for gemm: " << target;
+    return GemmInst::kMMA;
   }
 }
 
-std::pair<int, int> GemmWarpPolicyNode::ComputeWarpPartition(
+std::pair<int, int> GemmWarpPolicyNode::computeWarpPartition(
     int M, int N, int block_size, Target target, GemmInst gemm_inst) const {
   int num_warps = block_size / TargetGetWarpSize(target);
   if (gemm_inst == GemmInst::kTCGEN5MMA) {
@@ -208,7 +150,10 @@ std::pair<int, int> GemmWarpPolicyNode::ComputeWarpPartition(
 
   int m_warp = 1, n_warp = 1;
   constexpr int kMPerWarp = 16; // Rows processed by a single warp
-  constexpr int kNPerWarp = 8;  // Columns processed by a single warp
+  int kNPerWarp = 8;            // Columns processed by a single warp
+  if (TargetIsVolta(target)) {
+    kNPerWarp = 16;
+  }
   ICHECK(M % kMPerWarp == 0)
       << "M must be divisible by " << kMPerWarp << ", but got " << M;
   ICHECK(N % kNPerWarp == 0)
@@ -408,51 +353,40 @@ std::pair<int, int> GemmWarpPolicyNode::ComputeWarpPartition(
  * @return true if WGMMA is supported for the current buffers, dtypes, and
  *         transpose/shape constraints; false otherwise.
  */
-bool GemmNode::CheckWGMMA() const {
-  if (B.scope() != "shared.dyn" && B.scope() != "shared") {
+bool GemmNode::checkWgmma() const {
+  if (b_.scope() != "shared.dyn" && b_.scope() != "shared") {
     return false;
   }
 
-  if (C->dtype == DataType::Float(16)) {
-    if (A->dtype == DataType::Float(16) && B->dtype == DataType::Float(16))
-      return K % 16 == 0;
-    else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e4m3())
-      return (!trans_A) && trans_B && K % 32 == 0;
-    else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e5m2())
-      return (!trans_A) && trans_B && K % 32 == 0;
-    else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e4m3())
-      return (!trans_A) && trans_B && K % 32 == 0;
-    else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e5m2())
-      return (!trans_A) && trans_B && K % 32 == 0;
+  if (c_->dtype == DataType::Float(16)) {
+    if (a_->dtype == DataType::Float(16) && b_->dtype == DataType::Float(16))
+      return k_ % 16 == 0;
+    else if (a_->dtype.is_float8() && b_->dtype.is_float8())
+      return (!transA_) && transB_ && k_ % 32 == 0;
     else
       return false;
-  } else if (C->dtype == DataType::Float(32)) {
-    if (A->dtype == DataType::Float(16) && B->dtype == DataType::Float(16))
-      return K % 16 == 0;
-    else if (A->dtype == DataType::BFloat(16) &&
-             B->dtype == DataType::BFloat(16))
-      return K % 16 == 0;
-    else if (A->dtype == DataType::Float(32) && B->dtype == DataType::Float(32))
-      return (!trans_A) && trans_B && K % 8 == 0;
-    else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e4m3())
-      return (!trans_A) && trans_B && K % 32 == 0;
-    else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e5m2())
-      return (!trans_A) && trans_B && K % 32 == 0;
-    else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e4m3())
-      return (!trans_A) && trans_B && K % 32 == 0;
-    else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e5m2())
-      return (!trans_A) && trans_B && K % 32 == 0;
+  } else if (c_->dtype == DataType::Float(32)) {
+    if (a_->dtype == DataType::Float(16) && b_->dtype == DataType::Float(16))
+      return k_ % 16 == 0;
+    else if (a_->dtype == DataType::BFloat(16) &&
+             b_->dtype == DataType::BFloat(16))
+      return k_ % 16 == 0;
+    else if (a_->dtype == DataType::Float(32) &&
+             b_->dtype == DataType::Float(32))
+      return (!transA_) && transB_ && k_ % 8 == 0;
+    else if (a_->dtype.is_float8() && b_->dtype.is_float8())
+      return (!transA_) && transB_ && k_ % 32 == 0;
     else
       return false;
-  } else if (C->dtype == DataType::Int(32)) {
-    if (A->dtype == DataType::Int(8) && B->dtype == DataType::Int(8))
-      return (!trans_A) && trans_B && K % 32 == 0;
-    else if (A->dtype == DataType::Int(8) && B->dtype == DataType::UInt(8))
-      return (!trans_A) && trans_B && K % 32 == 0;
-    else if (A->dtype == DataType::UInt(8) && B->dtype == DataType::Int(8))
-      return (!trans_A) && trans_B && K % 32 == 0;
-    else if (A->dtype == DataType::UInt(8) && B->dtype == DataType::UInt(8))
-      return (!trans_A) && trans_B && K % 32 == 0;
+  } else if (c_->dtype == DataType::Int(32)) {
+    if (a_->dtype == DataType::Int(8) && b_->dtype == DataType::Int(8))
+      return (!transA_) && transB_ && k_ % 32 == 0;
+    else if (a_->dtype == DataType::Int(8) && b_->dtype == DataType::UInt(8))
+      return (!transA_) && transB_ && k_ % 32 == 0;
+    else if (a_->dtype == DataType::UInt(8) && b_->dtype == DataType::Int(8))
+      return (!transA_) && transB_ && k_ % 32 == 0;
+    else if (a_->dtype == DataType::UInt(8) && b_->dtype == DataType::UInt(8))
+      return (!transA_) && transB_ && k_ % 32 == 0;
     else
       return false;
   } else {
@@ -476,8 +410,8 @@ bool GemmNode::CheckWGMMA() const {
  */
 static int GetArchInt(Target target) {
   int arch_int = 0;
-  auto s = target->GetAttr<String>("arch");
-  ICHECK(s.defined());
+  auto s = target->GetAttr<tvm::ffi::String>("arch");
+  ICHECK(s.has_value());
   std::string arch = s.value();
   if (arch.rfind("sm_", 0) == 0) {
     arch_int = std::stoi(arch.substr(3));
@@ -502,56 +436,66 @@ static int GetArchInt(Target target) {
  */
 Stmt GemmNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   auto block_size = *as_const_int(T.thread_bounds->extent);
-  GemmInst gemm_inst = GetGemmInst(block_size, T.target);
+  GemmInst gemm_inst = getGemmInst(block_size, T.target);
   auto [warp_m, warp_n] =
-      policy->ComputeWarpPartition(M, N, block_size, T.target, gemm_inst);
+      policy_->computeWarpPartition(m_, n_, block_size, T.target, gemm_inst);
+
+  // Build access pointers from regions locally
+  PrimExpr Aptr =
+      MakeAccessPtrFromRegion(aRegion_, /*r*/ 1, /*require_2d*/ true);
+  PrimExpr Bptr =
+      MakeAccessPtrFromRegion(bRegion_, /*r*/ 1, /*require_2d*/ true);
+  PrimExpr Cptr =
+      MakeAccessPtrFromRegion(cRegion_, /*rw*/ 3, /*require_2d*/ true);
 
   std::stringstream ss;
   std::string op_name;
 
   if (gemm_inst == GemmInst::kTCGEN5MMA) {
     auto [can_use_tcgen5mma, meta] =
-        GetTCGEN5MMAMeta(M, N, K, A->dtype, C->dtype);
+        GetTCGEN5MMAMeta(m_, n_, k_, a_->dtype, c_->dtype);
     ICHECK(can_use_tcgen5mma);
-    ICHECK(B.scope() == "shared.dyn" || B.scope() == "shared");
-    ICHECK(C.scope() == "shared.tmem");
-    ICHECK(mbar.has_value()) << "mbar must be provided for TCGEN5MMA";
-    if (A.scope() == "shared.tmem") {
+    ICHECK(b_.scope() == "shared.dyn" || b_.scope() == "shared");
+    ICHECK(c_.scope() == "shared.tmem");
+    ICHECK(mbar_.has_value()) << "mbar must be provided for TCGEN5MMA";
+    if (a_.scope() == "shared.tmem") {
       op_name = "tl::tcgen5mma_gemm_ts";
-    } else if (A.scope() == "shared.dyn" || A.scope() == "shared") {
+    } else if (a_.scope() == "shared.dyn" || a_.scope() == "shared") {
       op_name = "tl::tcgen5mma_gemm_ss";
     } else {
       ICHECK(0)
           << "Unsupported A scope for TCGEN5MMA: "
-          << A.scope(); // If this is triggered, it means Tilelang has bugs.
+          << a_.scope(); // If this is triggered, it means Tilelang has bugs.
     }
-    ICHECK(wg_wait == -1)
+    ICHECK(wgWait_ == -1)
         << "Currently only wg_wait == -1 is supported for TCGEN5MMA. Please "
            "use "
            "wg_wait = -1 and manually synchronize with mbarrier.";
 
     std::string accum_dtype = "";
-    if (C->dtype.is_float()) {
-      if (C->dtype.bits() == 32) {
+    if (c_->dtype.is_float()) {
+      if (c_->dtype.bits() == 32) {
         accum_dtype = "float";
       }
     }
     ICHECK(!accum_dtype.empty())
-        << "Unsupported C dtype for TCGEN5MMA: " << C->dtype;
-    ss << op_name << "<" << M << ", " << N << ", " << K << ", ";
+        << "Unsupported C dtype for TCGEN5MMA: " << c_->dtype;
+    ss << op_name << "<" << m_ << ", " << n_ << ", " << k_ << ", ";
     ss << meta.atom_m << ", " << meta.atom_n << ", " << meta.atom_k << ", ";
-    ss << trans_A << ", " << trans_B << ", ";
+    ss << transA_ << ", " << transB_ << ", ";
     ss << accum_dtype;
     ss << ">";
 
-    auto C_buffer = T.buffer_remap.count(C) ? T.buffer_remap[C] : C;
+    auto C_buffer = T.buffer_remap.count(c_) ? T.buffer_remap[c_] : c_;
     Array<PrimExpr> new_args;
+    auto mbarPtr =
+        MakeAccessPtrFromRegion(mbarRegion_, /*rw*/ 3, /*require_2d*/ true);
     new_args.push_back(StringImm(ss.str()));
     new_args.push_back(Aptr);
     new_args.push_back(Bptr);
-    new_args.push_back(BufferLoad(C_buffer, C_coords));
-    new_args.push_back(mbarptr);
-    new_args.push_back(clear_accum);
+    new_args.push_back(BufferLoad(C_buffer, cCoords_));
+    new_args.push_back(mbarPtr);
+    new_args.push_back(clearAccum_);
     auto new_call = Call(DataType::Handle(), builtin::call_extern(), new_args);
 
     // Since TCGEN5MMA atoms provided by CUTLASS always have an internal
@@ -576,47 +520,49 @@ Stmt GemmNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
     }
   }
 
-  if (A.scope() == "local.fragment") {
-    ICHECK(B.scope() != "local.fragment");
+  if (IsFragmentBuffer(a_)) {
+    ICHECK(!IsFragmentBuffer(b_));
+    ICHECK(!transA_)
+        << "gemm_rs requires the A operand to be in non-transposed layout.";
     op_name = "tl::gemm_rs";
-  } else if (B.scope() == "local.fragment") {
+  } else if (IsFragmentBuffer(b_)) {
     op_name = "tl::gemm_sr";
   } else {
     op_name = "tl::gemm_ss";
   }
-  ICHECK(C.scope() == "local.fragment");
+  ICHECK(IsFragmentBuffer(c_));
 
-  ss << op_name << "<" << M << ", " << N << ", " << K << ", ";
+  ss << op_name << "<" << m_ << ", " << n_ << ", " << k_ << ", ";
   ss << warp_m << ", " << warp_n << ", ";
-  ss << trans_A << ", " << trans_B;
-  auto clear_accum_bool = clear_accum.as<Bool>();
+  ss << transA_ << ", " << transB_;
+  auto clear_accum_bool = clearAccum_.as<Bool>();
   ICHECK(clear_accum_bool.has_value())
-      << "clear_accum must be a constant Bool type, got " << clear_accum;
+      << "clear_accum must be a constant Bool type, got " << clearAccum_;
   ss << ", " << bool(clear_accum_bool.value());
   if (TargetIsCuda(T.target) && (GetArchInt(T.target) >= 75)) {
-    ss << ", " << stride_A << ", " << stride_B;
-    ss << ", " << offset_A << ", " << offset_B;
+    ss << ", " << strideA_ << ", " << strideB_;
+    ss << ", " << offsetA_ << ", " << offsetB_;
   }
   if (TargetIsCDNA(T.target)) {
     // for cdna gemm, we need to specify kPack
-    ss << ", " << kPack;
+    ss << ", " << kPack_;
   } else if (TargetIsHopper(T.target)) {
     ss << ", " << (gemm_inst == GemmInst::kWGMMA ? "true" : "false");
   }
 
   // Emit wg_wait if necessary
   if (TargetIsHopper(T.target)) {
-    if (wg_wait != 0) {
-      ss << ", " << wg_wait;
+    if (wgWait_ != 0) {
+      ss << ", " << wgWait_;
     }
   } else if (TargetIsSm100(T.target)) {
     // NOTE On sm100, only the leading thread issues the TCGEN5MMA instruction
     // but all threads need to wait, so we emit another statement for cases
     // where wg_wait == 0.
-    ICHECK(wg_wait == 0 || wg_wait == -1)
+    ICHECK(wgWait_ == 0 || wgWait_ == -1)
         << "wg_wait must be 0 or -1 for Sm100";
   } else {
-    ICHECK(wg_wait == 0)
+    ICHECK(wgWait_ == 0)
         << "wg_wait must be 0 for non-Hopper and non-Sm100 targets";
   }
   ss << ">";
@@ -652,151 +598,152 @@ LayoutMap GemmNode::InferLayout(const LayoutInferArgs &T,
   LayoutMap results;
   auto thread_range = T.thread_bounds;
   auto block_size = *as_const_int(thread_range->extent);
-  GemmInst gemm_inst = GetGemmInst(block_size, T.target);
+  GemmInst gemm_inst = getGemmInst(block_size, T.target);
   auto [warp_m, warp_n] =
-      policy->ComputeWarpPartition(M, N, block_size, T.target, gemm_inst);
+      policy_->computeWarpPartition(m_, n_, block_size, T.target, gemm_inst);
   if (TargetIsVolta(T.target)) {
-    ICHECK(C.scope() == "local.fragment")
+    ICHECK(IsFragmentBuffer(c_))
         << "Volta gemm only supports C in local.fragment scope, got "
-        << C.scope();
-    auto fragment =
-        makeGemmVoltaFragmentC(M, N, M / warp_m, N / warp_n, C->dtype.bits());
-    results.Set(C, fragment->BindThreadRange(thread_range));
-    if (A.scope() == "shared" || A.scope() == "shared.dyn") {
-      int dim_A = A->shape.size();
-      results.Set(A, makeGemmVoltaABLayout(*as_const_int(A->shape[dim_A - 2]),
-                                           *as_const_int(A->shape[dim_A - 1]),
-                                           true, !trans_A));
-    } else if (A.scope() == "local.fragment") {
-      ICHECK(trans_A == false);
-      auto fragment = makeGemmVoltaFragmentA(M, N, K, M / warp_m, N / warp_n);
-      results.Set(A, fragment->BindThreadRange(thread_range));
+        << c_.scope();
+    auto fragment = makeGemmVoltaFragmentC(m_, n_, m_ / warp_m, n_ / warp_n,
+                                           c_->dtype.bits());
+    results.Set(c_, fragment->BindThreadRange(thread_range));
+    if (a_.scope() == "shared" || a_.scope() == "shared.dyn") {
+      int dim_A = a_->shape.size();
+      results.Set(a_, makeGemmVoltaABLayout(*as_const_int(a_->shape[dim_A - 2]),
+                                            *as_const_int(a_->shape[dim_A - 1]),
+                                            true, !transA_));
+    } else if (IsFragmentBuffer(a_)) {
+      ICHECK(transA_ == false);
+      auto fragment =
+          makeGemmVoltaFragmentA(m_, n_, k_, m_ / warp_m, n_ / warp_n);
+      results.Set(a_, fragment->BindThreadRange(thread_range));
     } else {
       ICHECK(0);
     }
 
-    ICHECK(B.scope() == "shared" || B.scope() == "shared.dyn");
-    int dim_B = B->shape.size();
-    results.Set(B, makeGemmVoltaABLayout(*as_const_int(B->shape[dim_B - 2]),
-                                         *as_const_int(B->shape[dim_B - 1]),
-                                         false, trans_B));
+    ICHECK(b_.scope() == "shared" || b_.scope() == "shared.dyn");
+    int dim_B = b_->shape.size();
+    results.Set(b_, makeGemmVoltaABLayout(*as_const_int(b_->shape[dim_B - 2]),
+                                          *as_const_int(b_->shape[dim_B - 1]),
+                                          false, transB_));
   } else if (TargetIsAmpere(T.target) || TargetIsTuring(T.target) ||
              TargetIsSM120(T.target) ||
              (TargetIsSm100(T.target) && gemm_inst == GemmInst::kMMA)) {
-    ICHECK(C.scope() == "local.fragment")
-        << "MMA only supports C in local.fragment scope, got " << C.scope();
+    ICHECK(IsFragmentBuffer(c_))
+        << "MMA only supports C in local.fragment scope, got " << c_.scope();
 
     auto fragment =
-        makeGemmFragmentC(M, N, M / warp_m, N / warp_n, C->dtype.bits());
-    results.Set(C, fragment->BindThreadRange(thread_range));
-
-    if (A.scope() == "shared" || A.scope() == "shared.dyn") {
-      int dim_A = A->shape.size();
-      const int64_t mat_stride = *as_const_int(A->shape[dim_A - 2]);
-      const int64_t mat_continuous = *as_const_int(A->shape[dim_A - 1]);
-      results.Set(A,
+        makeGemmFragmentC(m_, n_, m_ / warp_m, n_ / warp_n, c_->dtype.bits());
+    results.Set(c_, fragment->BindThreadRange(thread_range));
+
+    if (a_.scope() == "shared" || a_.scope() == "shared.dyn") {
+      int dim_A = a_->shape.size();
+      const int64_t mat_stride = *as_const_int(a_->shape[dim_A - 2]);
+      const int64_t mat_continuous = *as_const_int(a_->shape[dim_A - 1]);
+      results.Set(a_,
                   makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
-                                   A->dtype.bits(), !trans_A));
-    } else if (A.scope() == "local.fragment") {
-      auto fragment = makeGemmFragmentA(M, N, K, M / warp_m, N / warp_n,
-                                        A->dtype.bits(), trans_A);
-      results.Set(A, fragment->BindThreadRange(thread_range));
+                                   a_->dtype.bits(), !transA_));
+    } else if (IsFragmentBuffer(a_)) {
+      auto fragment = makeGemmFragmentA(m_, n_, k_, m_ / warp_m, n_ / warp_n,
+                                        a_->dtype.bits(), transA_);
+      results.Set(a_, fragment->BindThreadRange(thread_range));
     } else {
       ICHECK(0);
     }
-    if (B.scope() == "shared" || B.scope() == "shared.dyn") {
-      int dim_B = B->shape.size();
-      const int64_t mat_stride = *as_const_int(B->shape[dim_B - 2]);
-      const int64_t mat_continuous = *as_const_int(B->shape[dim_B - 1]);
-      results.Set(B,
+    if (b_.scope() == "shared" || b_.scope() == "shared.dyn") {
+      int dim_B = b_->shape.size();
+      const int64_t mat_stride = *as_const_int(b_->shape[dim_B - 2]);
+      const int64_t mat_continuous = *as_const_int(b_->shape[dim_B - 1]);
+      results.Set(b_,
                   makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
-                                   B->dtype.bits(), trans_B));
-    } else if (B.scope() == "local.fragment") {
+                                   b_->dtype.bits(), transB_));
+    } else if (IsFragmentBuffer(b_)) {
       auto fragment =
-          makeGemmFragmentB(M, N, K, M / warp_m, N / warp_n, trans_B);
-      results.Set(B, fragment->BindThreadRange(thread_range));
+          makeGemmFragmentB(m_, n_, k_, m_ / warp_m, n_ / warp_n, transB_);
+      results.Set(b_, fragment->BindThreadRange(thread_range));
     } else {
       ICHECK(0);
     }
   } else if (TargetIsHopper(T.target)) {
-    ICHECK(C.scope() == "local.fragment")
+    ICHECK(IsFragmentBuffer(c_))
         << (gemm_inst == GemmInst::kWGMMA ? "WGMMA " : "MMA ")
-        << "only supports C in local.fragment scope, got " << C.scope();
-    auto fragment =
-        gemm_inst == GemmInst::kWGMMA
-            ? makeGemmFragmentCHopper(M, N, M / warp_m, N / warp_n,
-                                      C->dtype.bits())
-            : makeGemmFragmentC(M, N, M / warp_m, N / warp_n, C->dtype.bits());
-    results.Set(C, fragment->BindThreadRange(thread_range));
-    if (A.scope() == "shared" || A.scope() == "shared.dyn") {
-      int dim_A = A->shape.size();
-      const int64_t mat_stride = *as_const_int(A->shape[dim_A - 2]);
-      const int64_t mat_continuous = *as_const_int(A->shape[dim_A - 1]);
+        << "only supports C in local.fragment scope, got " << c_.scope();
+    auto fragment = gemm_inst == GemmInst::kWGMMA
+                        ? makeGemmFragmentCHopper(m_, n_, m_ / warp_m,
+                                                  n_ / warp_n, c_->dtype.bits())
+                        : makeGemmFragmentC(m_, n_, m_ / warp_m, n_ / warp_n,
+                                            c_->dtype.bits());
+    results.Set(c_, fragment->BindThreadRange(thread_range));
+    if (a_.scope() == "shared" || a_.scope() == "shared.dyn") {
+      int dim_A = a_->shape.size();
+      const int64_t mat_stride = *as_const_int(a_->shape[dim_A - 2]);
+      const int64_t mat_continuous = *as_const_int(a_->shape[dim_A - 1]);
       const int64_t continuity =
-          trans_A ? 4 * mat_continuous / warp_m : mat_continuous;
+          transA_ ? 4 * mat_continuous / warp_m : mat_continuous;
       auto ABLayout =
           gemm_inst == GemmInst::kWGMMA
               ? makeGemmABLayoutHopper(mat_stride, mat_continuous, continuity,
-                                       A->dtype.bits(), !trans_A)
+                                       a_->dtype.bits(), !transA_)
               : makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
-                                 A->dtype.bits(), !trans_A);
-      results.Set(A, ABLayout);
+                                 a_->dtype.bits(), !transA_);
+      results.Set(a_, ABLayout);
     } else {
-      auto fragment = makeGemmFragmentA(M, N, K, M / warp_m, N / warp_n,
-                                        A->dtype.bits(), trans_A);
-      results.Set(A, fragment->BindThreadRange(thread_range));
+      auto fragment = makeGemmFragmentA(m_, n_, k_, m_ / warp_m, n_ / warp_n,
+                                        a_->dtype.bits(), transA_);
+      results.Set(a_, fragment->BindThreadRange(thread_range));
     }
-    if (B.scope() == "shared" || B.scope() == "shared.dyn") {
-      int dim_B = B->shape.size();
-      const int64_t mat_stride = *as_const_int(B->shape[dim_B - 2]);
-      const int64_t mat_continuous = *as_const_int(B->shape[dim_B - 1]);
+    if (b_.scope() == "shared" || b_.scope() == "shared.dyn") {
+      int dim_B = b_->shape.size();
+      const int64_t mat_stride = *as_const_int(b_->shape[dim_B - 2]);
+      const int64_t mat_continuous = *as_const_int(b_->shape[dim_B - 1]);
       const int64_t continuity =
-          trans_B ? mat_continuous : mat_continuous / warp_n;
+          transB_ ? mat_continuous : mat_continuous / warp_n;
 
       auto ABLayout =
           gemm_inst == GemmInst::kWGMMA
               ? makeGemmABLayoutHopper(mat_stride, mat_continuous, continuity,
-                                       B->dtype.bits(), trans_B)
+                                       b_->dtype.bits(), transB_)
               : makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
-                                 B->dtype.bits(), trans_B);
-      results.Set(B, ABLayout);
+                                 b_->dtype.bits(), transB_);
+      results.Set(b_, ABLayout);
     } else {
       auto fragment =
-          makeGemmFragmentB(M, N, K, M / warp_m, N / warp_n, trans_B);
-      results.Set(B, fragment->BindThreadRange(thread_range));
+          makeGemmFragmentB(m_, n_, k_, m_ / warp_m, n_ / warp_n, transB_);
+      results.Set(b_, fragment->BindThreadRange(thread_range));
     }
   } else if (gemm_inst == GemmInst::kTCGEN5MMA) {
-    ICHECK(C.scope() == "shared.tmem")
-        << "TCGEN5MMA only supports C in shared.tmem scope, got " << C.scope();
-    ICHECK(A.scope() == "shared.dyn" || A.scope() == "shared")
+    ICHECK(c_.scope() == "shared.tmem")
+        << "TCGEN5MMA only supports C in shared.tmem scope, got " << c_.scope();
+    ICHECK(a_.scope() == "shared.dyn" || a_.scope() == "shared")
         << "Current TCGEN5MMA only supports A in shared.dyn scope";
     auto [can_use_tcgen5mma, meta] =
-        GetTCGEN5MMAMeta(M, N, K, A->dtype, C->dtype);
+        GetTCGEN5MMAMeta(m_, n_, k_, a_->dtype, c_->dtype);
     ICHECK(can_use_tcgen5mma);
     {
-      int dim_A = A->shape.size();
-      const int64_t mat_stride = *as_const_int(A->shape[dim_A - 2]);
-      const int64_t mat_continuous = *as_const_int(A->shape[dim_A - 1]);
-      results.Set(A, makeGemmABLayoutSm100(mat_stride, mat_continuous,
-                                           mat_continuous, A->dtype.bits(),
-                                           trans_A ? 1 : 2));
+      int dim_A = a_->shape.size();
+      const int64_t mat_stride = *as_const_int(a_->shape[dim_A - 2]);
+      const int64_t mat_continuous = *as_const_int(a_->shape[dim_A - 1]);
+      results.Set(a_, makeGemmABLayoutSm100(mat_stride, mat_continuous,
+                                            mat_continuous, a_->dtype.bits(),
+                                            transA_ ? 1 : 2));
     }
     {
-      int dim_B = B->shape.size();
-      const int64_t mat_stride = *as_const_int(B->shape[dim_B - 2]);
-      const int64_t mat_continuous = *as_const_int(B->shape[dim_B - 1]);
+      int dim_B = b_->shape.size();
+      const int64_t mat_stride = *as_const_int(b_->shape[dim_B - 2]);
+      const int64_t mat_continuous = *as_const_int(b_->shape[dim_B - 1]);
       const int64_t continuity = mat_continuous;
-      results.Set(B,
+      results.Set(b_,
                   makeGemmABLayoutSm100(mat_stride, mat_continuous, continuity,
-                                        B->dtype.bits(), trans_B ? 2 : 1));
+                                        b_->dtype.bits(), transB_ ? 2 : 1));
     }
     {
       Layout res;
-      IterVar i = make_itervar("i", M);
-      IterVar j = make_itervar("j", N);
-      ICHECK(M % meta.atom_m == 0);
+      IterVar i = make_itervar("i", m_);
+      IterVar j = make_itervar("j", n_);
+      ICHECK(m_ % meta.atom_m == 0);
       PrimExpr atom_idx = FloorDiv(i, meta.atom_m) +
-                          FloorDiv(j, meta.atom_n) * (M / meta.atom_m);
+                          FloorDiv(j, meta.atom_n) * (m_ / meta.atom_m);
       PrimExpr ai = FloorMod(i, meta.atom_m); // "ai" means "atom_i"
       PrimExpr aj = FloorMod(j, meta.atom_n);
       if (meta.atom_m == 128) {
@@ -822,40 +769,41 @@ LayoutMap GemmNode::InferLayout(const LayoutInferArgs &T,
       } else {
         ICHECK(0);
       }
-      results.Set(C, res);
+      results.Set(c_, res);
     }
   } else if (TargetIsCDNA(T.target)) {
-    ICHECK(C.scope() == "local.fragment")
+    ICHECK(IsFragmentBuffer(c_))
         << "CDNA gemm (FMMA) only supports C in local.fragment scope, got "
-        << C.scope();
-    auto fragment =
-        makeGemmFragmentCCDNA(M, N, M / warp_m, N / warp_n, C->dtype.bits());
-    results.Set(C, fragment->BindThreadRange(thread_range));
+        << c_.scope();
+    auto fragment = makeGemmFragmentCCDNA(m_, n_, m_ / warp_m, n_ / warp_n,
+                                          c_->dtype.bits());
+    results.Set(c_, fragment->BindThreadRange(thread_range));
 
-    if (A.scope() == "shared" || A.scope() == "shared.dyn") {
-      int dim_A = A->shape.size();
+    if (a_.scope() == "shared" || a_.scope() == "shared.dyn") {
+      int dim_A = a_->shape.size();
       auto shared_layout = makeGemmABLayoutCDNA(
-          *as_const_int(A->shape[dim_A - 2]),
-          *as_const_int(A->shape[dim_A - 1]), A->dtype.bits(), kPack);
-      results.Set(A, shared_layout);
-    } else if (A.scope() == "local.fragment") {
-      auto fragment = makeGemmFragmentACDNA(M, N, K, M / warp_m, N / warp_n,
-                                            A->dtype.bits(), kPack, trans_A);
-      results.Set(A, fragment->BindThreadRange(thread_range));
+          *as_const_int(a_->shape[dim_A - 2]),
+          *as_const_int(a_->shape[dim_A - 1]), a_->dtype.bits(), kPack_);
+      results.Set(a_, shared_layout);
+    } else if (IsFragmentBuffer(a_)) {
+      auto fragment =
+          makeGemmFragmentACDNA(m_, n_, k_, m_ / warp_m, n_ / warp_n,
+                                a_->dtype.bits(), kPack_, transA_);
+      results.Set(a_, fragment->BindThreadRange(thread_range));
     } else {
       ICHECK(0);
     }
-    if (B.scope() == "shared" || B.scope() == "shared.dyn") {
-      int dim_B = B->shape.size();
+    if (b_.scope() == "shared" || b_.scope() == "shared.dyn") {
+      int dim_B = b_->shape.size();
       auto shared_layout = makeGemmABLayoutCDNA(
-          *as_const_int(B->shape[dim_B - 2]),
-          *as_const_int(B->shape[dim_B - 1]), B->dtype.bits(), kPack);
+          *as_const_int(b_->shape[dim_B - 2]),
+          *as_const_int(b_->shape[dim_B - 1]), b_->dtype.bits(), kPack_);
 
-      results.Set(B, shared_layout);
-    } else if (B.scope() == "local.fragment") {
+      results.Set(b_, shared_layout);
+    } else if (IsFragmentBuffer(b_)) {
       auto fragment =
-          makeGemmFragmentB(M, N, K, M / warp_m, N / warp_n, trans_B);
-      results.Set(B, fragment->BindThreadRange(thread_range));
+          makeGemmFragmentB(m_, n_, k_, m_ / warp_m, n_ / warp_n, transB_);
+      results.Set(b_, fragment->BindThreadRange(thread_range));
     } else {
       ICHECK(0);
     }
@@ -866,7 +814,7 @@ LayoutMap GemmNode::InferLayout(const LayoutInferArgs &T,
   return results;
 }
 
-TIR_REGISTER_TL_OP(Gemm, gemm)
+TIR_REGISTER_TL_TILE_OP(Gemm, gemm)
     .set_num_inputs(5)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
@@ -874,18 +822,17 @@ TIR_REGISTER_TL_OP(Gemm, gemm)
 TVM_REGISTER_OP("tl.GemmWarpPolicy")
     .set_attr<TScriptPrinterName>("TScriptPrinterName", "GemmWarpPolicy");
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   GemmNode::RegisterReflection();
   GemmWarpPolicyNode::RegisterReflection();
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.GemmWarpPolicyComputeWarpPartition",
                         [](GemmWarpPolicy policy, int M, int N, int block_size,
                            Target target, GemmInst gemm_inst) {
-                          policy->ComputeWarpPartition(M, N, block_size, target,
+                          policy->computeWarpPartition(M, N, block_size, target,
                                                        gemm_inst);
-                          return;
                         });
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/gemm.h b/src/op/gemm.h
index dd7e24011..fd2733882 100644
--- a/src/op/gemm.h
+++ b/src/op/gemm.h
@@ -22,16 +22,48 @@ enum class GemmWarpPolicyType : uint8_t {
   kFree = 3,
 };
 
+/// Convert GemmWarpPolicyType enum to string for debugging
+inline const char *GemmWarpPolicyTypeToString(GemmWarpPolicyType type) {
+  switch (type) {
+  case GemmWarpPolicyType::kSquare:
+    return "Square";
+  case GemmWarpPolicyType::kFullRow:
+    return "FullRow";
+  case GemmWarpPolicyType::kFullCol:
+    return "FullCol";
+  case GemmWarpPolicyType::kFree:
+    return "Free";
+  default:
+    return "Unknown";
+  }
+}
+
 // Target GEMM instruction
 enum class GemmInst : uint8_t { kMMA, kWGMMA, kTCGEN5MMA, kMFMA };
+
+/// Convert GemmInst enum to string for debugging
+inline const char *GemmInstToString(GemmInst inst) {
+  switch (inst) {
+  case GemmInst::kMMA:
+    return "MMA";
+  case GemmInst::kWGMMA:
+    return "WGMMA";
+  case GemmInst::kTCGEN5MMA:
+    return "TCGEN5MMA";
+  case GemmInst::kMFMA:
+    return "MFMA";
+  default:
+    return "Unknown";
+  }
+}
+
 class GemmWarpPolicyNode : public Object {
 public:
   mutable int m_warp{0};
   mutable int n_warp{0};
   int policy_type;
 
-  static constexpr const char *_type_key = "tl.GemmWarpPolicy";
-  TVM_DECLARE_FINAL_OBJECT_INFO(GemmWarpPolicyNode, Object);
+  TVM_FFI_DECLARE_OBJECT_INFO("tl.GemmWarpPolicy", GemmWarpPolicyNode, Object);
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -41,22 +73,7 @@ class GemmWarpPolicyNode : public Object {
         .def_ro("n_warp", &GemmWarpPolicyNode::n_warp);
   }
 
-  bool SEqualReduce(const GemmWarpPolicyNode *other,
-                    SEqualReducer equal) const {
-    return equal(policy_type, other->policy_type) &&
-           equal(m_warp, other->m_warp) && equal(n_warp, other->n_warp);
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const {
-    hash_reduce(policy_type);
-    hash_reduce(m_warp);
-    hash_reduce(n_warp);
-  }
-
-  static constexpr bool _type_has_method_sequal_reduce = true;
-  static constexpr bool _type_has_method_shash_reduce = true;
-
-  std::pair<int, int> ComputeWarpPartition(int M, int N, int block_size,
+  std::pair<int, int> computeWarpPartition(int M, int N, int block_size,
                                            Target target,
                                            GemmInst gemm_inst) const;
 
@@ -74,22 +91,23 @@ class GemmWarpPolicyNode : public Object {
 
 class GemmWarpPolicy : public ObjectRef {
 public:
-  TVM_DEFINE_OBJECT_REF_METHODS(GemmWarpPolicy, ObjectRef, GemmWarpPolicyNode);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(GemmWarpPolicy, ObjectRef,
+                                             GemmWarpPolicyNode);
 
   explicit GemmWarpPolicy(GemmWarpPolicyType policy_type) {
-    auto node = make_object<GemmWarpPolicyNode>();
+    auto node = tvm::ffi::make_object<GemmWarpPolicyNode>();
     node->policy_type = (int)policy_type;
     data_ = std::move(node);
   }
 
   explicit GemmWarpPolicy(int policy_type) {
-    auto node = make_object<GemmWarpPolicyNode>();
+    auto node = tvm::ffi::make_object<GemmWarpPolicyNode>();
     node->policy_type = policy_type;
     data_ = std::move(node);
   }
 
   explicit GemmWarpPolicy(int m_warp, int n_warp) {
-    auto node = make_object<GemmWarpPolicyNode>();
+    auto node = tvm::ffi::make_object<GemmWarpPolicyNode>();
     node->m_warp = m_warp;
     node->n_warp = n_warp;
     node->policy_type = (int)GemmWarpPolicyType::kFree;
@@ -99,89 +117,48 @@ class GemmWarpPolicy : public ObjectRef {
 
 class GemmNode : public TileOperatorNode {
 public:
-  bool CheckWGMMA() const;
-  tir::Buffer A, B, C;
-  // pointer to the A, B, C
-  PrimExpr Aptr, Bptr, Cptr;
-  bool trans_A, trans_B;
-  int M, N, K;
-  int stride_A, stride_B;
-  int offset_A, offset_B;
-  PrimExpr clear_accum = const_false();
+  bool checkWgmma() const;
+  tir::Buffer a_, b_, c_;
+  // BufferRegion for A, B and C
+  BufferRegion aRegion_, bRegion_, cRegion_;
+  bool transA_, transB_;
+  int m_, n_, k_;
+  int strideA_, strideB_;
+  int offsetA_, offsetB_;
+  PrimExpr clearAccum_ = const_false();
   // k_pack please ref to bitblas/tl/mfma_macro_generator.py::k_pack
   // only will be enabled under cdna mfma instructions
-  int kPack = 1;
-  int wg_wait = 0;
-  PrimExpr mbarptr;
-  std::optional<tir::Buffer> mbar; // mbar is optional, only used for TCGEN5MMA
-  Array<PrimExpr> C_coords;
-  mutable GemmWarpPolicy policy;
-
-  static constexpr const char *_type_key = "tl.Gemm";
-  TVM_DECLARE_FINAL_OBJECT_INFO(GemmNode, TileOperatorNode);
+  int kPack_ = 1;
+  int wgWait_ = 0;
+  BufferRegion mbarRegion_;
+  std::optional<tir::Buffer> mbar_; // mbar is optional, only used for TCGEN5MMA
+  Array<PrimExpr> cCoords_;
+  mutable GemmWarpPolicy policy_;
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.Gemm", GemmNode, TileOperatorNode);
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
     refl::ObjectDef<GemmNode>()
-        .def_ro("A", &GemmNode::A)
-        .def_ro("B", &GemmNode::B)
-        .def_ro("C", &GemmNode::C)
-        .def_ro("Aptr", &GemmNode::Aptr)
-        .def_ro("Bptr", &GemmNode::Bptr)
-        .def_ro("Cptr", &GemmNode::Cptr)
-        .def_ro("trans_A", &GemmNode::trans_A)
-        .def_ro("trans_B", &GemmNode::trans_B)
-        .def_ro("M", &GemmNode::M)
-        .def_ro("N", &GemmNode::N)
-        .def_ro("K", &GemmNode::K)
-        .def_ro("stride_A", &GemmNode::stride_A)
-        .def_ro("stride_B", &GemmNode::stride_B)
-        .def_ro("offset_A", &GemmNode::offset_A)
-        .def_ro("offset_B", &GemmNode::offset_B)
-        .def_ro("clear_accum", &GemmNode::clear_accum)
-        .def_ro("kPack", &GemmNode::kPack)
-        .def_ro("wg_wait", &GemmNode::wg_wait)
-        .def_ro("policy", &GemmNode::policy);
-  }
-
-  bool SEqualReduce(const GemmNode *other, SEqualReducer equal) const {
-    return equal(A, other->A) && equal(B, other->B) && equal(C, other->C) &&
-           equal(Aptr, other->Aptr) && equal(Bptr, other->Bptr) &&
-           equal(Cptr, other->Cptr) && equal(trans_A, other->trans_A) &&
-           equal(trans_B, other->trans_B) && equal(M, other->M) &&
-           equal(N, other->N) && equal(K, other->K) &&
-           equal(stride_A, other->stride_A) &&
-           equal(stride_B, other->stride_B) &&
-           equal(offset_A, other->offset_A) &&
-           equal(offset_B, other->offset_B) &&
-           equal(clear_accum, other->clear_accum) &&
-           equal(kPack, other->kPack) && equal(wg_wait, other->wg_wait) &&
-           equal(policy, other->policy);
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const {
-    hash_reduce(A);
-    hash_reduce(B);
-    hash_reduce(C);
-    hash_reduce(Aptr);
-    hash_reduce(Bptr);
-    hash_reduce(Cptr);
-    hash_reduce(trans_A);
-    hash_reduce(trans_B);
-    hash_reduce(M);
-    hash_reduce(N);
-    hash_reduce(K);
-    hash_reduce(stride_A);
-    hash_reduce(stride_B);
-    hash_reduce(offset_A);
-    hash_reduce(offset_B);
-    hash_reduce(clear_accum);
-    hash_reduce(kPack);
-    hash_reduce(wg_wait);
-    hash_reduce(policy);
+        .def_ro("a", &GemmNode::a_)
+        .def_ro("b", &GemmNode::b_)
+        .def_ro("c", &GemmNode::c_)
+        .def_ro("aRegion", &GemmNode::aRegion_)
+        .def_ro("bRegion", &GemmNode::bRegion_)
+        .def_ro("cRegion", &GemmNode::cRegion_)
+        .def_ro("transA", &GemmNode::transA_)
+        .def_ro("transB", &GemmNode::transB_)
+        .def_ro("m", &GemmNode::m_)
+        .def_ro("n", &GemmNode::n_)
+        .def_ro("k", &GemmNode::k_)
+        .def_ro("strideA", &GemmNode::strideA_)
+        .def_ro("strideB", &GemmNode::strideB_)
+        .def_ro("offsetA", &GemmNode::offsetA_)
+        .def_ro("offsetB", &GemmNode::offsetB_)
+        .def_ro("clearAccum", &GemmNode::clearAccum_)
+        .def_ro("kPack", &GemmNode::kPack_)
+        .def_ro("wgWait", &GemmNode::wgWait_)
+        .def_ro("policy", &GemmNode::policy_);
   }
-  static constexpr bool _type_has_method_sequal_reduce = true;
-  static constexpr bool _type_has_method_shash_reduce = true;
 
   Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
   LayoutMap InferLayout(const LayoutInferArgs &T,
@@ -190,21 +167,22 @@ class GemmNode : public TileOperatorNode {
   TileOperator Clone() const;
 
 private:
-  GemmInst GetGemmInst(int block_size, Target target) const;
-  bool AllowTCGEN5MMA(Target target) const;
-  bool AllowWGMMA(int block_size, Target target) const;
+  GemmInst getGemmInst(int block_size, Target target) const;
+  bool allowTcgen5Mma(Target target) const;
+  bool allowWgmma(int block_size, Target target) const;
 
   mutable bool completed_ = false;
 };
 
 class Gemm : public TileOperator {
 public:
-  TVM_DEFINE_OBJECT_REF_METHODS(Gemm, TileOperator, GemmNode);
-  TVM_DLL Gemm(Array<PrimExpr> args, BufferMap vmap);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Gemm, TileOperator, GemmNode);
+  TVM_DLL Gemm(Array<PrimExpr> args,
+               Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
 } // namespace tl
 } // namespace tvm
 
-#endif //  TVM_TL_OP_GEMM_H_
\ No newline at end of file
+#endif //  TVM_TL_OP_GEMM_H_
diff --git a/src/op/gemm_py.cc b/src/op/gemm_py.cc
index 4e48389ee..c68861814 100644
--- a/src/op/gemm_py.cc
+++ b/src/op/gemm_py.cc
@@ -12,13 +12,18 @@
 #include <tvm/tir/transform.h>
 
 #include "../target/utils.h"
-#include "tvm/ffi/string.h"
+#include "tcgen5_meta.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
 
 using namespace tir;
 
+// NormalizeToBufferRegion moved to src/op/utils.{h,cc}
+
+// MakeAccessPtrFromRegion moved to src/op/utils.{h,cc}
+
 /**
  * @brief Construct a Gemm operator from serialized TL arguments and a buffer
  * map.
@@ -40,42 +45,50 @@ using namespace tir;
  *      M (Int), N (Int), K (Int), policy (Int), clear_accum (Bool),
  *      stride_A (Int), stride_B (Int), offset_A (Int), offset_B (Int),
  *      (optional) kPack (Int), (optional) wg_wait (Int)]
- * @param vmap Mapping from access pointer vars to Buffer objects used to
- *   resolve the Buffer corresponding to each pointer argument.
  *
  * @note If `kPack` is provided it must be 1 or 2; otherwise the constructor
  *       fails with an ICHECK (runtime assertion). No other validation is
  *       performed here.
  */
-GemmPy::GemmPy(Array<PrimExpr> args, BufferMap vmap) {
-  ObjectPtr<GemmPyNode> node = make_object<GemmPyNode>();
+GemmPy::GemmPy(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
+  ObjectPtr<GemmPyNode> node = tvm::ffi::make_object<GemmPyNode>();
 
-  node->Aptr = args[0];
-  node->Bptr = args[1];
-  node->Cptr = args[2];
-  node->A = vmap[GetVarFromAccessPtr(node->Aptr)];
-  node->B = vmap[GetVarFromAccessPtr(node->Bptr)];
-  node->C = vmap[GetVarFromAccessPtr(node->Cptr)];
-  node->trans_A = args[3].as<Bool>().value();
-  node->trans_B = args[4].as<Bool>().value();
-  node->M = args[5].as<IntImm>().value()->value;
-  node->N = args[6].as<IntImm>().value()->value;
-  node->K = args[7].as<IntImm>().value()->value;
-  node->policy = GemmWarpPolicy(args[8].as<IntImm>().value()->value);
-  node->clear_accum = args[9].as<PrimExpr>().value();
-  node->stride_A = args[10].as<IntImm>().value()->value;
-  node->stride_B = args[11].as<IntImm>().value()->value;
-  node->offset_A = args[12].as<IntImm>().value()->value;
-  node->offset_B = args[13].as<IntImm>().value()->value;
+  node->aRegion_ = NormalizeToBufferRegion(args[0]);
+  node->bRegion_ = NormalizeToBufferRegion(args[1]);
+  node->cRegion_ = NormalizeToBufferRegion(args[2]);
+
+  node->a_ = node->aRegion_->buffer;
+  node->b_ = node->bRegion_->buffer;
+  node->c_ = node->cRegion_->buffer;
+  node->transA_ = args[3].as<Bool>().value();
+  node->transB_ = args[4].as<Bool>().value();
+  node->m_ = args[5].as<IntImm>().value()->value;
+  node->n_ = args[6].as<IntImm>().value()->value;
+  node->k_ = args[7].as<IntImm>().value()->value;
+  node->policy_ = GemmWarpPolicy(args[8].as<IntImm>().value()->value);
+  node->clearAccum_ = args[9].as<PrimExpr>().value();
+  node->strideA_ = args[10].as<IntImm>().value()->value;
+  node->strideB_ = args[11].as<IntImm>().value()->value;
+  node->offsetA_ = args[12].as<IntImm>().value()->value;
+  node->offsetB_ = args[13].as<IntImm>().value()->value;
   if (args.size() > 14) {
-    node->kPack = args[14].as<IntImm>().value()->value;
-    if (node->kPack != 1 && node->kPack != 2) {
+    node->kPack_ = args[14].as<IntImm>().value()->value;
+    if (node->kPack_ != 1 && node->kPack_ != 2) {
       ICHECK(false) << "kPack must be 1 or 2";
     }
   }
   if (args.size() > 15) {
-    node->wg_wait = args[15].as<IntImm>().value()->value;
+    node->wgWait_ = args[15].as<IntImm>().value()->value;
+  }
+  if (args.size() > 16) {
+    if (const auto *load = args[16].as<BufferLoadNode>()) {
+      node->mbarRegion_ =
+          NormalizeToBufferRegion(Downcast<BufferLoad>(args[16]));
+      node->mbar_ = node->mbarRegion_->buffer;
+    }
   }
+  node->cCoords_ = Array<PrimExpr>(
+      {args[17].as<PrimExpr>().value(), args[18].as<PrimExpr>().value()});
   data_ = std::move(node);
 }
 
@@ -88,16 +101,35 @@ GemmPy::GemmPy(Array<PrimExpr> args, BufferMap vmap) {
  * @return TileOperator A Gemm operator that owns a copy of this node.
  */
 TileOperator GemmPyNode::Clone() const {
-  auto op = make_object<GemmPyNode>(*this);
+  auto op = tvm::ffi::make_object<GemmPyNode>(*this);
   return GemmPy(op);
 }
 
-GemmInst GemmPyNode::GetGemmInst(int block_size, Target target) const {
+bool GemmPyNode::allowTcgen5Mma(Target target) const {
+  return TargetIsSm100(target) &&
+         ((a_.scope() == "shared.dyn" || a_.scope() == "shared" ||
+           a_.scope() == "shared.tmem") &&
+          (b_.scope() == "shared.dyn" || b_.scope() == "shared") &&
+          c_.scope() == "shared.tmem") &&
+         GetTCGEN5MMAMeta(m_, n_, k_, a_->dtype, c_->dtype).first;
+}
+
+bool GemmPyNode::allowWgmma(int block_size, Target target) const {
+  tvm::transform::PassContext ctxt = tvm::transform::PassContext::Current();
+
   int warp_size = TargetGetWarpSize(target);
   int num_warps = block_size / warp_size;
-  bool allow_wgmma = TargetIsHopper(target) && (this->M >= 64) &&
-                     (num_warps % 4 == 0) && CheckWGMMA();
-  if (allow_wgmma) {
+  return !ctxt->GetConfig(kDisableWGMMA, Optional<Bool>()).value_or(false) &&
+         TargetIsHopper(target) && (this->m_ >= 64) && (num_warps % 4 == 0) &&
+         checkWgmma();
+}
+
+GemmInst GemmPyNode::getGemmInst(int block_size, Target target) const {
+  bool allow_tcgen5mma = allowTcgen5Mma(target);
+  bool allow_wgmma = allowWgmma(block_size, target);
+  if (allow_tcgen5mma) {
+    return GemmInst::kTCGEN5MMA;
+  } else if (allow_wgmma) {
     return GemmInst::kWGMMA;
   } else if (TargetIsCDNA(target)) {
     return GemmInst::kMFMA;
@@ -140,51 +172,40 @@ GemmInst GemmPyNode::GetGemmInst(int block_size, Target target) const {
  * @return true if WGMMA is supported for the current buffers, dtypes, and
  *         transpose/shape constraints; false otherwise.
  */
-bool GemmPyNode::CheckWGMMA() const {
-  if (B.scope() != "shared.dyn" && B.scope() != "shared") {
+bool GemmPyNode::checkWgmma() const {
+  if (b_.scope() != "shared.dyn" && b_.scope() != "shared") {
     return false;
   }
 
-  if (C->dtype == DataType::Float(16)) {
-    if (A->dtype == DataType::Float(16) && B->dtype == DataType::Float(16))
-      return K % 16 == 0;
-    else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e4m3())
-      return (!trans_A) && trans_B && K % 32 == 0;
-    else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e5m2())
-      return (!trans_A) && trans_B && K % 32 == 0;
-    else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e4m3())
-      return (!trans_A) && trans_B && K % 32 == 0;
-    else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e5m2())
-      return (!trans_A) && trans_B && K % 32 == 0;
+  if (c_->dtype == DataType::Float(16)) {
+    if (a_->dtype == DataType::Float(16) && b_->dtype == DataType::Float(16))
+      return k_ % 16 == 0;
+    else if (a_->dtype.is_float8() && b_->dtype.is_float8())
+      return (!transA_) && transB_ && k_ % 32 == 0;
     else
       return false;
-  } else if (C->dtype == DataType::Float(32)) {
-    if (A->dtype == DataType::Float(16) && B->dtype == DataType::Float(16))
-      return K % 16 == 0;
-    else if (A->dtype == DataType::BFloat(16) &&
-             B->dtype == DataType::BFloat(16))
-      return K % 16 == 0;
-    else if (A->dtype == DataType::Float(32) && B->dtype == DataType::Float(32))
-      return (!trans_A) && trans_B && K % 8 == 0;
-    else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e4m3())
-      return (!trans_A) && trans_B && K % 32 == 0;
-    else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e5m2())
-      return (!trans_A) && trans_B && K % 32 == 0;
-    else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e4m3())
-      return (!trans_A) && trans_B && K % 32 == 0;
-    else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e5m2())
-      return (!trans_A) && trans_B && K % 32 == 0;
+  } else if (c_->dtype == DataType::Float(32)) {
+    if (a_->dtype == DataType::Float(16) && b_->dtype == DataType::Float(16))
+      return k_ % 16 == 0;
+    else if (a_->dtype == DataType::BFloat(16) &&
+             b_->dtype == DataType::BFloat(16))
+      return k_ % 16 == 0;
+    else if (a_->dtype == DataType::Float(32) &&
+             b_->dtype == DataType::Float(32))
+      return (!transA_) && transB_ && k_ % 8 == 0;
+    else if (a_->dtype.is_float8() && b_->dtype.is_float8())
+      return (!transA_) && transB_ && k_ % 32 == 0;
     else
       return false;
-  } else if (C->dtype == DataType::Int(32)) {
-    if (A->dtype == DataType::Int(8) && B->dtype == DataType::Int(8))
-      return (!trans_A) && trans_B && K % 32 == 0;
-    else if (A->dtype == DataType::Int(8) && B->dtype == DataType::UInt(8))
-      return (!trans_A) && trans_B && K % 32 == 0;
-    else if (A->dtype == DataType::UInt(8) && B->dtype == DataType::Int(8))
-      return (!trans_A) && trans_B && K % 32 == 0;
-    else if (A->dtype == DataType::UInt(8) && B->dtype == DataType::UInt(8))
-      return (!trans_A) && trans_B && K % 32 == 0;
+  } else if (c_->dtype == DataType::Int(32)) {
+    if (a_->dtype == DataType::Int(8) && b_->dtype == DataType::Int(8))
+      return (!transA_) && transB_ && k_ % 32 == 0;
+    else if (a_->dtype == DataType::Int(8) && b_->dtype == DataType::UInt(8))
+      return (!transA_) && transB_ && k_ % 32 == 0;
+    else if (a_->dtype == DataType::UInt(8) && b_->dtype == DataType::Int(8))
+      return (!transA_) && transB_ && k_ % 32 == 0;
+    else if (a_->dtype == DataType::UInt(8) && b_->dtype == DataType::UInt(8))
+      return (!transA_) && transB_ && k_ % 32 == 0;
     else
       return false;
   } else {
@@ -208,8 +229,8 @@ bool GemmPyNode::CheckWGMMA() const {
  */
 static int GetArchInt(Target target) {
   int arch_int = 0;
-  auto s = target->GetAttr<String>("arch");
-  ICHECK(s.defined());
+  auto s = target->GetAttr<tvm::ffi::String>("arch");
+  ICHECK(s.has_value());
   std::string arch = s.value();
   if (arch.rfind("sm_", 0) == 0) {
     arch_int = std::stoi(arch.substr(3));
@@ -221,18 +242,19 @@ static int GetArchInt(Target target) {
 
 Stmt GemmPyNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   auto block_size = *as_const_int(T.thread_bounds->extent);
-  GemmInst gemm_inst = GetGemmInst(block_size, T.target);
+  GemmInst gemm_inst = getGemmInst(block_size, T.target);
 
   auto [warp_m, warp_n] =
-      policy->ComputeWarpPartition(M, N, block_size, T.target, gemm_inst);
+      policy_->computeWarpPartition(m_, n_, block_size, T.target, gemm_inst);
 
   if (const auto f = ffi::Function::GetGlobal("tl.gemm_py.lower")) {
     auto prim_func =
-        Downcast<PrimFunc>((*f)(GetRef<GemmPy>(this), T.layout_map, T.target,
-                                T.thread_bounds, T.thread_var));
+        Downcast<PrimFunc>((*f)(tvm::ffi::GetRef<GemmPy>(this), T.layout_map,
+                                T.target, T.thread_bounds, T.thread_var));
     ICHECK(prim_func->attrs.defined());
-    auto global_symbol = prim_func->attrs.GetAttr<String>("global_symbol");
-    ICHECK(global_symbol.defined());
+    auto global_symbol =
+        prim_func->attrs.GetAttr<tvm::ffi::String>("global_symbol");
+    ICHECK(global_symbol.has_value());
     if (prim_func->body.as<BlockRealizeNode>()) {
       BlockRealize block_realize = Downcast<BlockRealize>(prim_func->body);
       auto block = block_realize->block;
@@ -265,7 +287,15 @@ LayoutMap GemmPyNode::InferLayout(const LayoutInferArgs &T,
 
   if (const auto f = ffi::Function::GetGlobal("tl.gemm_py.infer_layout")) {
     results = Downcast<LayoutMap>(
-        (*f)(GetRef<GemmPy>(this), T.target, T.thread_bounds));
+        (*f)(tvm::ffi::GetRef<GemmPy>(this), T.target, T.thread_bounds));
+    // Bind all fragment layouts with the provided thread range
+    for (auto kv : results) {
+      const Buffer &buf = kv.first;
+      const Layout &layout = kv.second;
+      if (auto frag = layout.as<Fragment>()) {
+        results.Set(buf, frag.value()->BindThreadRange(T.thread_bounds));
+      }
+    }
   } else {
     LOG(FATAL) << "No infer layout function found for gemm_py";
   }
@@ -274,20 +304,48 @@ LayoutMap GemmPyNode::InferLayout(const LayoutInferArgs &T,
   return results;
 }
 
-TIR_REGISTER_TL_OP(GemmPy, gemm_py)
+TIR_REGISTER_TL_TILE_OP(GemmPy, gemm_py)
     .set_num_inputs(5)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
-TVM_FFI_STATIC_INIT_BLOCK({ GemmPyNode::RegisterReflection(); });
+TVM_FFI_STATIC_INIT_BLOCK() { GemmPyNode::RegisterReflection(); }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.GemmPyGemmInst",
                         [](GemmPy gemm_py, int block_size, Target target) {
-                          return gemm_py->GetGemmInst(block_size, target);
+                          return gemm_py->getGemmInst(block_size, target);
                         });
-});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def(
+      "tl.get_tcgen5_mma_meta",
+      [](int M, int N, int K, DataType ab_dtype, DataType c_dtype) {
+        auto [success, meta] = GetTCGEN5MMAMeta(M, N, K, ab_dtype, c_dtype);
+        Array<Integer> result;
+        if (success) {
+          result.push_back(Integer(meta.atom_m));
+          result.push_back(Integer(meta.atom_n));
+          result.push_back(Integer(meta.atom_k));
+          result.push_back(Integer(meta.enable_ws));
+          result.push_back(Integer(meta.enable_2cta));
+        }
+        return result;
+      });
+  refl::GlobalDef().def(
+      "tl.get_tcgen5_instr_desc",
+      [](int atom_m, int atom_n, int atom_k, DataType ab_dtype,
+         DataType c_dtype, bool a_is_k_major, bool b_is_k_major, int scale_in_a,
+         int scale_in_b) {
+        uint32_t desc = GetTCGEN5InstrDesc(atom_m, atom_n, atom_k, ab_dtype,
+                                           c_dtype, a_is_k_major, b_is_k_major,
+                                           scale_in_a, scale_in_b);
+        return Integer(static_cast<int64_t>(desc));
+      });
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/gemm_py.h b/src/op/gemm_py.h
index 65ed08c0f..d6468a0bf 100644
--- a/src/op/gemm_py.h
+++ b/src/op/gemm_py.h
@@ -18,87 +18,55 @@ using namespace tir;
 
 class GemmPyNode : public TileOperatorNode {
 public:
-  bool CheckWGMMA() const;
-  tir::Buffer A, B, C;
-  // pointer to the A, B, C
-  PrimExpr Aptr, Bptr, Cptr;
-  bool trans_A, trans_B;
-  int M, N, K;
-  int stride_A, stride_B;
-  int offset_A, offset_B;
-  PrimExpr clear_accum = const_false();
+  bool checkWgmma() const;
+  bool allowTcgen5Mma(Target target) const;
+  bool allowWgmma(int block_size, Target target) const;
+  tir::Buffer a_, b_, c_;
+  // BufferRegion for A, B and C
+  BufferRegion aRegion_, bRegion_, cRegion_;
+  bool transA_, transB_;
+  int m_, n_, k_;
+  int strideA_, strideB_;
+  int offsetA_, offsetB_;
+  PrimExpr clearAccum_ = const_false();
+  BufferRegion mbarRegion_;
+  tir::Buffer mbar_; // mbar is optional, only used for TCGEN5MMA
+  Array<PrimExpr> cCoords_;
   // k_pack please ref to bitblas/tl/mfma_macro_generator.py::k_pack
   // only will be enabled under cdna mfma instructions
-  int kPack = 1;
-  int wg_wait = 0;
-  mutable GemmWarpPolicy policy;
+  int kPack_ = 1;
+  int wgWait_ = 0;
+  mutable GemmWarpPolicy policy_;
 
-  static constexpr const char *_type_key = "tl.GemmPy";
-  TVM_DECLARE_FINAL_OBJECT_INFO(GemmPyNode, TileOperatorNode);
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.GemmPy", GemmPyNode, TileOperatorNode);
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
     refl::ObjectDef<GemmPyNode>()
-        .def_ro("A", &GemmPyNode::A)
-        .def_ro("B", &GemmPyNode::B)
-        .def_ro("C", &GemmPyNode::C)
-        .def_ro("Aptr", &GemmPyNode::Aptr)
-        .def_ro("Bptr", &GemmPyNode::Bptr)
-        .def_ro("Cptr", &GemmPyNode::Cptr)
-        .def_ro("trans_A", &GemmPyNode::trans_A)
-        .def_ro("trans_B", &GemmPyNode::trans_B)
-        .def_ro("M", &GemmPyNode::M)
-        .def_ro("N", &GemmPyNode::N)
-        .def_ro("K", &GemmPyNode::K)
-        .def_ro("stride_A", &GemmPyNode::stride_A)
-        .def_ro("stride_B", &GemmPyNode::stride_B)
-        .def_ro("offset_A", &GemmPyNode::offset_A)
-        .def_ro("offset_B", &GemmPyNode::offset_B)
-        .def_ro("clear_accum", &GemmPyNode::clear_accum)
-        .def_ro("kPack", &GemmPyNode::kPack)
-        .def_ro("wg_wait", &GemmPyNode::wg_wait)
-        .def_ro("policy", &GemmPyNode::policy);
+        .def_ro("a", &GemmPyNode::a_)
+        .def_ro("b", &GemmPyNode::b_)
+        .def_ro("c", &GemmPyNode::c_)
+        .def_ro("aRegion", &GemmPyNode::aRegion_)
+        .def_ro("bRegion", &GemmPyNode::bRegion_)
+        .def_ro("cRegion", &GemmPyNode::cRegion_)
+        .def_ro("transA", &GemmPyNode::transA_)
+        .def_ro("transB", &GemmPyNode::transB_)
+        .def_ro("m", &GemmPyNode::m_)
+        .def_ro("n", &GemmPyNode::n_)
+        .def_ro("k", &GemmPyNode::k_)
+        .def_ro("strideA", &GemmPyNode::strideA_)
+        .def_ro("strideB", &GemmPyNode::strideB_)
+        .def_ro("offsetA", &GemmPyNode::offsetA_)
+        .def_ro("offsetB", &GemmPyNode::offsetB_)
+        .def_ro("clearAccum", &GemmPyNode::clearAccum_)
+        .def_ro("mbarRegion", &GemmPyNode::mbarRegion_)
+        .def_ro("mbar", &GemmPyNode::mbar_)
+        .def_ro("cCoords", &GemmPyNode::cCoords_)
+        .def_ro("kPack", &GemmPyNode::kPack_)
+        .def_ro("wgWait", &GemmPyNode::wgWait_)
+        .def_ro("policy", &GemmPyNode::policy_);
   }
 
-  bool SEqualReduce(const GemmPyNode *other, SEqualReducer equal) const {
-    return equal(A, other->A) && equal(B, other->B) && equal(C, other->C) &&
-           equal(Aptr, other->Aptr) && equal(Bptr, other->Bptr) &&
-           equal(Cptr, other->Cptr) && equal(trans_A, other->trans_A) &&
-           equal(trans_B, other->trans_B) && equal(M, other->M) &&
-           equal(N, other->N) && equal(K, other->K) &&
-           equal(stride_A, other->stride_A) &&
-           equal(stride_B, other->stride_B) &&
-           equal(offset_A, other->offset_B) &&
-           equal(offset_B, other->offset_B) &&
-           equal(clear_accum, other->clear_accum) &&
-           equal(kPack, other->kPack) && equal(wg_wait, other->wg_wait) &&
-           equal(policy, other->policy);
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const {
-    hash_reduce(A);
-    hash_reduce(B);
-    hash_reduce(C);
-    hash_reduce(Aptr);
-    hash_reduce(Bptr);
-    hash_reduce(Cptr);
-    hash_reduce(trans_A);
-    hash_reduce(trans_B);
-    hash_reduce(M);
-    hash_reduce(N);
-    hash_reduce(K);
-    hash_reduce(stride_A);
-    hash_reduce(stride_B);
-    hash_reduce(offset_A);
-    hash_reduce(offset_B);
-    hash_reduce(clear_accum);
-    hash_reduce(kPack);
-    hash_reduce(wg_wait);
-    hash_reduce(policy);
-  }
-  static constexpr bool _type_has_method_sequal_reduce = true;
-  static constexpr bool _type_has_method_shash_reduce = true;
-
   Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
   LayoutMap InferLayout(const LayoutInferArgs &T,
                         InferLevel level) const override;
@@ -106,7 +74,7 @@ class GemmPyNode : public TileOperatorNode {
   TileOperator Clone() const;
 
   // Target GEMM instruction
-  GemmInst GetGemmInst(int block_size, Target target) const;
+  GemmInst getGemmInst(int block_size, Target target) const;
 
 private:
   mutable bool completed_ = false;
@@ -114,12 +82,13 @@ class GemmPyNode : public TileOperatorNode {
 
 class GemmPy : public TileOperator {
 public:
-  TVM_DEFINE_OBJECT_REF_METHODS(GemmPy, TileOperator, GemmPyNode);
-  TVM_DLL GemmPy(Array<PrimExpr> args, BufferMap vmap);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(GemmPy, TileOperator, GemmPyNode);
+  TVM_DLL GemmPy(Array<PrimExpr> args,
+                 Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
 } // namespace tl
 } // namespace tvm
 
-#endif //  TVM_TL_OP_GEMM_PY_H_
\ No newline at end of file
+#endif //  TVM_TL_OP_GEMM_PY_H_
diff --git a/src/op/gemm_sp.cc b/src/op/gemm_sp.cc
index dfa58b353..acff1ff7b 100644
--- a/src/op/gemm_sp.cc
+++ b/src/op/gemm_sp.cc
@@ -14,18 +14,19 @@
 #include "../target/utils.h"
 #include "builtin.h"
 #include "gemm.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
 
-std::pair<int, int> GemmSPWarpPolicyNode::ComputeWarpPartition(int M, int N,
+std::pair<int, int> GemmSPWarpPolicyNode::computeWarpPartition(int M, int N,
                                                                int block_size,
                                                                Target target,
                                                                bool use_wgmma,
                                                                int bits) const {
   int num_warps = block_size / TargetGetWarpSize(target);
 
-  auto [m_warp, n_warp] = GemmWarpPolicyNode::ComputeWarpPartition(
+  auto [m_warp, n_warp] = GemmWarpPolicyNode::computeWarpPartition(
       M, N, block_size, target, use_wgmma ? GemmInst::kWGMMA : GemmInst::kMMA);
 
   // Special handling for gemm_sp when the tiling size is not a multiple
@@ -79,31 +80,34 @@ std::pair<int, int> GemmSPWarpPolicyNode::ComputeWarpPartition(int M, int N,
  * The populated GemmSPNode is stored in the instance's internal data_ pointer.
  *
  * @param args Positional TL call arguments in the above order.
- * @param vmap BufferMap mapping access pointers (from args) to Buffer objects.
  *
  * @note An ICHECK failure is raised if a provided kPack is not 1 or 2.
  */
-GemmSP::GemmSP(Array<PrimExpr> args, BufferMap vmap) {
-  ObjectPtr<GemmSPNode> node = make_object<GemmSPNode>();
-  node->A = vmap[GetVarFromAccessPtr(args[0])];
-  node->E = vmap[GetVarFromAccessPtr(args[1])];
-  node->B = vmap[GetVarFromAccessPtr(args[2])];
-  node->C = vmap[GetVarFromAccessPtr(args[3])];
-  node->trans_A = args[4].as<Bool>().value();
-  node->trans_B = args[5].as<Bool>().value();
-  node->M = args[6].as<IntImm>().value()->value;
-  node->N = args[7].as<IntImm>().value()->value;
-  node->K = args[8].as<IntImm>().value()->value;
-  node->policy = GemmSPWarpPolicy(args[9].as<IntImm>().value()->value);
-  node->clear_accum = args[10].as<Bool>().value();
+GemmSP::GemmSP(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
+  ObjectPtr<GemmSPNode> node = tvm::ffi::make_object<GemmSPNode>();
+  node->aRegion_ = NormalizeToBufferRegion(args[0]);
+  node->eRegion_ = NormalizeToBufferRegion(args[1]);
+  node->bRegion_ = NormalizeToBufferRegion(args[2]);
+  node->cRegion_ = NormalizeToBufferRegion(args[3]);
+  node->a_ = node->aRegion_->buffer;
+  node->e_ = node->eRegion_->buffer;
+  node->b_ = node->bRegion_->buffer;
+  node->c_ = node->cRegion_->buffer;
+  node->transA_ = args[4].as<Bool>().value();
+  node->transB_ = args[5].as<Bool>().value();
+  node->m_ = args[6].as<IntImm>().value()->value;
+  node->n_ = args[7].as<IntImm>().value()->value;
+  node->k_ = args[8].as<IntImm>().value()->value;
+  node->policy_ = GemmSPWarpPolicy(args[9].as<IntImm>().value()->value);
+  node->clearAccum_ = args[10].as<Bool>().value();
   if (args.size() > 11) {
-    node->kPack = args[11].as<IntImm>().value()->value;
-    if (node->kPack != 1 && node->kPack != 2) {
+    node->kPack_ = args[11].as<IntImm>().value()->value;
+    if (node->kPack_ != 1 && node->kPack_ != 2) {
       ICHECK(false) << "kPack must be 1 or 2";
     }
   }
   if (args.size() > 12) {
-    node->wg_wait = args[12].as<IntImm>().value()->value;
+    node->wgWait_ = args[12].as<IntImm>().value()->value;
   }
   data_ = std::move(node);
 }
@@ -118,7 +122,7 @@ GemmSP::GemmSP(Array<PrimExpr> args, BufferMap vmap) {
  * @return TileOperator A TileOperator holding a cloned GemmSPNode.
  */
 TileOperator GemmSPNode::Clone() const {
-  auto op = make_object<GemmSPNode>(*this);
+  auto op = tvm::ffi::make_object<GemmSPNode>(*this);
   return GemmSP(op);
 }
 
@@ -144,37 +148,37 @@ Stmt GemmSPNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   int warp_size = 32;
 
   auto block_size = *as_const_int(T.thread_bounds->extent);
-  bool maybe_wgmma = TargetIsHopper(T.target) && (this->M >= 64) &&
+  bool maybe_wgmma = TargetIsHopper(T.target) && (this->m_ >= 64) &&
                      (block_size / warp_size % 4 == 0);
 
-  auto [warp_m, warp_n] = policy->ComputeWarpPartition(
-      M, N, block_size, T.target, maybe_wgmma, A->dtype.bits());
+  auto [warp_m, warp_n] = policy_->computeWarpPartition(
+      m_, n_, block_size, T.target, maybe_wgmma, a_->dtype.bits());
 
   std::stringstream ss;
   std::string op_name = "tl::gemm_sp_ss";
-  ICHECK((A.scope() == "shared" || A.scope() == "shared.dyn") &&
-         (B.scope() == "shared" || B.scope() == "shared.dyn"))
-      << "Only support shared.dyn scope for A and B, but received " << A.scope()
-      << " and " << B.scope();
-  ICHECK((E.scope() == "shared" || E.scope() == "shared.dyn"))
+  ICHECK((a_.scope() == "shared" || a_.scope() == "shared.dyn") &&
+         (b_.scope() == "shared" || b_.scope() == "shared.dyn"))
+      << "Only support shared.dyn scope for A and B, but received "
+      << a_.scope() << " and " << b_.scope();
+  ICHECK((e_.scope() == "shared" || e_.scope() == "shared.dyn"))
       << "Only support shared.dyn scope for E as copy from smem to rmem are "
          "delegated to cute implementation, found "
-      << E.scope();
-  ss << op_name << "<" << M << ", " << N << ", " << K << ", ";
+      << e_.scope();
+  ss << op_name << "<" << m_ << ", " << n_ << ", " << k_ << ", ";
   ss << warp_m << ", " << warp_n << ", ";
-  ss << trans_A << ", " << trans_B;
-  ss << ", " << clear_accum;
+  ss << transA_ << ", " << transB_;
+  ss << ", " << clearAccum_;
   if (TargetIsHopper(T.target)) {
     ss << ", " << (maybe_wgmma ? "true" : "false");
   }
-  if (wg_wait != 0) {
-    ss << ", " << wg_wait;
+  if (wgWait_ != 0) {
+    ss << ", " << wgWait_;
   }
   ss << ">";
-  auto A_buffer = T.buffer_remap.count(A) ? T.buffer_remap[A] : A;
-  auto B_buffer = T.buffer_remap.count(B) ? T.buffer_remap[B] : B;
-  auto C_buffer = T.buffer_remap[C];
-  auto E_buffer = T.buffer_remap.count(E) ? T.buffer_remap[E] : E;
+  auto A_buffer = T.buffer_remap.count(a_) ? T.buffer_remap[a_] : a_;
+  auto B_buffer = T.buffer_remap.count(b_) ? T.buffer_remap[b_] : b_;
+  auto C_buffer = T.buffer_remap[c_];
+  auto E_buffer = T.buffer_remap.count(e_) ? T.buffer_remap[e_] : e_;
 
   auto new_call =
       Call(DataType::Handle(), tl::tl_gemm_sp(),
@@ -217,59 +221,59 @@ LayoutMap GemmSPNode::InferLayout(const LayoutInferArgs &T,
   if (completed_)
     return {};
   LayoutMap results;
-  ICHECK(C.scope() == "local.fragment");
+  ICHECK(IsFragmentBuffer(c_));
   auto thread_range = T.thread_bounds;
   auto block_size = *as_const_int(thread_range->extent);
   if (TargetIsHopper(T.target)) {
     const int warp_size = 32;
     constexpr int wgmma_m = 16 * 4;
     bool maybe_wgmma =
-        (this->M >= wgmma_m) && (block_size / warp_size % 4 == 0);
-    auto [warp_m, warp_n] = policy->ComputeWarpPartition(
-        M, N, block_size, T.target, maybe_wgmma, A->dtype.bits());
-    auto fragment =
-        maybe_wgmma
-            ? makeGemmFragmentCHopper(M, N, M / warp_m, N / warp_n,
-                                      C->dtype.bits())
-            : makeGemmFragmentC(M, N, M / warp_m, N / warp_n, C->dtype.bits());
-    results.Set(C, fragment->BindThreadRange(thread_range));
-    if (A.scope() == "shared" || A.scope() == "shared.dyn") {
-      int dim_A = A->shape.size();
-      const int64_t mat_stride = *as_const_int(A->shape[dim_A - 2]);
-      const int64_t mat_continuous = *as_const_int(A->shape[dim_A - 1]);
-      results.Set(A, makeGemmABLayoutHopper(mat_stride, mat_continuous,
-                                            mat_continuous, A->dtype.bits(),
-                                            trans_A ? 1 : 2));
+        (this->m_ >= wgmma_m) && (block_size / warp_size % 4 == 0);
+    auto [warp_m, warp_n] = policy_->computeWarpPartition(
+        m_, n_, block_size, T.target, maybe_wgmma, a_->dtype.bits());
+    auto fragment = maybe_wgmma
+                        ? makeGemmFragmentCHopper(m_, n_, m_ / warp_m,
+                                                  n_ / warp_n, c_->dtype.bits())
+                        : makeGemmFragmentC(m_, n_, m_ / warp_m, n_ / warp_n,
+                                            c_->dtype.bits());
+    results.Set(c_, fragment->BindThreadRange(thread_range));
+    if (a_.scope() == "shared" || a_.scope() == "shared.dyn") {
+      int dim_A = a_->shape.size();
+      const int64_t mat_stride = *as_const_int(a_->shape[dim_A - 2]);
+      const int64_t mat_continuous = *as_const_int(a_->shape[dim_A - 1]);
+      results.Set(a_, makeGemmABLayoutHopper(mat_stride, mat_continuous,
+                                             mat_continuous, a_->dtype.bits(),
+                                             transA_ ? 1 : 2));
     } else {
       ICHECK(false) << "Not implemented";
     }
 
-    if (B.scope() == "shared" || B.scope() == "shared.dyn") {
-      int dim_B = B->shape.size();
-      const int64_t mat_stride = *as_const_int(B->shape[dim_B - 2]);
-      const int64_t mat_continuous = *as_const_int(B->shape[dim_B - 1]);
+    if (b_.scope() == "shared" || b_.scope() == "shared.dyn") {
+      int dim_B = b_->shape.size();
+      const int64_t mat_stride = *as_const_int(b_->shape[dim_B - 2]);
+      const int64_t mat_continuous = *as_const_int(b_->shape[dim_B - 1]);
       const int64_t continuity =
-          trans_B ? mat_continuous : mat_continuous / warp_n;
-      results.Set(B,
+          transB_ ? mat_continuous : mat_continuous / warp_n;
+      results.Set(b_,
                   makeGemmABLayoutHopper(mat_stride, mat_continuous, continuity,
-                                         B->dtype.bits(), trans_B ? 2 : 1));
+                                         b_->dtype.bits(), transB_ ? 2 : 1));
     } else {
       ICHECK(false) << "WGMMA only support B in shared.";
     }
   } else if (TargetIsAmpere(T.target)) {
-    auto [warp_m, warp_n] = policy->ComputeWarpPartition(
-        M, N, block_size, T.target, false, A->dtype.bits());
-    auto fragment =
-        makeGemmSparseFragmentC(M, N, M / warp_m, N / warp_n, C->dtype.bits());
-    results.Set(C, fragment->BindThreadRange(thread_range));
+    auto [warp_m, warp_n] = policy_->computeWarpPartition(
+        m_, n_, block_size, T.target, false, a_->dtype.bits());
+    auto fragment = makeGemmSparseFragmentC(m_, n_, m_ / warp_m, n_ / warp_n,
+                                            c_->dtype.bits());
+    results.Set(c_, fragment->BindThreadRange(thread_range));
 
-    if (A.scope() == "shared" || A.scope() == "shared.dyn") {
-      int dim_A = A->shape.size();
-      const int64_t mat_stride = *as_const_int(A->shape[dim_A - 2]);
-      const int64_t mat_continuous = *as_const_int(A->shape[dim_A - 1]);
-      results.Set(A, makeGemmSparseAmpereABLayout(mat_stride, mat_continuous,
-                                                  A->dtype.bits()));
-    } else if (A.scope() == "local.fragment") {
+    if (a_.scope() == "shared" || a_.scope() == "shared.dyn") {
+      int dim_A = a_->shape.size();
+      const int64_t mat_stride = *as_const_int(a_->shape[dim_A - 2]);
+      const int64_t mat_continuous = *as_const_int(a_->shape[dim_A - 1]);
+      results.Set(a_, makeGemmSparseAmpereABLayout(mat_stride, mat_continuous,
+                                                   a_->dtype.bits()));
+    } else if (IsFragmentBuffer(a_)) {
       // auto fragment = makeGemmFragmentA(M, N, K, M / warp_m, N / warp_n,
       //                                   A->dtype.bits(), trans_A);
       // results.Set(A, fragment->BindThreadRange(thread_range));
@@ -277,13 +281,13 @@ LayoutMap GemmSPNode::InferLayout(const LayoutInferArgs &T,
     } else {
       ICHECK(0);
     }
-    if (B.scope() == "shared" || B.scope() == "shared.dyn") {
-      int dim_B = B->shape.size();
-      const int64_t mat_stride = *as_const_int(B->shape[dim_B - 2]);
-      const int64_t mat_continuous = *as_const_int(B->shape[dim_B - 1]);
-      results.Set(B, makeGemmSparseAmpereABLayout(mat_stride, mat_continuous,
-                                                  B->dtype.bits()));
-    } else if (B.scope() == "local.fragment") {
+    if (b_.scope() == "shared" || b_.scope() == "shared.dyn") {
+      int dim_B = b_->shape.size();
+      const int64_t mat_stride = *as_const_int(b_->shape[dim_B - 2]);
+      const int64_t mat_continuous = *as_const_int(b_->shape[dim_B - 1]);
+      results.Set(b_, makeGemmSparseAmpereABLayout(mat_stride, mat_continuous,
+                                                   b_->dtype.bits()));
+    } else if (IsFragmentBuffer(b_)) {
       // auto fragment =
       //     makeGemmFragmentB(M, N, K, M / warp_m, N / warp_n, trans_B);
       // results.Set(B, fragment->BindThreadRange(thread_range));
@@ -298,12 +302,25 @@ LayoutMap GemmSPNode::InferLayout(const LayoutInferArgs &T,
   return results;
 }
 
-TIR_REGISTER_TL_OP(GemmSP, gemm_sp)
+TIR_REGISTER_TL_TILE_OP(GemmSP, gemm_sp)
     .set_num_inputs(5)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
-TVM_FFI_STATIC_INIT_BLOCK({ GemmSPNode::RegisterReflection(); });
+TVM_REGISTER_OP("tl.GemmSPWarpPolicy")
+    .set_attr<TScriptPrinterName>("TScriptPrinterName", "GemmSPWarpPolicy");
 
+TVM_FFI_STATIC_INIT_BLOCK() {
+  GemmSPNode::RegisterReflection();
+  GemmSPWarpPolicyNode::RegisterReflection();
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def(
+      "tl.GemmSPWarpPolicyComputeWarpPartition",
+      [](GemmSPWarpPolicy policy, int M, int N, int block_size, Target target,
+         bool use_wgmma, int bits) {
+        policy->computeWarpPartition(M, N, block_size, target, use_wgmma, bits);
+        return;
+      });
+}
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/gemm_sp.h b/src/op/gemm_sp.h
index eee7cd795..a00773801 100644
--- a/src/op/gemm_sp.h
+++ b/src/op/gemm_sp.h
@@ -18,30 +18,40 @@ using namespace tir;
 
 class GemmSPWarpPolicyNode : public GemmWarpPolicyNode {
 public:
-  std::pair<int, int> ComputeWarpPartition(int M, int N, int block_size,
+  std::pair<int, int> computeWarpPartition(int M, int N, int block_size,
                                            Target target, bool use_wgmma,
                                            int bits) const;
+  TVM_FFI_DECLARE_OBJECT_INFO("tl.GemmSPWarpPolicy", GemmSPWarpPolicyNode,
+                              GemmWarpPolicyNode);
+
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    refl::ObjectDef<GemmSPWarpPolicyNode>()
+        .def_ro("policy_type", &GemmSPWarpPolicyNode::policy_type)
+        .def_ro("m_warp", &GemmSPWarpPolicyNode::m_warp)
+        .def_ro("n_warp", &GemmSPWarpPolicyNode::n_warp);
+  }
 };
 
 class GemmSPWarpPolicy : public ObjectRef {
 public:
-  TVM_DEFINE_OBJECT_REF_METHODS(GemmSPWarpPolicy, ObjectRef,
-                                GemmSPWarpPolicyNode);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(GemmSPWarpPolicy, ObjectRef,
+                                             GemmSPWarpPolicyNode);
 
   explicit GemmSPWarpPolicy(GemmWarpPolicyType policy_type) {
-    auto node = make_object<GemmSPWarpPolicyNode>();
+    auto node = tvm::ffi::make_object<GemmSPWarpPolicyNode>();
     node->policy_type = (int)policy_type;
     data_ = std::move(node);
   }
 
   explicit GemmSPWarpPolicy(int policy_type) {
-    auto node = make_object<GemmSPWarpPolicyNode>();
+    auto node = tvm::ffi::make_object<GemmSPWarpPolicyNode>();
     node->policy_type = policy_type;
     data_ = std::move(node);
   }
 
   explicit GemmSPWarpPolicy(int m_warp, int n_warp) {
-    auto node = make_object<GemmSPWarpPolicyNode>();
+    auto node = tvm::ffi::make_object<GemmSPWarpPolicyNode>();
     node->m_warp = m_warp;
     node->n_warp = n_warp;
     node->policy_type = (int)GemmWarpPolicyType::kFree;
@@ -51,19 +61,19 @@ class GemmSPWarpPolicy : public ObjectRef {
 
 class GemmSPNode : public TileOperatorNode {
 public:
-  tir::Buffer A, B, C, E;
-  bool trans_A, trans_B;
-  int M, N, K;
-  bool clear_accum = false;
+  BufferRegion aRegion_, bRegion_, cRegion_, eRegion_;
+  tir::Buffer a_, b_, c_, e_;
+  bool transA_, transB_;
+  int m_, n_, k_;
+  bool clearAccum_ = false;
   // k_pack please ref to bitblas/tl/mfma_macro_generator.py::k_pack
   // only will be enabled under cdna mfma instructions
-  int kPack = 1;
-  int wg_wait = 0;
+  int kPack_ = 1;
+  int wgWait_ = 0;
 
-  mutable GemmSPWarpPolicy policy;
+  mutable GemmSPWarpPolicy policy_;
 
-  static constexpr const char *_type_key = "tl.GemmSP";
-  TVM_DECLARE_FINAL_OBJECT_INFO(GemmSPNode, TileOperatorNode);
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.GemmSP", GemmSPNode, TileOperatorNode);
   Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
   LayoutMap InferLayout(const LayoutInferArgs &T,
                         InferLevel level) const override;
@@ -73,44 +83,23 @@ class GemmSPNode : public TileOperatorNode {
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
     refl::ObjectDef<GemmSPNode>()
-        .def_ro("policy", &GemmSPNode::policy)
-        .def_ro("A", &GemmSPNode::A)
-        .def_ro("B", &GemmSPNode::B)
-        .def_ro("C", &GemmSPNode::C)
-        .def_ro("E", &GemmSPNode::E)
-        .def_ro("trans_A", &GemmSPNode::trans_A)
-        .def_ro("trans_B", &GemmSPNode::trans_B)
-        .def_ro("M", &GemmSPNode::M)
-        .def_ro("N", &GemmSPNode::N)
-        .def_ro("K", &GemmSPNode::K)
-        .def_ro("clear_accum", &GemmSPNode::clear_accum)
-        .def_ro("kPack", &GemmSPNode::kPack)
-        .def_ro("wg_wait", &GemmSPNode::wg_wait);
-  }
-
-  bool SEqualReduce(const GemmSPNode *other, SEqualReducer equal) const {
-    return equal(A, other->A) && equal(B, other->B) && equal(C, other->C) &&
-           equal(E, other->E) && equal(trans_A, other->trans_A) &&
-           equal(trans_B, other->trans_B) && equal(M, other->M) &&
-           equal(N, other->N) && equal(K, other->K) &&
-           equal(clear_accum, other->clear_accum) &&
-           equal(kPack, other->kPack) && equal(wg_wait, other->wg_wait);
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const {
-    hash_reduce(policy);
-    hash_reduce(A);
-    hash_reduce(B);
-    hash_reduce(C);
-    hash_reduce(E);
-    hash_reduce(trans_A);
-    hash_reduce(trans_B);
-    hash_reduce(M);
-    hash_reduce(N);
-    hash_reduce(K);
-    hash_reduce(clear_accum);
-    hash_reduce(kPack);
-    hash_reduce(wg_wait);
+        .def_ro("policy", &GemmSPNode::policy_)
+        .def_ro("aRegion", &GemmSPNode::aRegion_)
+        .def_ro("bRegion", &GemmSPNode::bRegion_)
+        .def_ro("cRegion", &GemmSPNode::cRegion_)
+        .def_ro("eRegion", &GemmSPNode::eRegion_)
+        .def_ro("a", &GemmSPNode::a_)
+        .def_ro("b", &GemmSPNode::b_)
+        .def_ro("c", &GemmSPNode::c_)
+        .def_ro("e", &GemmSPNode::e_)
+        .def_ro("transA", &GemmSPNode::transA_)
+        .def_ro("transB", &GemmSPNode::transB_)
+        .def_ro("m", &GemmSPNode::m_)
+        .def_ro("n", &GemmSPNode::n_)
+        .def_ro("k", &GemmSPNode::k_)
+        .def_ro("clearAccum", &GemmSPNode::clearAccum_)
+        .def_ro("kPack", &GemmSPNode::kPack_)
+        .def_ro("wgWait", &GemmSPNode::wgWait_);
   }
 
 private:
@@ -119,8 +108,9 @@ class GemmSPNode : public TileOperatorNode {
 
 class GemmSP : public TileOperator {
 public:
-  TVM_DEFINE_OBJECT_REF_METHODS(GemmSP, TileOperator, GemmSPNode);
-  TVM_DLL GemmSP(Array<PrimExpr> args, BufferMap vmap);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(GemmSP, TileOperator, GemmSPNode);
+  TVM_DLL GemmSP(Array<PrimExpr> args,
+                 Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
diff --git a/src/op/gemm_sp_py.cc b/src/op/gemm_sp_py.cc
new file mode 100644
index 000000000..f66c8506a
--- /dev/null
+++ b/src/op/gemm_sp_py.cc
@@ -0,0 +1,289 @@
+/*!
+ * \file tl/op/gemm_sp_py.cc
+ * \brief Implementation of Sparse General Matrix Multiplication (GEMM_SP)
+ * operators
+ */
+
+#include "gemm_sp_py.h"
+#include "utils.h"
+
+#include "builtin.h"
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/op_attr_types.h>
+#include <tvm/tir/transform.h>
+
+#include "../target/utils.h"
+#include "tvm/ffi/string.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+/**
+ * @brief Construct a Gemm operator from serialized TL arguments and a buffer
+ * map.
+ *
+ * This constructor deserializes operator parameters from `args` and resolves
+ * buffer references via `vmap`, populating an internal GemmSPPyNode with:
+ * - device pointers for A, E, B, C and their corresponding Buffer objects,
+ * - transpose flags for A and B,
+ * - matrix dimensions M, N, K,
+ * - warp allocation policy and clear_accum flag,
+ * - strides and memory offsets for A and B,
+ * - optional kPack (must be 1 or 2) and optional wg_wait.
+ *
+ * The populated GemmSPPyNode is stored into the wrapper's internal `data_`.
+ *
+ * @param args Positional serialized arguments produced by the TL frontend:
+ *   expected layout is:
+ *     [Aptr, Eptr, Bptr, Cptr, trans_A (Bool), trans_B (Bool),
+ *      M (Int), N (Int), K (Int), policy (Int), clear_accum (Bool),
+ *      stride_A (Int), stride_B (Int), offset_A (Int), offset_B (Int),
+ *      (optional) kPack (Int), (optional) wg_wait (Int)]
+ * @param vmap Mapping from access pointer vars to Buffer objects used to
+ *   resolve the Buffer corresponding to each pointer argument.
+ *
+ * @note If `kPack` is provided it must be 1 or 2; otherwise the constructor
+ *       fails with an ICHECK (runtime assertion). No other validation is
+ *       performed here.
+ */
+GemmSPPy::GemmSPPy(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
+  ObjectPtr<GemmSPPyNode> node = tvm::ffi::make_object<GemmSPPyNode>();
+
+  node->aRegion_ = NormalizeToBufferRegion(args[0]);
+  node->eRegion_ = NormalizeToBufferRegion(args[1]);
+  node->bRegion_ = NormalizeToBufferRegion(args[2]);
+  node->cRegion_ = NormalizeToBufferRegion(args[3]);
+
+  node->A = node->aRegion_->buffer;
+  node->E = node->eRegion_->buffer;
+  node->B = node->bRegion_->buffer;
+  node->C = node->cRegion_->buffer;
+
+  node->trans_A = args[4].as<Bool>().value();
+  node->trans_B = args[5].as<Bool>().value();
+  node->trans_E = args[6].as<Bool>().value();
+  node->M = args[7].as<IntImm>().value()->value;
+  node->N = args[8].as<IntImm>().value()->value;
+  node->K = args[9].as<IntImm>().value()->value;
+  node->policy = GemmWarpPolicy(args[10].as<IntImm>().value()->value);
+  node->clear_accum = args[11].as<PrimExpr>().value();
+  node->stride_A = args[12].as<IntImm>().value()->value;
+  node->stride_B = args[13].as<IntImm>().value()->value;
+  node->offset_A = args[14].as<IntImm>().value()->value;
+  node->offset_B = args[15].as<IntImm>().value()->value;
+  if (args.size() > 16) {
+    node->kPack = args[16].as<IntImm>().value()->value;
+    if (node->kPack != 1 && node->kPack != 2) {
+      ICHECK(false) << "kPack must be 1 or 2";
+    }
+  }
+  if (args.size() > 17) {
+    node->wg_wait = args[17].as<IntImm>().value()->value;
+  }
+  data_ = std::move(node);
+}
+
+/**
+ * @brief Create a copy of this GemmSPPyNode as a TileOperator.
+ *
+ * Constructs a new GemmSPPyNode by copying the current node state and returns
+ * it wrapped in a GemmSPPy TileOperator.
+ *
+ * @return TileOperator A GemmSPPy operator that owns a copy of this node.
+ */
+TileOperator GemmSPPyNode::Clone() const {
+  auto op = tvm::ffi::make_object<GemmSPPyNode>(*this);
+  return GemmSPPy(op);
+}
+
+GemmInst GemmSPPyNode::GetGemmInst(int block_size, Target target) const {
+  int warp_size = TargetGetWarpSize(target);
+  int num_warps = block_size / warp_size;
+  bool allow_wgmma = TargetIsHopper(target) && (this->M >= 64) &&
+                     (num_warps % 4 == 0) && CheckWGMMA();
+  if (allow_wgmma) {
+    return GemmInst::kWGMMA;
+  } else if (TargetIsCDNA(target)) {
+    return GemmInst::kMFMA;
+  } else if (TargetIsCuda(target)) {
+    return GemmInst::kMMA;
+  } else {
+    ICHECK(0) << "Unsupported target for gemm: " << target->str();
+  }
+}
+
+/**
+ * @brief Checks whether WGMMA (warp-group MMA) can be used for this GEMM.
+ *
+ * Evaluates device-memory placement, data-type combinations, transpose flags,
+ * and K divisibility constraints required for the Hopper WGMMA code path.
+ *
+ * The check returns true only when:
+ * - B resides in shared memory ("shared" or "shared.dyn"); and
+ * - (C, A, B) dtypes match one of the supported combinations below and K
+ *   satisfies the required alignment; and
+ * - for combinations that require specific orientations, A is not transposed
+ *   and B is transposed.
+ *
+ * Supported combinations and constraints:
+ * - C=float16:
+ *   - A=float16, B=float16: K % 16 == 0
+ *   - Various float8 mixes (e4m3/e5m2): require (!trans_A && trans_B) and K %
+ * 32 == 0
+ * - C=float32:
+ *   - A=float16, B=float16: K % 16 == 0
+ *   - A=bfloat16, B=bfloat16: K % 16 == 0
+ *   - A=float32, B=float32: require (!trans_A && trans_B) and K % 8 == 0
+ *   - Various float8 mixes: require (!trans_A && trans_B) and K % 32 == 0
+ * - C=int32:
+ *   - 8-bit integer combinations (Int8/UInt8): require (!trans_A && trans_B)
+ * and K % 32 == 0
+ *
+ * @return true if WGMMA is supported for the current buffers, dtypes, and
+ *         transpose/shape constraints; false otherwise.
+ */
+bool GemmSPPyNode::CheckWGMMA() const {
+  return false; // not supported yet
+  // if (B.scope() != "shared.dyn" && B.scope() != "shared") {
+  //   return false;
+  // }
+
+  // if (C->dtype == DataType::Float(16)) {
+  //   if (A->dtype == DataType::Float(16) && B->dtype == DataType::Float(16))
+  //     return K % 16 == 0;
+  //   else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e4m3())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e5m2())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e4m3())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e5m2())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else
+  //     return false;
+  // } else if (C->dtype == DataType::Float(32)) {
+  //   if (A->dtype == DataType::Float(16) && B->dtype == DataType::Float(16))
+  //     return K % 16 == 0;
+  //   else if (A->dtype == DataType::BFloat(16) &&
+  //            B->dtype == DataType::BFloat(16))
+  //     return K % 16 == 0;
+  //   else if (A->dtype == DataType::Float(32) && B->dtype ==
+  //   DataType::Float(32))
+  //     return (!trans_A) && trans_B && K % 8 == 0;
+  //   else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e4m3())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype.is_float8_e4m3() && B->dtype.is_float8_e5m2())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e4m3())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype.is_float8_e5m2() && B->dtype.is_float8_e5m2())
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else
+  //     return false;
+  // } else if (C->dtype == DataType::Int(32)) {
+  //   if (A->dtype == DataType::Int(8) && B->dtype == DataType::Int(8))
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype == DataType::Int(8) && B->dtype == DataType::UInt(8))
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype == DataType::UInt(8) && B->dtype == DataType::Int(8))
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else if (A->dtype == DataType::UInt(8) && B->dtype == DataType::UInt(8))
+  //     return (!trans_A) && trans_B && K % 32 == 0;
+  //   else
+  //     return false;
+  // } else {
+  //   return false;
+  // }
+}
+
+/**
+ * @brief Parse and return the numeric GPU architecture from a Target's "arch"
+ * attribute.
+ *
+ * Examines the target's "arch" string and, if it matches the pattern
+ * "sm_<num>", returns <num> as an int. If the attribute is present but does not
+ * match that pattern, returns 0.
+ *
+ * Preconditions: the target must have an "arch" attribute (this is checked via
+ * ICHECK).
+ *
+ * @return int The parsed architecture number (e.g., 80 for "sm_80"), or 0 if
+ * the arch string does not match "sm_<num>".
+ */
+static int GetArchInt(Target target) {
+  int arch_int = 0;
+  auto s = target->GetAttr<String>("arch");
+  ICHECK(s.has_value());
+  std::string arch = s.value();
+  if (arch.rfind("sm_", 0) == 0) {
+    arch_int = std::stoi(arch.substr(3));
+  } else {
+    arch_int = 0;
+  }
+  return arch_int;
+}
+
+Stmt GemmSPPyNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
+  auto block_size = *as_const_int(T.thread_bounds->extent);
+  GemmInst gemm_inst = GetGemmInst(block_size, T.target);
+
+  auto [warp_m, warp_n] =
+      policy->computeWarpPartition(M, N, block_size, T.target, gemm_inst);
+
+  if (const auto f = ffi::Function::GetGlobal("tl.gemm_sp_py.lower")) {
+    auto prim_func =
+        Downcast<PrimFunc>((*f)(tvm::ffi::GetRef<GemmSPPy>(this), T.target,
+                                T.thread_bounds, T.thread_var));
+    ICHECK(prim_func->attrs.defined());
+    auto global_symbol = prim_func->attrs.GetAttr<String>("global_symbol");
+    ICHECK(global_symbol.has_value());
+    if (prim_func->body.as<BlockRealizeNode>()) {
+      BlockRealize block_realize = Downcast<BlockRealize>(prim_func->body);
+      auto block = block_realize->block;
+      {
+        BlockNode *n = block.CopyOnWrite();
+        n->name_hint = global_symbol.value();
+      }
+      return BlockRealize(block_realize->iter_values, block_realize->predicate,
+                          block);
+    }
+    // warp with block realize node
+    return BlockRealize(
+        /*iter_values=*/Array<PrimExpr>(),
+        /*predicate=*/const_true(),
+        /*block=*/
+        Block(/*iter_vars=*/{}, /*reads=*/{}, /*writes=*/{},
+              /*name_hint=*/global_symbol.value(), prim_func->body));
+  } else {
+    LOG(FATAL) << "No lower function found for gemm_sp_py";
+  }
+}
+
+LayoutMap GemmSPPyNode::InferLayout(const LayoutInferArgs &T,
+                                    InferLevel level) const {
+  if (completed_)
+    return {};
+  LayoutMap results;
+
+  if (const auto f = ffi::Function::GetGlobal("tl.gemm_sp_py.infer_layout")) {
+    results = Downcast<LayoutMap>(
+        (*f)(tvm::ffi::GetRef<GemmSPPy>(this), T.target, T.thread_bounds));
+  } else {
+    LOG(FATAL) << "No infer layout function found for gemm_sp_py";
+  }
+
+  completed_ = true;
+  return results;
+}
+
+TIR_REGISTER_TL_TILE_OP(GemmSPPy, gemm_sp_py)
+    .set_num_inputs(5)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TVM_FFI_STATIC_INIT_BLOCK() { GemmSPPyNode::RegisterReflection(); }
+} // namespace tl
+} // namespace tvm
diff --git a/src/op/gemm_sp_py.h b/src/op/gemm_sp_py.h
new file mode 100644
index 000000000..59c276f16
--- /dev/null
+++ b/src/op/gemm_sp_py.h
@@ -0,0 +1,96 @@
+/*!
+ * \file tl/op/gemm_sp_py.h
+ * \brief Define gemm_sp_py operator.
+ *
+ */
+
+// TODO: @botbw: remove redundant code with gemm_py.h
+
+#ifndef TVM_TL_OP_GEMM_SP_PY_H_
+#define TVM_TL_OP_GEMM_SP_PY_H_
+
+#include "gemm_sp.h"
+#include "operator.h"
+
+namespace tvm {
+
+namespace tl {
+
+using namespace tir;
+
+class GemmSPPyNode : public TileOperatorNode {
+public:
+  bool CheckWGMMA() const;
+  tir::Buffer A, E, B, C;
+  // pointer to the A, E, B, C
+  BufferRegion aRegion_, eRegion_, bRegion_, cRegion_;
+  bool trans_A, trans_B, trans_E;
+  int M, N, K;
+  int stride_A, stride_B;
+  int offset_A, offset_B;
+  PrimExpr clear_accum = const_false();
+  // k_pack please ref to bitblas/tl/mfma_macro_generator.py::k_pack
+  // only will be enabled under cdna mfma instructions
+  int kPack = 1;
+  int wg_wait = 0;
+
+  // use GemmWarp Policy here as the atom size are flexible in v2
+  mutable GemmWarpPolicy policy;
+
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.GemmSPPy", GemmSPPyNode,
+                                    TileOperatorNode);
+
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    refl::ObjectDef<GemmSPPyNode>()
+        .def_ro("A", &GemmSPPyNode::A)
+        .def_ro("E", &GemmSPPyNode::E)
+        .def_ro("B", &GemmSPPyNode::B)
+        .def_ro("C", &GemmSPPyNode::C)
+        .def_ro("aRegion", &GemmSPPyNode::aRegion_)
+        .def_ro("eRegion", &GemmSPPyNode::eRegion_)
+        .def_ro("bRegion", &GemmSPPyNode::bRegion_)
+        .def_ro("cRegion", &GemmSPPyNode::cRegion_)
+        .def_ro("trans_A", &GemmSPPyNode::trans_A)
+        .def_ro("trans_B", &GemmSPPyNode::trans_B)
+        .def_ro("trans_E", &GemmSPPyNode::trans_E)
+        .def_ro("M", &GemmSPPyNode::M)
+        .def_ro("N", &GemmSPPyNode::N)
+        .def_ro("K", &GemmSPPyNode::K)
+        .def_ro("stride_A", &GemmSPPyNode::stride_A)
+        .def_ro("stride_B", &GemmSPPyNode::stride_B)
+        .def_ro("offset_A", &GemmSPPyNode::offset_A)
+        .def_ro("offset_B", &GemmSPPyNode::offset_B)
+        .def_ro("clear_accum", &GemmSPPyNode::clear_accum)
+        .def_ro("kPack", &GemmSPPyNode::kPack)
+        .def_ro("wg_wait", &GemmSPPyNode::wg_wait)
+        .def_ro("policy", &GemmSPPyNode::policy);
+  }
+
+  Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
+  LayoutMap InferLayout(const LayoutInferArgs &T,
+                        InferLevel level) const override;
+
+  TileOperator Clone() const;
+
+private:
+  // Target GEMM instruction
+  GemmInst GetGemmInst(int block_size, Target target) const;
+
+  mutable bool completed_ = false;
+};
+
+class GemmSPPy : public TileOperator {
+public:
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(GemmSPPy, TileOperator,
+                                             GemmSPPyNode);
+  TVM_DLL
+  GemmSPPy(Array<PrimExpr> args,
+           Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
+  static const Op &Get();
+};
+
+} // namespace tl
+} // namespace tvm
+
+#endif //  TVM_TL_OP_GEMM_SP_PY_H_
diff --git a/src/op/logical.cc b/src/op/logical.cc
index 0398c38c1..0de6658bd 100644
--- a/src/op/logical.cc
+++ b/src/op/logical.cc
@@ -9,6 +9,8 @@
 #include <tvm/tir/op.h>
 #include <tvm/tir/op_attr_types.h>
 
+#include "../support/ffi_aliases.h"
+
 namespace tvm {
 namespace tl {
 using namespace tir;
@@ -50,4 +52,4 @@ TVM_REGISTER_OP("tl.all_of")
     .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", all_of_op);
 
 } // namespace tl
-} // namespace tvm
\ No newline at end of file
+} // namespace tvm
diff --git a/src/op/math.cc b/src/op/math.cc
index 572399877..b9de966ea 100644
--- a/src/op/math.cc
+++ b/src/op/math.cc
@@ -9,6 +9,8 @@
 #include <tvm/tir/op.h>
 #include <tvm/tir/op_attr_types.h>
 
+#include "../support/ffi_aliases.h"
+
 namespace tvm {
 namespace tl {
 using namespace tir;
@@ -31,7 +33,35 @@ TVM_REGISTER_OP("tl.pow_of_int")
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kPure))
     .set_attr<TScriptPrinterName>("TScriptPrinterName", "pow_of_int")
+    .set_attr<FLowerIntrinsic>("hip.FLowerIntrinsic", pow_of_int_op)
     .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", pow_of_int_op);
 
+PrimExpr infinity_op(PrimExpr args) {
+  const CallNode *call = args.as<CallNode>();
+  CHECK(call != nullptr);
+  const DataType &dtype = call->dtype;
+  ICHECK_EQ(dtype.lanes(), 1);
+
+  // NOTE(wt): Codegen for PrintConst:Inf will handle this based on dtype
+  if (dtype.is_float()) {
+    if (dtype.bits() == 64 || dtype.bits() == 32 || dtype.bits() == 16) {
+      return FloatImm(dtype, std::numeric_limits<float>::infinity(),
+                      call->span);
+    }
+  } else if (dtype.is_bfloat16()) {
+    return FloatImm(dtype, std::numeric_limits<float>::infinity(), call->span);
+  }
+  LOG(FATAL) << "Cannot decide infinity for type " << dtype;
+  throw; // Unreachable, keeps compiler happy
+}
+
+TVM_REGISTER_OP("tl.infinity")
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kPure))
+    .set_attr<TScriptPrinterName>("TScriptPrinterName", "infinity")
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", infinity_op)
+    .set_attr<FLowerIntrinsic>("hip.FLowerIntrinsic", infinity_op);
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/operator.cc b/src/op/operator.cc
index aa589460b..0a8f6b8b8 100644
--- a/src/op/operator.cc
+++ b/src/op/operator.cc
@@ -24,16 +24,14 @@ using namespace tir;
  *
  * @param call The TIR Call whose operator and arguments will be used to build
  * the TileOperator.
- * @param vmap Buffer mapping passed through to the builder to resolve buffer
- * references.
  * @return TileOperator The constructed TileOperator, or a default (empty)
  * TileOperator if no builder exists.
  */
-TileOperator ParseOperator(Call call, BufferMap vmap) {
+TileOperator ParseOperator(Call call) {
   auto op_map = Op::GetAttrMap<OpBuilderFunc>("TLOpBuilder");
   Op op = call->op.as<Op>().value();
   if (op_map.count(op)) {
-    auto tile_op = op_map[op](call->args, vmap);
+    auto tile_op = op_map[op](call->args, call->annotations);
     ICHECK(tile_op.defined());
     return tile_op;
   }
@@ -48,14 +46,13 @@ TileOperator ParseOperator(Call call, BufferMap vmap) {
  * Otherwise returns a default-constructed (empty) TileOperator.
  *
  * @param stmt TIR statement to inspect; expected to be an Evaluate of a Call.
- * @param vmap Mapping of buffer variables used when building the operator.
  * @return TileOperator Parsed operator on success, or a default (empty)
  * TileOperator if `stmt` is not an Evaluate(Call).
  */
-TileOperator ParseOperator(Stmt stmt, BufferMap vmap) {
+TileOperator ParseOperator(Stmt stmt) {
   if (stmt.as<Evaluate>() && stmt.as<EvaluateNode>()->value.as<CallNode>()) {
     auto call = stmt.as<EvaluateNode>()->value.as<CallNode>();
-    return ParseOperator(GetRef<Call>(call), vmap);
+    return ParseOperator(tvm::ffi::GetRef<Call>(call));
   }
   return TileOperator();
 }
@@ -77,7 +74,7 @@ Var GetVarFromAccessPtr(const PrimExpr &expr) {
   ICHECK(call->op.same_as(builtin::tvm_access_ptr()));
   auto var = call->args[1].as<VarNode>();
   ICHECK(var);
-  return GetRef<Var>(var);
+  return tvm::ffi::GetRef<Var>(var);
 }
 
 } // namespace tl
diff --git a/src/op/operator.h b/src/op/operator.h
index 5c1b223ac..ddbe1fa6b 100644
--- a/src/op/operator.h
+++ b/src/op/operator.h
@@ -32,6 +32,20 @@ enum class InferLevel : uint8_t {
   kStrict = 2,
 };
 
+/// Convert InferLevel enum to string for debugging
+inline const char *InferLevelToString(InferLevel level) {
+  switch (level) {
+  case InferLevel::kFree:
+    return "Free";
+  case InferLevel::kCommon:
+    return "Common";
+  case InferLevel::kStrict:
+    return "Strict";
+  default:
+    return "Unknown";
+  }
+}
+
 struct LowerArgs {
   Target target;
   Range thread_bounds;
@@ -39,7 +53,9 @@ struct LowerArgs {
   AddWorkspaceCallback AddWorkspace;
   LayoutMap layout_map;
   Map<Buffer, Buffer> buffer_remap;
-  Array<Var> buffer_var_gemm;
+  // Map from LetStmt variable to its bound expression, for resolving
+  // fragment buffer accesses through let bindings
+  Map<Var, PrimExpr> let_var_to_expr;
 };
 
 struct LayoutInferArgs {
@@ -49,6 +65,9 @@ struct LayoutInferArgs {
   arith::Analyzer *analyzer;
   bool buffer_oob = false;
   Map<Buffer, Buffer> buffer_remap;
+  // Map from LetStmt variable to its bound expression, for resolving
+  // fragment buffer accesses through let bindings
+  Map<Var, PrimExpr> let_var_to_expr;
 };
 
 class TileOperator;
@@ -62,35 +81,35 @@ class TileOperatorNode : public Object {
 
   virtual TileOperator Clone() const = 0;
 
-  static constexpr const char *_type_key = "tl.TileOperator";
-
-  TVM_DECLARE_BASE_OBJECT_INFO(TileOperatorNode, Object);
+  TVM_FFI_DECLARE_OBJECT_INFO("tl.TileOperator", TileOperatorNode, Object);
 };
 
 class TileOperator : public ObjectRef {
 public:
-  TVM_DEFINE_OBJECT_REF_METHODS(TileOperator, ObjectRef, TileOperatorNode);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(TileOperator, ObjectRef,
+                                             TileOperatorNode);
 };
 
 Var GetVarFromAccessPtr(const PrimExpr &expr);
 
-TileOperator ParseOperator(Call call, BufferMap vmap);
-TileOperator ParseOperator(Stmt stmt, BufferMap vmap);
+TileOperator ParseOperator(Call call);
+TileOperator ParseOperator(Stmt stmt);
 
 using OpBuilderFunc =
-    ffi::TypedFunction<TileOperator(Array<PrimExpr>, BufferMap)>;
+    ffi::TypedFunction<TileOperator(Array<PrimExpr>, Map<String, ObjectRef>)>;
 
-#define TIR_REGISTER_TL_OP(Entry, OpName)                                      \
+#define TIR_REGISTER_TL_TILE_OP(Entry, OpName)                                 \
   const Op &Entry::Get() {                                                     \
-    static const Op &op = Op::Get("tl." #OpName);                              \
+    static const Op &op = Op::Get("tl.tileop." #OpName);                       \
     return op;                                                                 \
   }                                                                            \
-  TVM_REGISTER_OP("tl." #OpName)                                               \
+  TVM_REGISTER_OP("tl.tileop." #OpName)                                        \
       .set_attr<TScriptPrinterName>("TScriptPrinterName", #OpName)             \
-      .set_attr<OpBuilderFunc>("TLOpBuilder",                                  \
-                               [](Array<PrimExpr> args, BufferMap vmap) {      \
-                                 return Entry(args, vmap);                     \
-                               })
+      .set_attr<OpBuilderFunc>(                                                \
+          "TLOpBuilder",                                                       \
+          [](Array<PrimExpr> args, Map<String, ObjectRef> annotations) {       \
+            return Entry(args, annotations);                                   \
+          })
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/parallel.cc b/src/op/parallel.cc
index c0ef00cc8..764fa1909 100644
--- a/src/op/parallel.cc
+++ b/src/op/parallel.cc
@@ -12,17 +12,13 @@
 #include "../target/utils.h"
 #include "../transform/loop_partition.h"
 #include "../transform/loop_vectorize.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
 
 using namespace tir;
 
-namespace attr {
-/*! \brief Mark that how the loop is vectorized. */
-constexpr const char *coalesced_width = "coalesced_width";
-} // namespace attr
-
 // ProveFragmentContains checks whether the threads that access elements of a
 // smaller fragment (small_frag) are a subset of the threads that access
 // elements of a larger fragment (large_frag) for any given loop index. This
@@ -48,7 +44,30 @@ constexpr const char *coalesced_width = "coalesced_width";
 bool ProveFragmentContains(Fragment small_frag, Fragment large_frag,
                            Array<PrimExpr> small_frag_indices,
                            Array<PrimExpr> large_frag_indices,
-                           arith::Analyzer &analyzer_) {
+                           arith::Analyzer &analyzer_,
+                           bool check_forward_index) {
+  // When check_forward_index is true, verify that the physical indices
+  // (forward index) of both fragments are equal. This is required when
+  // validating loop layout against buffer fragment, as code generation
+  // needs to correctly derive buffer physical indices from loop layout.
+  if (check_forward_index) {
+    auto small_physical = small_frag->Forward(small_frag_indices);
+    auto large_physical = large_frag->Forward(large_frag_indices);
+
+    // Dimension mismatch means they are not equal.
+    if (small_physical.size() != large_physical.size()) {
+      return false;
+    }
+
+    // Check each physical index component for equality.
+    for (size_t i = 0; i < small_physical.size(); i++) {
+      auto diff = analyzer_.Simplify(small_physical[i] - large_physical[i]);
+      if (!is_zero(diff)) {
+        return false;
+      }
+    }
+  }
+
   Var rep_small("__checking_frag_contains_rep");
   analyzer_.Bind(rep_small,
                  Range(IntImm(small_frag->ReplicateExtent()->dtype, 0),
@@ -147,7 +166,7 @@ void ParallelLoopNestVisitor::VisitStmt_(const ForNode *op) {
 }
 
 void ParallelLoopNestVisitor::VisitStmt_(const BufferStoreNode *op) {
-  if (op->buffer.scope() == "local.fragment") {
+  if (IsFragmentBuffer(op->buffer)) {
     if (p->indice_map_.find(op->buffer) != p->indice_map_.end()) {
       ICHECK(StructuralEqual()(p->indice_map_.at(op->buffer), op->indices))
           << op->buffer << ": " << op->indices << " and "
@@ -161,7 +180,7 @@ void ParallelLoopNestVisitor::VisitStmt_(const BufferStoreNode *op) {
 }
 
 void ParallelLoopNestVisitor::VisitExpr_(const BufferLoadNode *op) {
-  if (op->buffer.scope() == "local.fragment") {
+  if (IsFragmentBuffer(op->buffer)) {
     if (p->indice_map_.find(op->buffer) != p->indice_map_.end()) {
       ICHECK(StructuralEqual()(p->indice_map_.at(op->buffer), op->indices))
           << op->buffer << ": " << op->indices << " and "
@@ -178,10 +197,48 @@ ParallelOpNode::ParallelOpNode(For root) : root_(root), V(this) {
 }
 
 TileOperator ParallelOpNode::Clone() const {
-  auto op = make_object<ParallelOpNode>(*this);
+  auto op = tvm::ffi::make_object<ParallelOpNode>(*this);
   return ParallelOp(op);
 }
 
+void ParallelOpNode::ExpandLetBindings(
+    const Map<Var, PrimExpr> &let_var_to_expr) {
+  if (let_var_to_expr.empty())
+    return;
+
+  // Helper function to recursively find BufferLoads through let bindings
+  std::function<void(const PrimExpr &)> expand = [&](const PrimExpr &expr) {
+    PostOrderVisit(expr, [&](const ObjectRef &node) {
+      if (auto bl = node.as<BufferLoadNode>()) {
+        if (IsFragmentBuffer(bl->buffer) && !indice_map_.count(bl->buffer)) {
+          indice_map_.Set(bl->buffer, bl->indices);
+        }
+      } else if (auto var_node = node.as<VarNode>()) {
+        auto var = tvm::ffi::GetRef<Var>(var_node);
+        if (let_var_to_expr.count(var)) {
+          expand(let_var_to_expr[var]);
+        }
+      }
+    });
+  };
+
+  // Only expand let bindings that are used in root_
+  // First, collect all vars used in root_
+  std::unordered_set<const VarNode *> used_vars;
+  PostOrderVisit(root_, [&](const ObjectRef &node) {
+    if (auto var_node = node.as<VarNode>()) {
+      used_vars.insert(var_node);
+    }
+  });
+
+  // Only expand let bindings for vars that are actually used in root_
+  for (const auto &[var, expr] : let_var_to_expr) {
+    if (used_vars.count(var.get())) {
+      expand(expr);
+    }
+  }
+}
+
 Stmt ParallelOpNode::Lower(const LowerArgs &T,
                            arith::Analyzer *analyzer) const {
   return root_;
@@ -214,6 +271,12 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
                                       InferLevel level) const {
   if (loop_layout_.defined())
     return {};
+
+  // Expand let bindings to find fragment buffer accesses
+  if (!T.let_var_to_expr.empty()) {
+    const_cast<ParallelOpNode *>(this)->ExpandLetBindings(T.let_var_to_expr);
+  }
+
   if (level == InferLevel::kStrict) {
     LayoutMap results;
     // Deduce buffers that should be complicated replicated.
@@ -225,7 +288,7 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
       if (T.layout_map.count(buffer)) {
         continue;
       }
-      if (buffer.scope() != "local.fragment")
+      if (!IsFragmentBuffer(buffer))
         continue;
 
       // Check if all indices are zero
@@ -252,23 +315,25 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
           forward_vars.push_back(
               IterVar(Range(0, s), Var(), IterVarType::kDataPar));
         }
-        Array<PrimExpr> forward_index;
-        for (const auto &iv : forward_vars) {
-          forward_index.push_back(iv->var);
-        }
         Var rep;
         auto rep_iter =
             IterVar({0, T.thread_bounds->extent}, rep, IterVarType::kDataPar);
 
+        // Use default fragment indexing (single output dim) to
+        // stay consistent with other ops (e.g., ReduceOp), and
+        // bind the thread range for comparability.
         const PrimExpr &forward_thread = rep;
-        results.Set(buffer, Fragment(forward_vars, forward_index,
-                                     forward_thread, rep_iter));
+        auto frag = Fragment(forward_vars, /*forward_index=*/{}, forward_thread,
+                             rep_iter)
+                        ->BindThreadRange(T.thread_bounds);
+        results.Set(buffer, frag);
       }
     }
     return results;
   }
+
   auto buffer_is_completed_replicated = [&](const Buffer &buffer) {
-    if (buffer.scope() != "local.fragment")
+    if (!IsFragmentBuffer(buffer))
       return false;
     auto frag = T.layout_map[buffer].as<Fragment>().value();
     // buffer indices should be IntImm
@@ -284,7 +349,7 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
   // Collect fragment buffers with const index and all fragment_buffers
   std::vector<Buffer> const_index_fragment_buffer, fragment_buffers;
   for (const auto &[buffer, indices] : indice_map_) {
-    if (buffer.scope() != "local.fragment")
+    if (!IsFragmentBuffer(buffer))
       continue;
     fragment_buffers.push_back(buffer);
 
@@ -349,50 +414,7 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
       }
     }
   }
-  auto compute_loop_layout_from_buffer = [&](const Buffer &buffer) {
-    Fragment src_layout = T.layout_map[buffer].as<Fragment>().value();
-    DLOG(INFO) << "[compute_loop_layout_from_buffer] infer from buffer `"
-               << buffer << "` of layout " << src_layout->DebugOutput() << '\n';
-
-    Fragment result;
-    if (IsCommonAccessIndice(buffer)) {
-      result = src_layout;
-    } else {
-      Var rep;
-      auto rep_iter = IterVar({0, src_layout->ReplicateExtent()}, rep,
-                              IterVarType::kDataPar);
-      PrimExpr loop_var_to_thread =
-          src_layout->ForwardThread(indice_map_[buffer], rep);
-      loop_var_to_thread = analyzer_.Simplify(loop_var_to_thread);
-      PostOrderVisit(loop_var_to_thread, [&](const ObjectRef &objref) {
-        if (auto opt_var = objref.as<Var>();
-            opt_var && inner_vars_.count(*opt_var)) {
-          std::ostringstream oss;
-          oss << "loop_var_to_thread = " << loop_var_to_thread
-              << "contains inner var" << *opt_var;
-          throw LayoutConflictException(oss.str());
-        }
-      });
-
-      try {
-        result = Fragment(loop_vars_, {}, loop_var_to_thread, rep_iter)
-                     ->BindThreadRange(T.thread_bounds);
-      } catch (const tvm::runtime::Error &err) {
-        std::ostringstream msg;
-        msg << "Layout inference for buffer `" << buffer->name
-            << "` failed inside `T.parallel` loop.";
-
-        msg << "\nUnderlying TVM error: " << err.what();
-        msg << "\nProblematic loop AST:\n " << root_;
-        msg << "\nHint: ensure the loop extent divides the thread binding or "
-               "adjust the fragment mapping.";
-        LOG(FATAL) << msg.str();
-      }
-    }
-    DLOG(INFO) << "[compute_loop_layout_from_buffer] ... and get "
-               << result->DebugOutput() << '\n';
-    return result;
-  };
+  // moved to ComputeLoopLayoutFromBuffer
 
   // Try to infer loop layout from buffers in order of preference:
   // 1. Non-replicated write buffer (most reliable)
@@ -401,7 +423,7 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
   // 4. Free inference mode (no source buffer)
 
   if (source_buffer.defined() && allow_layout_propgate) {
-    loop_layout_ = compute_loop_layout_from_buffer(source_buffer);
+    loop_layout_ = ComputeLoopLayoutFromBuffer(source_buffer, T);
   } else if (level == InferLevel::kFree) {
     // For free layout inference
     // If replication exists and buffer has cross-thread shared memory access,
@@ -437,131 +459,57 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
         if (buffer.scope() == "shared" || buffer.scope() == "shared.dyn" ||
             buffer.scope() == "global") {
           store_shared_global_buffers.emplace_back(buffer);
-        } else if (buffer.scope() == "local.fragment") {
+        } else if (IsFragmentBuffer(buffer)) {
           store_fragment_buffers.emplace_back(buffer);
         }
       }
     });
+    // In free inference, try two mechanisms and prefer the one that
+    // minimizes replication while remaining compatible:
+    // 1) compute_loop_layout_from_buffer (always correct but may
+    // over-replicate) 2) PlanLoopPartition (often smaller replication)
+    Fragment candidate_from_buffer;
+    Fragment candidate_from_plan;
+
     if (read_source_buffer.defined() && allow_layout_propgate) {
-      loop_layout_ = compute_loop_layout_from_buffer(read_source_buffer);
+      candidate_from_buffer =
+          ComputeLoopLayoutFromBuffer(read_source_buffer, T);
     }
 
-    if (!loop_layout_.defined()) {
-      // No source buffer available, use free mode inference
-      // Vectorize Size must be aware of the buffer_remap
-      // As the pass will do post processing to the layout
-      auto maybe_remapped_root_ =
-          IfBufferRemapLoopGenerator::run(root_, T.buffer_remap, T.layout_map);
-      int vector_size = GetVectorizeSize(maybe_remapped_root_);
-
-      DLOG(INFO) << "[PlanLoopPartition] vector_size = " << vector_size << '\n';
-
-      PrimExpr loop_total_size = 1;
-      for (Stmt l = root_; l.as<For>().has_value();
-           l = l.as<For>().value()->body)
-        loop_total_size = loop_total_size * l.as<For>().value()->extent;
-      DLOG(INFO) << "[PlanLoopPartition] loop_total_size = " << loop_total_size
-                 << '\n';
-      while (!analyzer_.CanProve(
-                 floormod(loop_total_size,
-                          T.thread_bounds->extent * vector_size) == 0) &&
-             vector_size > 1)
-        vector_size /= 2;
-      DLOG(INFO) << "[PlanLoopPartition] after adjust: vector_size = "
-                 << vector_size << '\n';
-
-      // Check if coalesced_width is defined
-      if (auto coalesced_width =
-              root_->annotations.Get(tl::attr::coalesced_width)) {
-        if (const auto *imm = coalesced_width->as<IntImmNode>()) {
-          int expected = imm->value;
-          // Verify that vector_size is divisible by expected
-          if (vector_size % expected != 0) {
-            LOG(FATAL) << "Vector size " << vector_size
-                       << " is not divisible by coalesced width " << expected;
-          }
-          vector_size = expected;
-        } else {
-          LOG(FATAL) << "coalesced_width should be an IntImmNode.";
-        }
-      }
-      DLOG(INFO) << "[PlanLoopPartition] root_ = " << root_
-                 << " ############# vector_size = " << vector_size
-                 << ", thread_bounds = " << T.thread_bounds << '\n';
-      loop_layout_ = PlanLoopPartition(root_, vector_size, T.thread_bounds);
-      DLOG(INFO) << "[PlanLoopPartition] loop_layout_ = "
-                 << loop_layout_->DebugOutput() << '\n';
+    // try to infer loop layout with two mechanisms and choose the best one
+    {
+      candidate_from_plan = ComputePlanCandidate(T);
     }
 
-    // Lambda that guards replicated accesses:
-    // - When a loop layout replicates a fragment buffer (rep > 1), each thread
-    //   observes the same fragment elements. Blindly storing to shared/global
-    //   memory in that case would add the same value multiple times.
-    // - We therefore restrict the store so that only the replica with rep == 0
-    //   performs the update (e.g. global[i] += fragment[i] only fires once).
-    // Trigger conditions for this guard:
-    // 1) There are cross-thread stores targeting shared/global memory (no
-    //    fragment stores in this branch; atomic_add and similar remain TODO).
-    // 2) The loop layout replicate extent is greater than 1, inferred from the
-    //    thread bounds captured in the layout.
-
-    [this, &store_shared_global_buffers, &store_fragment_buffers,
-     &has_cross_thread_access, &const_index_fragment_buffer, &T]() {
-      if (is_one(loop_layout_->ReplicateExtent()))
-        return;
-      if (!has_cross_thread_access)
-        return;
-
-      if (!store_fragment_buffers.empty()) {
-        // Iterate replicated fragment stores: when the fragment index is a
-        // constant (e.g. fragment[0]), every thread touches the same slot, so
-        // the rep == 0 predicate is unnecessary. Example: for i in
-        // T.Parallel(...):
-        //   shared[i] = ...
-        //   fragment[0] = ...
-        bool replicate_is_from_dynamic_index_fragment = false;
-        for (const auto &fragment : store_fragment_buffers) {
-          if (!T.layout_map.count(fragment)) {
-            continue;
-          }
-
-          auto fragment_layout = T.layout_map[fragment].as<Fragment>().value();
-          if (is_one(fragment_layout->ReplicateExtent()))
-            continue;
-
-          if (analyzer_.CanProveEqual(fragment_layout->ReplicateExtent(),
-                                      loop_layout_->ReplicateExtent()))
-            continue;
-          if (std::find(const_index_fragment_buffer.begin(),
-                        const_index_fragment_buffer.end(),
-                        fragment) == const_index_fragment_buffer.end()) {
-            replicate_is_from_dynamic_index_fragment = true;
-          }
-        }
-
-        if (!replicate_is_from_dynamic_index_fragment)
-          return;
+    // Choose the best candidate:
+    if (candidate_from_buffer.defined() && candidate_from_plan.defined()) {
+      loop_layout_ =
+          ChooseBestCandidate(candidate_from_buffer, candidate_from_plan, T);
+    } else if (candidate_from_plan.defined()) {
+      loop_layout_ = candidate_from_plan;
+      DLOG(INFO) << "[FreeInfer] only PlanLoopPartition available, choose it.";
+    } else if (candidate_from_buffer.defined()) {
+      loop_layout_ = candidate_from_buffer;
+      DLOG(INFO)
+          << "[FreeInfer] only compute_from_buffer available, choose it.";
+    }
 
-        ICHECK(store_shared_global_buffers.empty())
-            << "Invalid layout: cannot have both fragment and shared store "
-               "buffers "
-               "in replicated loop layout.";
-        return;
-      } else {
-        // Now, store is global or shared
-        // or T.call_extern or T.call_intrin ...
-        auto inv = loop_layout_->Inverse();
-        Array<PrimExpr> fwd;
-        for (size_t i = 0; i < loop_layout_->OutputDim(); i++)
-          fwd.push_back(0);
-        fwd.push_back(InputPlaceholder(0) - T.thread_bounds->min);
-        auto rep = inv->Forward(fwd).back();
-        AddPredicate(EQ(rep, 0));
-      }
-    }();
+    BuildReplicationGuardsIfNeeded(
+        T, store_shared_global_buffers, store_fragment_buffers,
+        has_cross_thread_access, const_index_fragment_buffer);
   } else {
     return {};
   }
+  // check loop_layout_ is injective
+  auto injective_res = loop_layout_->DetectInjective();
+  if (!injective_res->errors.empty()) {
+    std::ostringstream oss;
+    oss << "Loop layout is not injective: " << loop_layout_->DebugOutput()
+        << '\n'
+        << "  errors: " << injective_res->errors << '\n'
+        << "  loop AST: " << root_;
+    throw LoopLayoutInjectiveException(oss.str());
+  }
 
   PrimExpr loop_thread_extent = loop_layout_->ThreadExtent();
 
@@ -620,11 +568,37 @@ Fragment ParallelOpNode::CompleteBufferFragment(const Buffer &buffer) const {
   if (IsCommonAccessIndice(buffer)) {
     return loop_layout_;
   }
+  // Prefer a simple path: if original 2D indices form a bijective map, invert
+  // them directly and avoid introducing a synthetic replicate dimension.
+  {
+    auto res2d =
+        arith::DetectIterMap(indice_map_[buffer], ToVMap(loop_vars_), 1,
+                             arith::IterMapLevel::Bijective,
+                             const_cast<arith::Analyzer *>(&analyzer_));
+    if (res2d->errors.empty()) {
+      Layout ind_inv2d = Layout(loop_vars_, indice_map_[buffer])->Inverse();
+      PrimExpr indice_rep_extent = 1;
+      PrimExpr loop_rep_extent = loop_layout_->ReplicateExtent();
+      PrimExpr dest_buffer_rep_extent = indice_rep_extent * loop_rep_extent;
+      Array<PrimExpr> fwd2;
+      for (size_t i = 0; i < buffer->shape.size(); i++) {
+        fwd2.push_back(InputPlaceholder(i));
+      }
+      PrimExpr thd_b2 =
+          loop_layout_->ForwardThread(ind_inv2d->Forward(fwd2), std::nullopt);
+      return Fragment(buffer->shape, {}, thd_b2, dest_buffer_rep_extent,
+                      std::nullopt)
+          ->CondenseReplicateVar();
+    }
+  }
+  // Otherwise, infer an extra flattened iterator that captures truly-unused
+  // pieces of the loop space (if any), then try inversion with it.
   PrimExpr rep_b = MakeFlattenedExpression(
       DivideUnusedIterators(indice_map_[buffer], loop_vars_, &analyzer_));
   auto bijective_indice = indice_map_[buffer];
   bijective_indice.push_back(rep_b);
   Layout ind_inv = Layout(loop_vars_, bijective_indice)->Inverse();
+
   PrimExpr indice_rep_extent =
       ind_inv->InputShape().back(); // this is the size of rep_b
   PrimExpr loop_rep_extent = loop_layout_->ReplicateExtent();
@@ -642,7 +616,227 @@ Fragment ParallelOpNode::CompleteBufferFragment(const Buffer &buffer) const {
       ->CondenseReplicateVar();
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({ ParallelOpNode::RegisterReflection(); });
+TVM_FFI_STATIC_INIT_BLOCK() { ParallelOpNode::RegisterReflection(); }
+
+bool ParallelOpNode::ValidateCandidateAgainstFragments(
+    const Fragment &candidate, const LayoutInferArgs &T) const {
+  auto vars =
+      loop_vars_.Map([](const IterVar &iv) { return PrimExpr(iv->var); });
+  for (const auto &[buffer, _] : indice_map_) {
+    if (!T.layout_map.count(buffer))
+      continue;
+    auto fragment = T.layout_map[buffer].as<Fragment>().value();
+    // check_forward_index=true: when validating loop layout against buffer
+    // fragment, we need to ensure physical indices match for correct code gen.
+    if (!ProveFragmentContains(candidate, fragment, vars, indice_map_[buffer],
+                               analyzer_, /*check_forward_index=*/true)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+Fragment
+ParallelOpNode::ComputeLoopLayoutFromBuffer(const Buffer &buffer,
+                                            const LayoutInferArgs &T) const {
+  Fragment src_layout = T.layout_map[buffer].as<Fragment>().value();
+  DLOG(INFO) << "[compute_loop_layout_from_buffer] infer from buffer `"
+             << buffer << "` of layout " << src_layout->DebugOutput() << '\n';
+
+  Fragment result;
+  if (IsCommonAccessIndice(buffer)) {
+    result = src_layout;
+  } else {
+    Var rep;
+    auto rep_iter =
+        IterVar({0, src_layout->ReplicateExtent()}, rep, IterVarType::kDataPar);
+    PrimExpr loop_var_to_thread =
+        src_layout->ForwardThread(indice_map_[buffer], rep);
+    loop_var_to_thread = analyzer_.Simplify(loop_var_to_thread);
+    PostOrderVisit(loop_var_to_thread, [&](const ObjectRef &objref) {
+      if (auto opt_var = objref.as<Var>();
+          opt_var && inner_vars_.count(*opt_var)) {
+        std::ostringstream oss;
+        oss << "loop_var_to_thread = " << loop_var_to_thread
+            << "contains inner var" << *opt_var;
+        throw LayoutConflictException(oss.str());
+      }
+    });
+
+    try {
+      result = Fragment(loop_vars_, {}, loop_var_to_thread, rep_iter)
+                   ->BindThreadRange(T.thread_bounds);
+    } catch (const tvm::runtime::Error &err) {
+      std::ostringstream msg;
+      msg << "Layout inference for buffer `" << buffer->name
+          << "` failed inside `T.parallel` loop.";
+
+      msg << "\nUnderlying TVM error: " << err.what();
+      msg << "\nProblematic loop AST:\n " << root_;
+      msg << "\nHint: ensure the loop extent divides the thread binding or "
+             "adjust the fragment mapping.";
+      LOG(FATAL) << msg.str();
+    }
+  }
+  DLOG(INFO) << "[compute_loop_layout_from_buffer] ... and get "
+             << result->DebugOutput() << '\n';
+  return result;
+}
+
+Fragment ParallelOpNode::ComputePlanCandidate(const LayoutInferArgs &T) const {
+  // Vectorize Size must be aware of the buffer_remap
+  // As the pass will do post processing to the layout
+  auto maybe_remapped_root_ =
+      IfBufferRemapLoopGenerator::run(root_, T.buffer_remap, T.layout_map);
+  int vector_size = GetVectorizeSize(maybe_remapped_root_, T.analyzer);
+  DLOG(INFO) << "[PlanLoopPartition] vector_size = " << vector_size << '\n';
+
+  PrimExpr loop_total_size = 1;
+  for (Stmt l = root_; l.as<For>().has_value(); l = l.as<For>().value()->body)
+    loop_total_size = loop_total_size * l.as<For>().value()->extent;
+  DLOG(INFO) << "[PlanLoopPartition] loop_total_size = " << loop_total_size
+             << '\n';
+  while (!analyzer_.CanProve(floormod(loop_total_size, T.thread_bounds->extent *
+                                                           vector_size) == 0) &&
+         vector_size > 1)
+    vector_size /= 2;
+  DLOG(INFO) << "[PlanLoopPartition] after adjust: vector_size = "
+             << vector_size << '\n';
+
+  // Check if coalesced_width is defined
+  if (auto coalesced_width = root_->annotations.Get(attr::kCoalescedWidth)) {
+    if (const auto *imm = coalesced_width->as<IntImmNode>()) {
+      int expected = imm->value;
+      // Verify that vector_size is divisible by expected
+      if (vector_size % expected != 0) {
+        LOG(FATAL) << "Vector size " << vector_size
+                   << " is not divisible by coalesced width " << expected;
+      }
+      vector_size = expected;
+    } else {
+      LOG(FATAL) << "coalesced_width should be an IntImmNode.";
+    }
+  }
+  DLOG(INFO) << "[PlanLoopPartition] root_ = " << root_
+             << " ############# vector_size = " << vector_size
+             << ", thread_bounds = " << T.thread_bounds << '\n';
+  auto plan = PlanLoopPartition(root_, vector_size, T.thread_bounds);
+  DLOG(INFO) << "[PlanLoopPartition] candidate = " << plan->DebugOutput()
+             << '\n';
+  return plan;
+}
+
+void ParallelOpNode::BuildReplicationGuardsIfNeeded(
+    const LayoutInferArgs &T,
+    const std::vector<Buffer> &store_shared_global_buffers,
+    const std::vector<Buffer> &store_fragment_buffers,
+    bool has_cross_thread_access,
+    const std::vector<Buffer> &const_index_fragment_buffer) const {
+  if (is_one(loop_layout_->ReplicateExtent()))
+    return;
+  if (!has_cross_thread_access)
+    return;
+
+  if (!store_fragment_buffers.empty()) {
+    bool replicate_is_from_dynamic_index_fragment = false;
+    for (const auto &fragment : store_fragment_buffers) {
+      if (!T.layout_map.count(fragment)) {
+        continue;
+      }
+
+      auto fragment_layout = T.layout_map[fragment].as<Fragment>().value();
+      if (is_one(fragment_layout->ReplicateExtent()))
+        continue;
+
+      if (analyzer_.CanProveEqual(fragment_layout->ReplicateExtent(),
+                                  loop_layout_->ReplicateExtent()))
+        continue;
+      if (std::find(const_index_fragment_buffer.begin(),
+                    const_index_fragment_buffer.end(),
+                    fragment) == const_index_fragment_buffer.end()) {
+        replicate_is_from_dynamic_index_fragment = true;
+      }
+    }
+
+    if (!replicate_is_from_dynamic_index_fragment)
+      return;
+
+    ICHECK(store_shared_global_buffers.empty())
+        << "Invalid layout: cannot have both fragment and shared store buffers "
+           "in replicated loop layout.";
+    return;
+  } else {
+    auto inv = loop_layout_->Inverse();
+    Array<PrimExpr> fwd;
+    for (size_t i = 0; i < loop_layout_->OutputDim(); i++)
+      fwd.push_back(0);
+    fwd.push_back(InputPlaceholder(0) - T.thread_bounds->min);
+    auto rep = inv->Forward(fwd).back();
+    AddPredicate(EQ(rep, 0));
+  }
+}
+Fragment
+ParallelOpNode::ChooseBestCandidate(const Fragment &candidate_from_buffer,
+                                    const Fragment &candidate_from_plan,
+                                    const LayoutInferArgs &T) const {
+  // Strategy overview:
+  // 1) Validate each candidate against all known source fragments. If only one
+  //    is compatible, choose it immediately.
+  // 2) If both are compatible, compare their containment relation:
+  //      - If buffer-based contains plan-based, prefer plan (usually smaller
+  //      rep).
+  //      - If plan-based contains buffer-based, prefer buffer.
+  // 3) If neither contains the other, prefer the one with provably smaller or
+  //    equal replication extent; otherwise fall back to buffer-based candidate.
+  // Note: Final global validation happens after selection elsewhere.
+  auto vars =
+      loop_vars_.Map([](const IterVar &iv) { return PrimExpr(iv->var); });
+  auto contains = [&](const Fragment &big, const Fragment &small) {
+    // contains(A, B) means: for any loop index, the threads that access
+    // B's elements are a subset of those that access A's elements.
+    return ProveFragmentContains(small, big, vars, vars, analyzer_);
+  };
+
+  bool buf_ok = ValidateCandidateAgainstFragments(candidate_from_buffer, T);
+  bool plan_ok = ValidateCandidateAgainstFragments(candidate_from_plan, T);
+
+  if (buf_ok && !plan_ok) {
+    DLOG(INFO)
+        << "[FreeInfer] prefer compute_from_buffer (only valid candidate).";
+    return candidate_from_buffer;
+  }
+  if (plan_ok && !buf_ok) {
+    DLOG(INFO)
+        << "[FreeInfer] prefer PlanLoopPartition (only valid candidate).";
+    return candidate_from_plan;
+  }
+  if (!(buf_ok && plan_ok)) {
+    // Both invalid here; let the caller continue to final validation/throw.
+    // Returning buffer-based candidate keeps behavior deterministic.
+    return candidate_from_buffer; // arbitrary; caller will catch later
+  }
+
+  bool buf_contains_plan = contains(candidate_from_buffer, candidate_from_plan);
+  bool plan_contains_buf = contains(candidate_from_plan, candidate_from_buffer);
+
+  auto rep_buf = candidate_from_buffer->ReplicateExtent();
+  auto rep_plan = candidate_from_plan->ReplicateExtent();
+
+  // Prefer the contained candidate (tends to minimize replication while
+  // respecting access coverage):
+  if (buf_contains_plan && !plan_contains_buf) {
+    return candidate_from_plan;
+  }
+  if (plan_contains_buf && !buf_contains_plan) {
+    return candidate_from_buffer;
+  }
+  // Neither strictly contains the other; prefer the one with smaller/equal rep.
+  if (analyzer_.CanProve(rep_plan <= rep_buf)) {
+    return candidate_from_plan;
+  }
+  // Safe fallback: buffer-based candidate is always correct.
+  return candidate_from_buffer;
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/parallel.h b/src/op/parallel.h
index 9c6b7180f..e75600f77 100644
--- a/src/op/parallel.h
+++ b/src/op/parallel.h
@@ -24,19 +24,11 @@ namespace tl {
 
 using namespace tir;
 
-class LayoutConflictException : public std::exception {
-public:
-  const char *what() const noexcept override { return msg_.c_str(); }
-  LayoutConflictException(const std::string &msg) : msg_(msg) {}
-
-private:
-  std::string msg_;
-};
-
 bool ProveFragmentContains(Fragment small_frag, Fragment large_frag,
                            Array<PrimExpr> small_frag_indices,
                            Array<PrimExpr> large_frag_indices,
-                           arith::Analyzer &analyzer_);
+                           arith::Analyzer &analyzer_,
+                           bool check_forward_index = false);
 
 class ParallelOpNode;
 
@@ -66,8 +58,8 @@ class ParallelOpNode : public TileOperatorNode {
   mutable Optional<PrimExpr> predicate_;
 
   // Type key for TVM object system.
-  static constexpr const char *_type_key = "tl.ParallelOp";
-  TVM_DECLARE_FINAL_OBJECT_INFO(ParallelOpNode, TileOperatorNode);
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.ParallelOp", ParallelOpNode,
+                                    TileOperatorNode);
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -77,20 +69,6 @@ class ParallelOpNode : public TileOperatorNode {
         .def_ro("predicate", &ParallelOpNode::predicate_);
   }
 
-  bool SEqualReduce(const ParallelOpNode *other, SEqualReducer equal) const {
-    return equal(root_, other->root_) &&
-           equal(loop_layout_, other->loop_layout_) &&
-           equal(predicate_, other->predicate_);
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const {
-    hash_reduce(root_);
-    hash_reduce(loop_layout_);
-    hash_reduce(predicate_);
-  }
-  static constexpr bool _type_has_method_sequal_reduce = true;
-  static constexpr bool _type_has_method_shash_reduce = true;
-
   // Construct from a root For loop.
   ParallelOpNode(For root);
 
@@ -124,10 +102,37 @@ class ParallelOpNode : public TileOperatorNode {
   Fragment CompleteBufferFragment(const Buffer &buffer) const;
   // Check if the buffer is accessed with common indices (i.e., loop variables).
   bool IsCommonAccessIndice(const Buffer &buffer) const;
+  // Validate a candidate loop layout against all source fragments in
+  // T.layout_map. Returns true if compatible with all fragments; otherwise
+  // false. Does not throw.
+  bool ValidateCandidateAgainstFragments(const Fragment &candidate,
+                                         const LayoutInferArgs &T) const;
+  // Choose the better loop layout from two candidates using validation,
+  // containment and replication heuristic.
+  Fragment ChooseBestCandidate(const Fragment &candidate_from_buffer,
+                               const Fragment &candidate_from_plan,
+                               const LayoutInferArgs &T) const;
+  // Compute loop layout from a source buffer's fragment mapping.
+  Fragment ComputeLoopLayoutFromBuffer(const Buffer &buffer,
+                                       const LayoutInferArgs &T) const;
+  // Compute plan-based loop layout candidate using vectorization and thread
+  // bounds.
+  Fragment ComputePlanCandidate(const LayoutInferArgs &T) const;
+  // Add replication guard predicates when needed for cross-thread stores.
+  void BuildReplicationGuardsIfNeeded(
+      const LayoutInferArgs &T,
+      const std::vector<Buffer> &store_shared_global_buffers,
+      const std::vector<Buffer> &store_fragment_buffers,
+      bool has_cross_thread_access,
+      const std::vector<Buffer> &const_index_fragment_buffer) const;
   // Add a predicate to the current predicate expression.
   void AddPredicate(const PrimExpr &expr) const {
     predicate_ = predicate_.defined() ? And(expr, predicate_.value()) : expr;
   }
+  // Expand let bindings to find fragment buffer accesses and add them to
+  // indice_map_. This handles cases like: a = block_mask_f[i]; T.copy(A[a, 0],
+  // ...)
+  void ExpandLetBindings(const Map<Var, PrimExpr> &let_var_to_expr);
 
   // Allow ParallelLoopNestVisitor to access private members.
   friend class ParallelLoopNestVisitor;
@@ -150,10 +155,11 @@ class ParallelOpNode : public TileOperatorNode {
 
 class ParallelOp : public TileOperator {
 public:
-  TVM_DEFINE_OBJECT_REF_METHODS(ParallelOp, TileOperator, ParallelOpNode);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(ParallelOp, TileOperator,
+                                             ParallelOpNode);
 
   ParallelOp(const For &root) {
-    auto op = make_object<ParallelOpNode>(root);
+    auto op = tvm::ffi::make_object<ParallelOpNode>(root);
     data_ = std::move(op);
   }
 };
diff --git a/src/op/reduce.cc b/src/op/reduce.cc
index fe49e00b6..896a28c04 100644
--- a/src/op/reduce.cc
+++ b/src/op/reduce.cc
@@ -10,21 +10,31 @@
 #include <tvm/tir/op_attr_types.h>
 #include <tvm/tir/stmt_functor.h>
 
+#include "../layout/layout.h"
 #include "../layout/utils.h"
 #include "../op/parallel.h"
 #include "../target/utils.h"
 #include "../transform/loop_partition.h"
 #include "tir/transforms/ir_utils.h"
+#include "tvm/tir/stmt.h"
+#include "utils.h"
 
 namespace tvm {
 namespace tl {
 
 using namespace tir;
 
-ReduceOp::ReduceOp(Array<PrimExpr> args, BufferMap vmap) {
-  ObjectPtr<ReduceOpNode> node = make_object<ReduceOpNode>();
-  node->src = vmap[GetVarFromAccessPtr(args[0])];
-  node->dst = vmap[GetVarFromAccessPtr(args[1])];
+// NormalizeToBufferRegion moved to src/op/utils.{h,cc}
+
+// MakeAccessPtrFromRegion moved to src/op/utils.{h,cc}
+
+ReduceOp::ReduceOp(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
+  ObjectPtr<ReduceOpNode> node = tvm::ffi::make_object<ReduceOpNode>();
+  // Accept BufferRegion/BufferLoad for src/dst
+  node->srcRegion_ = NormalizeToBufferRegion(args[0]);
+  node->dstRegion_ = NormalizeToBufferRegion(args[1]);
+  node->src = node->srcRegion_->buffer;
+  node->dst = node->dstRegion_->buffer;
   std::string reduce_type = args[2].as<StringImm>().value()->value;
   node->dim = args[3].as<IntImm>().value()->value;
   node->type = ReduceType(reduce_type);
@@ -33,12 +43,12 @@ ReduceOp::ReduceOp(Array<PrimExpr> args, BufferMap vmap) {
 }
 
 TileOperator ReduceOpNode::Clone() const {
-  auto op = make_object<ReduceOpNode>(*this);
+  auto op = tvm::ffi::make_object<ReduceOpNode>(*this);
   return ReduceOp(op);
 }
 
 TileOperator CumSumOpNode::Clone() const {
-  auto op = make_object<CumSumOpNode>(*this);
+  auto op = tvm::ffi::make_object<CumSumOpNode>(*this);
   return CumSumOp(op);
 }
 
@@ -85,6 +95,7 @@ PrimExpr ReduceOpNode::MakeInitValue() const {
     return make_zero(dst->dtype);
   } else {
     LOG(FATAL) << "Unsupported reduce type: " << type->type;
+    return PrimExpr();
   }
 }
 
@@ -103,7 +114,7 @@ PrimExpr ReduceOpNode::MakeReduce(const PrimExpr &lhs,
   } else if (type->isMin()) {
     return Min(lhs, rhs);
   } else if (type->isAbsMax()) {
-    return Max(Max(lhs, rhs), -Min(lhs, rhs));
+    return Max(tvm::abs(lhs), tvm::abs(rhs));
   } else if (type->isBitAnd()) {
     return lhs & rhs;
   } else if (type->isBitOr()) {
@@ -185,6 +196,7 @@ Stmt ReduceOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   auto dst_scope = this->dst.scope();
 
   if (src_scope == "local.fragment" && dst_scope == "local.fragment") {
+
     Buffer src_buffer = get_buffer(this->src);
     Buffer dst_buffer = get_buffer(this->dst);
     Fragment src_layout = T.layout_map[this->src].as<Fragment>().value();
@@ -297,7 +309,8 @@ Stmt ReduceOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
         std::stringstream ss;
 
         auto thread_offset = T.thread_bounds->min;
-        if (TargetIsHopper(T.target) || TargetIsSm100(T.target)) {
+        if (TargetIsHopper(T.target) || TargetIsSm100(T.target) ||
+            TargetIsSM120(T.target)) {
           auto all_threads = T.thread_bounds->extent;
           ss << "tl::AllReduce<" << this->MakeCodegenReducer() << ", "
              << reducing_threads << ", " << (*scale) << ", " << thread_offset
@@ -359,70 +372,6 @@ Stmt ReduceOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
     return body;
   }
 
-  auto is_shared_scope = [](const std::string &scope) {
-    return scope == "shared" || scope == "shared.dyn";
-  };
-
-  if (is_shared_scope(src_scope) && is_shared_scope(dst_scope)) {
-    Buffer src_buffer = get_buffer(this->src);
-    Buffer dst_buffer = get_buffer(this->dst);
-
-    size_t src_dim = src_buffer->shape.size();
-    size_t dst_dim = dst_buffer->shape.size();
-    bool is_1d_reduce = (src_dim == dst_dim && dst_dim == 1);
-    if (!is_1d_reduce) {
-      ICHECK_EQ(src_dim, dst_dim + 1) << "Reduce dimension mismatch.";
-    } else {
-      ICHECK_EQ(dst_dim, 1U) << "Expect scalar layout for 1D reduce.";
-    }
-
-    auto thread_extent = as_const_int(T.thread_bounds->extent);
-    ICHECK(thread_extent)
-        << "Shared-memory reduce requires static thread extent.";
-    int threads = *thread_extent;
-
-    if (TargetIsCuda(T.target)) {
-      ICHECK_EQ(threads % 32, 0)
-          << "Shared reduce expects blockDim.x to be a multiple of 32 on CUDA.";
-    } else if (TargetIsRocm(T.target)) {
-      ICHECK_EQ(threads % 64, 0)
-          << "Shared reduce expects blockDim.x to be a multiple of 64 on HIP.";
-    }
-
-    bool use_abs = this->type->isAbsSum() || this->type->isAbsMax();
-    bool need_accumulate =
-        (!this->clear) && (this->type->isSum() || this->type->isAbsSum() ||
-                           this->type->isBitAnd() || this->type->isBitOr() ||
-                           this->type->isBitXor());
-
-    PrimExpr reduce_extent = src_buffer->shape[this->dim];
-    PrimExpr tail_extent = make_const(DataType::Int(32), 1);
-    for (size_t i = this->dim + 1; i < src_dim; ++i) {
-      tail_extent = analyzer->Simplify(tail_extent * src_buffer->shape[i]);
-    }
-
-    PrimExpr total_dest = make_const(DataType::Int(32), 1);
-    for (size_t i = 0; i < dst_dim; ++i) {
-      total_dest = analyzer->Simplify(total_dest * dst_buffer->shape[i]);
-    }
-
-    std::stringstream ss;
-    std::string reducer = this->MakeCodegenReducer();
-    ss << "tl::SharedReduceWarp<" << reducer << ", " << threads << ", "
-       << (use_abs ? "true" : "false") << ", "
-       << (need_accumulate ? "true" : "false") << ">::run";
-
-    Array<PrimExpr> call_args = {StringImm(ss.str()),
-                                 src_buffer.access_ptr(1),
-                                 dst_buffer.access_ptr(3),
-                                 cast(DataType::Int(32), total_dest),
-                                 cast(DataType::Int(32), reduce_extent),
-                                 cast(DataType::Int(32), tail_extent),
-                                 this->MakeInitValue()};
-
-    return Evaluate(Call(dst_buffer->dtype, builtin::call_extern(), call_args));
-  }
-
   LOG(FATAL) << "Reduce for buffers in scope (" << src_scope << ", "
              << dst_scope << ") is not implemented.";
   return Stmt();
@@ -432,7 +381,8 @@ LayoutMap ReduceOpNode::InferLayout(const LayoutInferArgs &T,
                                     InferLevel level) const {
   if (level >= InferLevel::kStrict)
     return {};
-  if (src.scope() == "local.fragment" && dst.scope() == "local.fragment" &&
+
+  if (IsFragmentBuffer(src) && IsFragmentBuffer(dst) &&
       T.layout_map.count(src)) {
     auto src_layout = T.layout_map[src].as<Fragment>().value();
 
@@ -452,10 +402,40 @@ LayoutMap ReduceOpNode::InferLayout(const LayoutInferArgs &T,
     }
     auto thd = src_layout->ForwardThread(
         fwd, FloorDiv(ReplicationPlaceholder(), indice_rep_extent));
+
+    // Ensure the thread count is divisible by the replicate extent.
+    // Otherwise, we cannot infer a valid fragment<->fragment layout.
+    {
+      arith::Analyzer analyzer;
+      PrimExpr num_threads = T.thread_bounds->extent;
+      // Though the dest_buffer_rep_extent will be compressed at
+      // CondenseReplicateVar, we need to check the divisibility here to avoid
+      // the issue that the thread count is not divisible by the replicate
+      // extent.
+      if (!analyzer.CanProve(FloorMod(num_threads, dest_buffer_rep_extent) ==
+                             0) &&
+          !analyzer.CanProve(FloorMod(dest_buffer_rep_extent, num_threads) ==
+                             0)) {
+        ICHECK(false) << "ReduceOp fragment layout inference failed: "
+                         "num_threads % replicate_extent != 0. "
+                      << "This mapping requires the block's thread count to be "
+                         "divisible by the "
+                      << "replicate extent. "
+                      << "Try one of: (1) choose a thread block size divisible "
+                         "by replicate_extent; "
+                      << "(2) pick a different reduce dimension or adjust the "
+                         "source fragment layout; "
+                      << "Details: num_threads=" << num_threads
+                      << ", replicate_extent=" << indice_rep_extent
+                      << ", src=" << src << ", dst=" << dst;
+      }
+    }
+
     Fragment dst_layout =
         Fragment(dst->shape, {}, thd, dest_buffer_rep_extent, std::nullopt)
             ->CondenseReplicateVar()
             ->BindThreadRange(T.thread_bounds);
+
     if (!T.layout_map.count(dst))
       return {{dst, dst_layout}};
     else {
@@ -500,30 +480,47 @@ LayoutMap ReduceOpNode::InferLayout(const LayoutInferArgs &T,
   return {};
 }
 
-TIR_REGISTER_TL_OP(ReduceOp, reduce)
+TIR_REGISTER_TL_TILE_OP(ReduceOp, reduce)
     .set_num_inputs(4)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
-CumSumOp::CumSumOp(Array<PrimExpr> args, BufferMap vmap) {
+// Normalize "Buffer" to BufferRegion. Use the shape of the buffer as the
+// ranges.
+static BufferRegion ConvertBufferToBufferRegion(const Buffer &buf) {
+  Array<Range> ranges;
+  for (PrimExpr extent : buf->shape) {
+    ranges.push_back(Range(IntImm(extent->dtype, 0), extent));
+  }
+  return BufferRegion(buf, ranges);
+}
+
+CumSumOp::CumSumOp(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
   /// CumSum constructor arguments:
   /// - src: input buffer
   /// - dst: output buffer
   /// - dim: dimension to cumsum
   /// - reverse: whether to cumsum in reverse order
   CHECK_EQ(args.size(), 4);
-  ObjectPtr<CumSumOpNode> node = make_object<CumSumOpNode>();
-  node->src = vmap[GetVarFromAccessPtr(args[0])];
-  node->dst = vmap[GetVarFromAccessPtr(args[1])];
+  ObjectPtr<CumSumOpNode> node = tvm::ffi::make_object<CumSumOpNode>();
+  // node->src = vmap[GetVarFromAccessPtr(args[0])];
+  // node->dst = vmap[GetVarFromAccessPtr(args[1])];
+  node->srcRegion_ = NormalizeToBufferRegion(args[0]);
+  node->dstRegion_ = NormalizeToBufferRegion(args[1]);
+  node->src = node->srcRegion_->buffer;
+  node->dst = node->dstRegion_->buffer;
   node->dim = args[2].as<IntImm>().value()->value;
   node->reverse = args[3].as<Bool>().value();
-  CHECK_LT(node->dim, static_cast<int>(node->src->shape.size()));
+  CHECK_LT(node->dim, static_cast<int>(node->src->shape.size()))
+      << "The dim of cumsum should be less than the number of dimensions. Got "
+         "dim="
+      << node->dim << ", but src has " << node->src->shape.size() << " dims.";
+
   data_ = std::move(node);
 }
 
 Stmt CumSumOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
-  if (this->src.scope() == "local.fragment" &&
-      this->dst.scope() == "local.fragment") {
+  if (IsFragmentBuffer(this->src) && IsFragmentBuffer(this->dst)) {
     LOG(FATAL) << "CumSum for fragment not implemented, please raise an issue "
                   "if you need this feature.";
   } else if (this->src.scope() == "shared.dyn" ||
@@ -532,19 +529,29 @@ Stmt CumSumOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
     std::stringstream ss;
     auto threads = T.thread_bounds->extent;
     Array<PrimExpr> args;
-    int ndim = static_cast<int>(src->shape.size());
+
+    // Build access pointers from regions locally
+    PrimExpr srcPtr = MakeAccessPtrFromRegion(srcRegion_, 1);
+    PrimExpr dstPtr = MakeAccessPtrFromRegion(dstRegion_, 2);
+
+    // Use region extents instead of buffer shape for correct slice handling
+    Array<PrimExpr> src_extents;
+    for (const auto &range : srcRegion_->region) {
+      src_extents.push_back(range->extent);
+    }
+    int ndim = static_cast<int>(src_extents.size());
+
     if (ndim == 1) {
       ICHECK_EQ(dim, 0) << "Cumulative sum over a 1D buffer only supports dim "
                            "= 0.";
       ss << "tl::CumSum1D<" << threads << ", " << (reverse ? "true" : "false")
          << ">::run";
-      args = {StringImm(ss.str()), src.access_ptr(1), dst.access_ptr(3),
-              src->shape[0]};
+      args = {StringImm(ss.str()), srcPtr, dstPtr, src_extents[0]};
     } else if (ndim == 2) {
       ss << "tl::CumSum2D<" << threads << ", " << dim << ", "
          << (reverse ? "true" : "false") << ">::run";
-      args = {StringImm(ss.str()), src.access_ptr(1), dst.access_ptr(3),
-              src->shape[0], src->shape[1]};
+      args = {StringImm(ss.str()), srcPtr, dstPtr, src_extents[0],
+              src_extents[1]};
     } else {
       LOG(FATAL) << "CumSum currently supports only 1D or 2D buffers, got "
                  << ndim << "D.";
@@ -560,12 +567,49 @@ Stmt CumSumOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
 
 LayoutMap CumSumOpNode::InferLayout(const LayoutInferArgs &T,
                                     InferLevel level) const {
-  return {};
+  // Only infer layout in strict mode
+  if (level != InferLevel::kStrict) {
+    return {};
+  }
+
+  LayoutMap result_map;
+
+  auto make_linear_layout = [](const Buffer &buf) -> Layout {
+    return makeLinearLayout(buf->shape);
+  };
+
+  auto check_or_set_linear_layout = [&](const Buffer &buf) {
+    if (!IsSharedBuffer(buf))
+      return;
+
+    Layout linear_layout = make_linear_layout(buf);
+    if (T.layout_map.count(buf)) {
+      // Check if existing layout is linear
+      Layout existing = T.layout_map.Get(buf).value().as<Layout>().value();
+      ICHECK(StructuralEqual()(existing, linear_layout))
+          << "CumSum requires linear layout for shared buffer " << buf->name
+          << ", but got non-linear layout.";
+    } else {
+      result_map.Set(buf, linear_layout);
+    }
+  };
+
+  check_or_set_linear_layout(src);
+  check_or_set_linear_layout(dst);
+
+  return result_map;
 }
 
-TIR_REGISTER_TL_OP(CumSumOp, cumsum)
+TIR_REGISTER_TL_TILE_OP(CumSumOp, cumsum)
     .set_num_inputs(4)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  ReduceOpNode::RegisterReflection();
+  CumSumOpNode::RegisterReflection();
+  ReduceTypeNode::RegisterReflection();
+}
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/reduce.h b/src/op/reduce.h
index 853d6e0dd..9d3fd8c4e 100644
--- a/src/op/reduce.h
+++ b/src/op/reduce.h
@@ -30,23 +30,13 @@ enum class ReduceTypeEnum : uint8_t {
 class ReduceTypeNode : public Object {
 public:
   int type{-1}; ///< Internal type identifier
-  static constexpr const char *_type_key = "tl.ReduceType";
-  TVM_DECLARE_FINAL_OBJECT_INFO(ReduceTypeNode, Object);
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.ReduceType", ReduceTypeNode, Object);
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
     refl::ObjectDef<ReduceTypeNode>().def_ro("type", &ReduceTypeNode::type);
   }
 
-  bool SEqualReduce(const ReduceTypeNode *other, SEqualReducer equal) const {
-    return equal(type, other->type);
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const { hash_reduce(type); }
-
-  static constexpr bool _type_has_method_sequal_reduce = true;
-  static constexpr bool _type_has_method_shash_reduce = true;
-
   /// Type checking methods
   bool isSum() const { return type == int(ReduceTypeEnum::kSum); }
   bool isAbsSum() const { return type == int(ReduceTypeEnum::kAbsSum); }
@@ -61,9 +51,10 @@ class ReduceTypeNode : public Object {
 /// Wrapper class for reduction type with string-based construction
 class ReduceType : public ObjectRef {
 public:
-  TVM_DEFINE_OBJECT_REF_METHODS(ReduceType, ObjectRef, ReduceTypeNode);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(ReduceType, ObjectRef,
+                                             ReduceTypeNode);
   TVM_DLL ReduceType(std::string type) {
-    auto node = make_object<ReduceTypeNode>();
+    auto node = tvm::ffi::make_object<ReduceTypeNode>();
     if (type == "sum") {
       node->type = int(ReduceTypeEnum::kSum);
     } else if (type == "abssum") {
@@ -91,40 +82,27 @@ class ReduceType : public ObjectRef {
 class ReduceOpNode : public TileOperatorNode {
 public:
   tir::Buffer src, dst; ///< Source and destination buffers
-  int dim;              ///< Dimension to reduce along
-  ReduceType type;      ///< Type of reduction operation
-  bool clear;           ///< Whether to clear destination before reduction
+  // Optional: keep the original regions used to construct this op
+  BufferRegion srcRegion_, dstRegion_;
+  int dim;         ///< Dimension to reduce along
+  ReduceType type; ///< Type of reduction operation
+  bool clear;      ///< Whether to clear destination before reduction
 
-  static constexpr const char *_type_key = "tl.ReduceOp";
-  TVM_DECLARE_FINAL_OBJECT_INFO(ReduceOpNode, TileOperatorNode);
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.ReduceOp", ReduceOpNode,
+                                    TileOperatorNode);
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
     refl::ObjectDef<ReduceOpNode>()
         .def_ro("src", &ReduceOpNode::src)
         .def_ro("dst", &ReduceOpNode::dst)
+        .def_ro("srcRegion", &ReduceOpNode::srcRegion_)
+        .def_ro("dstRegion", &ReduceOpNode::dstRegion_)
         .def_ro("dim", &ReduceOpNode::dim)
         .def_ro("type", &ReduceOpNode::type)
         .def_ro("clear", &ReduceOpNode::clear);
   }
 
-  bool SEqualReduce(const ReduceOpNode *other, SEqualReducer equal) const {
-    return equal(src, other->src) && equal(dst, other->dst) &&
-           equal(dim, other->dim) && equal(type, other->type) &&
-           equal(clear, other->clear);
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const {
-    hash_reduce(src);
-    hash_reduce(dst);
-    hash_reduce(dim);
-    hash_reduce(type);
-    hash_reduce(clear);
-  }
-
-  static constexpr bool _type_has_method_sequal_reduce = true;
-  static constexpr bool _type_has_method_shash_reduce = true;
-
   /// Lower the operator to TIR statements
   Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
   /// Infer memory layout for buffers
@@ -145,8 +123,11 @@ class ReduceOpNode : public TileOperatorNode {
 /// Wrapper class for reduction operations
 class ReduceOp : public TileOperator {
 public:
-  TVM_DEFINE_OBJECT_REF_METHODS(ReduceOp, TileOperator, ReduceOpNode);
-  TVM_DLL ReduceOp(Array<PrimExpr> args, BufferMap vmap);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(ReduceOp, TileOperator,
+                                             ReduceOpNode);
+  TVM_DLL
+  ReduceOp(Array<PrimExpr> args,
+           Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
@@ -154,10 +135,23 @@ class ReduceOp : public TileOperator {
 class CumSumOpNode : public TileOperatorNode {
 public:
   tir::Buffer src, dst; ///< Source and destination buffers
-  int dim;              ///< Dimension along which to compute cumulative sum
-  bool reverse;         ///< Whether to compute in reverse order
-  static constexpr const char *_type_key = "tl.CumSumOp";
-  TVM_DECLARE_FINAL_OBJECT_INFO(CumSumOpNode, TileOperatorNode);
+  // Optional: keep the original regions used to construct this op
+  BufferRegion srcRegion_, dstRegion_;
+  int dim;      ///< Dimension along which to compute cumulative sum
+  bool reverse; ///< Whether to compute in reverse order
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.CumSumOp", CumSumOpNode,
+                                    TileOperatorNode);
+
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    refl::ObjectDef<CumSumOpNode>()
+        .def_ro("src", &CumSumOpNode::src)
+        .def_ro("dst", &CumSumOpNode::dst)
+        .def_ro("srcRegion", &CumSumOpNode::srcRegion_)
+        .def_ro("dstRegion", &CumSumOpNode::dstRegion_)
+        .def_ro("dim", &CumSumOpNode::dim)
+        .def_ro("reverse", &CumSumOpNode::reverse);
+  }
 
   Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
   LayoutMap InferLayout(const LayoutInferArgs &T,
@@ -169,12 +163,15 @@ class CumSumOpNode : public TileOperatorNode {
 /// Wrapper class for cumulative sum operations
 class CumSumOp : public TileOperator {
 public:
-  TVM_DEFINE_OBJECT_REF_METHODS(CumSumOp, TileOperator, CumSumOpNode);
-  TVM_DLL CumSumOp(Array<PrimExpr> args, BufferMap vmap);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(CumSumOp, TileOperator,
+                                             CumSumOpNode);
+  TVM_DLL
+  CumSumOp(Array<PrimExpr> args,
+           Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
 } // namespace tl
 } // namespace tvm
 
-#endif //  TVM_TL_OP_REDUCE_H_
\ No newline at end of file
+#endif //  TVM_TL_OP_REDUCE_H_
diff --git a/src/op/region.cc b/src/op/region.cc
index 95a0b4295..4776edd55 100644
--- a/src/op/region.cc
+++ b/src/op/region.cc
@@ -1,7 +1,14 @@
 /*!
  * \file tl/op/region.cc
- * \brief Define region operator.
+ * \brief Define region operator (bridge to carry BufferRegion via Call args).
  *
+ * Notes:
+ * - BufferLoad/Ramp cannot represent a general PrimExpr as a vector lane
+ *   count. Dynamic extents like (H1 - H0) cannot be encoded as
+ *   Ramp(lanes = H1 - H0), and lowering BufferRegion to BufferLoad loses the
+ *   explicit extent information.
+ * - tl.region carries both mins and extents in Call args and lets the backend
+ *   reconstruct a BufferRegion faithfully.
  */
 
 #include "region.h"
@@ -11,27 +18,7 @@ namespace tvm {
 namespace tl {
 using namespace tir;
 
-/**
- * @brief Construct a RegionOp from TL operator arguments.
- *
- * Parses the TL `region` operator call arguments to populate the RegionOpNode:
- * - Expects args[0] to be a `BufferLoad` whose `indices` are the per-dimension
- * minima.
- * - args[1] must be a constant integer used as the access mask.
- * - args[2 + i] provides the extent for dimension `i`.
- *
- * The constructor validates that the number of load indices equals `args.size()
- * - 2` and will abort via ICHECK on mismatch or if args[0] is not a
- * `BufferLoad`.
- *
- * Parameters:
- * - args: TL operator call arguments in the form
- *     [BufferLoad(min_i...), access_mask, extent_0, extent_1, ...,
- * extent_{n-1}] where n = number of dimensions.
- * - vmap: BufferMap passed through by the caller (not documented here as a
- * generic utility).
- */
-RegionOp::RegionOp(Array<PrimExpr> args, BufferMap vmap) {
+RegionOp::RegionOp(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
   size_t n = args.size();
   size_t ndim = n - 2;
   auto load = args[0].as<BufferLoadNode>();
@@ -39,38 +26,37 @@ RegionOp::RegionOp(Array<PrimExpr> args, BufferMap vmap) {
   ICHECK(load->indices.size() == ndim)
       << "load->indices.size() = " << load->indices << " ndim = " << ndim;
   Array<Range> ranges;
+  // Rebuild per-axis ranges from mins (BufferLoad indices) and provided extents
   for (size_t i = 0; i < ndim; i++) {
-    PrimExpr min = load->indices[i];
+    PrimExpr index = load->indices[i];
     PrimExpr extent = args[2 + i];
-    ranges.push_back(Range::FromMinExtent(min, extent));
+    if (const auto *ramp = index.as<RampNode>()) {
+      const auto *stride_imm = ramp->stride.as<IntImmNode>();
+      ICHECK(stride_imm && stride_imm->value == 1)
+          << "RegionOp expects stride-1 Ramp for index";
+      if (const auto *lanes_imm = ramp->lanes.as<IntImmNode>()) {
+        if (const auto *ext_imm = extent.as<IntImmNode>()) {
+          ICHECK_EQ(lanes_imm->value, ext_imm->value)
+              << "Ramp lanes and provided extent must match";
+        }
+      }
+      ranges.push_back(Range::FromMinExtent(ramp->base, ramp->lanes));
+    } else {
+      ranges.push_back(Range::FromMinExtent(index, extent));
+    }
   }
-  ObjectPtr<RegionOpNode> node = make_object<RegionOpNode>();
+  ObjectPtr<RegionOpNode> node = tvm::ffi::make_object<RegionOpNode>();
   node->buffer_ = load->buffer;
   node->access_mask_ = static_cast<int>(*as_const_int(args[1]));
   node->ranges_ = ranges;
   data_ = std::move(node);
 }
 
-/**
- * @brief Create a copy of this RegionOpNode and return it as a TileOperator.
- *
- * @return TileOperator A new TileOperator that owns a copied RegionOpNode.
- */
 TileOperator RegionOpNode::Clone() const {
-  auto op = make_object<RegionOpNode>(*this);
+  auto op = tvm::ffi::make_object<RegionOpNode>(*this);
   return RegionOp(op);
 }
 
-/**
- * @brief Check whether the region spans the entire underlying buffer.
- *
- * Returns true if for every dimension the range minimum is zero and the
- * range extent is structurally equal to the corresponding buffer shape
- * dimension. Otherwise returns false.
- *
- * @return true if the region covers the full buffer in all dimensions; false
- * otherwise.
- */
 bool RegionOpNode::IsFullRegion() const {
   for (size_t i = 0; i < ranges_.size(); i++) {
     if (!is_zero(ranges_[i]->min))
@@ -81,42 +67,21 @@ bool RegionOpNode::IsFullRegion() const {
   return true;
 }
 
-/**
- * @brief Lower the region operator to a TIR statement.
- *
- * Lowers this RegionOpNode into a TIR Stmt by delegating to the operator's
- * evaluation path (currently `Evaluate(0)`).
- *
- * @param T Lowering context (provides buffers, producers/consumers and other
- *          environment required for lowering).
- * @param analyzer Optional arithmetic analyzer used for simplification during
- *                 lowering.
- * @return Stmt The lowered TIR statement representing this region operation.
- */
 Stmt RegionOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   return Evaluate(0);
 }
 
-/**
- * @brief Infers data layout for the region operator.
- *
- * This operator does not provide any layout inference; the function always
- * returns an empty LayoutMap regardless of the provided arguments or inference
- * level.
- *
- * @param T Layout inference arguments (ignored).
- * @param level Inference granularity level (ignored).
- * @return LayoutMap Empty map indicating no inferred layouts.
- */
 LayoutMap RegionOpNode::InferLayout(const LayoutInferArgs &T,
                                     InferLevel level) const {
   return {};
 }
 
-TIR_REGISTER_TL_OP(RegionOp, region)
+TIR_REGISTER_TL_TILE_OP(RegionOp, region)
     .set_num_inputs(-1)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kPure));
 
+TVM_FFI_STATIC_INIT_BLOCK() { RegionOpNode::RegisterReflection(); }
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/region.h b/src/op/region.h
index 2d3c9d8ec..5f013eca6 100644
--- a/src/op/region.h
+++ b/src/op/region.h
@@ -1,74 +1,36 @@
 /*!
- * \file tl/op/op.h
- * \brief Tile library operations.
+ * \file tl/op/region.h
+ * \brief Tile memory region descriptor op (bridge to carry BufferRegion via
+ * Call args).
  *
+ * Why tl.region instead of passing BufferRegion directly?
+ *
+ * - While TIR can represent a BufferRegion, when a BufferRegion is passed as a
+ *   call argument through call_intrin/FFI, the Python->C++ conversion lowers it
+ *   to a BufferLoad(indices). To encode an interval inside indices, the FFI
+ *   typically uses Ramp(base, stride, lanes) to represent a contiguous slice.
+ * - Ramp(lanes) may only be a constant or vscale*k (scalable vector). A general
+ *   PrimExpr (e.g., H1 - H0) is not allowed as lanes, so dynamic extents would
+ *   make the lowered BufferLoad invalid.
+ * - Moreover, BufferLoad only carries indices, not per-axis extents. Downstream
+ *   tile operators (e.g., tl.copy, tl.reduce) that require both min and extent
+ *   cannot losslessly recover dynamic extents from a BufferLoad alone.
+ *
+ * tl.region is a small transport-only op that solves this:
+ * - The frontend packs buffer + mins (from BufferLoad.indices) + extents into
+ *   Call args, allowing dynamic extents to be expressed explicitly.
+ * - The backend (NormalizeToBufferRegion) reconstructs a BufferRegion from the
+ *   tl.region call without losing information.
+ * - The op itself carries no semantics in Lower/InferLayout and is only used as
+ *   a bridge for argument passing.
  */
 
 #ifndef TVM_TL_OP_REGION_H_
 #define TVM_TL_OP_REGION_H_
 
 #include "./operator.h"
-#include <tvm/arith/analyzer.h>
-#include <tvm/ir/op.h>
-#include <tvm/target/target.h>
 #include <tvm/tir/buffer.h>
 
-/**
- * Tile operator representing a memory region (buffer + ranges) used by TL
- * passes.
- *
- * Encapsulates the target tir::Buffer, the region extents as an Array<Range>,
- * and an access mask that indicates permitted or intended accesses for lowering
- * and layout inference.
- */
-
-/**
- * Lower this RegionOp into a TIR statement representing the region access.
- *
- * @param T Lowering-time arguments (e.g., loop/build context and value
- * mappings).
- * @param analyzer Arithmetic analyzer used to simplify and reason about
- * expressions.
- * @return A tir::Stmt that implements the region access/mutation described by
- * this operator.
- */
-
-/**
- * Infer the layout mapping for this region operator.
- *
- * Produces a LayoutMap describing how loop/axis indices map to buffer axes for
- * layout-aware scheduling and subsequent operators.
- *
- * @param T Layout inference arguments (e.g., input layouts and shapes).
- * @param level The inference detail level to use.
- * @return A LayoutMap describing inferred mappings for the operator.
- */
-
-/**
- * Return true when this RegionOp represents the full buffer region (i.e.,
- * ranges cover the entire buffer extent).
- */
-
-/**
- * Create a shallow copy of this operator as a TileOperator handle.
- *
- * @return A TileOperator that references a cloned RegionOpNode.
- */
-
-/**
- * Construct a RegionOp from argument expressions and a buffer map.
- *
- * @param args Positional expressions used to instantiate the operator
- * (semantics depend on how RegionOp is invoked in TL pipelines).
- * @param vmap Mapping from Buffer to replacement Buffer or buffer metadata used
- * during creation.
- */
-
-/**
- * Return the global Op registration for RegionOp.
- *
- * @return Reference to the registered tvm::Op describing the RegionOp.
- */
 namespace tvm {
 namespace tl {
 
@@ -80,8 +42,14 @@ class RegionOpNode : public TileOperatorNode {
   Array<Range> ranges_;
   int access_mask_;
 
-  static constexpr const char *_type_key = "tl.RegionOp";
-  TVM_DECLARE_FINAL_OBJECT_INFO(RegionOpNode, TileOperatorNode);
+  /*!
+   * access_mask_ encodes the intended access type when the region is used as
+   * an argument to tile operators: 1=read, 2=write, 3=read-write. The mask is
+   * transport metadata only and does not affect lowering.
+   */
+
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.RegionOp", RegionOpNode,
+                                    TileOperatorNode);
 
   Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
   LayoutMap InferLayout(const LayoutInferArgs &T,
@@ -101,27 +69,21 @@ class RegionOpNode : public TileOperatorNode {
         .def_ro("ranges", &RegionOpNode::ranges_)
         .def_ro("access_mask", &RegionOpNode::access_mask_);
   }
-
-  bool SEqualReduce(const RegionOpNode *other, SEqualReducer equal) const {
-    return equal(buffer_, other->buffer_) && equal(ranges_, other->ranges_) &&
-           equal(access_mask_, other->access_mask_);
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const {
-    hash_reduce(buffer_);
-    hash_reduce(ranges_);
-    hash_reduce(access_mask_);
-  }
-
-  static constexpr bool _type_has_method_sequal_reduce = true;
-  static constexpr bool _type_has_method_shash_reduce = true;
 };
 
 class RegionOp : public TileOperator {
 public:
-  TVM_DEFINE_OBJECT_REF_METHODS(RegionOp, TileOperator, RegionOpNode);
-  TVM_DLL RegionOp(Array<PrimExpr> args, BufferMap vmap);
-
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(RegionOp, TileOperator,
+                                             RegionOpNode);
+  /*!
+   * Build a RegionOp from call arguments:
+   * - args[0]: BufferLoad whose indices are per-axis minima.
+   * - args[1]: Integer access mask (1=r, 2=w, 3=rw).
+   * - args[2 + i]: Extent of axis i (supports dynamic PrimExpr).
+   */
+  TVM_DLL
+  RegionOp(Array<PrimExpr> args,
+           Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
diff --git a/src/op/remote_copy.cc b/src/op/remote_copy.cc
index fba501e48..1c2e7a7d2 100644
--- a/src/op/remote_copy.cc
+++ b/src/op/remote_copy.cc
@@ -16,6 +16,7 @@
 #include "../target/utils.h"
 #include "builtin.h"
 #include "distributed.h"
+#include "operator.h"
 #include "parallel.h"
 
 namespace tvm {
@@ -50,8 +51,8 @@ PrimExpr PutOpNode::MakeRemappedAddress(const LowerArgs &T,
   return MakeAddress(remapped, indices);
 }
 
-PutOp::PutOp(Array<PrimExpr> args, BufferMap vmap) {
-  ObjectPtr<PutOpNode> node = make_object<PutOpNode>();
+PutOp::PutOp(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
+  ObjectPtr<PutOpNode> node = tvm::ffi::make_object<PutOpNode>();
   node->src_addr = args[0];
   node->dst_addr = args[1];
   ICHECK(node->src_addr.as<CallNode>()) << "src_addr must be a call node";
@@ -80,7 +81,6 @@ PutOp::PutOp(Array<PrimExpr> args, BufferMap vmap) {
   node->scope = args[5].as<StringImm>().value()->value;
   node->enable_aggressive_vectorize = bool(args[6].as<IntImm>().value()->value);
   data_ = std::move(node);
-  (void)vmap;
 }
 
 bool PutOpNode::is_distributed() const {
@@ -129,7 +129,7 @@ LayoutMap PutOpNode::InferLayout(const LayoutInferArgs &T,
 }
 
 TileOperator PutOpNode::Clone() const {
-  auto node = make_object<PutOpNode>(*this);
+  auto node = tvm::ffi::make_object<PutOpNode>(*this);
   return PutOp(node);
 }
 
@@ -160,8 +160,8 @@ PrimExpr GetOpNode::MakeRemappedAddress(const LowerArgs &T,
   return MakeAddress(remapped, indices);
 }
 
-GetOp::GetOp(Array<PrimExpr> args, BufferMap vmap) {
-  ObjectPtr<GetOpNode> node = make_object<GetOpNode>();
+GetOp::GetOp(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
+  ObjectPtr<GetOpNode> node = tvm::ffi::make_object<GetOpNode>();
   node->src_addr = args[0];
   node->dst_addr = args[1];
   ICHECK(node->src_addr.as<CallNode>()) << "src_addr must be a call node";
@@ -190,7 +190,6 @@ GetOp::GetOp(Array<PrimExpr> args, BufferMap vmap) {
   node->scope = args[5].as<StringImm>().value()->value;
   node->enable_aggressive_vectorize = bool(args[6].as<IntImm>().value()->value);
   data_ = std::move(node);
-  (void)vmap;
 }
 
 bool GetOpNode::is_distributed() const {
@@ -241,12 +240,12 @@ LayoutMap GetOpNode::InferLayout(const LayoutInferArgs &T,
 }
 
 TileOperator GetOpNode::Clone() const {
-  auto node = make_object<GetOpNode>(*this);
+  auto node = tvm::ffi::make_object<GetOpNode>(*this);
   return GetOp(node);
 }
 
-StOp::StOp(Array<PrimExpr> args, BufferMap vmap) {
-  ObjectPtr<StOpNode> node = make_object<StOpNode>();
+StOp::StOp(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
+  ObjectPtr<StOpNode> node = tvm::ffi::make_object<StOpNode>();
   node->dst = args[0];
   ICHECK(node->dst.as<CallNode>()) << "dst must be a call node";
   ICHECK(node->dst.as<CallNode>()->op.same_as(builtin::address_of()))
@@ -258,7 +257,6 @@ StOp::StOp(Array<PrimExpr> args, BufferMap vmap) {
   node->na = args[4].as<IntImm>().value()->value;
   node->dst_pe = args[5];
   data_ = std::move(node);
-  (void)vmap;
 }
 
 bool StOpNode::is_distributed() const {
@@ -309,12 +307,12 @@ LayoutMap StOpNode::InferLayout(const LayoutInferArgs &T,
 }
 
 TileOperator StOpNode::Clone() const {
-  auto node = make_object<StOpNode>(*this);
+  auto node = tvm::ffi::make_object<StOpNode>(*this);
   return StOp(node);
 }
 
-LdOp::LdOp(Array<PrimExpr> args, BufferMap vmap) {
-  ObjectPtr<LdOpNode> node = make_object<LdOpNode>();
+LdOp::LdOp(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
+  ObjectPtr<LdOpNode> node = tvm::ffi::make_object<LdOpNode>();
   node->src = args[0];
   ICHECK(node->src.as<CallNode>()) << "src must be a call node";
   ICHECK(node->src.as<CallNode>()->op.same_as(builtin::address_of()))
@@ -327,7 +325,6 @@ LdOp::LdOp(Array<PrimExpr> args, BufferMap vmap) {
   node->nc = args[5].as<IntImm>().value()->value;
   node->src_pe = args[6];
   data_ = std::move(node);
-  (void)vmap;
 }
 
 bool LdOpNode::is_distributed() const {
@@ -378,30 +375,30 @@ LayoutMap LdOpNode::InferLayout(const LayoutInferArgs &T,
 }
 
 TileOperator LdOpNode::Clone() const {
-  auto node = make_object<LdOpNode>(*this);
+  auto node = tvm::ffi::make_object<LdOpNode>(*this);
   return LdOp(node);
 }
 
-TIR_REGISTER_TL_OP(PutOp, put)
+TIR_REGISTER_TL_TILE_OP(PutOp, put)
     .set_num_inputs(7)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
-TIR_REGISTER_TL_OP(GetOp, get)
+TIR_REGISTER_TL_TILE_OP(GetOp, get)
     .set_num_inputs(7)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
-TIR_REGISTER_TL_OP(StOp, st).set_num_inputs(6).set_attr<TCallEffectKind>(
+TIR_REGISTER_TL_TILE_OP(StOp, st).set_num_inputs(6).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
-TIR_REGISTER_TL_OP(LdOp, ld).set_num_inputs(7).set_attr<TCallEffectKind>(
+TIR_REGISTER_TL_TILE_OP(LdOp, ld).set_num_inputs(7).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
-TVM_FFI_STATIC_INIT_BLOCK({ PutOpNode::RegisterReflection(); });
-TVM_FFI_STATIC_INIT_BLOCK({ GetOpNode::RegisterReflection(); });
-TVM_FFI_STATIC_INIT_BLOCK({ StOpNode::RegisterReflection(); });
-TVM_FFI_STATIC_INIT_BLOCK({ LdOpNode::RegisterReflection(); });
+TVM_FFI_STATIC_INIT_BLOCK() { PutOpNode::RegisterReflection(); }
+TVM_FFI_STATIC_INIT_BLOCK() { GetOpNode::RegisterReflection(); }
+TVM_FFI_STATIC_INIT_BLOCK() { StOpNode::RegisterReflection(); }
+TVM_FFI_STATIC_INIT_BLOCK() { LdOpNode::RegisterReflection(); }
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/remote_copy.h b/src/op/remote_copy.h
index 3c118f33a..677b18d6d 100644
--- a/src/op/remote_copy.h
+++ b/src/op/remote_copy.h
@@ -1,11 +1,10 @@
 /*!
  * \file tl/op/remote_copy.h
- * \brief Remote copy operators.
- *
+ * \brief Remote copy operators for distributed computing.
  */
 
-#ifndef TVM_TL_OP_BULK_COPY_H_
-#define TVM_TL_OP_BULK_COPY_H_
+#ifndef TVM_TL_OP_REMOTE_COPY_H_
+#define TVM_TL_OP_REMOTE_COPY_H_
 
 #include <tvm/target/target.h>
 #include <tvm/tir/stmt_functor.h>
@@ -18,6 +17,9 @@ namespace tl {
 
 using namespace tir;
 
+/*!
+ * \brief Put operation for remote memory copy (local -> remote).
+ */
 class PutOpNode : public TileOperatorNode {
 public:
   PrimExpr src_addr;           ///< Address of the source buffer (address_of)
@@ -34,19 +36,11 @@ class PutOpNode : public TileOperatorNode {
       dst_indices;   ///< Destination indices used for address computation
   std::string scope; ///< Scope: {warp, block}
   bool enable_aggressive_vectorize; ///< Whether to enable aggressive
-                                    ///< vectorization, only effctive for
-                                    ///< warp-scope
+                                    ///< vectorization
 
   bool is_distributed() const;
 
-  static constexpr const char *_type_key = "tl.PutOp";
-  TVM_DECLARE_FINAL_OBJECT_INFO(PutOpNode, TileOperatorNode);
-
-  Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
-  LayoutMap InferLayout(const LayoutInferArgs &T,
-                        InferLevel level) const override;
-  static const Op &Get();
-  TileOperator Clone() const override;
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.PutOp", PutOpNode, TileOperatorNode);
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -60,39 +54,16 @@ class PutOpNode : public TileOperatorNode {
         .def_ro("dst_buffer", &PutOpNode::dst_buffer)
         .def_ro("src_indices", &PutOpNode::src_indices)
         .def_ro("dst_indices", &PutOpNode::dst_indices)
-        .def_ro("scope", &PutOpNode::scope);
+        .def_ro("scope", &PutOpNode::scope)
+        .def_ro("enable_aggressive_vectorize",
+                &PutOpNode::enable_aggressive_vectorize);
   }
 
-  bool SEqualReduce(const PutOpNode *other, SEqualReducer equal) const {
-    return equal(src_addr, other->src_addr) &&
-           equal(dst_addr, other->dst_addr) &&
-           equal(src_offset, other->src_offset) &&
-           equal(dst_offset, other->dst_offset) &&
-           equal(copy_size, other->copy_size) && equal(dst_pe, other->dst_pe) &&
-           equal(unroll_factor, other->unroll_factor) &&
-           equal(src_buffer, other->src_buffer) &&
-           equal(dst_buffer, other->dst_buffer) &&
-           equal(src_indices, other->src_indices) &&
-           equal(dst_indices, other->dst_indices) && scope == other->scope;
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const {
-    hash_reduce(src_addr);
-    hash_reduce(dst_addr);
-    hash_reduce(src_offset);
-    hash_reduce(dst_offset);
-    hash_reduce(copy_size);
-    hash_reduce(dst_pe);
-    hash_reduce(unroll_factor);
-    hash_reduce(src_buffer);
-    hash_reduce(dst_buffer);
-    hash_reduce(src_indices);
-    hash_reduce(dst_indices);
-    hash_reduce(scope);
-  }
-
-  static constexpr bool _type_has_method_sequal_reduce = true;
-  static constexpr bool _type_has_method_shash_reduce = true;
+  Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
+  LayoutMap InferLayout(const LayoutInferArgs &T,
+                        InferLevel level) const override;
+  static const Op &Get();
+  TileOperator Clone() const override;
 
   PrimExpr get_offset(const BufferLoadNode *load) const;
 
@@ -105,11 +76,15 @@ class PutOpNode : public TileOperatorNode {
 
 class PutOp : public TileOperator {
 public:
-  TVM_DEFINE_OBJECT_REF_METHODS(PutOp, TileOperator, PutOpNode);
-  TVM_DLL PutOp(Array<PrimExpr> args, BufferMap vmap);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(PutOp, TileOperator, PutOpNode);
+  TVM_DLL PutOp(Array<PrimExpr> args,
+                Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
+/*!
+ * \brief Get operation for remote memory copy (remote -> local).
+ */
 class GetOpNode : public TileOperatorNode {
 public:
   PrimExpr src_addr;           ///< Remote source buffer address
@@ -126,19 +101,11 @@ class GetOpNode : public TileOperatorNode {
       dst_indices;   ///< Destination indices used for address computation
   std::string scope; ///< Scope: {warp, block}
   bool enable_aggressive_vectorize; ///< Whether to enable aggressive
-                                    ///< vectorization, only effctive for
-                                    ///< warp-scope
+                                    ///< vectorization
 
   bool is_distributed() const;
 
-  static constexpr const char *_type_key = "tl.GetOp";
-  TVM_DECLARE_FINAL_OBJECT_INFO(GetOpNode, TileOperatorNode);
-
-  Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
-  LayoutMap InferLayout(const LayoutInferArgs &T,
-                        InferLevel level) const override;
-  static const Op &Get();
-  TileOperator Clone() const override;
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.GetOp", GetOpNode, TileOperatorNode);
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -152,39 +119,16 @@ class GetOpNode : public TileOperatorNode {
         .def_ro("dst_buffer", &GetOpNode::dst_buffer)
         .def_ro("src_indices", &GetOpNode::src_indices)
         .def_ro("dst_indices", &GetOpNode::dst_indices)
-        .def_ro("scope", &GetOpNode::scope);
+        .def_ro("scope", &GetOpNode::scope)
+        .def_ro("enable_aggressive_vectorize",
+                &GetOpNode::enable_aggressive_vectorize);
   }
 
-  bool SEqualReduce(const GetOpNode *other, SEqualReducer equal) const {
-    return equal(src_addr, other->src_addr) &&
-           equal(dst_addr, other->dst_addr) &&
-           equal(src_offset, other->src_offset) &&
-           equal(dst_offset, other->dst_offset) &&
-           equal(copy_size, other->copy_size) && equal(src_pe, other->src_pe) &&
-           equal(unroll_factor, other->unroll_factor) &&
-           equal(src_buffer, other->src_buffer) &&
-           equal(dst_buffer, other->dst_buffer) &&
-           equal(src_indices, other->src_indices) &&
-           equal(dst_indices, other->dst_indices) && scope == other->scope;
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const {
-    hash_reduce(src_addr);
-    hash_reduce(dst_addr);
-    hash_reduce(src_offset);
-    hash_reduce(dst_offset);
-    hash_reduce(copy_size);
-    hash_reduce(src_pe);
-    hash_reduce(unroll_factor);
-    hash_reduce(src_buffer);
-    hash_reduce(dst_buffer);
-    hash_reduce(src_indices);
-    hash_reduce(dst_indices);
-    hash_reduce(scope);
-  }
-
-  static constexpr bool _type_has_method_sequal_reduce = true;
-  static constexpr bool _type_has_method_shash_reduce = true;
+  Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
+  LayoutMap InferLayout(const LayoutInferArgs &T,
+                        InferLevel level) const override;
+  static const Op &Get();
+  TileOperator Clone() const override;
 
   PrimExpr get_offset(const BufferLoadNode *load) const;
 
@@ -197,11 +141,15 @@ class GetOpNode : public TileOperatorNode {
 
 class GetOp : public TileOperator {
 public:
-  TVM_DEFINE_OBJECT_REF_METHODS(GetOp, TileOperator, GetOpNode);
-  TVM_DLL GetOp(Array<PrimExpr> args, BufferMap vmap);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(GetOp, TileOperator, GetOpNode);
+  TVM_DLL GetOp(Array<PrimExpr> args,
+                Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
+/*!
+ * \brief Store operation for remote memory (with signaling).
+ */
 class StOpNode : public TileOperatorNode {
 public:
   PrimExpr dst;    ///< Destination address
@@ -213,14 +161,7 @@ class StOpNode : public TileOperatorNode {
 
   bool is_distributed() const;
 
-  static constexpr const char *_type_key = "tl.StOp";
-  TVM_DECLARE_FINAL_OBJECT_INFO(StOpNode, TileOperatorNode);
-
-  Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
-  LayoutMap InferLayout(const LayoutInferArgs &T,
-                        InferLevel level) const override;
-  static const Op &Get();
-  TileOperator Clone() const override;
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.StOp", StOpNode, TileOperatorNode);
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -233,32 +174,24 @@ class StOpNode : public TileOperatorNode {
         .def_ro("na", &StOpNode::na);
   }
 
-  bool SEqualReduce(const StOpNode *other, SEqualReducer equal) const {
-    return equal(dst, other->dst) && equal(value, other->value) &&
-           equal(dst_pe, other->dst_pe) && scope == other->scope &&
-           sem == other->sem && na == other->na;
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const {
-    hash_reduce(dst);
-    hash_reduce(value);
-    hash_reduce(dst_pe);
-    hash_reduce(scope);
-    hash_reduce(sem);
-    hash_reduce(na);
-  }
-
-  static constexpr bool _type_has_method_sequal_reduce = true;
-  static constexpr bool _type_has_method_shash_reduce = true;
+  Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
+  LayoutMap InferLayout(const LayoutInferArgs &T,
+                        InferLevel level) const override;
+  static const Op &Get();
+  TileOperator Clone() const override;
 };
 
 class StOp : public TileOperator {
 public:
-  TVM_DEFINE_OBJECT_REF_METHODS(StOp, TileOperator, StOpNode);
-  TVM_DLL StOp(Array<PrimExpr> args, BufferMap vmap);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(StOp, TileOperator, StOpNode);
+  TVM_DLL StOp(Array<PrimExpr> args,
+               Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
+/*!
+ * \brief Load operation for remote memory (with signaling).
+ */
 class LdOpNode : public TileOperatorNode {
 public:
   PrimExpr src;    ///< Source address
@@ -271,14 +204,7 @@ class LdOpNode : public TileOperatorNode {
 
   bool is_distributed() const;
 
-  static constexpr const char *_type_key = "tl.LdOp";
-  TVM_DECLARE_FINAL_OBJECT_INFO(LdOpNode, TileOperatorNode);
-
-  Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
-  LayoutMap InferLayout(const LayoutInferArgs &T,
-                        InferLevel level) const override;
-  static const Op &Get();
-  TileOperator Clone() const override;
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.LdOp", LdOpNode, TileOperatorNode);
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -292,34 +218,22 @@ class LdOpNode : public TileOperatorNode {
         .def_ro("nc", &LdOpNode::nc);
   }
 
-  bool SEqualReduce(const LdOpNode *other, SEqualReducer equal) const {
-    return equal(src, other->src) && equal(value, other->value) &&
-           equal(src_pe, other->src_pe) && scope == other->scope &&
-           sem == other->sem && na == other->na && nc == other->nc;
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const {
-    hash_reduce(src);
-    hash_reduce(value);
-    hash_reduce(src_pe);
-    hash_reduce(scope);
-    hash_reduce(sem);
-    hash_reduce(na);
-    hash_reduce(nc);
-  }
-
-  static constexpr bool _type_has_method_sequal_reduce = true;
-  static constexpr bool _type_has_method_shash_reduce = true;
+  Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
+  LayoutMap InferLayout(const LayoutInferArgs &T,
+                        InferLevel level) const override;
+  static const Op &Get();
+  TileOperator Clone() const override;
 };
 
 class LdOp : public TileOperator {
 public:
-  TVM_DEFINE_OBJECT_REF_METHODS(LdOp, TileOperator, LdOpNode);
-  TVM_DLL LdOp(Array<PrimExpr> args, BufferMap vmap);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(LdOp, TileOperator, LdOpNode);
+  TVM_DLL LdOp(Array<PrimExpr> args,
+               Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
 } // namespace tl
 } // namespace tvm
 
-#endif //  TVM_TL_OP_BULK_COPY_H_
+#endif // TVM_TL_OP_REMOTE_COPY_H_
diff --git a/src/op/sync.cc b/src/op/sync.cc
index 892fc2220..d7a8bc9ec 100644
--- a/src/op/sync.cc
+++ b/src/op/sync.cc
@@ -35,35 +35,10 @@ PrimExpr BarrierBlocksOpNode::get_offset(const BufferLoadNode *load) const {
   }                                                                            \
   TVM_REGISTER_OP("tl." #OpName)                                               \
       .set_attr<TScriptPrinterName>("TScriptPrinterName", #OpName)
-
-TIR_DEFINE_TL_BUILTIN(init_barrier_gpu)
-    .set_num_inputs(2)
-    .set_attr<TCallEffectKind>("TCallEffectKind",
-                               Integer(CallEffectKind::kOpaque));
-
-TIR_DEFINE_TL_BUILTIN(arrive_barrier_gpu)
-    .set_num_inputs(1)
-    .set_attr<TCallEffectKind>("TCallEffectKind",
-                               Integer(CallEffectKind::kOpaque));
-
-TIR_DEFINE_TL_BUILTIN(wait_barrier_gpu)
-    .set_num_inputs(1)
-    .set_attr<TCallEffectKind>("TCallEffectKind",
-                               Integer(CallEffectKind::kOpaque));
-
-TIR_DEFINE_TL_BUILTIN(wait_eq).set_num_inputs(2).set_attr<TCallEffectKind>(
-    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
-
-TIR_DEFINE_TL_BUILTIN(sync_barrier_gpu)
-    .set_num_inputs(1)
-    .set_attr<TCallEffectKind>("TCallEffectKind",
-                               Integer(CallEffectKind::kOpaque));
-
-TIR_DEFINE_TL_BUILTIN(sync_grid).set_num_inputs(1).set_attr<TCallEffectKind>(
-    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
-
-BarrierBlocksOp::BarrierBlocksOp(Array<PrimExpr> args, BufferMap vmap) {
-  ObjectPtr<BarrierBlocksOpNode> node = make_object<BarrierBlocksOpNode>();
+BarrierBlocksOp::BarrierBlocksOp(Array<PrimExpr> args,
+                                 Map<String, ObjectRef> annotations) {
+  ObjectPtr<BarrierBlocksOpNode> node =
+      tvm::ffi::make_object<BarrierBlocksOpNode>();
   node->local_bar_addr = args[0];
   node->need_fence = bool(args[1].as<IntImmNode>()->value);
   const auto *call = node->local_bar_addr.as<CallNode>();
@@ -77,7 +52,6 @@ BarrierBlocksOp::BarrierBlocksOp(Array<PrimExpr> args, BufferMap vmap) {
   node->local_bar = load->buffer;
   node->local_indices = load->indices;
   data_ = std::move(node);
-  (void)vmap;
 }
 
 Stmt BarrierBlocksOpNode::Lower(const LowerArgs &T,
@@ -117,7 +91,7 @@ LayoutMap BarrierBlocksOpNode::InferLayout(const LayoutInferArgs &T,
 }
 
 TileOperator BarrierBlocksOpNode::Clone() const {
-  auto node = make_object<BarrierBlocksOpNode>(*this);
+  auto node = tvm::ffi::make_object<BarrierBlocksOpNode>(*this);
   return BarrierBlocksOp(node);
 }
 
@@ -135,14 +109,13 @@ PrimExpr BarrierBlocksOpNode::MakeLocalBarAddr(const LowerArgs &T) const {
               {BufferLoad(buffer, local_indices)});
 }
 
-WaitOp::WaitOp(Array<PrimExpr> args, BufferMap vmap) {
-  ObjectPtr<WaitOpNode> node = make_object<WaitOpNode>();
+WaitOp::WaitOp(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
+  ObjectPtr<WaitOpNode> node = tvm::ffi::make_object<WaitOpNode>();
   node->relation = args[0].as<IntImmNode>()->value;
   node->addr = args[1];
   node->expected = args[2];
   node->peer = args[3];
   data_ = std::move(node);
-  (void)vmap;
 }
 
 bool WaitOpNode::is_distributed() const {
@@ -187,20 +160,40 @@ LayoutMap WaitOpNode::InferLayout(const LayoutInferArgs &T,
 }
 
 TileOperator WaitOpNode::Clone() const {
-  auto node = make_object<WaitOpNode>(*this);
+  auto node = tvm::ffi::make_object<WaitOpNode>(*this);
   return WaitOp(node);
 }
 
-TIR_REGISTER_TL_OP(BarrierBlocksOp, barrier_blocks)
+TIR_REGISTER_TL_TILE_OP(BarrierBlocksOp, barrier_blocks)
     .set_num_inputs(1)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
-TIR_REGISTER_TL_OP(WaitOp, wait)
+TIR_REGISTER_TL_TILE_OP(WaitOp, wait)
     .set_num_inputs(4)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_TL_BUILTIN(init_barrier_gpu)
+    .set_num_inputs(2)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(arrive_barrier_gpu)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(wait_barrier_gpu)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(sync_barrier_gpu)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_TL_BUILTIN(fence_cta).set_num_inputs(0).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
@@ -210,8 +203,8 @@ TIR_DEFINE_TL_BUILTIN(fence_gpu).set_num_inputs(0).set_attr<TCallEffectKind>(
 TIR_DEFINE_TL_BUILTIN(fence_sys).set_num_inputs(0).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
-TVM_FFI_STATIC_INIT_BLOCK({ BarrierBlocksOpNode::RegisterReflection(); });
-TVM_FFI_STATIC_INIT_BLOCK({ WaitOpNode::RegisterReflection(); });
+TVM_FFI_STATIC_INIT_BLOCK() { BarrierBlocksOpNode::RegisterReflection(); }
+TVM_FFI_STATIC_INIT_BLOCK() { WaitOpNode::RegisterReflection(); }
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/sync.h b/src/op/sync.h
index 16487877e..390ff8ea8 100644
--- a/src/op/sync.h
+++ b/src/op/sync.h
@@ -17,34 +17,6 @@ namespace tl {
 
 using namespace tir;
 
-/*!
- * \brief Initialize a barrier for GPU-level synchronization
- *
- * void init_barrier_gpu(barrier, expected)
- */
-TVM_DLL const Op &init_barrier_gpu();
-
-/*!
- * \brief Arrive at a barrier for GPU-level synchronization
- *
- * void arrive_barrier_gpu(barrier)
- */
-TVM_DLL const Op &arrive_barrier_gpu();
-
-/*!
- * \brief Wait at a barrier for GPU-level synchronization
- *
- * void wait_barrier_gpu(barrier)
- */
-TVM_DLL const Op &wait_barrier_gpu();
-
-/*!
- * \brief Wait until *addr == expected* for GPU-level synchronization
- * void wait_eq(addr, expected)
- */
-
-TVM_DLL const Op &wait_eq();
-
 /*!
  * \brief TileOperatorNode for wait operation.
  *
@@ -60,14 +32,7 @@ class WaitOpNode : public TileOperatorNode {
 
   bool is_distributed() const;
 
-  static constexpr const char *_type_key = "tl.WaitOp";
-  TVM_DECLARE_FINAL_OBJECT_INFO(WaitOpNode, TileOperatorNode);
-
-  Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
-  LayoutMap InferLayout(const LayoutInferArgs &T,
-                        InferLevel level) const override;
-  static const Op &Get();
-  TileOperator Clone() const override;
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.WaitOp", WaitOpNode, TileOperatorNode);
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -78,20 +43,11 @@ class WaitOpNode : public TileOperatorNode {
         .def_ro("relation", &WaitOpNode::relation);
   }
 
-  bool SEqualReduce(const WaitOpNode *other, SEqualReducer equal) const {
-    return equal(addr, other->addr) && equal(expected, other->expected) &&
-           equal(peer, other->peer) && equal(relation, other->relation);
-  }
-
-  void SHashReduce(SHashReducer hash_reduce) const {
-    hash_reduce(addr);
-    hash_reduce(expected);
-    hash_reduce(peer);
-    hash_reduce(relation);
-  }
-
-  static constexpr bool _type_has_method_sequal_reduce = true;
-  static constexpr bool _type_has_method_shash_reduce = true;
+  Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
+  LayoutMap InferLayout(const LayoutInferArgs &T,
+                        InferLevel level) const override;
+  static const Op &Get();
+  TileOperator Clone() const override;
 };
 
 /*!
@@ -99,26 +55,12 @@ class WaitOpNode : public TileOperatorNode {
  */
 class WaitOp : public TileOperator {
 public:
-  TVM_DEFINE_OBJECT_REF_METHODS(WaitOp, TileOperator, WaitOpNode);
-  TVM_DLL WaitOp(Array<PrimExpr> args, BufferMap vmap);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(WaitOp, TileOperator, WaitOpNode);
+  TVM_DLL WaitOp(Array<PrimExpr> args,
+                 Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
-/*!
- * \brief Synchronize at a barrier for GPU-level synchronization
- *
- * void sync_barrier_gpu(barrier)
- */
-TVM_DLL const Op &sync_barrier_gpu();
-
-/*!
- * \brief Synchronize at a barrier for GPU-level synchronization in cooperative
- * group style
- *
- * void sync_grid(barrier)
- */
-TVM_DLL const Op &sync_grid();
-
 /*!
  * \brief Synchronize all blocks at a system-level barrier
  *
@@ -133,14 +75,8 @@ class BarrierBlocksOpNode : public TileOperatorNode {
   Array<PrimExpr> local_indices; ///< Indices used to access the barrier buffer
   bool need_fence;               ///< Whether need sys-level fence
 
-  static constexpr const char *_type_key = "tl.BarrierBlocksOp";
-  TVM_DECLARE_FINAL_OBJECT_INFO(BarrierBlocksOpNode, TileOperatorNode);
-
-  Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
-  LayoutMap InferLayout(const LayoutInferArgs &T,
-                        InferLevel level) const override;
-  static const Op &Get();
-  TileOperator Clone() const override;
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.BarrierBlocksOp", BarrierBlocksOpNode,
+                                    TileOperatorNode);
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -148,25 +84,15 @@ class BarrierBlocksOpNode : public TileOperatorNode {
         .def_ro("local_bar_addr", &BarrierBlocksOpNode::local_bar_addr)
         .def_ro("offset", &BarrierBlocksOpNode::offset)
         .def_ro("local_bar", &BarrierBlocksOpNode::local_bar)
-        .def_ro("local_indices", &BarrierBlocksOpNode::local_indices);
-  }
-
-  bool SEqualReduce(const BarrierBlocksOpNode *other,
-                    SEqualReducer equal) const {
-    return equal(local_bar_addr, other->local_bar_addr) &&
-           equal(offset, other->offset) && equal(local_bar, other->local_bar) &&
-           equal(local_indices, other->local_indices);
+        .def_ro("local_indices", &BarrierBlocksOpNode::local_indices)
+        .def_ro("need_fence", &BarrierBlocksOpNode::need_fence);
   }
 
-  void SHashReduce(SHashReducer hash_reduce) const {
-    hash_reduce(local_bar_addr);
-    hash_reduce(offset);
-    hash_reduce(local_bar);
-    hash_reduce(local_indices);
-  }
-
-  static constexpr bool _type_has_method_sequal_reduce = true;
-  static constexpr bool _type_has_method_shash_reduce = true;
+  Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
+  LayoutMap InferLayout(const LayoutInferArgs &T,
+                        InferLevel level) const override;
+  static const Op &Get();
+  TileOperator Clone() const override;
 
   PrimExpr get_offset(const BufferLoadNode *load) const;
 
@@ -179,12 +105,42 @@ class BarrierBlocksOpNode : public TileOperatorNode {
  */
 class BarrierBlocksOp : public TileOperator {
 public:
-  TVM_DEFINE_OBJECT_REF_METHODS(BarrierBlocksOp, TileOperator,
-                                BarrierBlocksOpNode);
-  TVM_DLL BarrierBlocksOp(Array<PrimExpr> args, BufferMap vmap);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(BarrierBlocksOp, TileOperator,
+                                             BarrierBlocksOpNode);
+  TVM_DLL BarrierBlocksOp(
+      Array<PrimExpr> args,
+      Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
   static const Op &Get();
 };
 
+/*!
+ * \brief Initialize a barrier for GPU-level synchronization
+ *
+ * void init_barrier_gpu(barrier, expected)
+ */
+TVM_DLL const Op &init_barrier_gpu();
+
+/*!
+ * \brief Arrive at a barrier for GPU-level synchronization
+ *
+ * void arrive_barrier_gpu(barrier)
+ */
+TVM_DLL const Op &arrive_barrier_gpu();
+
+/*!
+ * \brief Wait at a barrier for GPU-level synchronization
+ *
+ * void wait_barrier_gpu(barrier)
+ */
+TVM_DLL const Op &wait_barrier_gpu();
+
+/*!
+ * \brief Synchronize at a barrier for GPU-level synchronization
+ *
+ * void sync_barrier_gpu(barrier)
+ */
+TVM_DLL const Op &sync_barrier_gpu();
+
 /*!
  * \brief Create a memory fence at the block level (visible to all threads in
  * the current block)
diff --git a/src/op/tcgen5_meta.h b/src/op/tcgen5_meta.h
new file mode 100644
index 000000000..8b6ff61ba
--- /dev/null
+++ b/src/op/tcgen5_meta.h
@@ -0,0 +1,177 @@
+#ifndef TVM_TL_OP_TCGEN5_META_H_
+#define TVM_TL_OP_TCGEN5_META_H_
+
+#include <cstdint>
+#include <tvm/runtime/data_type.h>
+#include <tvm/runtime/logging.h>
+
+#include <utility>
+#include <vector>
+
+namespace tvm {
+namespace tl {
+
+using runtime::DataType;
+
+struct TCGEN5MMAMeta {
+  int atom_m, atom_n, atom_k;
+  bool enable_ws, enable_2cta;
+};
+
+inline std::pair<bool, TCGEN5MMAMeta>
+GetTCGEN5MMAMeta(int M, int N, int K, DataType ab_dtype, DataType c_dtype) {
+// TODO (lei) Currently not all shapes / dtypes are supported for TCGEN5MMA.
+#define FAIL                                                                   \
+  return {                                                                     \
+    false, TCGEN5MMAMeta { 0, 0, 0, false, false }                             \
+  }
+#define SUCCESS(atom_m, atom_n, atom_k, use_ws, use_2cta)                      \
+  return {                                                                     \
+    true, TCGEN5MMAMeta { atom_m, atom_n, atom_k, use_ws, use_2cta }           \
+  }
+  std::vector<int> ws_valid_atom_ns = {256, 128, 64};
+  if ((ab_dtype.is_bfloat16() || ab_dtype.is_float16()) &&
+      (c_dtype.is_float() && c_dtype.bits() == 32)) {
+    if (K % 16 != 0)
+      FAIL;
+    if (M % 128 == 0) {
+      for (int atom_n = 256; atom_n >= 16; atom_n -= 16)
+        if (N % atom_n == 0)
+          SUCCESS(128, atom_n, 16, false, false);
+      FAIL;
+    } else if (M % 64 == 0) {
+      for (int atom_n : ws_valid_atom_ns)
+        if (N % atom_n == 0)
+          SUCCESS(64, atom_n, 16, true, false);
+      FAIL;
+    } else if (M % 32 == 0) {
+      for (int atom_n : ws_valid_atom_ns)
+        if (N % atom_n == 0)
+          SUCCESS(32, atom_n, 16, true, false);
+      FAIL;
+    } else {
+      FAIL;
+    }
+  } else if ((ab_dtype.is_float8() || ab_dtype.is_float6_e2m3fn() ||
+              ab_dtype.is_float6_e3m2fn() || ab_dtype.is_float4_e2m1fn()) &&
+             ((c_dtype.is_float() && c_dtype.bits() == 32) ||
+              (c_dtype.is_float16() && c_dtype.bits() == 16))) {
+    if (K % 32 != 0)
+      FAIL;
+    if (M % 128 == 0) {
+      for (int atom_n : ws_valid_atom_ns)
+        if (N % atom_n == 0)
+          SUCCESS(128, atom_n, 32, true, false);
+      for (int atom_n = 256; atom_n >= 16; atom_n -= 16)
+        if (N % atom_n == 0)
+          SUCCESS(128, atom_n, 32, false, true);
+      for (int atom_n = 256; atom_n >= 8; atom_n -= 8)
+        if (N % atom_n == 0)
+          SUCCESS(128, atom_n, 32, false, false);
+      FAIL;
+    } else if (M % 64 == 0) {
+      for (int atom_n : ws_valid_atom_ns)
+        if (N % atom_n == 0)
+          SUCCESS(64, atom_n, 32, true, false);
+      for (int atom_n = 256; atom_n >= 8; atom_n -= 8)
+        if (N % atom_n == 0)
+          SUCCESS(64, atom_n, 32, false, false);
+      FAIL;
+    } else if (M % 32 == 0) {
+      for (int atom_n : ws_valid_atom_ns)
+        if (N % atom_n == 0)
+          SUCCESS(32, atom_n, 32, true, false);
+      FAIL;
+    } else {
+      FAIL;
+    }
+  }
+  FAIL;
+#undef FAIL
+#undef SUCCESS
+}
+
+inline uint32_t GetTCGEN5InstrDesc(int atom_m, int atom_n, int atom_k,
+                                   DataType ab_dtype, DataType c_dtype,
+                                   bool a_is_k_major, bool b_is_k_major,
+                                   int scale_in_a, int scale_in_b) {
+  ICHECK(atom_m % 16 == 0) << "atom_m must be divisible by 16";
+  ICHECK(atom_n % 8 == 0) << "atom_n must be divisible by 8";
+  ICHECK(atom_k == 16 || atom_k == 32)
+      << "Unsupported atom_k for TCGEN5MMA descriptor: " << atom_k;
+  ICHECK(scale_in_a == 1 || scale_in_a == -1)
+      << "scale_in_a must be +/-1 for TCGEN5MMA";
+  ICHECK(scale_in_b == 1 || scale_in_b == -1)
+      << "scale_in_b must be +/-1 for TCGEN5MMA";
+
+  auto encode_dtype = [&](DataType dtype) -> uint32_t {
+    if (dtype.is_float16()) {
+      return static_cast<uint32_t>(0);
+    } else if (dtype.is_bfloat16()) {
+      return static_cast<uint32_t>(1);
+    } else if (dtype.is_float8_e4m3fn() || dtype.is_float8_e4m3fnuz() ||
+               dtype.is_float8_e4m3()) {
+      return static_cast<uint32_t>(0);
+    } else if (dtype.is_float8_e5m2fnuz() || dtype.is_float8_e5m2()) {
+      return static_cast<uint32_t>(1);
+    }
+    LOG(FATAL) << "Unsupported dtype for TCGEN5MMA descriptor: " << dtype;
+    return 0u;
+  };
+
+  uint32_t a_format = encode_dtype(ab_dtype);
+  uint32_t b_format = a_format;
+
+  uint32_t c_format = 0;
+  if (c_dtype.is_float16()) {
+    c_format = 0;
+  } else if (c_dtype.is_float()) {
+    c_format = 1;
+  } else if (c_dtype.is_int()) {
+    c_format = 2;
+  } else {
+    LOG(FATAL) << "Unsupported accumulator dtype for TCGEN5MMA descriptor: "
+               << c_dtype;
+  }
+
+  auto set_bits = [](uint32_t value, int start, int width) -> uint32_t {
+    uint32_t mask = (width == 32) ? 0xFFFFFFFFu : ((1u << width) - 1);
+    return (value & mask) << start;
+  };
+
+  uint32_t desc = 0;
+  desc |= set_bits(0, 0, 2); // sparse_id2
+  desc |= set_bits(0, 2, 1); // sparse_flag
+  desc |= set_bits(0, 3, 1); // saturate
+  desc |= set_bits(c_format, 4, 2);
+
+  desc |= set_bits(a_format, 7, 3);
+  desc |= set_bits(b_format, 10, 3);
+
+  uint32_t a_neg = (scale_in_a == -1) ? 1u : 0u;
+  uint32_t b_neg = (scale_in_b == -1) ? 1u : 0u;
+  desc |= set_bits(a_neg, 13, 1);
+  desc |= set_bits(b_neg, 14, 1);
+
+  uint32_t a_major = a_is_k_major ? 0u : 1u;
+  uint32_t b_major = b_is_k_major ? 0u : 1u;
+  desc |= set_bits(a_major, 15, 1);
+  desc |= set_bits(b_major, 16, 1);
+
+  uint32_t n_dim = static_cast<uint32_t>(atom_n >> 3);
+  uint32_t m_dim = static_cast<uint32_t>(atom_m >> 4);
+  desc |= set_bits(n_dim, 17, 6);
+  desc |= set_bits(0, 23, 1);
+  desc |= set_bits(m_dim, 24, 5);
+  desc |= set_bits(0, 29, 1);
+
+  uint32_t max_shift = 0u;
+  desc |= set_bits(max_shift, 30, 2);
+
+  return desc;
+}
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_OP_TCGEN5_META_H_
diff --git a/src/op/utils.cc b/src/op/utils.cc
new file mode 100644
index 000000000..7e56ae8c7
--- /dev/null
+++ b/src/op/utils.cc
@@ -0,0 +1,96 @@
+/*!
+ * \file tl/op/utils.cc
+ * \brief Common utilities implementation for TL ops.
+ */
+
+#include "utils.h"
+
+#include <tvm/tir/builtin.h>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+BufferRegion NormalizeToBufferRegion(const PrimExpr &arg) {
+  // Case 1: Already a BufferRegion
+  if (arg->IsInstance<BufferRegionNode>()) {
+    return Downcast<BufferRegion>(arg);
+  }
+
+  // Case 2: BufferLoad — convert indices to ranges (Ramp -> lanes, else
+  // extent=1)
+  if (const auto *load = arg.as<BufferLoadNode>()) {
+    Array<Range> ranges;
+    for (const PrimExpr &index : load->indices) {
+      if (const auto *ramp = index.as<RampNode>()) {
+        ICHECK(ramp->stride.as<IntImmNode>()) << "Ramp stride must be IntImm";
+        ICHECK_EQ(ramp->stride.as<IntImmNode>()->value, 1)
+            << "Only stride-1 Ramp is supported in region conversion";
+        ICHECK(ramp->lanes.as<IntImmNode>())
+            << "Scalable vector lanes not supported in region conversion";
+        ranges.push_back(Range::FromMinExtent(ramp->base, ramp->lanes));
+      } else {
+        ranges.push_back(Range::FromMinExtent(index, 1));
+      }
+    }
+    return BufferRegion(load->buffer, ranges);
+  }
+
+  // Case 3: tl.region(...) — reconstruct via RegionOp (bridge)
+  if (const auto *call = arg.as<CallNode>()) {
+    if (call->op.same_as(RegionOp::Get())) {
+      RegionOp region(call->args);
+      return BufferRegion(region->GetBuffer(), region->GetRanges());
+    }
+    LOG(FATAL) << "Unsupported argument for BufferRegion (expect "
+                  "BufferLoad/BufferRegion/tl.region): "
+               << arg;
+  }
+
+  LOG(FATAL) << "Unsupported argument for BufferRegion: " << arg;
+  throw; // Unreachable
+}
+
+PrimExpr MakeAccessPtrFromRegion(const BufferRegion &region, int rw_mask,
+                                 bool require_2d) {
+  Buffer buf = region->buffer;
+  int ndim = static_cast<int>(buf->shape.size());
+  if (require_2d) {
+    ICHECK(ndim >= 2) << "Expect buffers with at least 2 dims";
+  }
+
+  PrimExpr offset, extent;
+  if (ndim == 1) {
+    // 1D: straightforward
+    auto axis = region->region[0];
+    offset = axis->min;
+    extent = axis->extent;
+  } else {
+    // Compute row-major strides
+    std::vector<PrimExpr> strides(ndim);
+    PrimExpr one = make_const(buf->shape[0].dtype(), 1);
+    PrimExpr cur = one;
+    for (int i = ndim - 1; i >= 0; --i) {
+      strides[i] = cur;
+      cur = cur * buf->shape[i];
+    }
+    // Offset: sum_{i in [0..ndim-3]} min_i * stride_i
+    offset = make_const(buf->shape[0].dtype(), 0);
+    for (int i = 0; i < ndim - 2; ++i) {
+      offset = offset + region->region[i]->min * strides[i];
+    }
+    // Extent: last two extents product (elements)
+    extent =
+        region->region[ndim - 2]->extent * region->region[ndim - 1]->extent;
+  }
+
+  // ptype and return handle
+  PrimExpr ptype = tir::TypeAnnotation(buf->dtype);
+  Array<PrimExpr> acc_args{ptype, buf->data, offset, extent,
+                           IntImm(DataType::Int(32), rw_mask)};
+  return Call(DataType::Handle(), builtin::tvm_access_ptr(), acc_args);
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/op/utils.h b/src/op/utils.h
new file mode 100644
index 000000000..0cc26f2d5
--- /dev/null
+++ b/src/op/utils.h
@@ -0,0 +1,61 @@
+/*!
+ * \file tl/op/utils.h
+ * \brief Common utilities for TL ops.
+ */
+
+#ifndef TVM_TL_OP_UTILS_H_
+#define TVM_TL_OP_UTILS_H_
+
+#include "./operator.h"
+#include "region.h"
+#include <tvm/tir/buffer.h>
+#include <tvm/tir/op.h>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+// Normalize an argument (BufferRegion/BufferLoad/tl.region)
+// to BufferRegion so ops can uniformly consume regions.
+// Note: tvm_access_ptr is no longer supported here.
+TVM_DLL BufferRegion NormalizeToBufferRegion(const PrimExpr &arg);
+
+// Build a tvm_access_ptr(handle) from a BufferRegion.
+// - If `require_2d` is true, checks buffer ndim >= 2.
+// - For 1D regions (when allowed), offset=min, extent=extent.
+// - For ndim >= 2, offset sums all but last two dims using row-major strides,
+//   extent is product of the last two extents.
+TVM_DLL PrimExpr MakeAccessPtrFromRegion(const BufferRegion &region,
+                                         int rw_mask, bool require_2d = false);
+
+// Check if a buffer is a fragment buffer (scope == "local.fragment")
+inline bool IsFragmentBuffer(const Buffer &buffer) {
+  return buffer.defined() && buffer.scope() == "local.fragment";
+}
+
+inline bool IsSharedBuffer(const Buffer &buffer, bool allow_dynamic = true) {
+  if (allow_dynamic) {
+    return buffer.defined() &&
+           (buffer.scope() == "shared" || buffer.scope() == "shared.dyn");
+  } else {
+    return buffer.defined() && buffer.scope() == "shared";
+  }
+}
+
+inline bool IsGlobalBuffer(const Buffer &buffer) {
+  return buffer.defined() && buffer.scope() == "global";
+}
+
+inline bool IsLocalBuffer(const Buffer &buffer) {
+  return buffer.defined() && buffer.scope() == "local";
+}
+
+inline bool IsLocalVarBuffer(const Buffer &buffer) {
+  return buffer.defined() && buffer.scope() == "local.var";
+}
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_OP_UTILS_H_
diff --git a/src/runtime/error_helpers.cc b/src/runtime/error_helpers.cc
new file mode 100644
index 000000000..903f8b1d9
--- /dev/null
+++ b/src/runtime/error_helpers.cc
@@ -0,0 +1,222 @@
+/*
+ * Helper functions for nicer runtime error messages.
+ */
+#include "error_helpers.h"
+
+#include <tvm/ffi/c_api.h>
+#include <tvm/ffi/error.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/runtime/data_type.h>
+#include <tvm/runtime/device_api.h>
+
+#include <sstream>
+#include <string>
+
+namespace tvm {
+namespace tl {
+
+// Return non-zero so that tvm_call_packed sites treat it as failure and return
+// -1.
+static int DTypeMismatch(const tvm::ffi::String &kernel_name,
+                         const tvm::ffi::String &buffer_name,
+                         int64_t actual_code, int64_t actual_bits,
+                         int64_t actual_lanes, int64_t expect_code,
+                         int64_t expect_bits, int64_t expect_lanes) {
+  tvm::runtime::DataType actual(static_cast<int>(actual_code),
+                                static_cast<int>(actual_bits),
+                                static_cast<int>(actual_lanes));
+  tvm::runtime::DataType expect(static_cast<int>(expect_code),
+                                static_cast<int>(expect_bits),
+                                static_cast<int>(expect_lanes));
+  std::ostringstream os;
+  os << "kernel " << std::string(kernel_name) << " input "
+     << std::string(buffer_name) << " dtype expected " << expect << ", but got "
+     << actual;
+  TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+  return -1;
+}
+
+// Variant without names, to avoid passing extra raw strings through packed
+// args.
+static int DTypeMismatchNoNames(int64_t actual_code, int64_t actual_bits,
+                                int64_t actual_lanes, int64_t expect_code,
+                                int64_t expect_bits, int64_t expect_lanes) {
+  tvm::runtime::DataType actual(static_cast<int>(actual_code),
+                                static_cast<int>(actual_bits),
+                                static_cast<int>(actual_lanes));
+  tvm::runtime::DataType expect(static_cast<int>(expect_code),
+                                static_cast<int>(expect_bits),
+                                static_cast<int>(expect_lanes));
+  std::ostringstream os;
+  os << "dtype mismatch: expected " << expect << ", but got " << actual;
+  TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+  return -1;
+}
+
+// Register packed versions, following the design in runtime.cc
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+
+  // Packed: __tvm_error_dtype_mismatch(kernel_name, buffer_name,
+  //                                    actual_code, actual_bits, actual_lanes,
+  //                                    expect_code, expect_bits, expect_lanes)
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_dtype_mismatch,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 8) << "Expected 8 args: kernel, buffer, "
+                                    "actual_code, actual_bits, actual_lanes, "
+                                 << "expect_code, expect_bits, expect_lanes";
+
+        auto kernel_name = args[0].cast<tvm::ffi::String>();
+        auto buffer_name = args[1].cast<tvm::ffi::String>();
+        int64_t actual_code = args[2].cast<int64_t>();
+        int64_t actual_bits = args[3].cast<int64_t>();
+        int64_t actual_lanes = args[4].cast<int64_t>();
+        int64_t expect_code = args[5].cast<int64_t>();
+        int64_t expect_bits = args[6].cast<int64_t>();
+        int64_t expect_lanes = args[7].cast<int64_t>();
+
+        // Reuse the helper to format the message
+        (void)DTypeMismatch(kernel_name, buffer_name, actual_code, actual_bits,
+                            actual_lanes, expect_code, expect_bits,
+                            expect_lanes);
+        // Provide a return value for completeness, then signal the error
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // kernel, buffer, expect:int64, got:int64
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_ndim_mismatch,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 4)
+            << "__tvm_error_ndim_mismatch(kernel, buffer, expect, got)";
+        auto kernel = args[0].cast<tvm::ffi::String>();
+        auto buffer = args[1].cast<tvm::ffi::String>();
+        int64_t expect = args[2].cast<int64_t>();
+        int64_t got = args[3].cast<int64_t>();
+        std::ostringstream os;
+        os << "kernel " << std::string(kernel) << " input "
+           << std::string(buffer) << " ndim expected " << expect << ", but got "
+           << got;
+        TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // kernel, buffer, expect:int64, got:int64
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_byte_offset_mismatch,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 4)
+            << "__tvm_error_byte_offset_mismatch(kernel, buffer, expect, got)";
+        auto kernel = args[0].cast<tvm::ffi::String>();
+        auto buffer = args[1].cast<tvm::ffi::String>();
+        int64_t expect = args[2].cast<int64_t>();
+        int64_t got = args[3].cast<int64_t>();
+        std::ostringstream os;
+        os << "kernel " << std::string(kernel) << " input "
+           << std::string(buffer) << " byte_offset expected " << expect
+           << ", but got " << got;
+        TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // kernel, buffer, expect:int64, got:int64
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_device_type_mismatch,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 4)
+            << "__tvm_error_device_type_mismatch(kernel, buffer, expect, got)";
+        auto kernel = args[0].cast<tvm::ffi::String>();
+        auto buffer = args[1].cast<tvm::ffi::String>();
+        int64_t expect = args[2].cast<int64_t>();
+        int64_t got = args[3].cast<int64_t>();
+        const char *expect_str =
+            tvm::runtime::DLDeviceType2Str(static_cast<int>(expect));
+        const char *got_str =
+            tvm::runtime::DLDeviceType2Str(static_cast<int>(got));
+        std::ostringstream os;
+        os << "kernel " << std::string(kernel) << " input "
+           << std::string(buffer) << " device_type expected " << expect_str
+           << ", but got " << got_str;
+        TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // kernel, buffer, field:String
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_null_ptr,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 3)
+            << "__tvm_error_null_ptr(kernel, buffer, field)";
+        auto kernel = args[0].cast<tvm::ffi::String>();
+        auto buffer = args[1].cast<tvm::ffi::String>();
+        auto field = args[2].cast<tvm::ffi::String>();
+        std::ostringstream os;
+        os << "kernel " << std::string(kernel) << " input "
+           << std::string(buffer) << ' ' << std::string(field)
+           << " expected non-NULL, but got NULL";
+        TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // kernel, buffer, field:String, expect:int64, got:int64
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_expect_eq,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 5)
+            << "__tvm_error_expect_eq(kernel, buffer, field, expect, got)";
+        auto kernel = args[0].cast<tvm::ffi::String>();
+        auto buffer = args[1].cast<tvm::ffi::String>();
+        auto field = args[2].cast<tvm::ffi::String>();
+        int64_t expect = args[3].cast<int64_t>();
+        int64_t got = args[4].cast<int64_t>();
+        std::ostringstream os;
+        os << "kernel " << std::string(kernel) << " input "
+           << std::string(buffer) << ' ' << std::string(field) << " expected "
+           << expect << ", but got " << got;
+        TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // kernel, buffer, field:String [, reason:String]
+  refl::GlobalDef().def_packed(
+      tl::tvm_error_constraint_violation,
+      [](tvm::ffi::PackedArgs args, tvm::ffi::Any *ret) {
+        ICHECK(args.size() == 3 || args.size() == 4)
+            << "__tvm_error_constraint_violation(kernel, buffer, field[, "
+               "reason])";
+        auto kernel = args[0].cast<tvm::ffi::String>();
+        auto buffer = args[1].cast<tvm::ffi::String>();
+        auto field = args[2].cast<tvm::ffi::String>();
+        std::string reason;
+        if (args.size() == 4) {
+          reason = args[3].cast<tvm::ffi::String>();
+        }
+        std::ostringstream os;
+        os << "kernel " << std::string(kernel) << " input "
+           << std::string(buffer) << ' ' << std::string(field)
+           << " constraint not satisfied";
+        if (!reason.empty()) {
+          os << ": " << reason;
+        }
+        TVMFFIErrorSetRaisedFromCStr("RuntimeError", os.str().c_str());
+        *ret = -1;
+        throw ::tvm::ffi::EnvErrorAlreadySet();
+      });
+
+  // Legacy typed registrations for backward compatibility
+  refl::GlobalDef().def("tilelang_error_dtype_mismatch",
+                        &tvm::tl::DTypeMismatch);
+  refl::GlobalDef().def("tilelang_error_dtype_mismatch2",
+                        &tvm::tl::DTypeMismatchNoNames);
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/runtime/error_helpers.h b/src/runtime/error_helpers.h
new file mode 100644
index 000000000..6620d837e
--- /dev/null
+++ b/src/runtime/error_helpers.h
@@ -0,0 +1,27 @@
+/*!
+ * \file tl/runtime/error_helpers.h
+ * \brief Error helper FFI names for TileLang runtime.
+ */
+
+#ifndef TVM_TL_RUNTIME_ERROR_HELPERS_H_
+#define TVM_TL_RUNTIME_ERROR_HELPERS_H_
+
+namespace tvm {
+namespace tl {
+
+// Error helper packed functions
+constexpr const char *tvm_error_dtype_mismatch = "__tvm_error_dtype_mismatch";
+constexpr const char *tvm_error_ndim_mismatch = "__tvm_error_ndim_mismatch";
+constexpr const char *tvm_error_byte_offset_mismatch =
+    "__tvm_error_byte_offset_mismatch";
+constexpr const char *tvm_error_device_type_mismatch =
+    "__tvm_error_device_type_mismatch";
+constexpr const char *tvm_error_null_ptr = "__tvm_error_null_ptr";
+constexpr const char *tvm_error_expect_eq = "__tvm_error_expect_eq";
+constexpr const char *tvm_error_constraint_violation =
+    "__tvm_error_constraint_violation";
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_RUNTIME_ERROR_HELPERS_H_
diff --git a/src/runtime/runtime.cc b/src/runtime/runtime.cc
index 3ea89d666..b2a7127d2 100644
--- a/src/runtime/runtime.cc
+++ b/src/runtime/runtime.cc
@@ -13,6 +13,12 @@
 namespace tvm {
 namespace tl {
 
+#if 1
+// Thread-local storage for restoring the L2 persisting cache limit
+static thread_local size_t __tl_prev_persisting_l2_cache_size = 0;
+static thread_local bool __tl_prev_persisting_l2_cache_saved = false;
+#endif
+
 #if (CUDA_MAJOR_VERSION >= 12)
 template <typename T> static std::string ArrayToStr(const T *ptr, size_t n) {
   std::stringstream ss;
@@ -89,22 +95,24 @@ struct TensorMapArgs {
 };
 
 // set device api
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def_packed("tvm_tensormap_create_tiled", [](PackedArgs args,
-                                                                Any *ret) {
-    TensorMapArgs T = TensorMapArgs::Extract(args);
-    CUresult result = cuTensorMapEncodeTiled(
-        T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
-        T.globalStride + 1, T.boxDim, T.elementStrides, T.interleave, T.swizzle,
-        T.l2Promotion, T.oobFill);
-    if (result != CUDA_SUCCESS) {
-      LOG_FATAL << "Failed to initialize the TMA descriptor " << result << '\n'
-                << T.ToDebugString();
-    }
-    *ret = static_cast<int>(result);
-  });
-});
+  // Register using the canonical names defined in runtime.h
+  refl::GlobalDef().def_packed(
+      tl::tvm_tensormap_create_tiled, [](PackedArgs args, Any *ret) {
+        TensorMapArgs T = TensorMapArgs::Extract(args);
+        CUresult result = cuTensorMapEncodeTiled(
+            T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
+            T.globalStride + 1, T.boxDim, T.elementStrides, T.interleave,
+            T.swizzle, T.l2Promotion, T.oobFill);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to initialize the TMA descriptor " << result
+                    << '\n'
+                    << T.ToDebugString();
+        }
+        *ret = static_cast<int>(result);
+      });
+}
 
 struct TensorMapIm2ColArgs {
   CUtensorMap *map;
@@ -180,10 +188,10 @@ struct TensorMapIm2ColArgs {
   }
 };
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def_packed(
-      "tvm_tensormap_create_im2col", [](PackedArgs args, Any *ret) {
+      tl::tvm_tensormap_create_im2col, [](PackedArgs args, Any *ret) {
         TensorMapIm2ColArgs T = TensorMapIm2ColArgs::Extract(args);
         CUresult result = cuTensorMapEncodeIm2col(
             T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
@@ -197,9 +205,145 @@ TVM_FFI_STATIC_INIT_BLOCK({
         }
         *ret = static_cast<int>(result);
       });
-});
+}
 
 #endif // (CUDA_MAJOR_VERSION >= 12)
 
+//
+// CUDA L2 Persisting Cache Access Policy Window helpers.
+// Exposed as TVM FFI packed functions similar to TMA initialization.
+//
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  // Set stream access policy window and adjust persisting L2 cache size
+  // Args:
+  //  [0]: void* base_ptr (required)
+  //  [1]: int64 num_bytes (required)
+  //  [2]: float hit_ratio (optional, default 0.8)
+  //  [3]: void* stream (optional, default 0 => default stream)
+  //  [4]: int64 l2_limit_bytes (optional, default = num_bytes)
+  refl::GlobalDef().def_packed(
+      tl::tvm_cuda_stream_set_access_policy_window,
+      [](PackedArgs args, Any *ret) {
+        ICHECK(args.size() >= 2) << "Expected at least base_ptr and num_bytes";
+
+        void *base_ptr = args[0].cast<void *>();
+        size_t num_bytes = static_cast<size_t>(args[1].cast<int64_t>());
+        float hit_ratio = 0.8f;
+        if (args.size() >= 3) {
+          // Accept double/float
+          hit_ratio = static_cast<float>(args[2].cast<double>());
+        }
+        CUstream stream = nullptr;
+        if (args.size() >= 4) {
+          stream = reinterpret_cast<CUstream>(args[3].cast<void *>());
+        }
+        size_t l2_limit_bytes = num_bytes;
+        if (args.size() >= 5) {
+          l2_limit_bytes = static_cast<size_t>(args[4].cast<int64_t>());
+        }
+
+        // Clamp requested limit to device capability
+        CUdevice device;
+        CUresult result = cuCtxGetDevice(&device);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to get current CUDA device: " << result;
+        }
+        int max_persisting = 0;
+        result = cuDeviceGetAttribute(
+            &max_persisting, CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE,
+            device);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to query MAX_PERSISTING_L2_CACHE_SIZE: "
+                    << result;
+        }
+        if (max_persisting > 0 &&
+            l2_limit_bytes > static_cast<size_t>(max_persisting)) {
+          l2_limit_bytes = static_cast<size_t>(max_persisting);
+        }
+
+        // Save current limit to restore later
+        size_t init_persisting_l2_cache_size = 0;
+        result = cuCtxGetLimit(&init_persisting_l2_cache_size,
+                               CU_LIMIT_PERSISTING_L2_CACHE_SIZE);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to get current persisting L2 cache size limit: "
+                    << result;
+        }
+        __tl_prev_persisting_l2_cache_size = init_persisting_l2_cache_size;
+        __tl_prev_persisting_l2_cache_saved = true;
+
+        // Set new limit
+        result =
+            cuCtxSetLimit(CU_LIMIT_PERSISTING_L2_CACHE_SIZE, l2_limit_bytes);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to set persisting L2 cache size limit: "
+                    << result;
+        }
+
+        // Apply access policy window to stream
+        CUstreamAttrValue stream_attribute;
+        memset(&stream_attribute, 0, sizeof(stream_attribute));
+        stream_attribute.accessPolicyWindow.base_ptr = base_ptr;
+        stream_attribute.accessPolicyWindow.num_bytes = l2_limit_bytes;
+        stream_attribute.accessPolicyWindow.hitRatio = hit_ratio;
+        stream_attribute.accessPolicyWindow.hitProp =
+            CU_ACCESS_PROPERTY_PERSISTING;
+        stream_attribute.accessPolicyWindow.missProp =
+            CU_ACCESS_PROPERTY_STREAMING;
+
+        result = cuStreamSetAttribute(stream,
+                                      CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW,
+                                      &stream_attribute);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to set stream access policy window: " << result;
+        }
+
+        *ret = static_cast<int>(result);
+      });
+
+  // Reset stream access policy window and restore the previous L2 cache size
+  // Args:
+  //  [0]: void* stream (optional, default 0)
+  refl::GlobalDef().def_packed(
+      tl::tvm_cuda_stream_reset_access_policy_window,
+      [](PackedArgs args, Any *ret) {
+        CUstream stream = nullptr;
+        if (args.size() >= 1) {
+          stream = reinterpret_cast<CUstream>(args[0].cast<void *>());
+        }
+
+        CUstreamAttrValue stream_attribute;
+        memset(&stream_attribute, 0, sizeof(stream_attribute));
+        // num_bytes = 0 disables the access policy window on the stream
+        stream_attribute.accessPolicyWindow.num_bytes = 0;
+
+        CUresult result = cuStreamSetAttribute(
+            stream, CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW,
+            &stream_attribute);
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to reset stream access policy window: "
+                    << result;
+        }
+
+        result = cuCtxResetPersistingL2Cache();
+        if (result != CUDA_SUCCESS) {
+          LOG_FATAL << "Failed to reset persisting L2 cache lines: " << result;
+        }
+
+        if (__tl_prev_persisting_l2_cache_saved) {
+          result = cuCtxSetLimit(CU_LIMIT_PERSISTING_L2_CACHE_SIZE,
+                                 __tl_prev_persisting_l2_cache_size);
+          if (result != CUDA_SUCCESS) {
+            LOG_FATAL << "Failed to restore persisting L2 cache size limit: "
+                      << result;
+          }
+          __tl_prev_persisting_l2_cache_saved = false;
+        }
+
+        *ret = static_cast<int>(result);
+      });
+}
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/runtime/runtime.h b/src/runtime/runtime.h
index fb9dfcfdd..4b389fc03 100644
--- a/src/runtime/runtime.h
+++ b/src/runtime/runtime.h
@@ -16,7 +16,13 @@ constexpr const char *tvm_tensormap_create_tiled =
 constexpr const char *tvm_tensormap_create_im2col =
     "__tvm_tensormap_create_im2col";
 #endif // (CUDA_MAJOR_VERSION >= 12)
+
+// CUDA stream access policy window helpers
+constexpr const char *tvm_cuda_stream_set_access_policy_window =
+    "__tvm_cuda_stream_set_access_policy_window";
+constexpr const char *tvm_cuda_stream_reset_access_policy_window =
+    "__tvm_cuda_stream_reset_access_policy_window";
 } // namespace tl
 } // namespace tvm
 
-#endif //  TVM_TL_RUNTIME_RUNTIME_H_
\ No newline at end of file
+#endif //  TVM_TL_RUNTIME_RUNTIME_H_
diff --git a/src/runtime/tilescale_cuda_module.cc b/src/runtime/tilescale_cuda_module.cc
new file mode 100644
index 000000000..8f570e6c7
--- /dev/null
+++ b/src/runtime/tilescale_cuda_module.cc
@@ -0,0 +1,411 @@
+/*!
+ * \file tilescale_cuda_module.cc
+ * \brief TileScale extended CUDA module with distributed table initialization
+ * support.
+ */
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <dmlc/memory_io.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/reflection/registry.h>
+
+#include <array>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "runtime/cuda/cuda_common.h"
+#include "runtime/file_utils.h"
+#include "runtime/meta_data.h"
+#include "runtime/pack_args.h"
+#include "runtime/thread_storage_scope.h"
+
+namespace tvm {
+namespace runtime {
+
+// Maximum number of GPUs supported (same as TVM's default)
+constexpr int kTileScaleMaxNumGPUs = 32;
+
+// Forward declaration
+class TileScaleCUDAModuleNode;
+
+// TileScale: Initialize distributed table by copying host data to device
+// meta_data symbol
+class TileScaleInitDistributedTable {
+public:
+  // meta_data symbol size: 1024 * sizeof(uint64_t)
+  static constexpr size_t kMetaDataSize = 1024 * sizeof(uint64_t);
+
+  TileScaleInitDistributedTable(TileScaleCUDAModuleNode *m,
+                                ffi::ObjectPtr<ffi::Object> sptr)
+      : m_(m), sptr_(sptr) {
+    std::fill(pcache_.begin(), pcache_.end(), 0);
+  }
+
+  // args: host_table_ptr (void*), table_size (int64_t), stream (void*)
+  void operator()(const ffi::PackedArgs &args, ffi::Any *rv) const;
+
+private:
+  // internal module
+  TileScaleCUDAModuleNode *m_;
+  // the resource holder
+  ffi::ObjectPtr<ffi::Object> sptr_;
+  // mark as mutable, to enable lazy initialization
+  mutable std::array<CUdeviceptr, kTileScaleMaxNumGPUs> pcache_;
+};
+
+/*!
+ * \brief TileScale extended CUDA module with distributed table support.
+ *
+ * This module extends TVM's CUDAModule with:
+ * - __tilescale_init_table: Initialize distributed table by copying host
+ *   data to the device's meta_data symbol
+ */
+class TileScaleCUDAModuleNode : public ffi::ModuleObj {
+public:
+  explicit TileScaleCUDAModuleNode(
+      std::string data, std::string fmt,
+      std::unordered_map<std::string, FunctionInfo> fmap,
+      std::string cuda_source)
+      : data_(data), fmt_(fmt), fmap_(fmap), cuda_source_(cuda_source) {
+    std::fill(module_.begin(), module_.end(), nullptr);
+  }
+
+  ~TileScaleCUDAModuleNode() {
+    for (size_t i = 0; i < module_.size(); ++i) {
+      if (module_[i] != nullptr) {
+        CUDA_CALL(cudaSetDevice(static_cast<int>(i)));
+        CUDA_DRIVER_CALL(cuModuleUnload(module_[i]));
+      }
+    }
+  }
+
+  const char *kind() const final { return "tilescale_cuda"; }
+
+  int GetPropertyMask() const final {
+    return ffi::Module::kBinarySerializable | ffi::Module::kRunnable;
+  }
+
+  ffi::Optional<ffi::Function> GetFunction(const ffi::String &name) final;
+
+  void WriteToFile(const ffi::String &file_name,
+                   const ffi::String &format) const final {
+    std::string fmt = GetFileFormat(file_name, format);
+    std::string meta_file = GetMetaFilePath(file_name);
+    if (fmt == "cu") {
+      ICHECK_NE(cuda_source_.length(), 0);
+      SaveMetaDataToFile(meta_file, fmap_);
+      SaveBinaryToFile(file_name, cuda_source_);
+    } else {
+      ICHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_;
+      SaveMetaDataToFile(meta_file, fmap_);
+      SaveBinaryToFile(file_name, data_);
+    }
+  }
+
+  ffi::Bytes SaveToBytes() const final {
+    std::string buffer;
+    dmlc::MemoryStringStream ms(&buffer);
+    dmlc::Stream *stream = &ms;
+    stream->Write(fmt_);
+    stream->Write(fmap_);
+    stream->Write(data_);
+    return ffi::Bytes(buffer);
+  }
+
+  ffi::String InspectSource(const ffi::String &format) const final {
+    if (format == fmt_)
+      return data_;
+    if (cuda_source_.length() != 0) {
+      return cuda_source_;
+    } else {
+      if (fmt_ == "ptx")
+        return data_;
+      return "";
+    }
+  }
+
+  // Get a CUfunction from primary context in device_id
+  CUfunction GetFunc(int device_id, const std::string &func_name) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (module_[device_id] == nullptr) {
+      CUDA_DRIVER_CALL(cuModuleLoadData(&(module_[device_id]), data_.c_str()));
+      static auto nvshmem_init_hook =
+          ffi::Function::GetGlobal("runtime.nvshmem.cumodule_init");
+      if (nvshmem_init_hook.has_value()) {
+        (*nvshmem_init_hook)(static_cast<void *>(module_[device_id]));
+      }
+    }
+    CUfunction func;
+    CUresult result =
+        cuModuleGetFunction(&func, module_[device_id], func_name.c_str());
+    if (result != CUDA_SUCCESS) {
+      const char *msg;
+      cuGetErrorName(result, &msg);
+      LOG(FATAL) << "CUDAError: cuModuleGetFunction " << func_name
+                 << " failed with error: " << msg;
+    }
+    return func;
+  }
+
+  // Get a global var from primary context in device_id
+  CUdeviceptr GetGlobal(int device_id, const std::string &global_name,
+                        size_t expect_nbytes) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (module_[device_id] == nullptr) {
+      CUDA_DRIVER_CALL(cuModuleLoadData(&(module_[device_id]), data_.c_str()));
+      static auto nvshmem_init_hook =
+          ffi::Function::GetGlobal("runtime.nvshmem.cumodule_init");
+      if (nvshmem_init_hook.has_value()) {
+        (*nvshmem_init_hook)(static_cast<void *>(module_[device_id]));
+      }
+    }
+    CUdeviceptr global;
+    size_t nbytes;
+
+    CUresult result = cuModuleGetGlobal(&global, &nbytes, module_[device_id],
+                                        global_name.c_str());
+    ICHECK_EQ(nbytes, expect_nbytes);
+    if (result != CUDA_SUCCESS) {
+      const char *msg;
+      cuGetErrorName(result, &msg);
+      LOG(FATAL) << "CUDAError: cuModuleGetGlobal " << global_name
+                 << " failed with error: " << msg;
+    }
+    return global;
+  }
+
+private:
+  std::string data_;
+  std::string fmt_;
+  std::unordered_map<std::string, FunctionInfo> fmap_;
+  std::string cuda_source_;
+  std::array<CUmodule, kTileScaleMaxNumGPUs> module_;
+  std::mutex mutex_;
+};
+
+// Implementation of TileScaleInitDistributedTable::operator()
+void TileScaleInitDistributedTable::operator()(const ffi::PackedArgs &args,
+                                               ffi::Any *rv) const {
+  // Accept int64_t from Python and cast to pointers internally
+  // This is necessary because TVM FFI doesn't auto-convert int to void*
+  int64_t host_table_ptr = args[0].cast<int64_t>();
+  int64_t table_size = args[1].cast<int64_t>();
+  int64_t stream_ptr = args[2].cast<int64_t>();
+
+  void *host_table = reinterpret_cast<void *>(host_table_ptr);
+  // 打印host table前8个entry
+  auto *table_ptr = reinterpret_cast<const uint64_t *>(host_table);
+  std::ostringstream oss;
+  int rank;
+  CUDA_CALL(cudaGetDevice(&rank));
+  CUstream stream = reinterpret_cast<CUstream>(stream_ptr);
+
+  int device_id;
+  CUDA_CALL(cudaGetDevice(&device_id));
+
+  // Get the device pointer for meta_data symbol (lazy initialization)
+  if (pcache_[device_id] == 0) {
+    pcache_[device_id] = m_->GetGlobal(device_id, "meta_data", kMetaDataSize);
+  }
+
+  // Copy data from host to device constant memory.
+  // Note: must use Driver API (cuMemcpyHtoD) instead of cudaMemcpyToSymbol,
+  // because the symbol lives in a dynamically loaded CUmodule.
+  size_t bytes = static_cast<size_t>(table_size) * sizeof(uint64_t);
+  CUDA_DRIVER_CALL(cuMemcpyHtoD(pcache_[device_id], host_table, bytes));
+
+  // Return success
+  *rv = 0;
+}
+
+// Wrapped function class similar to TVM's CUDAWrappedFunc
+class TileScaleCUDAWrappedFunc {
+public:
+  void Init(TileScaleCUDAModuleNode *m, ffi::ObjectPtr<ffi::Object> sptr,
+            const std::string &func_name, size_t num_void_args,
+            const std::vector<std::string> &launch_param_tags) {
+    m_ = m;
+    sptr_ = sptr;
+    func_name_ = func_name;
+    std::fill(fcache_.begin(), fcache_.end(), nullptr);
+    std::fill(dyn_smem_initialized_.begin(), dyn_smem_initialized_.end(),
+              false);
+    use_dyn_shared_memory_ = false;
+    for (const auto &tag : launch_param_tags) {
+      if (tag == launch_param::kUseDynamicSharedMemoryTag) {
+        use_dyn_shared_memory_ = true;
+        break;
+      }
+    }
+    launch_param_config_.Init(num_void_args, launch_param_tags);
+  }
+
+  void operator()(ffi::PackedArgs args, ffi::Any *rv, void **void_args) const {
+    int device_id;
+    CUDA_CALL(cudaGetDevice(&device_id));
+    ThreadWorkLoad wl = launch_param_config_.Extract(args);
+
+    if (fcache_[device_id] == nullptr) {
+      fcache_[device_id] = m_->GetFunc(device_id, func_name_);
+    }
+
+    bool need_dyn_attr = use_dyn_shared_memory_ || (wl.dyn_shmem_size > 0);
+    if (need_dyn_attr) {
+      if (!dyn_smem_initialized_[device_id] ||
+          dyn_smem_last_[device_id] != wl.dyn_shmem_size) {
+        CUresult attr_set = cuFuncSetAttribute(
+            fcache_[device_id], CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+            wl.dyn_shmem_size);
+        if (attr_set != CUDA_SUCCESS) {
+          LOG(FATAL)
+              << "Failed to set the allowed dynamic shared memory size to "
+              << wl.dyn_shmem_size;
+        }
+        dyn_smem_last_[device_id] = wl.dyn_shmem_size;
+        dyn_smem_initialized_[device_id] = true;
+      }
+    }
+    // Get stream from TVM's device API
+    CUstream strm = nullptr;
+    static auto get_stream =
+        ffi::Function::GetGlobal("device_api.cuda.get_stream");
+    if (get_stream.has_value()) {
+      strm = static_cast<CUstream>((*get_stream)(device_id).cast<void *>());
+    }
+    CUresult result = cuLaunchKernel(
+        fcache_[device_id], wl.grid_dim(0), wl.grid_dim(1), wl.grid_dim(2),
+        wl.block_dim(0), wl.block_dim(1), wl.block_dim(2), wl.dyn_shmem_size,
+        strm, void_args, nullptr);
+    if (result != CUDA_SUCCESS) {
+      const char *msg;
+      cuGetErrorName(result, &msg);
+      std::ostringstream os;
+      os << "CUDALaunch Error: " << msg << "\n"
+         << " grid=(" << wl.grid_dim(0) << "," << wl.grid_dim(1) << ","
+         << wl.grid_dim(2) << "), "
+         << " block=(" << wl.block_dim(0) << "," << wl.block_dim(1) << ","
+         << wl.block_dim(2) << ")\n";
+      std::string cuda_err = os.str();
+      LOG(FATAL) << "CUDALaunch Error: " << cuda_err;
+    }
+  }
+
+private:
+  TileScaleCUDAModuleNode *m_;
+  ffi::ObjectPtr<ffi::Object> sptr_;
+  std::string func_name_;
+  mutable std::array<CUfunction, kTileScaleMaxNumGPUs> fcache_;
+  LaunchParamConfig launch_param_config_;
+  mutable std::array<size_t, kTileScaleMaxNumGPUs> dyn_smem_last_;
+  mutable std::array<bool, kTileScaleMaxNumGPUs> dyn_smem_initialized_;
+  bool use_dyn_shared_memory_{false};
+};
+
+// Prepare global barrier class
+class TileScaleCUDAPrepGlobalBarrier {
+public:
+  TileScaleCUDAPrepGlobalBarrier(TileScaleCUDAModuleNode *m,
+                                 ffi::ObjectPtr<ffi::Object> sptr)
+      : m_(m), sptr_(sptr) {
+    std::fill(pcache_.begin(), pcache_.end(), 0);
+  }
+
+  void operator()(const ffi::PackedArgs &args, ffi::Any *rv) const {
+    int device_id;
+    CUDA_CALL(cudaGetDevice(&device_id));
+    if (pcache_[device_id] == 0) {
+      pcache_[device_id] = m_->GetGlobal(
+          device_id, symbol::tvm_global_barrier_state, sizeof(unsigned));
+    }
+    CUDA_DRIVER_CALL(cuMemsetD32(pcache_[device_id], 0, 1));
+  }
+
+private:
+  TileScaleCUDAModuleNode *m_;
+  ffi::ObjectPtr<ffi::Object> sptr_;
+  mutable std::array<CUdeviceptr, kTileScaleMaxNumGPUs> pcache_;
+};
+
+ffi::Optional<ffi::Function>
+TileScaleCUDAModuleNode::GetFunction(const ffi::String &name) {
+  ffi::ObjectPtr<ffi::Object> sptr_to_self =
+      ffi::GetObjectPtr<ffi::Object>(this);
+  ICHECK_EQ(sptr_to_self.get(), this);
+
+  // TileScale: Handle distributed table initialization
+  if (name == "__tilescale_init_table") {
+    return ffi::Function(TileScaleInitDistributedTable(this, sptr_to_self));
+  }
+
+  // TVM: Handle global barrier preparation
+  if (name == symbol::tvm_prepare_global_barrier) {
+    return ffi::Function(TileScaleCUDAPrepGlobalBarrier(this, sptr_to_self));
+  }
+
+  auto it = fmap_.find(name);
+  if (it == fmap_.end())
+    return ffi::Function();
+  const FunctionInfo &info = it->second;
+  TileScaleCUDAWrappedFunc f;
+  f.Init(this, sptr_to_self, name, info.arg_types.size(),
+         info.launch_param_tags);
+  return PackFuncVoidAddr(f, info.arg_types, info.arg_extra_tags);
+}
+
+/*!
+ * \brief Create a TileScale extended CUDA module from data.
+ *
+ * \param data The module data, can be ptx, cubin
+ * \param fmt The format of the data, can be "ptx", "cubin"
+ * \param fmap The map function information map of each function.
+ * \param cuda_source Optional, cuda source file
+ */
+ffi::Module
+TileScaleCUDAModuleCreate(std::string data, std::string fmt,
+                          std::unordered_map<std::string, FunctionInfo> fmap,
+                          std::string cuda_source) {
+  auto n =
+      ffi::make_object<TileScaleCUDAModuleNode>(data, fmt, fmap, cuda_source);
+  return ffi::Module(n);
+}
+
+// Load TileScale CUDA module from serialized bytes (deserialization).
+ffi::Module TileScaleCUDAModuleLoadFromBytes(const ffi::Bytes &bytes) {
+  dmlc::MemoryFixedSizeStream ms(const_cast<char *>(bytes.data()),
+                                 bytes.size());
+  dmlc::Stream *stream = &ms;
+  std::string fmt;
+  std::unordered_map<std::string, FunctionInfo> fmap;
+  std::string data;
+  stream->Read(&fmt);
+  stream->Read(&fmap);
+  stream->Read(&data);
+  return TileScaleCUDAModuleCreate(data, fmt, fmap, std::string());
+}
+
+// Load TileScale CUDA module from file.
+ffi::Module TileScaleCUDAModuleLoadFile(const std::string &file_name,
+                                        const ffi::String &format) {
+  std::string data;
+  std::unordered_map<std::string, FunctionInfo> fmap;
+  std::string fmt = GetFileFormat(file_name, format);
+  std::string meta_file = GetMetaFilePath(file_name);
+  LoadBinaryFromFile(file_name, &data);
+  LoadMetaDataFromFile(meta_file, &fmap);
+  return TileScaleCUDAModuleCreate(data, fmt, fmap, std::string());
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef()
+      .def("ffi.Module.load_from_bytes.tilescale_cuda",
+           TileScaleCUDAModuleLoadFromBytes)
+      .def("ffi.Module.load_from_file.tilescale_cuda",
+           TileScaleCUDAModuleLoadFile);
+}
+
+} // namespace runtime
+} // namespace tvm
diff --git a/src/runtime/tilescale_cuda_module.h b/src/runtime/tilescale_cuda_module.h
new file mode 100644
index 000000000..a005f10f7
--- /dev/null
+++ b/src/runtime/tilescale_cuda_module.h
@@ -0,0 +1,39 @@
+/*!
+ * \file tilescale_cuda_module.h
+ * \brief TileScale extended CUDA module with distributed table initialization
+ * support.
+ */
+#ifndef TILESCALE_RUNTIME_TILESCALE_CUDA_MODULE_H_
+#define TILESCALE_RUNTIME_TILESCALE_CUDA_MODULE_H_
+
+#include <tvm/runtime/module.h>
+
+#include <string>
+#include <unordered_map>
+
+#include "runtime/meta_data.h"
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief Create a TileScale extended CUDA module from data.
+ *
+ * This module extends TVM's CUDAModule with additional functionality:
+ * - __tilescale_init_distributed_table: Initialize distributed table by copying
+ *   host data to the device's meta_data symbol for distributed kernels.
+ *
+ * \param data The module data, can be ptx, cubin
+ * \param fmt The format of the data, can be "ptx", "cubin"
+ * \param fmap The map function information map of each function.
+ * \param cuda_source Optional, cuda source file
+ */
+ffi::Module
+TileScaleCUDAModuleCreate(std::string data, std::string fmt,
+                          std::unordered_map<std::string, FunctionInfo> fmap,
+                          std::string cuda_source);
+
+} // namespace runtime
+} // namespace tvm
+
+#endif // TILESCALE_RUNTIME_TILESCALE_CUDA_MODULE_H_
diff --git a/src/support/ffi_aliases.h b/src/support/ffi_aliases.h
new file mode 100644
index 000000000..7dbe0b395
--- /dev/null
+++ b/src/support/ffi_aliases.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <tvm/ffi/cast.h>
+#include <tvm/ffi/container/array.h>
+#include <tvm/ffi/container/map.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/memory.h>
+#include <tvm/ffi/optional.h>
+#include <tvm/ffi/string.h>
+
+namespace tvm {
+using ffi::Array;
+using ffi::Function;
+using ffi::Map;
+using ffi::Optional;
+using ffi::String;
+} // namespace tvm
diff --git a/src/target/codegen_c_host.cc b/src/target/codegen_c_host.cc
new file mode 100644
index 000000000..2c1873467
--- /dev/null
+++ b/src/target/codegen_c_host.cc
@@ -0,0 +1,511 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file codegen_c_host.cc
+ */
+#include "codegen_c_host.h"
+
+#include <tvm/ffi/container/shape.h>
+#include <tvm/ffi/extra/module.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/target/codegen.h>
+
+#include <algorithm>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+// For escaping strings embedded into generated C sources
+#include "support/str_escape.h"
+
+namespace tvm {
+namespace tl {
+
+CodeGenCHost::CodeGenCHost() {
+  module_name_ = name_supply_->FreshName(tvm::ffi::symbol::tvm_ffi_library_ctx);
+}
+
+void CodeGenCHost::Init(bool output_ssa, bool emit_asserts,
+                        bool emit_fwd_func_decl, std::string target_str,
+                        const std::unordered_set<std::string> &devices) {
+  emit_asserts_ = emit_asserts;
+  emit_fwd_func_decl_ = emit_fwd_func_decl;
+  declared_globals_.clear();
+  decl_stream << "// tilelang target: " << target_str << "\n";
+  decl_stream << "#define TVM_EXPORTS\n";
+  decl_stream << "#include \"tvm/runtime/base.h\"\n";
+  decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n";
+  decl_stream << "#include \"tvm/ffi/c_api.h\"\n";
+  decl_stream << "#include <math.h>\n";
+  // snprintf for richer assert messages with actual values
+  decl_stream << "#include <stdio.h>\n";
+  decl_stream << "#include <stdbool.h>\n";
+  CodeGenCHost::InitGlobalContext();
+  tvm::codegen::CodeGenC::Init(output_ssa);
+}
+
+void CodeGenCHost::InitGlobalContext() {
+  decl_stream << "void* " << tvm::ffi::symbol::tvm_ffi_library_ctx
+              << " = NULL;\n";
+}
+
+void CodeGenCHost::DefineModuleName() {
+  decl_stream << "void* " << module_name_ << " = NULL;\n";
+}
+
+void CodeGenCHost::AddFunction(const tvm::GlobalVar &gvar,
+                               const tvm::tir::PrimFunc &func) {
+  return AddFunction(gvar, func, /*emit_fwd_func_decl=*/false);
+}
+
+void CodeGenCHost::AddFunction(const tvm::GlobalVar &gvar,
+                               const tvm::tir::PrimFunc &func,
+                               bool emit_fwd_func_decl) {
+  auto global_symbol =
+      func->GetAttr<tvm::ffi::String>(tvm::attr::kGlobalSymbol);
+  if (global_symbol) {
+    function_names_.push_back(global_symbol.value());
+  }
+
+  emit_fwd_func_decl_ = emit_fwd_func_decl;
+  tvm::codegen::CodeGenC::AddFunction(gvar, func);
+  if (func->HasNonzeroAttr(tvm::tir::attr::kIsEntryFunc) && !has_main_func_) {
+    ICHECK(global_symbol.has_value())
+        << "CodeGenCHost: The entry func must have the global_symbol "
+           "attribute, "
+        << "but function " << gvar << " only has attributes " << func->attrs;
+    function_names_.push_back(tvm::ffi::symbol::tvm_ffi_main);
+    stream << "// CodegenC: NOTE: Auto-generated entry function\n";
+    PrintFuncPrefix(stream);
+    PrintType(func->ret_type, stream);
+    stream << " " << tvm::ffi::symbol::tvm_ffi_main
+           << "(void* self, void* args,int num_args, void* result) {\n";
+    stream << "  return " << static_cast<std::string>(global_symbol.value())
+           << "(self, args, num_args, result);\n";
+    stream << "}\n";
+    has_main_func_ = true;
+  }
+}
+
+void CodeGenCHost::GenerateForwardFunctionDeclarations(
+    tvm::ffi::String global_symbol, const tvm::ffi::Array<tvm::Type> &arg_types,
+    const tvm::Type &ret_type) {
+  if (!emit_fwd_func_decl_) {
+    return;
+  }
+  for (auto &func_already_defined : GetFunctionNames()) {
+    if (global_symbol == func_already_defined) {
+      return;
+    }
+  }
+  this->PrintFuncPrefix(fwd_decl_stream);
+  this->PrintType(ret_type, fwd_decl_stream);
+  fwd_decl_stream << " " << global_symbol << "(";
+  for (size_t i = 0; i < arg_types.size(); ++i) {
+    if (i > 0) {
+      fwd_decl_stream << ", ";
+    }
+    tvm::codegen::CodeGenSourceBase::PrintType(arg_types[i], fwd_decl_stream);
+  }
+  fwd_decl_stream << ");\n";
+}
+
+void CodeGenCHost::PrintFuncPrefix(std::ostream &os) { // NOLINT(*)
+  os << "#ifdef __cplusplus\n"
+     << "extern \"C\"\n"
+     << "#endif\n";
+}
+
+void CodeGenCHost::PrintType(tvm::DataType t, std::ostream &os) { // NOLINT(*)
+  int lanes = t.lanes();
+  if (t.is_handle()) {
+    ICHECK_EQ(lanes, 1) << "does not support vector types";
+    os << "void*";
+    return;
+  }
+  if (t.is_void()) {
+    os << "void";
+    return;
+  }
+  if (t == tvm::DataType::Bool()) {
+    os << "bool";
+    return;
+  }
+  bool fail = false;
+  if (t.is_float()) {
+    switch (t.bits()) {
+    case 16:
+      os << "half";
+      break;
+    case 32:
+      os << "float";
+      break;
+    case 64:
+      os << "double";
+      break;
+    default:
+      fail = true;
+      break;
+    }
+    if (!fail && lanes == 1)
+      return;
+    if (!fail && (lanes >= 2 && lanes <= 16)) {
+      os << lanes;
+      return;
+    }
+  }
+  if (t.is_bfloat16()) {
+    os << "__bf16";
+    return;
+  }
+  if (t.is_int() || t.is_uint()) {
+    if (t.is_uint()) {
+      os << 'u';
+    }
+    switch (t.bits()) {
+    case 8:
+      os << "int8_t";
+      break;
+    case 16:
+      os << "int16_t";
+      break;
+    case 32:
+      os << "int32_t";
+      break;
+    case 64:
+      os << "int64_t";
+      break;
+    case 1:
+      os << "int32_t";
+      break;
+    default:
+      fail = true;
+      break;
+    }
+    if (!fail && lanes == 1)
+      return;
+    if (!fail && (lanes >= 2 && lanes <= 16)) {
+      os << lanes;
+      return;
+    }
+  }
+  LOG(FATAL) << "Cannot convert type " << t << " to C type";
+}
+
+void CodeGenCHost::VisitExpr_(const tvm::tir::BroadcastNode *op,
+                              std::ostream &os) { // NOLINT(*)
+  std::string v = PrintExpr(op->value);
+  int lanes = op->dtype.lanes();
+  os << "((";
+  PrintType(op->dtype, os);
+  os << ")(";
+  for (int i = 0; i < lanes; ++i) {
+    if (i != 0)
+      os << ", ";
+    os << v;
+  }
+  os << "))";
+}
+
+void CodeGenCHost::PrintGetFuncFromBackend(
+    const std::string &func_name, const std::string &packed_func_name) {
+  this->PrintIndent();
+  this->stream << "if (" << packed_func_name << " == NULL) {\n";
+  int packed_func_if_scope = this->BeginScope();
+  this->PrintIndent();
+  this->stream << "if (TVMBackendGetFuncFromEnv(" << module_name_ << ", \""
+               << func_name << "\""
+               << ", &" << packed_func_name << ") != 0) {\n";
+  int get_func_env_scope = this->BeginScope();
+  this->PrintIndent();
+  this->stream << "return -1;\n";
+  this->EndScope(get_func_env_scope);
+  this->PrintIndent();
+  this->stream << "}\n";
+  this->EndScope(packed_func_if_scope);
+  this->PrintIndent();
+  this->stream << "}\n";
+}
+
+void CodeGenCHost::PrintCallPacked(const tvm::tir::CallNode *op) {
+  using namespace tvm::tir;
+  const StringImmNode *func_name = op->args[0].as<StringImmNode>();
+  ICHECK(func_name != nullptr)
+      << "tvm_call_[c]packed_lowered expects first argument as function name";
+  int64_t begin = op->args[2].as<IntImmNode>()->value;
+  int64_t end = op->args[3].as<IntImmNode>()->value;
+  int64_t num_args = end - begin;
+  ICHECK_GE(num_args, 0);
+
+  std::string packed_func_name;
+  if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
+    packed_func_name = GetPackedName(op);
+    this->PrintGetFuncFromBackend(func_name->value, packed_func_name);
+  } else {
+    // directly use the original symbol
+    ICHECK(op->op.same_as(builtin::tvm_call_cpacked_lowered()));
+    packed_func_name =
+        tvm::ffi::symbol::tvm_ffi_symbol_prefix + func_name->value;
+  }
+
+  std::string args_stack = PrintExpr(op->args[1]);
+  this->PrintIndent();
+  std::string result = name_supply_->FreshName("result");
+  this->stream << "TVMFFIAny " << result << ";\n";
+  this->PrintIndent();
+  // must make sure type_index is set to none
+  this->stream << result << ".type_index = kTVMFFINone;\n";
+  this->PrintIndent();
+  this->stream << result << ".zero_padding = 0;\n";
+  this->PrintIndent();
+  this->stream << result << ".v_int64 = 0;\n";
+  this->PrintIndent();
+  if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
+    this->stream << "if (TVMFFIFunctionCall(" << packed_func_name << ", ";
+  } else {
+    this->stream << "if (" << packed_func_name << "(NULL, ";
+  }
+  this->stream << "(TVMFFIAny*) " << args_stack << ", " << num_args << ", "
+               << "&" << result << ") != 0) {\n";
+  int func_call_scope = this->BeginScope();
+  this->PrintIndent();
+  this->stream << "return -1;\n";
+  this->EndScope(func_call_scope);
+  this->PrintIndent();
+  this->stream << "}\n";
+}
+
+std::string CodeGenCHost::GetPackedName(const tvm::tir::CallNode *op) {
+  using namespace tvm::tir;
+  const StringImmNode *s = op->args[0].as<StringImmNode>();
+  ICHECK(s != nullptr)
+      << "tvm_call_packed_lowered expects first argument as function name";
+  std::string func_name = s->value;
+  std::string packed_func_name = func_name + "_packed";
+  std::string unique_name;
+  auto it = declared_globals_.find(packed_func_name);
+  if (it != declared_globals_.end()) {
+    unique_name = it->second;
+  } else {
+    unique_name = name_supply_->FreshName(packed_func_name);
+    declared_globals_[packed_func_name] = unique_name;
+    decl_stream << "static void* " << unique_name << " = NULL;\n";
+  }
+  return unique_name;
+}
+
+void CodeGenCHost::VisitExpr_(const tvm::tir::CallNode *op,
+                              std::ostream &os) { // NOLINT(*)
+  using namespace tvm::tir;
+  if (op->op.same_as(builtin::tvm_stack_alloca())) {
+    std::string stack_name = name_supply_->FreshName("stack");
+    const std::string &type = op->args[0].as<StringImmNode>()->value;
+    const IntImmNode *num = op->args[1].as<IntImmNode>();
+    ICHECK(num != nullptr);
+    static_assert(alignof(TVMFFIAny) % alignof(DLTensor) == 0, "invariant");
+    size_t unit = sizeof(TVMFFIAny);
+    size_t size = 0;
+    if (type == "shape") {
+      size = (num->value * sizeof(ffi::Shape::index_type) + unit - 1) / unit;
+    } else if (type == "tvm_ffi_any") {
+      size = (num->value * sizeof(TVMFFIAny) + unit - 1) / unit;
+    } else if (type == "array") {
+      size = (num->value * sizeof(DLTensor) + unit - 1) / unit;
+    } else {
+      LOG(FATAL) << "Unknown stack alloca type " << type;
+    }
+    this->PrintIndent();
+    this->stream << "TVMFFIAny " << stack_name << "[" << size << "];\n";
+    os << stack_name;
+  } else if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
+    this->PrintCallPacked(op);
+  } else if (op->op.same_as(builtin::tvm_call_cpacked_lowered())) {
+    this->PrintCallPacked(op);
+  } else if (op->op.same_as(builtin::tvm_throw_last_error())) {
+    this->PrintIndent();
+    this->stream << "return -1;\n";
+  } else {
+    tvm::codegen::CodeGenC::VisitExpr_(op, os);
+  }
+}
+
+void CodeGenCHost::VisitStmt_(const tvm::tir::AssertStmtNode *op) { // NOLINT(*)
+  if (emit_asserts_) {
+    std::string cond = PrintExpr(op->condition);
+    PrintIndent();
+    stream << "if (!(" << cond << ")) {\n";
+    int assert_if_scope = this->BeginScope();
+    {
+      // Prepare the base error message: allow StringImm or general PrimExpr
+      const auto *msg_node = op->message.as<tvm::tir::StringImmNode>();
+      bool msg_is_literal = (msg_node != nullptr);
+      std::string esc_msg;
+      std::string msg_expr;
+      if (msg_is_literal) {
+        const std::string &raw_msg = msg_node->value;
+        esc_msg = tvm::support::StrEscape(
+            raw_msg.c_str(), raw_msg.length(), /*use_octal_escape=*/true,
+            /*escape_whitespace_special_chars=*/true);
+      } else {
+        msg_expr = PrintExpr(op->message);
+      }
+
+      // Only print expected/got values for equality when message is StringImm
+      if (msg_is_literal) {
+        if (const auto *eq = op->condition.as<tvm::tir::EQNode>()) {
+          std::string lhs = PrintExpr(eq->a);
+          std::string rhs = PrintExpr(eq->b);
+          PrintIndent();
+          stream << "char __tvm_assert_msg_buf[512];\n";
+          PrintIndent();
+          stream << "snprintf(__tvm_assert_msg_buf, 512, \"%s; expected: %lld, "
+                    "got: %lld\", \""
+                 << esc_msg << "\", (long long)(" << lhs << "), (long long)("
+                 << rhs << "));\n";
+          PrintIndent();
+          stream << "TVMFFIErrorSetRaisedFromCStr(\"RuntimeError\", "
+                    "__tvm_assert_msg_buf);\n";
+        } else {
+          PrintIndent();
+          stream << "TVMFFIErrorSetRaisedFromCStr(\"RuntimeError\", \""
+                 << esc_msg << "\");\n";
+        }
+      } else {
+        PrintIndent();
+        stream << "TVMFFIErrorSetRaisedFromCStr(\"RuntimeError\", " << msg_expr
+               << ");\n";
+      }
+    }
+    PrintIndent();
+    stream << "return -1;\n";
+    this->EndScope(assert_if_scope);
+    PrintIndent();
+    stream << "}\n";
+  }
+  this->PrintStmt(op->body);
+}
+
+void CodeGenCHost::VisitExpr_(const tvm::tir::MinNode *op,
+                              std::ostream &os) { // NOLINT(*)
+  PrintTernaryCondExpr(op, "<", os);
+}
+
+void CodeGenCHost::VisitExpr_(const tvm::tir::MaxNode *op,
+                              std::ostream &os) { // NOLINT(*)
+  PrintTernaryCondExpr(op, ">", os);
+}
+
+template <typename T>
+inline void CodeGenCHost::PrintTernaryCondExpr(const T *op, const char *compare,
+                                               std::ostream &os) { // NOLINT(*)
+  std::ostringstream temp_a;
+  VisitExpr(op->a, temp_a);
+  std::string a_id = SSAGetID(temp_a.str(), op->a.dtype());
+  std::ostringstream temp_b;
+  VisitExpr(op->b, temp_b);
+  std::string b_id = SSAGetID(temp_b.str(), op->b.dtype());
+
+  os << "((" << a_id << ") " << compare << " (" << b_id << ") "
+     << "? (" << a_id << ") : (" << b_id << "))";
+}
+
+} // namespace tl
+} // namespace tvm
+
+namespace tvm {
+namespace tl {
+
+using tvm::codegen::CodeGenSourceBase;
+using tvm::codegen::CSourceModuleCreate;
+using tvm::ffi::Array;
+using tvm::ffi::Map;
+using tvm::ffi::Module;
+using tvm::ffi::String;
+
+// Build function that mirrors TVM's host C codegen, registered under a
+// TileLang-specific name.
+::tvm::ffi::Module BuildTileLangCHost(::tvm::IRModule mod,
+                                      ::tvm::Target target) {
+  bool output_ssa = false;
+  bool emit_asserts = true;
+  bool emit_fwd_func_decl = true;
+
+  std::unordered_set<std::string> devices;
+  if (mod->GetAttr<::tvm::ffi::Map<::tvm::GlobalVar, ::tvm::ffi::String>>(
+          "device_contexts") != nullptr) {
+    ::tvm::ffi::Map<::tvm::GlobalVar, ::tvm::ffi::String> device_contexts =
+        mod->GetAttr<::tvm::ffi::Map<::tvm::GlobalVar, ::tvm::ffi::String>>(
+               "device_contexts")
+            .value();
+    for (auto const &context : device_contexts) {
+      devices.insert(context.second.data());
+    }
+  }
+
+  CodeGenCHost cg;
+  cg.Init(output_ssa, emit_asserts, emit_fwd_func_decl, target->str(), devices);
+  cg.SetConstantsByteAlignment(
+      target->GetAttr<::tvm::Integer>("constants-byte-alignment").value_or(16));
+
+  auto is_aot_executor_fn = [](::tvm::tir::PrimFunc const &func) -> bool {
+    return func->GetAttr<::tvm::Bool>("runner_function", ::tvm::Bool(false))
+        .value();
+  };
+
+  std::vector<std::pair<::tvm::GlobalVar, ::tvm::tir::PrimFunc>> funcs;
+  for (auto [gvar, base_func] : mod->functions) {
+    ICHECK(base_func->IsInstance<::tvm::tir::PrimFuncNode>())
+        << "CodegenCHost: Can only take PrimFunc";
+    auto prim_func = ::tvm::Downcast<::tvm::tir::PrimFunc>(base_func);
+    funcs.push_back({gvar, prim_func});
+  }
+
+  auto sort_key = [&is_aot_executor_fn](const auto &kv) {
+    return std::tuple{is_aot_executor_fn(kv.second), kv.first->name_hint};
+  };
+  std::sort(funcs.begin(), funcs.end(),
+            [&sort_key](const auto &kv_a, const auto &kv_b) {
+              return sort_key(kv_a) < sort_key(kv_b);
+            });
+
+  for (const auto &[gvar, prim_func] : funcs) {
+    cg.DeclareFunction(gvar, prim_func);
+  }
+
+  for (const auto &[gvar, prim_func] : funcs) {
+    cg.AddFunction(gvar, prim_func, emit_fwd_func_decl);
+  }
+
+  std::string code = cg.Finish();
+  if (const auto f =
+          ffi::Function::GetGlobal("tilelang_callback_c_host_postproc")) {
+    code = (*f)(code, target).cast<std::string>();
+  }
+  return ::tvm::codegen::CSourceModuleCreate(code, "c", cg.GetFunctionNames());
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("target.build.tilelang_c", BuildTileLangCHost);
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/target/codegen_c_host.h b/src/target/codegen_c_host.h
new file mode 100644
index 000000000..8d54cb4ad
--- /dev/null
+++ b/src/target/codegen_c_host.h
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file codegen_c_host.h
+ * \brief Generate C host code (TileLang copy).
+ */
+#ifndef TL_TARGET_SOURCE_CODEGEN_C_HOST_H_
+#define TL_TARGET_SOURCE_CODEGEN_C_HOST_H_
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "target/source/codegen_c.h"
+#include "tvm/target/codegen.h"
+#include "tvm/tir/expr.h"
+
+namespace tvm {
+namespace tl {
+
+// TileLang copy of TVM's CodeGenCHost, under the tl namespace.
+// Inherits from tvm::codegen::CodeGenC.
+class CodeGenCHost : public tvm::codegen::CodeGenC {
+public:
+  CodeGenCHost();
+  void Init(bool output_ssa, bool emit_asserts, bool emit_fwd_func_decl,
+            std::string target_str,
+            const std::unordered_set<std::string> &devices);
+
+  void InitGlobalContext();
+
+  void AddFunction(const tvm::GlobalVar &gvar,
+                   const tvm::tir::PrimFunc &f) override;
+  void AddFunction(const tvm::GlobalVar &gvar, const tvm::tir::PrimFunc &f,
+                   bool emit_fwd_func_decl);
+  /*!
+   * \brief Add functions from the (unordered) range to the current module in a
+   * deterministic order. This helps with debugging.
+   *
+   * \param functions A vector of unordered range of current module.
+   */
+  void AddFunctionsOrdered(
+      std::vector<std::pair<tvm::GlobalVar, tvm::BaseFunc>> functions);
+  void DefineModuleName();
+
+  using tvm::codegen::CodeGenC::PrintType;
+  void PrintType(tvm::DataType t, std::ostream &os) final; // NOLINT(*)
+  void PrintFuncPrefix(std::ostream &os) final;            // NOLINT(*)
+
+  // overload visitor functions
+  void VisitExpr_(const tvm::tir::BroadcastNode *op,
+                  std::ostream &os) final; // NOLINT(*)
+  void VisitExpr_(const tvm::tir::CallNode *op,
+                  std::ostream &os) override; // NOLINT(*)
+  // overload min and max to use the ternary operator, so we don't rely on the
+  // standard library implementations
+  void VisitExpr_(const tvm::tir::MinNode *op,
+                  std::ostream &os) final; // NOLINT(*)
+  void VisitExpr_(const tvm::tir::MaxNode *op,
+                  std::ostream &os) final; // NOLINT(*)
+
+  void VisitStmt_(const tvm::tir::AssertStmtNode *op) final; // NOLINT(*)
+
+  void GenerateForwardFunctionDeclarations(
+      tvm::ffi::String global_symbol,
+      const tvm::ffi::Array<tvm::Type> &arg_types,
+      const tvm::Type &ret_type) override;
+  tvm::ffi::Array<tvm::ffi::String> GetFunctionNames() {
+    return function_names_;
+  }
+
+private:
+  std::string module_name_;
+  /* \brief mapping global packed func to the unique name */
+  std::unordered_map<std::string, std::string> declared_globals_;
+  /* \brief names of the functions declared in this module */
+  tvm::ffi::Array<tvm::ffi::String> function_names_;
+  /*! \brief whether to emit asserts in the resulting C code */
+  bool emit_asserts_;
+  /*! \brief whether to emit forwared function declarations in the resulting C
+   * code */
+  bool emit_fwd_func_decl_;
+  /*! \brief whether to generate the entry function if encountered */
+  bool has_main_func_ = false;
+
+  std::string GetPackedName(const tvm::tir::CallNode *op);
+  void PrintGetFuncFromBackend(const std::string &func_name,
+                               const std::string &packed_func_name);
+  void PrintCallPacked(const tvm::tir::CallNode *op);
+  /*!
+   * \brief Print ternary conditional operator implementing binary `op`
+   * Forces the operands to be in SSA form.
+   * \param op binary operator being expressed
+   * \param compare string representation of comparison operator
+   * \param os stream reference to print into
+   */
+  template <typename T>
+  inline void PrintTernaryCondExpr(const T *op, const char *compare,
+                                   std::ostream &os); // NOLINT(*)
+};
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TL_TARGET_SOURCE_CODEGEN_C_HOST_H_
diff --git a/src/target/codegen_cpp.cc b/src/target/codegen_cpp.cc
index a2c52cad9..4f736bb06 100644
--- a/src/target/codegen_cpp.cc
+++ b/src/target/codegen_cpp.cc
@@ -29,6 +29,8 @@
 #include <unordered_set>
 #include <utility>
 
+#include "../op/builtin.h"
+#include "../support/ffi_aliases.h"
 #include "support/str_escape.h"
 #include "target/build_common.h"
 #include "target/source/codegen_params.h"
@@ -54,8 +56,7 @@ void CodeGenTileLangCPP::Init(bool output_ssa, bool emit_asserts,
 }
 
 void CodeGenTileLangCPP::InitGlobalContext() {
-  decl_stream << "void* " << tvm::runtime::symbol::tvm_ffi_library_ctx
-              << " = NULL;\n";
+  decl_stream << "void* " << ffi::symbol::tvm_ffi_library_ctx << " = NULL;\n";
 }
 
 void CodeGenTileLangCPP::DefineModuleName() {
@@ -203,12 +204,12 @@ void CodeGenTileLangCPP::PrintFuncCall(const std::string &packed_func_name,
   this->PrintIndent();
   std::string ret_val = name_supply_->FreshName("ret_val");
   std::string ret_type_code = name_supply_->FreshName("ret_type_code");
-  this->stream << "TVMValue " << ret_val << ";\n";
+  this->stream << "TVMFFIAny " << ret_val << ";\n";
   this->PrintIndent();
   this->stream << "int " << ret_type_code << ";\n";
   this->PrintIndent();
   this->stream << "if (TVMFuncCall(" << packed_func_name << ", "
-               << "(TVMValue*) stack_value"
+               << "(TVMFFIAny*) stack_value"
                << ", "
                << "(int*) stack_tcode"
                << ", " << num_args << ", "
@@ -228,13 +229,13 @@ void CodeGenTileLangCPP::PrintFuncCallC(
   this->PrintIndent();
   std::string ret_val = name_supply_->FreshName("ret_val");
   std::string ret_type_code = name_supply_->FreshName("ret_type_code");
-  this->stream << "TVMValue " << ret_val << ";\n";
+  this->stream << "TVMFFIAny " << ret_val << ";\n";
   this->PrintIndent();
   this->stream << "int " << ret_type_code << ";\n";
   this->PrintIndent();
 
   this->stream << "if (" << packed_func_name << "( "
-               << "(TVMValue*) stack_value "
+               << "(TVMFFIAny*) stack_value "
                << ", "
                << "(int*) stack_tcode"
                << ", " << num_args << ", "
@@ -256,10 +257,16 @@ void CodeGenTileLangCPP::AddFunction(const PrimFunc &f) {
   // reserve keywords
   ReserveKeywordsAsUnique();
 
-  auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-  ICHECK(global_symbol.defined())
+  auto global_symbol = f->GetAttr<ffi::String>(tvm::attr::kGlobalSymbol);
+  ICHECK(global_symbol)
       << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
   bool no_alias = f->HasNonzeroAttr(tir::attr::kNoAlias);
+  std::unordered_set<const VarNode *> non_restrict;
+  if (auto opt =
+          f->GetAttr<ffi::Array<tir::Var>>(tl::attr::kNonRestrictParams)) {
+    for (const tir::Var &v : opt.value())
+      non_restrict.insert(v.get());
+  }
 
   this->PrintFuncPrefix(stream);
   CodeGenC::PrintType(f->ret_type, stream);
@@ -294,7 +301,7 @@ void CodeGenTileLangCPP::AddFunction(const PrimFunc &f) {
         }
       }
 
-      if (no_alias) {
+      if (no_alias && !non_restrict.count(v.get())) {
         PrintRestrict(v, stream);
       }
     } else {
diff --git a/src/target/codegen_cpp.h b/src/target/codegen_cpp.h
index c3ce25a0a..25bb115c8 100644
--- a/src/target/codegen_cpp.h
+++ b/src/target/codegen_cpp.h
@@ -73,10 +73,10 @@ class CodeGenTileLangCPP : public CodeGenC {
   void VisitStmt_(const AssertStmtNode *op) final; // NOLINT(*)
   void VisitStmt_(const AllocateNode *op) final;   // NOLINT(*)
 
-  void GenerateForwardFunctionDeclarations(String global_symbol,
-                                           const Array<Type> &arg_types,
+  void GenerateForwardFunctionDeclarations(ffi::String global_symbol,
+                                           const ffi::Array<Type> &arg_types,
                                            const Type &ret_type) override;
-  Array<String> GetFunctionNames() { return function_names_; }
+  ffi::Array<ffi::String> GetFunctionNames() { return function_names_; }
 
 private:
   /* \brief Internal structure to store information about function calls */
@@ -92,7 +92,7 @@ class CodeGenTileLangCPP : public CodeGenC {
   /* \brief mapping global packed func to the unique name */
   std::unordered_map<std::string, std::string> declared_globals_;
   /* \brief names of the functions declared in this module */
-  Array<String> function_names_;
+  ffi::Array<ffi::String> function_names_;
   /*! \brief whether to emit asserts in the resulting C code */
   bool emit_asserts_;
   /*! \brief whether to emit forward function declarations in the resulting C
diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
index b39e9b042..ce605ac7d 100644
--- a/src/target/codegen_cuda.cc
+++ b/src/target/codegen_cuda.cc
@@ -17,11 +17,13 @@
 #include "../op/distributed.h"
 #include "../op/sync.h"
 #include "./ptx.h"
+#include "./utils.h"
 #include "arith/pattern_match.h"
 
 namespace tvm {
 namespace codegen {
 using namespace tvm::tl::codegen;
+using namespace ffi;
 
 struct CUDAMath {
   std::string operator()(DataType t, std::string name) const {
@@ -108,7 +110,7 @@ struct CUDAIEEEMath {
   }
 };
 
-static std::string GetFP8Type(DataType type) {
+static std::string GetTileLangFP8Type(DataType type) {
   std::stringstream stream;
   int32_t lanes = type.lanes();
   std::string vec;
@@ -129,19 +131,19 @@ static std::string GetFP8Type(DataType type) {
         << "Only support scalar and vector types of width (2, 4, 8, 16, 32) "
            "for FP8";
   }
-  if (type.is_float8_e4m3fn() || type.is_float8_e4m3fnuz() ||
-      type.is_float8_e4m3()) {
+  if (type.is_float8_e4m3() || type.is_float8_e4m3fn()) {
     stream << "fp8_e4" << vec << "_t";
-  } else if (type.is_float8_e5m2() || type.is_float8_e5m2fnuz() ||
-             type.is_float8_e5m2()) {
+  } else if (type.is_float8_e5m2()) {
     stream << "fp8_e5" << vec << "_t";
+  } else if (type.is_float8_e8m0fnu()) {
+    stream << "fp8_e8" << vec << "_t";
   } else {
     LOG(FATAL) << "Unsupported FP8 type in CUDA codegen but got " << type;
   }
   return stream.str();
 }
 
-std::string GetFP6Type(DataType type) {
+std::string GetTileLangFP6Type(DataType type) {
   std::stringstream stream;
   int32_t lanes = type.lanes();
   std::string vec;
@@ -172,32 +174,37 @@ std::string GetFP6Type(DataType type) {
   return stream.str();
 }
 
-std::string GetFP4Type(DataType type) {
+std::string GetTileLangFP4Type(DataType type) {
   std::stringstream stream;
   int32_t lanes = type.lanes();
   std::string vec;
   if (type.is_scalar()) {
     vec = "";
   } else if (lanes == 2) {
-    vec = "x2";
+    vec = "_2";
   } else if (lanes == 4) {
-    vec = "x4";
+    vec = "_4";
   } else if (lanes == 8) {
-    vec = "x8";
+    vec = "_8";
   } else if (lanes == 16) {
-    vec = "x16";
+    vec = "_16";
+  } else if (lanes == 32) {
+    vec = "_32";
+  } else if (lanes == 64) {
+    vec = "_64";
   } else {
-    LOG(FATAL)
-        << "Only support scalar and vector types of width (2, 4) for FP4";
+    LOG(FATAL) << "Only support scalar and vector types of width (2, 4, 8, 16, "
+                  "32, 64) for FP4";
   }
-  stream << "__nv_fp4";
+
   std::string suffix;
   if (type.code() == DataType::kFloat4_e2m1fn) {
-    suffix = "_e2m1";
+    suffix = "_e2";
   } else {
     LOG(FATAL) << "Unsupported FP4 type in CUDA codegen";
   }
-  stream << vec << suffix;
+
+  stream << "fp4" << suffix << vec << "_t";
   return stream.str();
 }
 
@@ -261,9 +268,27 @@ std::string CodeGenTileLangCUDA::Finish() {
   if (need_mma_h_) {
     decl_stream << "#include <mma.h>\n";
   }
+  if (need_mma_instruction_h_) {
+    decl_stream << "#include <tl_templates/cuda/instruction/mma.h>\n";
+  }
+  if (need_wgmma_instruction_h_) {
+    decl_stream << "#include <tl_templates/cuda/instruction/wgmma.h>\n";
+  }
+  if (need_tcgen05mma_instruction_h_) {
+    decl_stream << "#include <tl_templates/cuda/instruction/tcgen05mma.h>\n";
+  }
+  if (need_mma_sm70_instruction_h_) {
+    decl_stream << "#include <tl_templates/cuda/instruction/mma_sm70.h>\n";
+  }
+  if (need_tcgen05_common_h_) {
+    decl_stream << "#include <tl_templates/cuda/tcgen_05.h>\n";
+  }
   if (enable_fp8_) {
     decl_stream << "#include <tl_templates/cuda/cuda_fp8.h>\n";
   }
+  if (enable_fp4_) {
+    decl_stream << "#include <tl_templates/cuda/cuda_fp4.h>\n";
+  }
 
   if (need_math_constants_h_) {
     decl_stream << "#include <math_constants.h>\n";
@@ -278,6 +303,15 @@ std::string CodeGenTileLangCUDA::Finish() {
     decl_stream << "#include <cooperative_groups.h>\n";
   }
 
+  if (need_curand_kernel_h_) {
+    decl_stream << "#include <curand_kernel.h>\n";
+  }
+
+  if (use_nvshmem_) {
+    decl_stream << "#include <nvshmem.h>\n";
+    decl_stream << "#include <nvshmemx.h>\n";
+  }
+
   decl_stream << "#include <tl_templates/cuda/gemm.h>\n";
   if (enable_sparse_gemm_) {
     decl_stream << "#include <tl_templates/cuda/gemm_sp.h>\n";
@@ -287,11 +321,13 @@ std::string CodeGenTileLangCUDA::Finish() {
   decl_stream << "#include <tl_templates/cuda/ldsm.h>\n";
   decl_stream << "#include <tl_templates/cuda/threadblock_swizzle.h>\n";
   decl_stream << "#include <tl_templates/cuda/debug.h>\n";
+  decl_stream << "#include <tl_templates/cuda/intrin.h>\n";
+
   if (use_distributed_) {
     decl_stream << "#include <tl_templates/cuda/distributed.h>\n";
     decl_stream << "#include <tl_templates/cuda/sync.h>\n";
     decl_stream << "#include <tl_templates/cuda/ldst.h>\n";
-    decl_stream << "uint64_t __constant__ meta_data[1024];\n";
+    decl_stream << "extern \"C\" __constant__ uint64_t meta_data[1024];\n";
   }
   decl_stream << "#ifdef ENABLE_BF16\n";
   decl_stream << "#include <tl_templates/cuda/cuda_bf16_fallbacks.cuh>\n";
@@ -309,7 +345,12 @@ std::string CodeGenTileLangCUDA::Finish() {
 void CodeGenTileLangCUDA::VisitStmt_(const tir::ForNode *op) {
   if (op->kind == tir::ForKind::kUnrolled) {
     PrintIndent();
-    stream << "#pragma unroll\n";
+    if (unroll_factor.count(op->loop_var.get())) {
+      stream << "#pragma unroll "
+             << PrintExpr(unroll_factor[op->loop_var.get()]) << "\n";
+    } else {
+      stream << "#pragma unroll\n";
+    }
   }
   std::string extent =
       PrintExpr(arith::Analyzer().Simplify(op->extent + op->min));
@@ -429,18 +470,20 @@ void CodeGenTileLangCUDA::PrintType(DataType t, std::ostream &os) { // NOLINT(*)
       return;
   } else if (t.is_float8()) {
     enable_fp8_ = true;
-    os << GetFP8Type(t);
+    os << GetTileLangFP8Type(t);
     return;
   } else if (t.is_float6()) {
     enable_fp6_ = true;
     if (t.lanes() <= 4) {
-      os << GetFP6Type(t);
+      os << GetTileLangFP6Type(t);
     }
     return;
   } else if (t.is_float4()) {
     enable_fp4_ = true;
-    if (t.lanes() <= 4) {
-      os << GetFP4Type(t);
+    if (t.lanes() <= 64) {
+      os << GetTileLangFP4Type(t);
+    } else {
+      fail = true;
     }
     return;
   } else if (t == DataType::Bool()) {
@@ -657,7 +700,9 @@ void CodeGenTileLangCUDA::PrintVecElemLoad(const std::string &vec, DataType t,
   }
 
   static const char access[] = {'x', 'y', 'z', 'w'};
-  ICHECK(i >= 0 && i < 256 / t.bits());
+  ICHECK(i >= 0 && i < 256 / t.bits())
+      << "i: " << i << " t: " << t << " t.bits(): " << t.bits()
+      << " t.lanes(): " << t.lanes();
   if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
     std::string type_name = t.is_int() ? "char" : "unsigned char";
     if (t.lanes() == 2 || t.lanes() == 3) {
@@ -699,6 +744,22 @@ void CodeGenTileLangCUDA::PrintVecElemLoad(const std::string &vec, DataType t,
       os << "." << access[(i % 8) / 4];
     // fp8_e5_4_t or fp8_e5_2_t
     os << "." << access[i % 4];
+  } else if (t.is_float4_e2m1fn()) {
+    os << vec;
+    // fp4_e2_64_t
+    if (t.lanes() >= 64)
+      os << "." << access[i / 32];
+    // fp4_e2_32_t
+    if (t.lanes() >= 32)
+      os << "." << access[(i % 32) / 16];
+    // fp4_e2_16_t
+    if (t.lanes() >= 16)
+      os << "." << access[(i % 16) / 8];
+    // fp4_e2_8_t
+    if (t.lanes() >= 8)
+      os << "." << access[(i % 8) / 4];
+    // fp4_e2_4_t or fp4_e2_2_t
+    os << "." << access[i % 4];
   } else if (t.lanes() > 4 && t.lanes() <= 8) {
     std::string type_name;
     if (t.bits() == 16) {
@@ -802,6 +863,22 @@ void CodeGenTileLangCUDA::PrintVecElemStore(const std::string &vec, DataType t,
     ICHECK(!type_name.empty());
     stream << "((" << type_name << "2*)(&(" << vec << "." << access[i / 2]
            << ")))->" << access[i % 2] << " = " << value << ";\n";
+  } else if (t.is_float4_e2m1fn()) {
+    stream << vec;
+    // fp4_e2_64_t
+    if (t.lanes() >= 64)
+      stream << "." << access[i / 32];
+    // fp4_e2_32_t
+    if (t.lanes() >= 32)
+      stream << "." << access[(i % 32) / 16];
+    // fp4_e2_16_t
+    if (t.lanes() >= 16)
+      stream << "." << access[(i % 16) / 8];
+    // fp4_e2_8_t
+    if (t.lanes() >= 8)
+      stream << "." << access[(i % 8) / 4];
+    // fp4_e2_4_t or fp4_e2_2_t
+    stream << "." << access[i % 4] << " = " << value << ";\n";
   } else {
     stream << vec << "." << access[i] << " = " << value << ";\n";
   }
@@ -817,9 +894,11 @@ void CodeGenTileLangCUDA::PrintStorageSync(const CallNode *op) {
     if (args.size() == 1) {
       this->stream << "__syncthreads();\n";
     } else if (args.size() == 2) {
+      // In contrast to TileLang, here we support runtime determined barrier_id.
       std::string barrier_id = PrintExpr(args[1]);
       this->stream << "tl::__sync_thread_partial(" << barrier_id << ");\n";
     } else if (args.size() == 3) {
+      // Support runtime determined barrier_id and thread_count.
       std::string barrier_id = PrintExpr(args[1]);
       std::string thread_count = PrintExpr(args[2]);
       this->stream << "tl::__sync_thread_partial(" << barrier_id << ", "
@@ -913,123 +992,181 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
   stream << ' ' << sret << ";\n";
   std::string src = SSAGetID(PrintExpr(op->value), from_ty);
 
-  // Handle conversion between float16 and float32
-  if (from_ty.is_float16() && target_ty.is_float()) {
+  int lanes = from_ty.lanes();
+
+  auto PrintVectorizedCast =
+      [&](const std::string &cast_func, const std::string &src_type,
+          const std::string &dst_type, const std::string &extra_args = "",
+          bool src_needs_reinterpret = false,
+          bool dst_needs_reinterpret = false) {
+        int num_chunks = lanes / 2;
+        std::string src_cast = src_needs_reinterpret
+                                   ? "reinterpret_cast<" + src_type + "*>"
+                                   : "(" + src_type + "*)";
+        std::string dst_cast = dst_needs_reinterpret
+                                   ? "reinterpret_cast<" + dst_type + "*>"
+                                   : "(" + dst_type + "*)";
+
+        for (int i = 0; i < num_chunks; i++) {
+          PrintIndent();
+          stream << "(" << dst_cast << "(&" << sret << "))[" << i
+                 << "] = " << cast_func << "((" << src_cast << "(&" << src
+                 << "))[" << i << "]" << extra_args << ");\n";
+        }
+        os << sret;
+      };
+
+  // Handle conversion from float16 to float32
+  if (from_ty.is_float16() && target_ty.is_float() && target_ty.bits() == 32) {
     // Use __half22float2 for vectorized conversion (half2 -> float2)
-    if (from_ty.lanes() == 2 && target_ty.lanes() == 2) {
-      // half2 -> float2
-      PrintIndent();
-      stream << sret << " = __half22float2(*(half2*)(&(" << src << ")));\n";
-      os << sret;
-      return;
-    } else if (from_ty.lanes() == 4 && target_ty.lanes() == 4) {
-      // half4 -> float4
-      PrintIndent();
-      stream << "((float2*)(&" << sret << "))[0] = "
-             << "__half22float2(*(half2*)(&(" << src << ")));\n";
-      PrintIndent();
-      stream << "((float2*)(&" << sret << "))[1] = "
-             << "__half22float2(*((half2*)(&(" << src << "))+1));\n";
-      os << sret;
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__half22float2", "half2", "float2");
       return;
     }
-  } else if (from_ty.is_float() && target_ty.is_float16()) {
+  }
+
+  // Handle conversion from float32 to float16
+  if (from_ty.is_float() && from_ty.bits() == 32 && target_ty.is_float16()) {
     // Use __float22half2_rn for vectorized conversion (float2 -> half2)
-    if (from_ty.lanes() == 2 && target_ty.lanes() == 2) {
-      // float2 -> half2
-      PrintIndent();
-      stream << "*(half2*)(&(" << sret << ")) = __float22half2_rn(*(float2*)(&("
-             << src << ")));\n";
-      os << sret;
-      return;
-    } else if (from_ty.lanes() == 4 && target_ty.lanes() == 4) {
-      // float4 -> half4
-      PrintIndent();
-      stream << "((half2*)(&" << sret << "))[0] = "
-             << "__float22half2_rn(*(float2*)(&(" << src << ")));\n";
-      PrintIndent();
-      stream << "((half2*)(&" << sret << "))[1] = "
-             << "__float22half2_rn(*((float2*)(&(" << src << "))+1));\n";
-      os << sret;
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__float22half2_rn", "float2", "half2");
       return;
     }
   }
 
-  // Handle conversion between bfloat16 and float32
-  if (from_ty.is_bfloat16() && target_ty.is_float()) {
+  // Handle conversion from bfloat16 to float32
+  if (from_ty.is_bfloat16() && target_ty.is_float() && target_ty.bits() == 32) {
     // Use __bfloat1622float2 for vectorized conversion (bfloat162 -> float2)
-    if (from_ty.lanes() == 2 && target_ty.lanes() == 2) {
-      // bfloat162 -> float2
-      PrintIndent();
-      stream << sret
-             << " = __bfloat1622float2(*reinterpret_cast<__nv_bfloat162*>(&("
-             << src << ")));\n";
-      os << sret;
-      return;
-    } else if (from_ty.lanes() == 4 && target_ty.lanes() == 4) {
-      // bfloat162x2 -> float4
-      PrintIndent();
-      stream << "((float2*)(&" << sret << "))[0] = "
-             << "__bfloat1622float2(*reinterpret_cast<__nv_bfloat162*>(&("
-             << src << ")));\n";
-      PrintIndent();
-      stream << "((float2*)(&" << sret << "))[1] = "
-             << "__bfloat1622float2(*(reinterpret_cast<__nv_bfloat162*>(&("
-             << src << "))+1));\n";
-      os << sret;
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__bfloat1622float2", "__nv_bfloat162", "float2", "",
+                          true, false);
       return;
     }
-  } else if (from_ty.is_float() && target_ty.is_bfloat16()) {
+  }
+
+  // Handle conversion from float32 to bfloat16
+  if (from_ty.is_float() && from_ty.bits() == 32 && target_ty.is_bfloat16()) {
     // Use __float22bfloat162_rn for vectorized conversion (float2 -> bfloat162)
-    if (from_ty.lanes() == 2 && target_ty.lanes() == 2) {
-      // float2 -> bfloat162
-      PrintIndent();
-      stream << "*reinterpret_cast<__nv_bfloat162*>(&(" << sret
-             << ")) = __float22bfloat162_rn(*(float2*)(&(" << src << ")));\n";
-      os << sret;
-      return;
-    } else if (from_ty.lanes() == 4 && target_ty.lanes() == 4) {
-      // float4 -> bfloat162x2
-      PrintIndent();
-      stream << "(reinterpret_cast<__nv_bfloat162*>(&" << sret << "))[0] = "
-             << "__float22bfloat162_rn(*(float2*)(&(" << src << ")));\n";
-      PrintIndent();
-      stream << "(reinterpret_cast<__nv_bfloat162*>(&" << sret << "))[1] = "
-             << "__float22bfloat162_rn(*((float2*)(&(" << src << "))+1));\n";
-      os << sret;
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__float22bfloat162_rn", "float2", "__nv_bfloat162",
+                          "", false, true);
       return;
     }
   }
 
   // Handle conversion from float32 to float8 (E4M3/E5M2)
-  if (from_ty.is_float() &&
-      (target_ty.is_float8_e4m3() || target_ty.is_float8_e5m2())) {
-    // FP32 -> FP8: Use __nv_cvt_float2_to_fp8x2 for vectorized conversion
-    // (float2 -> fp8x2)
-    if (from_ty.lanes() == 2 && target_ty.lanes() == 2) {
-      // float2 -> fp8x2
-      PrintIndent();
-      stream << "*reinterpret_cast<__nv_fp8x2_storage_t*>(&(" << sret
-             << ")) = __nv_cvt_float2_to_fp8x2(*reinterpret_cast<float2*>(&("
-             << src << ")), __NV_SATFINITE, "
-             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
-             << ");\n";
-      os << sret;
+  if (from_ty.is_float() && from_ty.bits() == 32 &&
+      tl::IsCudaVectorizableFP8(target_ty)) {
+    bool target_type_is_e4m3 =
+        target_ty.is_float8_e4m3() || target_ty.is_float8_e4m3fn();
+    std::string type_suffix = target_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2";
+
+    // Use __nv_cvt_float2_to_fp8x2 for vectorized conversion (float2 -> fp8x2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      std::string extra_args = ", __NV_SATFINITE, " + type_suffix;
+      PrintVectorizedCast("__nv_cvt_float2_to_fp8x2", "float2",
+                          "__nv_fp8x2_storage_t", extra_args, false, true);
+      return;
+    }
+  }
+
+  // Handle conversion from float8 (E4M3/E5M2) to float32
+  if (tl::IsCudaVectorizableFP8(from_ty) && target_ty.is_float()) {
+    bool from_type_is_e4m3 =
+        from_ty.is_float8_e4m3() || from_ty.is_float8_e4m3fn();
+    std::string type_suffix = from_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2";
+
+    // Use __tl_cvt_fp8x2_to_float2 for vectorized conversion (fp8x2 -> float2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_fp8x2_to_float2", "__nv_fp8x2_storage_t",
+                          "float2", ", " + type_suffix, true, false);
+      return;
+    }
+  }
+
+  // Handle conversion from float16 to float4 (E2M1)
+  if (from_ty.is_float16() && target_ty.is_float4_e2m1fn()) {
+    // Use __tl_cvt_half2_to_fp4x2 for vectorized conversion (half2 -> fp4x2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_half2_to_fp4x2", "half2", "uint8_t", "",
+                          false, true);
+      return;
+    }
+  }
+
+  // Handle conversion from float32 to float4 (E2M1)
+  if (from_ty.is_float() && target_ty.is_float4_e2m1fn()) {
+    // Use __tl_cvt_float2_to_fp4x2 for vectorized conversion (float2 -> fp4x2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_float2_to_fp4x2", "float2", "uint8_t", "",
+                          false, true);
+      return;
+    }
+  }
+
+  // Handle conversion from float4 (E2M1) to float16
+  if (from_ty.is_float4_e2m1fn() && target_ty.is_float16()) {
+    // Use __tl_cvt_fp4x2_to_half2 for vectorized conversion (fp4x2 -> half2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_fp4x2_to_half2", "uint8_t", "half2", "",
+                          true, false);
+      return;
+    }
+  }
+
+  // Handle conversion from float4 (E2M1) to float32
+  if (from_ty.is_float4_e2m1fn() && target_ty.is_float()) {
+    // Use __tl_cvt_fp4x2_to_float2 for vectorized conversion (fp4x2 -> float2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_fp4x2_to_float2", "uint8_t", "float2", "",
+                          true, false);
+      return;
+    }
+  }
+
+  // Handle conversion from double to float4 (E2M1)
+  if (from_ty.is_float() && from_ty.bits() == 64 &&
+      target_ty.is_float4_e2m1fn()) {
+    // Use __tl_cvt_double2_to_fp4x2 for vectorized conversion (double2 ->
+    // fp4x2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_double2_to_fp4x2", "double2", "uint8_t", "",
+                          false, true);
+      return;
+    }
+  }
+
+  // Handle conversion from float4 (E2M1) to double
+  if (from_ty.is_float4_e2m1fn() && target_ty.is_float() &&
+      target_ty.bits() == 64) {
+    // Use __tl_cvt_fp4x2_to_double2 for vectorized conversion (fp4x2 ->
+    // double2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_fp4x2_to_double2", "uint8_t", "double2", "",
+                          true, false);
+      return;
+    }
+  }
+
+  // Handle conversion from bfloat16 to float4 (E2M1)
+  if (from_ty.is_bfloat16() && target_ty.is_float4_e2m1fn()) {
+    // Use __tl_cvt_bfloat162_to_fp4x2 for vectorized conversion (bfloat162 ->
+    // fp4x2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_bfloat162_to_fp4x2", "__nv_bfloat162",
+                          "uint8_t", "", false, true);
+      return;
+    }
+  }
+
+  // Handle conversion from float4 (E2M1) to bfloat16
+  if (from_ty.is_float4_e2m1fn() && target_ty.is_bfloat16()) {
+    // Use __tl_cvt_fp4x2_to_bfloat162 for vectorized conversion (fp4x2 ->
+    // bfloat162)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_fp4x2_to_bfloat162", "uint8_t",
+                          "__nv_bfloat162", "", true, false);
       return;
-    } else if (from_ty.lanes() == 4 && target_ty.lanes() == 4) {
-      // float4 -> fp8x4
-      PrintIndent();
-      stream << "((__nv_fp8x2_storage_t*)(&" << sret << "))[0] = "
-             << "__nv_cvt_float2_to_fp8x2(*(float2*)(&(" << src
-             << ")), __NV_SATFINITE, "
-             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
-             << ");\n";
-      PrintIndent();
-      stream << "((__nv_fp8x2_storage_t*)(&" << sret << "))[1] = "
-             << "__nv_cvt_float2_to_fp8x2(*((float2*)(&(" << src
-             << "))+1), __NV_SATFINITE, "
-             << (target_ty.is_float8_e4m3() ? "__NV_E4M3" : "__NV_E5M2")
-             << ");\n";
     }
   }
 
@@ -1047,6 +1184,52 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
   os << sret;
 }
 
+void CodeGenTileLangCUDA::VisitExpr_(const MinNode *op, std::ostream &os) {
+  // TODO(wt): Consider vectorized reduction and impl for other dtypes
+  DataType t = op->dtype;
+
+  // Standard min/max functions don't support bfloat16 or float16
+  if ((t.is_bfloat16() || t.is_float16()) && t.is_scalar()) {
+    os << "cutlass::fast_min(" << PrintExpr(op->a) << ", " << PrintExpr(op->b)
+       << ")";
+    return;
+  }
+
+  // For float32 and float64 scalar, use standard min functions
+  if (t.is_float() && t.is_scalar()) {
+    if (t.bits() == 32 || t.bits() == 64) {
+      os << "min(" << PrintExpr(op->a) << ", " << PrintExpr(op->b) << ")";
+      return;
+    }
+  }
+
+  // For all other scalar types (int, uint), use default implementation
+  CodeGenC::VisitExpr_(op, os);
+}
+
+void CodeGenTileLangCUDA::VisitExpr_(const MaxNode *op, std::ostream &os) {
+  // TODO(wt): Consider vectorized reduction and impl for other dtypes
+  DataType t = op->dtype;
+
+  // Standard min/max functions don't support bfloat16 or float16
+  if ((t.is_bfloat16() || t.is_float16()) && t.is_scalar()) {
+    os << "cutlass::fast_max(" << PrintExpr(op->a) << ", " << PrintExpr(op->b)
+       << ")";
+    return;
+  }
+
+  // For float32 and float64 scalar, use standard max functions
+  if (t.is_float() && t.is_scalar()) {
+    if (t.bits() == 32 || t.bits() == 64) {
+      os << "max(" << PrintExpr(op->a) << ", " << PrintExpr(op->b) << ")";
+      return;
+    }
+  }
+
+  // For all other scalar types (int, uint), use default implementation
+  CodeGenC::VisitExpr_(op, os);
+}
+
 void CodeGenTileLangCUDA::PrintCallExtern(Type ret_type, String global_symbol,
                                           const Array<PrimExpr> &args,
                                           bool skip_first_arg,
@@ -1142,16 +1325,15 @@ std::string CodeGenTileLangCUDA::GetBufferRef(DataType t,
     temp << "(" << ptr_cast(buffer_element_dtype) << vid << ")";
     buffer_str = temp.str();
   }
-
   if (scope.empty()) {
     scope = GetPtrStorageScope(buffer->data);
   }
-  if (scope == "local.var" || scope == "local.descriptor") {
+  if (scope == "local.var" || scope.find("local.descriptor") == 0) {
     os << vid;
     return os.str();
   }
   std::string index_str = PrintExpr(index);
-  if (t.bits() == 4 || (t.bits() == 1 && t.is_int())) {
+  if ((t.bits() == 4 && !t.is_float4()) || (t.bits() == 1 && t.is_int())) {
     // This is a special case, because CodegenCUDA::PrintType()
     // returns "int" for bool and for 4-bit integers. In most cases,
     // we divide by the number of lanes to determine the index.
@@ -1159,13 +1341,22 @@ std::string CodeGenTileLangCUDA::GetBufferRef(DataType t,
     // int32.  Therefore, we need to divide by the ratio of their
     // sizes in that case.
     int div_factor = (t.lanes() == 1) ? (32 / t.bits()) : t.lanes();
+    index_str =
+        PrintExpr(arith::Analyzer().Simplify(truncdiv(index, div_factor)));
 
-    os << "*("
-       << "(" << ptr_cast(t) << vid << ")"
-       << " + " << index_str << " / " << div_factor << ")";
+    os << "*((" << ptr_cast(t) << vid << ")" << " + " << index_str << ")";
   } else if (t == buffer_element_dtype) {
     os << buffer_str << "[" << index_str << "]";
   } else {
+    // Fix fp4 pointer arithmetic: fp4 elements are 4-bit packed 2 per byte.
+    // fp4* + n incorrectly advances n bytes (skipping 2n elements).
+    int div_factor = 1;
+    if (buffer_element_dtype.is_float4() && buffer_element_dtype.lanes() == 1) {
+      div_factor = 2;
+    }
+    index_str =
+        PrintExpr(arith::Analyzer().Simplify(truncdiv(index, div_factor)));
+
     os << "*" << ptr_cast(t) << "(" << buffer_str << " + " << index_str << ")";
   }
 
@@ -1466,6 +1657,22 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     int num_mma = Downcast<IntImm>(op->args[0])->value;
     this->stream << "tl::warpgroup_wait<" << std::to_string(num_mma)
                  << ">();\n";
+  } else if (op->op.same_as(tl::warpgroup_fence_operand())) {
+    ICHECK_EQ(op->args.size(), 4U);
+    std::string dtype = Downcast<StringImm>(op->args[0])->value;
+    std::string data_ptr = this->PrintExpr(op->args[1]);
+    std::string offset = this->PrintExpr(op->args[2]);
+    std::string num_regs = this->PrintExpr(op->args[3]);
+    auto dtype_enum = tl::codegen::ptx::DTypeFromString(dtype);
+    std::string cast_type = "uint32_t";
+    if (dtype_enum == tl::codegen::ptx::DataType::kFloat32 ||
+        dtype_enum == tl::codegen::ptx::DataType::kTensorFloat32) {
+      cast_type = "float";
+    }
+    this->PrintIndent();
+    this->stream << "tl::warpgroup_fence_operand(reinterpret_cast<" << cast_type
+                 << "*>(" << data_ptr << " + " << offset << "), " << num_regs
+                 << ");\n";
   } else if (op->op.same_as(tl::set_max_nreg())) {
     this->PrintIndent();
     int nreg = Downcast<IntImm>(op->args[0])->value;
@@ -1480,13 +1687,10 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
   } else if (op->op.same_as(tl::pack_b16())) {
     os << "__pack_half2(" << this->PrintExpr(op->args[0]) << ", "
        << this->PrintExpr(op->args[1]) << ")";
-  } else if (op->op.same_as(tl::sync_grid_cg())) {
-    this->need_cooperative_groups_ = true;
-    this->PrintIndent();
-    this->stream << "cooperative_groups::grid_group grid = "
-                    "cooperative_groups::this_grid();\n";
-    this->PrintIndent();
-    this->stream << "grid.sync();\n";
+    // } else if (op->op.same_as(tl::sync_grid_cg())) {
+    //   this->need_cooperative_groups_ = true;
+    //   this->PrintIndent();
+    //   this->stream << "cooperative_groups::this_grid().sync();\n";
   } else if (op->op.same_as(tl::init_barrier_gpu())) {
     ICHECK_GE(op->args.size(), 2);
     this->PrintIndent();
@@ -1507,18 +1711,14 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
   } else if (op->op.same_as(tl::sync_grid())) {
     this->PrintIndent();
     this->stream << "tl::sync_grid(" << this->PrintExpr(op->args[0]) << ");\n";
-  } else if (op->op.same_as(tl::wait_eq())) {
-    this->PrintIndent();
-    this->stream << "tl::wait_eq(" << this->PrintExpr(op->args[0]) << ", "
-                 << this->PrintExpr(op->args[1]) << ");\n";
   } else if (op->op.same_as(tl::atom_add())) {
     std::string func_name = "tl::ptx_atom_add_" +
                             op->args[2].as<StringImmNode>()->value + "_" +
                             op->args[3].as<StringImmNode>()->value;
     os << func_name << "(" << this->PrintExpr(op->args[0]) << ", "
        << this->PrintExpr(op->args[1]) << ")";
-  } else if (op->op.same_as(tl::get_clock())) {
-    os << "get_clock()";
+    // } else if (op->op.same_as(tl::get_clock())) {
+    //   os << "get_clock()";
   } else if (op->op.same_as(tl::loop_break())) {
     this->PrintIndent();
     this->stream << "break;\n";
@@ -1631,14 +1831,124 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     std::string b_bias = this->PrintExpr(op->args[9]);
     std::string c_ref = this->PrintExpr(op->args[10]);
     std::string c_bias = this->PrintExpr(op->args[11]);
-    bool saturate = Downcast<Bool>(op->args[12])->value;
-    std::string bit_op =
-        op->args.size() > 13 ? Downcast<StringImm>(op->args[13])->value : "";
-    std::string asm_code = PrintMMAAssembly(
-        shape, A_layout, B_layout, A_dtype, B_dtype, C_dtype, a_ref, a_bias,
-        b_ref, b_bias, c_ref, c_bias, "", "", "", bit_op, false, saturate);
+    auto dtype_a_enum = tl::codegen::ptx::DTypeFromString(A_dtype);
+    auto dtype_b_enum = tl::codegen::ptx::DTypeFromString(B_dtype);
+    auto dtype_c_enum = tl::codegen::ptx::DTypeFromString(C_dtype);
+    auto [m, n, k] = tl::codegen::ptx::ParseMMAShape(shape);
+
+    need_mma_instruction_h_ = true;
     this->PrintIndent();
-    this->stream << asm_code;
+    std::string mma_call =
+        "tl::mma_sync<(AType), (BType), (CType), (M), (N), (K), (TransA), "
+        "(TransB)>(reinterpret_cast<(CRegType)*>((C_ptr) + (C_offset)), "
+        "reinterpret_cast<const (ARegType)*>((A_ptr) + (A_offset)), "
+        "reinterpret_cast<const (BRegType)*>((B_ptr) + (B_offset)));\n";
+    tl::codegen::Replacer replacer;
+
+    // TODO(lei): Type Workaround for TF32, should be removed when
+    // we introduced tfloat32_t in the frontend.
+    std::string AType = tl::codegen::ptx::DTypeEnumToString(dtype_a_enum);
+    if (AType == "tl::DataType::kFloat32") {
+      AType = "tl::DataType::kTensorFloat32";
+    }
+    std::string BType = tl::codegen::ptx::DTypeEnumToString(dtype_b_enum);
+    if (BType == "tl::DataType::kFloat32") {
+      BType = "tl::DataType::kTensorFloat32";
+    }
+    std::string ARegType = tl::codegen::GetMMARegisterType(dtype_a_enum);
+    if (ARegType == "float") {
+      ARegType = "uint32_t";
+    }
+    std::string BRegType = tl::codegen::GetMMARegisterType(dtype_b_enum);
+    if (BRegType == "float") {
+      BRegType = "uint32_t";
+    }
+
+    replacer.register_rule("(AType)", AType);
+    replacer.register_rule("(BType)", BType);
+    replacer.register_rule("(CType)",
+                           tl::codegen::ptx::DTypeEnumToString(dtype_c_enum));
+    replacer.register_rule("(M)", std::to_string(m));
+    replacer.register_rule("(N)", std::to_string(n));
+    replacer.register_rule("(K)", std::to_string(k));
+    replacer.register_rule("(TransA)", A_layout == "row" ? "false" : "true");
+    replacer.register_rule("(TransB)", B_layout == "row" ? "false" : "true");
+    replacer.register_rule("(ARegType)", ARegType);
+    replacer.register_rule("(BRegType)", BRegType);
+    replacer.register_rule("(CRegType)",
+                           tl::codegen::GetMMARegisterType(dtype_c_enum));
+    replacer.register_rule("(A_ptr)", a_ref);
+    replacer.register_rule("(A_offset)", a_bias);
+    replacer.register_rule("(B_ptr)", b_ref);
+    replacer.register_rule("(B_offset)", b_bias);
+    replacer.register_rule("(C_ptr)", c_ref);
+    replacer.register_rule("(C_offset)", c_bias);
+    this->stream << replacer.rewrite(mma_call);
+  } else if (op->op.same_as(tl::ptx_mma_sm70())) {
+    // arg 0: shape: mXnXkX
+    // arg 1: A layout: row/col
+    // arg 2: B layout: row/col
+    // arg 3: A precision: fp16
+    // arg 4: B precision: fp16
+    // arg 5: C precision: fp16, fp32
+    // arg 6: A multiplicand
+    // arg 7: A multiplicand index
+    // arg 8: B multiplicand
+    // arg 9: B multiplicand index
+    // arg 10: C accumulator
+    // arg 11: C accumulator index
+    // arg 12: saturate
+    ICHECK_EQ(op->args.size(), 12U);
+    std::string shape = Downcast<StringImm>(op->args[0])->value;
+    std::string A_layout = Downcast<StringImm>(op->args[1])->value;
+    std::string B_layout = Downcast<StringImm>(op->args[2])->value;
+    std::string A_dtype = Downcast<StringImm>(op->args[3])->value;
+    std::string B_dtype = Downcast<StringImm>(op->args[4])->value;
+    std::string C_dtype = Downcast<StringImm>(op->args[5])->value;
+    std::string a_ref = this->PrintExpr(op->args[6]);
+    std::string a_bias = this->PrintExpr(op->args[7]);
+    std::string b_ref = this->PrintExpr(op->args[8]);
+    std::string b_bias = this->PrintExpr(op->args[9]);
+    std::string c_ref = this->PrintExpr(op->args[10]);
+    std::string c_bias = this->PrintExpr(op->args[11]);
+    auto dtype_a_enum = tl::codegen::ptx::DTypeFromString(A_dtype);
+    auto dtype_b_enum = tl::codegen::ptx::DTypeFromString(B_dtype);
+    auto dtype_c_enum = tl::codegen::ptx::DTypeFromString(C_dtype);
+    auto [m, n, k] = tl::codegen::ptx::ParseMMAShape(shape);
+
+    need_mma_sm70_instruction_h_ = true;
+    this->PrintIndent();
+    std::string mma_call =
+        "tl::mma_sync_sm70<(AType), (BType), (CType), (M), (N), (K), (TransA), "
+        "(TransB)>(reinterpret_cast<(CRegType)*>((C_ptr) + (C_offset)), "
+        "reinterpret_cast<const (ARegType)*>((A_ptr) + (A_offset)), "
+        "reinterpret_cast<const (BRegType)*>((B_ptr) + (B_offset)));\n";
+    tl::codegen::Replacer replacer;
+
+    replacer.register_rule("(AType)",
+                           tl::codegen::ptx::DTypeEnumToString(dtype_a_enum));
+    replacer.register_rule("(BType)",
+                           tl::codegen::ptx::DTypeEnumToString(dtype_b_enum));
+    replacer.register_rule("(CType)",
+                           tl::codegen::ptx::DTypeEnumToString(dtype_c_enum));
+    replacer.register_rule("(M)", std::to_string(m));
+    replacer.register_rule("(N)", std::to_string(n));
+    replacer.register_rule("(K)", std::to_string(k));
+    replacer.register_rule("(TransA)", A_layout == "row" ? "false" : "true");
+    replacer.register_rule("(TransB)", B_layout == "row" ? "false" : "true");
+    replacer.register_rule("(ARegType)",
+                           tl::codegen::GetMMARegisterType(dtype_a_enum));
+    replacer.register_rule("(BRegType)",
+                           tl::codegen::GetMMARegisterType(dtype_b_enum));
+    replacer.register_rule("(CRegType)",
+                           tl::codegen::GetMMARegisterType(dtype_c_enum));
+    replacer.register_rule("(A_ptr)", a_ref);
+    replacer.register_rule("(A_offset)", a_bias);
+    replacer.register_rule("(B_ptr)", b_ref);
+    replacer.register_rule("(B_offset)", b_bias);
+    replacer.register_rule("(C_ptr)", c_ref);
+    replacer.register_rule("(C_offset)", c_bias);
+    this->stream << replacer.rewrite(mma_call);
   } else if (op->op.same_as(builtin::ptx_mma_sp())) {
     // arg 0: shape: mXnXkX
     // arg 1: A layout: row/col
@@ -1704,27 +2014,32 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     std::string B_offset = this->PrintExpr(op->args[9]);
     std::string c_ref = this->PrintExpr(op->args[10]);
     std::string c_offset = this->PrintExpr(op->args[11]);
-    bool scale_out = Downcast<Bool>(op->args[12])->value;
+    std::string scale_out = this->PrintExpr(op->args[12]);
     bool scale_in_a = Downcast<Bool>(op->args[13])->value;
     bool scale_in_b = Downcast<Bool>(op->args[14])->value;
 
     const bool a_is_shared = true;
     this->PrintIndent();
-    std::string asm_code = PrintWGMMAAssembly(
-        shape, a_is_k_major, b_is_k_major, A_dtype, B_dtype, C_dtype, a_desc,
-        A_offset, b_desc, B_offset, c_ref, c_offset, scale_out, scale_in_a,
-        scale_in_b, a_is_shared, "", "", "", false);
     auto [m, n, k] = tl::codegen::ptx::ParseMMAShape(shape);
+    need_wgmma_instruction_h_ = true;
     std::string wgmma_asm_code =
         "tl::wgmma_ss<(AType), (BType), (CType), (M), (N), (K), (tnspA), "
         "(tnspB), (scaleA), (scaleB)>(uint64_t((desc_a) + (A_offset)), "
         "uint64_t((desc_b) + (B_offset)), ((uint32_t*)((C))), (scale_out));\n";
     // replace patterns
     tl::codegen::Replacer replacer;
-    replacer.register_rule("(AType)",
-                           tl::codegen::ptx::DTypeEnumToString(A_dtype));
-    replacer.register_rule("(BType)",
-                           tl::codegen::ptx::DTypeEnumToString(B_dtype));
+
+    std::string AType = tl::codegen::ptx::DTypeEnumToString(A_dtype);
+    if (AType == "tl::DataType::kFloat32") {
+      AType = "tl::DataType::kTensorFloat32";
+    }
+    std::string BType = tl::codegen::ptx::DTypeEnumToString(B_dtype);
+    if (BType == "tl::DataType::kFloat32") {
+      BType = "tl::DataType::kTensorFloat32";
+    }
+
+    replacer.register_rule("(AType)", AType);
+    replacer.register_rule("(BType)", BType);
     replacer.register_rule("(CType)",
                            tl::codegen::ptx::DTypeEnumToString(C_dtype));
     replacer.register_rule("(M)", std::to_string(m));
@@ -1739,45 +2054,184 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     replacer.register_rule("(desc_b)", b_desc);
     replacer.register_rule("(B_offset)", B_offset);
     replacer.register_rule("(C)", c_ref + " + " + c_offset);
-    replacer.register_rule("(scale_out)", scale_out ? "true" : "false");
+    replacer.register_rule("(scale_out)", scale_out);
     wgmma_asm_code = replacer.rewrite(wgmma_asm_code);
     this->stream << wgmma_asm_code;
   } else if (op->op.same_as(tl::ptx_wgmma_rs())) {
-    // arg 0: dtype
-    // arg 1: shape
-    // arg 2: A_layout
-    // arg 3: B_layout
-    // arg 4: A_dtype
-    // arg 5: B_dtype
-    // arg 6: C_dtype
-    // arg 7: multiplicand_a
-    // arg 8: multiplicand_b
+    // arg 0: shape
+    // arg 1: B_layout
+    // arg 2: A_dtype
+    // arg 3: B_dtype
+    // arg 4: C_dtype
+    // arg 5: multiplicand_a
+    // arg 6: multiplicand_a offset
+    // arg 7: multiplicand_b descriptor
+    // arg 8: multiplicand_b offset
     // arg 9: accumulator
-    // arg 10: saturate
-    ICHECK_EQ(op->args.size(), 15U) << "ptx_wgmma_rs args is " << op->args;
+    // arg 10: accumulator offset
+    // arg 11: scale_out
+    // arg 12: scale_in_a
+    // arg 13: scale_in_b
+    ICHECK_EQ(op->args.size(), 14U) << "ptx_wgmma_rs args is " << op->args;
     std::string shape = Downcast<StringImm>(op->args[0])->value;
-    bool A_layout = Downcast<Bool>(op->args[1])->value;
-    bool B_layout = Downcast<Bool>(op->args[2])->value;
-    std::string A_dtype = Downcast<StringImm>(op->args[3])->value;
-    std::string B_dtype = Downcast<StringImm>(op->args[4])->value;
-    std::string C_dtype = Downcast<StringImm>(op->args[5])->value;
-    std::string a_ref = this->PrintExpr(op->args[6]);
-    std::string A_offset = this->PrintExpr(op->args[7]);
-    std::string b_desc = this->PrintExpr(op->args[8]);
-    std::string B_offset = this->PrintExpr(op->args[9]);
-    std::string c_ref = this->PrintExpr(op->args[10]);
-    std::string c_offset = this->PrintExpr(op->args[11]);
-    bool scale_out = Downcast<Bool>(op->args[12])->value;
-    bool scale_in_a = Downcast<Bool>(op->args[13])->value;
-    bool scale_in_b = Downcast<Bool>(op->args[14])->value;
+    bool b_is_k_major = Downcast<Bool>(op->args[1])->value;
+    std::string A_dtype = Downcast<StringImm>(op->args[2])->value;
+    std::string B_dtype = Downcast<StringImm>(op->args[3])->value;
+    std::string C_dtype = Downcast<StringImm>(op->args[4])->value;
+    std::string a_ref = this->PrintExpr(op->args[5]);
+    std::string A_offset = this->PrintExpr(op->args[6]);
+    std::string b_desc = this->PrintExpr(op->args[7]);
+    std::string B_offset = this->PrintExpr(op->args[8]);
+    std::string c_ref = this->PrintExpr(op->args[9]);
+    std::string c_offset = this->PrintExpr(op->args[10]);
+    std::string scale_out = this->PrintExpr(op->args[11]);
+    bool scale_in_a = Downcast<Bool>(op->args[12])->value;
+    bool scale_in_b = Downcast<Bool>(op->args[13])->value;
+
+    auto dtype_a_enum = tl::codegen::ptx::DTypeFromString(A_dtype);
+    auto dtype_b_enum = tl::codegen::ptx::DTypeFromString(B_dtype);
+    auto dtype_c_enum = tl::codegen::ptx::DTypeFromString(C_dtype);
+    auto [m, n, k] = tl::codegen::ptx::ParseMMAShape(shape);
 
-    const bool a_is_shared = false;
+    need_wgmma_instruction_h_ = true;
     this->PrintIndent();
-    std::string asm_code = PrintWGMMAAssembly(
-        shape, A_layout, B_layout, A_dtype, B_dtype, C_dtype, a_ref, A_offset,
-        b_desc, B_offset, c_ref, c_offset, scale_out, scale_in_a, scale_in_b,
-        a_is_shared, "", "", "", false);
-    this->stream << asm_code;
+    std::string wgmma_call =
+        "tl::wgmma_rs<(AType), (BType), (CType), (M), (N), (K), (tnspA), "
+        "(tnspB), (scaleA), (scaleB)>(reinterpret_cast<const "
+        "uint32_t*>((A_ptr) + (A_offset)), "
+        "uint64_t((desc_b) + (B_offset)), "
+        "reinterpret_cast<uint32_t*>((C_ptr) + (C_offset)), "
+        "(scale_out));\n";
+
+    tl::codegen::Replacer replacer;
+    std::string AType = tl::codegen::ptx::DTypeEnumToString(A_dtype);
+    if (AType == "tl::DataType::kFloat32") {
+      AType = "tl::DataType::kTensorFloat32";
+    }
+    std::string BType = tl::codegen::ptx::DTypeEnumToString(B_dtype);
+    if (BType == "tl::DataType::kFloat32") {
+      BType = "tl::DataType::kTensorFloat32";
+    }
+
+    replacer.register_rule("(AType)", AType);
+    replacer.register_rule("(BType)", BType);
+    replacer.register_rule("(CType)",
+                           tl::codegen::ptx::DTypeEnumToString(dtype_c_enum));
+    replacer.register_rule("(M)", std::to_string(m));
+    replacer.register_rule("(N)", std::to_string(n));
+    replacer.register_rule("(K)", std::to_string(k));
+    replacer.register_rule("(tnspA)", "false");
+    replacer.register_rule("(tnspB)", b_is_k_major ? "false" : "true");
+    replacer.register_rule("(scaleA)", scale_in_a ? "1" : "-1");
+    replacer.register_rule("(scaleB)", scale_in_b ? "1" : "-1");
+    replacer.register_rule("(A_ptr)", a_ref);
+    replacer.register_rule("(A_offset)", A_offset);
+    replacer.register_rule("(desc_b)", b_desc);
+    replacer.register_rule("(B_offset)", B_offset);
+    replacer.register_rule("(C_ptr)", c_ref);
+    replacer.register_rule("(C_offset)", c_offset);
+    replacer.register_rule("(scale_out)", scale_out);
+    wgmma_call = replacer.rewrite(wgmma_call);
+    this->stream << wgmma_call;
+  } else if (op->op.same_as(tl::ptx_tcgen05_mma_ss())) {
+    ICHECK_EQ(op->args.size(), 14U)
+        << "ptx_tcgen05_mma_ss args is " << op->args;
+    std::string C_dtype = Downcast<StringImm>(op->args[0])->value;
+    std::string a_desc = this->PrintExpr(op->args[1]);
+    std::string A_offset = this->PrintExpr(op->args[2]);
+    std::string b_desc = this->PrintExpr(op->args[3]);
+    std::string B_offset = this->PrintExpr(op->args[4]);
+    std::string c_ref = this->PrintExpr(op->args[5]);
+    std::string c_offset = this->PrintExpr(op->args[6]);
+    PrimExpr desc_expr = op->args[7];
+    std::string scale_out = this->PrintExpr(op->args[8]);
+    std::string mask0 = this->PrintExpr(op->args[9]);
+    std::string mask1 = this->PrintExpr(op->args[10]);
+    std::string mask2 = this->PrintExpr(op->args[11]);
+    std::string mask3 = this->PrintExpr(op->args[12]);
+    bool enable_ws = Downcast<Bool>(op->args[13])->value;
+
+    auto dtype_c_enum = tl::codegen::ptx::DTypeFromString(C_dtype);
+
+    need_tcgen05mma_instruction_h_ = true;
+    this->PrintIndent();
+    std::string tcgen05_call =
+        "tl::(tcgen05_name)<(CType)>(uint64_t((desc_a) + (A_offset)), "
+        "uint64_t((desc_b) + (B_offset)), (*reinterpret_cast<uint32_t*>((C))) "
+        "+ (C_offset), "
+        "(scale_out), static_cast<uint32_t>((desc_val)), (mask0), (mask1), "
+        "(mask2), (mask3));\n";
+    tl::codegen::Replacer replacer;
+    replacer.register_rule("(CType)",
+                           tl::codegen::ptx::DTypeEnumToString(dtype_c_enum));
+    replacer.register_rule("(desc_a)", a_desc);
+    replacer.register_rule("(A_offset)", A_offset);
+    replacer.register_rule("(desc_b)", b_desc);
+    replacer.register_rule("(B_offset)", B_offset);
+    replacer.register_rule("(C)", c_ref);
+    replacer.register_rule("(C_offset)", c_offset);
+    replacer.register_rule("(tcgen05_name)",
+                           enable_ws ? "tcgen05mma_ws_ss" : "tcgen05mma_ss");
+    replacer.register_rule("(scale_out)", scale_out);
+    replacer.register_rule("(desc_val)", this->PrintExpr(desc_expr));
+    replacer.register_rule("(mask0)", mask0);
+    replacer.register_rule("(mask1)", mask1);
+    replacer.register_rule("(mask2)", mask2);
+    replacer.register_rule("(mask3)", mask3);
+    tcgen05_call = replacer.rewrite(tcgen05_call);
+    this->stream << tcgen05_call;
+  } else if (op->op.same_as(tl::ptx_tcgen05_mma_ts())) {
+    // TS: A from TMEM, B from SMEM (desc)
+    ICHECK_EQ(op->args.size(), 13U)
+        << "ptx_tcgen05_mma_ts args is " << op->args;
+    std::string kind_dtype = Downcast<StringImm>(op->args[0])->value;
+    std::string a_ref = this->PrintExpr(op->args[1]);
+    std::string A_offset = this->PrintExpr(op->args[2]);
+    std::string b_desc = this->PrintExpr(op->args[3]);
+    std::string B_offset = this->PrintExpr(op->args[4]);
+    std::string c_ref = this->PrintExpr(op->args[5]);
+    std::string c_offset = this->PrintExpr(op->args[6]);
+    PrimExpr desc_expr = op->args[7];
+    std::string scale_out = this->PrintExpr(op->args[8]);
+    std::string mask0 = this->PrintExpr(op->args[9]);
+    std::string mask1 = this->PrintExpr(op->args[10]);
+    std::string mask2 = this->PrintExpr(op->args[11]);
+    std::string mask3 = this->PrintExpr(op->args[12]);
+
+    auto dtype_enum = tl::codegen::ptx::DTypeFromString(kind_dtype);
+
+    need_tcgen05mma_instruction_h_ = true;
+    this->PrintIndent();
+    std::string tcgen05_call =
+        "tl::tcgen05mma_ts<(CType)>( (*reinterpret_cast<uint32_t*>((A))) + "
+        "(A_offset), "
+        "uint64_t((desc_b) + (B_offset)), (*reinterpret_cast<uint32_t*>((C))) "
+        "+ (C_offset), "
+        "(scale_out), static_cast<uint32_t>((desc_val)), (mask0), (mask1), "
+        "(mask2), (mask3));\n";
+    tl::codegen::Replacer replacer;
+    replacer.register_rule("(CType)",
+                           tl::codegen::ptx::DTypeEnumToString(dtype_enum));
+    replacer.register_rule("(A)", a_ref);
+    replacer.register_rule("(A_offset)", A_offset);
+    replacer.register_rule("(desc_b)", b_desc);
+    replacer.register_rule("(B_offset)", B_offset);
+    replacer.register_rule("(C)", c_ref);
+    replacer.register_rule("(C_offset)", c_offset);
+    replacer.register_rule("(scale_out)", scale_out);
+    replacer.register_rule("(desc_val)", this->PrintExpr(desc_expr));
+    replacer.register_rule("(mask0)", mask0);
+    replacer.register_rule("(mask1)", mask1);
+    replacer.register_rule("(mask2)", mask2);
+    replacer.register_rule("(mask3)", mask3);
+    tcgen05_call = replacer.rewrite(tcgen05_call);
+    this->stream << tcgen05_call;
+  } else if (op->op.same_as(tl::tcgen05_mma_arrive())) {
+    ICHECK_EQ(op->args.size(), 1U) << "tcgen05_mma_arrive expects 1 argument";
+    need_tcgen05_common_h_ = true;
+    this->PrintIndent();
+    this->stream << "tl::tcgen05_mma_arrive(" << this->PrintExpr(op->args[0])
+                 << ");\n";
   } else if (op->op.same_as(builtin::ptx_ldmatrix())) {
     // arg 0: whether the matrix is loaded in column major format or not.
     // arg 1: number of matrices to load.
@@ -1804,7 +2258,7 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
          << "[(i % 8) / 4 * " + smem_stride +
                 " * 16 + (threadIdx.x % 4) * 4 * " + smem_stride +
                 "+ (i % 4) * " + smem_stride +
-                " + threadIdx.x / 4 +  (i / 8) * 8];\n";
+                " + threadIdx.x / 4 + (i / 8) * 8];\n";
       os << "}\n";
     } else {
       std::string smem_elem_offset = this->PrintExpr(op->args[6]);
@@ -1990,130 +2444,23 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     stream << ": \"l\"((void*)(" << global_buffer << "+" << global_addr
            << ")), \"r\"((int)" << guard << ")\n";
     stream << ");\n";
-  } else if (op->op.same_as(tl::GetPE())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmem_my_pe()";
-  } else if (op->op.same_as(tl::GetPENum())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmem_n_pes()";
-  } else if (op->op.same_as(tl::IntPE())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmem_int_p(";
-    this->PrintExpr(op->args[0], os);
-    os << ", ";
-    this->PrintExpr(op->args[1], os);
-    os << ", ";
-    this->PrintExpr(op->args[2], os);
-    os << ")";
-  } else if (op->op.same_as(tl::PutmemBlock())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmemx_putmem_block(";
-    this->PrintExpr(op->args[0], os);
-    os << ", ";
-    this->PrintExpr(op->args[1], os);
-    os << ", ";
-    this->PrintExpr(op->args[2], os);
-    os << ", ";
-    this->PrintExpr(op->args[3], os);
-    os << ")";
-  } else if (op->op.same_as(tl::PutmemNbiBlock())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmemx_putmem_nbi_block(";
-    this->PrintExpr(op->args[0], os);
-    os << ", ";
-    this->PrintExpr(op->args[1], os);
-    os << ", ";
-    this->PrintExpr(op->args[2], os);
-    os << ", ";
-    this->PrintExpr(op->args[3], os);
-    os << ")";
-  } else if (op->op.same_as(tl::PutmemSignalNbiBlock())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmemx_putmem_signal_nbi_block(";
-    for (int i = 0; i < op->args.size(); i++) {
-      this->PrintExpr(op->args[i], os);
-      if (i != op->args.size() - 1) {
-        os << ", ";
-      }
-    }
-    os << ")";
-  } else if (op->op.same_as(tl::GetmemBlock())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmemx_getmem_block(";
-    this->PrintExpr(op->args[0], os);
-    os << ", ";
-    this->PrintExpr(op->args[1], os);
-    os << ", ";
-    this->PrintExpr(op->args[2], os);
-    os << ", ";
-    this->PrintExpr(op->args[3], os);
-    os << ")";
-  } else if (op->op.same_as(tl::GetmemNbiBlock())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmemx_getmem_nbi_block(";
-    this->PrintExpr(op->args[0], os);
-    os << ", ";
-    this->PrintExpr(op->args[1], os);
-    os << ", ";
-    this->PrintExpr(op->args[2], os);
-    os << ", ";
-    this->PrintExpr(op->args[3], os);
-    os << ")";
-  } else if (op->op.same_as(tl::SignalWaitUntil())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmem_signal_wait_until(";
-    for (int i = 0; i < op->args.size(); i++) {
-      this->PrintExpr(op->args[i], os);
-      if (i != op->args.size() - 1) {
-        os << ", ";
-      }
-    }
-    os << ")";
-  } else if (op->op.same_as(tl::SignalOp())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmemx_signal_op(";
-    for (int i = 0; i < op->args.size(); i++) {
-      this->PrintExpr(op->args[i], os);
-      if (i != op->args.size() - 1) {
-        os << ", ";
-      }
-    }
-    os << ")";
-  } else if (op->op.same_as(tl::Quiet())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmem_quiet()";
-  } else if (op->op.same_as(tl::Fence())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmem_fence()";
-  } else if (op->op.same_as(tl::SyncAll())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmem_sync_all()";
-  } else if (op->op.same_as(tl::BarrierAll())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmem_barrier_all()";
-  } else if (op->op.same_as(tl::fence_cta())) {
-    this->use_distributed_ = true;
-    os << "tl::memory_fence_cta()";
-  } else if (op->op.same_as(tl::fence_gpu())) {
-    this->use_distributed_ = true;
-    os << "tl::memory_fence_gpu()";
-  } else if (op->op.same_as(tl::fence_sys())) {
-    this->use_distributed_ = true;
-    os << "tl::memory_fence_sys()";
+  } else if (op->op.same_as(tl::__ldg())) {
+    // Explicit read-only cached load. Preferred form: __ldg(BufferLoad(...)).
+    // Fallback form: __ldg(buffer, index)
+    const BufferLoadNode *bl = nullptr;
+    if (!op->args.empty()) {
+      bl = op->args[0].as<BufferLoadNode>();
+    }
+    if (bl == nullptr) {
+      LOG(FATAL) << "T.__ldg expects a BufferLoad as the first argument.";
+    }
+    const BufferNode *buffer = bl->buffer.get();
+    ICHECK_EQ(bl->indices.size(), 1)
+        << "T.__ldg currently supports flattened 1D buffer accesses.";
+    PrimExpr base = bl->indices[0];
+    // Emit __ldg(&buffer_ref)
+    auto buffer_ref = this->GetBufferRef(op->dtype, buffer, base);
+    os << "__ldg(&(" << buffer_ref << "))";
   } else if (op->op.same_as(builtin::reinterpret())) {
     DataType tgt_dtype = op->dtype;
     DataType src_dtype = op->args[0]->dtype;
@@ -2213,8 +2560,8 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
                                     "A_ptr, B_ptr, C_ptr>, but got "
                                  << op->args.size();
     auto op_instance = Downcast<StringImm>(op->args[0]);
-    this->PrintCallExtern(GetType(GetRef<PrimExpr>(op)), op_instance->value,
-                          op->args, true, os);
+    this->PrintCallExtern(GetType(tvm::ffi::GetRef<PrimExpr>(op)),
+                          op_instance->value, op->args, true, os);
   } else if (op->op.same_as(tl::tl_gemm_sp())) {
     ICHECK(op->args.size() == 5)
         << "tl_gemm_sp expects 5 arguments <op_instance, A_ptr, B_ptr, C_ptr, "
@@ -2222,8 +2569,8 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
         << op->args.size();
     auto op_instance = Downcast<StringImm>(op->args[0]);
     enable_sparse_gemm_ = true;
-    this->PrintCallExtern(GetType(GetRef<PrimExpr>(op)), op_instance->value,
-                          op->args, true, os);
+    this->PrintCallExtern(GetType(tvm::ffi::GetRef<PrimExpr>(op)),
+                          op_instance->value, op->args, true, os);
   } else if (op->op.same_as(tl::get_lane_idx())) {
     ICHECK_LE(op->args.size(), 1)
         << "tl.get_lane_idx expects at most one argument <warp_size>.";
@@ -2261,19 +2608,35 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     os << ")";
   } else if (op->op.same_as(tl::tl_shuffle_elect())) {
     os << "tl::tl_shuffle_elect<" << PrintExpr(op->args[0]) << ">()";
-  } else if (op->op.same_as(tl::initialize_descriptor())) {
+  } else if (op->op.same_as(tl::initialize_wgmma_descriptor())) {
     ICHECK(op->args.size() == 5)
-        << "tl_initialize_descriptor expects 5 arguments but got "
+        << "tl_initialize_wgmma_descriptor expects 5 arguments but got "
         << op->args.size();
     auto descriptor = op->args[0];
     auto start_address = op->args[1];
     auto layout_type = op->args[2];
     auto leading_byte_offset = op->args[3];
     auto stride_byte_offset = op->args[4];
-    os << "tl::initialize_descriptor<" << PrintExpr(layout_type) << ", "
+    os << "tl::initialize_wgmma_descriptor<" << PrintExpr(layout_type) << ", "
        << PrintExpr(leading_byte_offset) << ", "
        << PrintExpr(stride_byte_offset) << ">(" << PrintExpr(descriptor) << ", "
        << PrintExpr(start_address) << ")";
+  } else if (op->op.same_as(tl::initialize_tcgen05_descriptor())) {
+    ICHECK(op->args.size() == 7)
+        << "tl_initialize_tcgen05_descriptor expects 7 arguments but got "
+        << op->args.size();
+    auto descriptor = op->args[0];
+    auto start_address = op->args[1];
+    auto leading_byte_offset = op->args[2];
+    auto stride_byte_offset = op->args[3];
+    auto base_offset = op->args[4];
+    auto leading_abs = op->args[5];
+    auto swizzle_mode = op->args[6];
+    os << "tl::initialize_tcgen05_descriptor(" << PrintExpr(descriptor) << ", "
+       << PrintExpr(start_address) << ", " << PrintExpr(leading_byte_offset)
+       << ", " << PrintExpr(stride_byte_offset) << ", "
+       << PrintExpr(base_offset) << ", " << PrintExpr(leading_abs) << ", "
+       << PrintExpr(swizzle_mode) << ")";
   } else if (op->op.same_as(tl::increase_descriptor_offset())) {
     ICHECK(op->args.size() == 2)
         << "tl_increase_descriptor_offset expects 2 arguments but got "
@@ -2358,19 +2721,179 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     std::string func_name = math_func(op->dtype, "fdiv", rounding_mode);
     os << func_name << "(" << PrintExpr(op->args[0]) << ", "
        << PrintExpr(op->args[1]) << ")";
-  } else if (op->op.same_as(tl::elect_one_sync())) {
-    os << "cute::elect_one_sync()";
+  } else if (op->op.same_as(tl::rng_init())) {
+    this->need_curand_kernel_h_ = true;
+    this->curand_philox_state = name_supply_->FreshName("__philox_state");
+    this->PrintIndent();
+    this->stream << "curandStatePhilox4_32_10_t " << this->curand_philox_state
+                 << ";\n";
+    this->PrintIndent();
+    this->stream << "curand_init(" << PrintExpr(op->args[0]) << ", "
+                 << PrintExpr(op->args[1]) << ", " << PrintExpr(op->args[2])
+                 << ", &" << this->curand_philox_state << ");\n";
+    // Store state_var for later use by rng_rand
+  } else if (op->op.same_as(tl::rng_rand())) {
+    this->need_curand_kernel_h_ = true;
+    os << "curand(&" << this->curand_philox_state << ")";
+  } else if (op->op.same_as(tl::warp_reduce_sum())) {
+    os << "tl::warp_reduce_sum(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_max())) {
+    os << "tl::warp_reduce_max(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_min())) {
+    os << "tl::warp_reduce_min(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_bitand())) {
+    os << "tl::warp_reduce_bitand(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_bitor())) {
+    os << "tl::warp_reduce_bitor(" << PrintExpr(op->args[0]) << ")";
   } else if (op->op.same_as(tl::sync_warp())) {
     os << "__syncwarp()";
-  } else if (op->op.same_as(tl::loop_continue())) {
-    os << "continue";
   } else if (op->op.same_as(tl::warp_any())) {
     os << "__any_sync(" << PrintExpr(op->args[1]) << ", "
        << PrintExpr(op->args[0]) << ")";
   } else if (op->op.same_as(tl::warp_all())) {
     os << "__all_sync(" << PrintExpr(op->args[1]) << ", "
        << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::GetPE())) {
+    this->use_distributed_ = true;
+    this->use_nvshmem_ = true;
+    os << "nvshmem_my_pe()";
+  } else if (op->op.same_as(tl::GetPENum())) {
+    this->use_distributed_ = true;
+    this->use_nvshmem_ = true;
+    os << "nvshmem_n_pes()";
+  } else if (op->op.same_as(tl::IntPE())) {
+    this->use_distributed_ = true;
+    this->use_nvshmem_ = true;
+    os << "nvshmem_int_p(";
+    this->PrintExpr(op->args[0], os);
+    os << ", ";
+    this->PrintExpr(op->args[1], os);
+    os << ", ";
+    this->PrintExpr(op->args[2], os);
+    os << ")";
+  } else if (op->op.same_as(tl::PutmemNbiBlock())) {
+    this->use_distributed_ = true;
+    this->use_nvshmem_ = true;
+    os << "nvshmemx_putmem_nbi_block(";
+    this->PrintExpr(op->args[0], os);
+    os << ", ";
+    this->PrintExpr(op->args[1], os);
+    os << ", ";
+    this->PrintExpr(op->args[2], os);
+    os << ", ";
+    this->PrintExpr(op->args[3], os);
+    os << ")";
+  } else if (op->op.same_as(tl::PutmemSignalNbiBlock())) {
+    this->use_distributed_ = true;
+    this->use_nvshmem_ = true;
+    os << "nvshmemx_putmem_signal_nbi_block(";
+    for (size_t i = 0; i < op->args.size(); i++) {
+      this->PrintExpr(op->args[i], os);
+      if (i != op->args.size() - 1) {
+        os << ", ";
+      }
+    }
+    os << ")";
+  } else if (op->op.same_as(tl::GetmemNbiBlock())) {
+    this->use_distributed_ = true;
+    this->use_nvshmem_ = true;
+    os << "nvshmemx_getmem_nbi_block(";
+    for (size_t i = 0; i < op->args.size(); i++) {
+      this->PrintExpr(op->args[i], os);
+      if (i != op->args.size() - 1) {
+        os << ", ";
+      }
+    }
+    os << ")";
+  } else if (op->op.same_as(tl::BarrierAll())) {
+    this->use_distributed_ = true;
+    this->use_nvshmem_ = true;
+    os << "nvshmem_barrier_all()";
+  } else if (op->op.same_as(tl::BarrierAllBlock())) {
+    this->use_distributed_ = true;
+    this->use_nvshmem_ = true;
+    os << "nvshmemx_barrier_all_block()";
+  } else if (op->op.same_as(tl::SyncAll())) {
+    this->use_distributed_ = true;
+    this->use_nvshmem_ = true;
+    os << "nvshmem_sync_all()";
+  } else if (op->op.same_as(tl::SyncAllBlock())) {
+    this->use_distributed_ = true;
+    this->use_nvshmem_ = true;
+    os << "nvshmemx_sync_all_block()";
+  } else if (op->op.same_as(tl::Quiet())) {
+    this->use_distributed_ = true;
+    this->use_nvshmem_ = true;
+    os << "nvshmem_quiet()";
+  } else if (op->op.same_as(tl::Fence())) {
+    this->use_distributed_ = true;
+    this->use_nvshmem_ = true;
+    os << "nvshmem_fence()";
+  } else if (op->op.same_as(tl::SignalOp())) {
+    this->use_distributed_ = true;
+    this->use_nvshmem_ = true;
+    os << "nvshmemx_signal_op(";
+    for (size_t i = 0; i < op->args.size(); i++) {
+      this->PrintExpr(op->args[i], os);
+      if (i != op->args.size() - 1) {
+        os << ", ";
+      }
+    }
+    os << ")";
+  } else if (op->op.same_as(tl::SignalWaitUntil())) {
+    this->use_distributed_ = true;
+    this->use_nvshmem_ = true;
+    os << "nvshmem_signal_wait_until(";
+    for (size_t i = 0; i < op->args.size(); i++) {
+      this->PrintExpr(op->args[i], os);
+      if (i != op->args.size() - 1) {
+        os << ", ";
+      }
+    }
+    os << ")";
+  } else if (op->op.same_as(tl::Quiet())) {
+    this->use_distributed_ = true;
+    this->use_nvshmem_ = true;
+    os << "nvshmem_quiet()";
+  } else if (op->op.same_as(tl::Fence())) {
+    this->use_distributed_ = true;
+    this->use_nvshmem_ = true;
+    os << "nvshmem_fence()";
+  } else if (op->op.same_as(tl::SyncAll())) {
+    this->use_distributed_ = true;
+    this->use_nvshmem_ = true;
+    os << "nvshmem_sync_all()";
+  } else if (op->op.same_as(tl::BarrierAll())) {
+    this->use_distributed_ = true;
+    this->use_nvshmem_ = true;
+    os << "nvshmem_barrier_all()";
+  } else if (op->op.same_as(tl::fence_cta())) {
+    this->use_distributed_ = true;
+    os << "tl::memory_fence_cta()";
+  } else if (op->op.same_as(tl::fence_gpu())) {
+    this->use_distributed_ = true;
+    os << "tl::memory_fence_gpu()";
+  } else if (op->op.same_as(tl::fence_sys())) {
+    this->use_distributed_ = true;
+    os << "tl::memory_fence_sys()";
+  } else if (op->op.same_as(tl::get_rank())) {
+    this->use_distributed_ = true;
+    os << "tl::get_rank()";
+  } else if (op->op.same_as(tl::get_num_ranks())) {
+    this->use_distributed_ = true;
+    os << "tl::get_num_ranks()";
+  } else if (op->op.same_as(tl::get_remote_base_ptr())) {
+    this->use_distributed_ = true;
+    std::string pe_str = this->PrintExpr(op->args[0]);
+    os << "tl::get_remote_base_ptr(" << pe_str << ")";
+  } else if (op->op.same_as(tl::get_uintptr_t())) {
+    this->use_distributed_ = true;
+    std::string ptr_str = this->PrintExpr(op->args[0]);
+    os << "tl::get_uintptr_t(" << ptr_str << ")";
   } else {
+    // Note: tl.put, tl.get, tl.wait are TileOperators handled through
+    // remote_copy.cc They are lowered to call_extern with
+    // tl::cp_warp/tl::cp_block templates
     CodeGenC::VisitExpr_(op, os);
   }
 }
@@ -2412,7 +2935,12 @@ void CodeGenTileLangCUDA::VisitStmt_(const AttrStmtNode *op) {
     this->stream << "const dim3 blockIdx = " << pattern->value << "();\n";
     this->VisitStmt(op->body);
     return;
+  } else if (op->attr_key == "pragma_unroll_factor") {
+    const IntImmNode *factor = op->value.as<IntImmNode>();
+    ICHECK(factor);
+    unroll_factor[op->node.as<VarNode>()] = Downcast<IntImm>(factor);
   }
+
   CodeGenC::VisitStmt_(op);
 }
 
@@ -2436,8 +2964,12 @@ void CodeGenTileLangCUDA::VisitStmt_(const AllocateNode *op) {
           << "Accumulator only support half, float and int type for now";
     }
     PrintWmmaScope(scope, op->dtype, buffer, stream);
-  } else if (scope == "local.descriptor") {
+  } else if (scope == "local.descriptor.wgmma") {
     stream << "tl::GmmaDescriptor " << vid << ";\n";
+  } else if (scope == "local.descriptor.tcgen05_smem") {
+    stream << "tl::Tcgen05SMemDescriptor " << vid << ";\n";
+  } else if (scope == "local.descriptor.tcgen05_instr") {
+    stream << "tl::Tcgen05InstrDescriptor " << vid << ";\n";
   } else {
     PrintStorageScope(scope, stream);
     PrintType(op->dtype, stream);
@@ -2479,7 +3011,7 @@ void CodeGenTileLangCUDA::VisitStmt_(const AllocateNode *op) {
         init = user_init;
       }
       stream << ' ' << vid << " = " << PrintExpr(init) << ";\n";
-    } else if (scope != "local.descriptor") {
+    } else if (scope.find("local.descriptor") != 0) {
       ICHECK(false) << "Unsupported scope: " << scope;
     }
   }
@@ -2501,6 +3033,16 @@ void CodeGenTileLangCUDA::VisitStmt_(const EvaluateNode *op) {
     stream << "  " << vid_global_barrier_expect_ << " = 0;\n";
     PrintIndent();
     stream << "}\n";
+  }
+  if (call && (call->op.same_as(tvm::tl::device_assert()))) {
+    std::string cond = PrintExpr(call->args[0]);
+    this->PrintIndent();
+    stream << "device_assert(" << cond << ");\n";
+  } else if (call && call->op.same_as(tvm::tl::device_assert_with_msg())) {
+    std::string cond = PrintExpr(call->args[0]);
+    std::string msg_expr = PrintExpr(call->args[1]);
+    this->PrintIndent();
+    stream << "device_assert_with_msg(" << cond << ", " << msg_expr << ");\n";
   } else {
     CodeGenC::VisitStmt_(op);
   }
@@ -2508,8 +3050,14 @@ void CodeGenTileLangCUDA::VisitStmt_(const EvaluateNode *op) {
 
 void CodeGenTileLangCUDA::VisitExpr_(const RampNode *op, std::ostream &os) {
   int lanes = static_cast<int>(Downcast<IntImm>(op->lanes)->value);
-  CHECK_LE(lanes, 4) << "Translate Ramp Node " << GetRef<Ramp>(op) << " with "
-                     << lanes << " lanes is not allowed.";
+  // TODO(chaofan): Comment the ramp lanes limit for now since we have
+  // LegalizeVectorizedLoop to automatically legalize vectorized loop whose
+  // width exceeds the limit. But we should add check here for safety in the
+  // future. The check should be aligned to certain bit width like 128bits or
+  // 256bits.
+
+  // CHECK_LE(lanes, 8) << "Translate Ramp Node " << tvm::ffi::GetRef<Ramp>(op)
+  //                    << "error: " << lanes << " exceeds max ramp lanes 8.";
   os << "(make_";
   PrintType(op->dtype, os);
   os << "(";
@@ -2542,7 +3090,8 @@ void CodeGenTileLangCUDA::VisitExpr_(const BufferLoadNode *op,
   } else {
     bool can_vector_load = false;
     arith::PVar<PrimExpr> base;
-    if (arith::ramp(base, 1, op->dtype.lanes()).Match(index)) {
+    int ramp_lanes = value_dtype.lanes() / element_dtype.lanes();
+    if (arith::ramp(base, 1, ramp_lanes).Match(index)) {
       const RampNode *ramp = index.as<RampNode>();
       ICHECK(ramp);
       can_vector_load = true;
@@ -2554,11 +3103,6 @@ void CodeGenTileLangCUDA::VisitExpr_(const BufferLoadNode *op,
       // }
     }
 
-    if (value_dtype.is_float4_e2m1fn() && lanes != 1) {
-      // A float4_e2m1fn element has 4 bits, which is an incomplete byte.
-      // So we cannot vector load it.
-      can_vector_load = false;
-    }
     if (can_vector_load) {
       std::string ref = GetVecLoad(op->dtype, op->buffer.get(), base.Eval());
       HandleVolatileLoads(ref, op, os);
@@ -2592,6 +3136,64 @@ void CodeGenTileLangCUDA::VisitExpr_(const BufferLoadNode *op,
   }
 }
 
+void CodeGenTileLangCUDA::VisitStmt_(const BufferStoreNode *op) {
+  ICHECK_EQ(op->indices.size(), 1) << "Store to non-flat memory not supported.";
+  ICHECK(!op->predicate.defined())
+      << "Predicated buffer store is not supported.";
+
+  DataType value_dtype = op->value.dtype();
+  DataType element_dtype = op->buffer->dtype;
+  PrimExpr index_expr = op->indices[0];
+  Var buffer_var = op->buffer->data;
+
+  if (value_dtype.lanes() == element_dtype.lanes()) {
+    std::string value = this->PrintExpr(op->value);
+    std::string ref =
+        this->GetBufferRef(value_dtype, op->buffer.get(), index_expr);
+    this->PrintIndent();
+    stream << ref << " = " << value << ";\n";
+  } else {
+    arith::PVar<PrimExpr> base;
+    int ramp_lanes = value_dtype.lanes() / element_dtype.lanes();
+    if (arith::ramp(base, 1, ramp_lanes).Match(index_expr)) {
+      std::string value = this->PrintExpr(op->value);
+      this->PrintVecStore(op->buffer.get(), value_dtype, base.Eval(), value);
+    } else {
+      // The assignment below introduces side-effect, and the resulting value
+      // cannot be reused across multiple expression, thus a new scope is needed
+      int vec_scope = BeginScope();
+
+      // store elements separately
+      std::string index = SSAGetID(PrintExpr(index_expr), index_expr.dtype());
+      std::string value = SSAGetID(PrintExpr(op->value), op->value.dtype());
+      std::string vid = GetVarID(buffer_var.get());
+      for (int i = 0; i < value_dtype.lanes(); ++i) {
+        this->PrintIndent();
+        DataType elem_type = value_dtype.element_of();
+        if (!HandleTypeMatch(buffer_var.get(), elem_type)) {
+          stream << "((";
+          if (buffer_var.get()->dtype.is_handle()) {
+            auto it = alloc_storage_scope_.find(buffer_var.get());
+            if (it != alloc_storage_scope_.end()) {
+              PrintStorageScope(it->second, stream);
+            }
+          }
+          PrintType(elem_type, stream);
+          stream << "*)" << vid << ')';
+        } else {
+          stream << vid;
+        }
+        stream << '[';
+        PrintVecElemLoad(index, index_expr.dtype(), i, stream);
+        stream << "] = ";
+        PrintVecElemLoad(value, op->value.dtype(), i, stream);
+        stream << ";\n";
+      }
+      EndScope(vec_scope);
+    }
+  }
+}
+
 void CodeGenTileLangCUDA::VisitExpr_(const BroadcastNode *op,
                                      std::ostream &os) { // NOLINT(*)
   int lanes = static_cast<int>(Downcast<IntImm>(op->lanes)->value);
@@ -2744,12 +3346,29 @@ void CodeGenTileLangCUDA::VisitExpr_(const BroadcastNode *op,
 
 inline void PrintConst(const FloatImmNode *op, std::ostream &os,
                        CodeGenTileLangCUDA *p) { // NOLINT(*)
-  // Type code is kBFloat
-  if (op->dtype.is_bfloat16()) {
-    os << "bfloat16_t";
-    os << '(' << std::hexfloat << op->value << 'f';
-    os << "/*" << std::scientific << op->value << "*/";
-    os << ')';
+  // Type code is kBFloat/kFloat16
+  // which is indeed CUTLASS supported types currently
+  if (op->dtype.is_bfloat16() || op->dtype.is_float16()) {
+    std::ostringstream temp;
+    if (std::isinf(op->value)) {
+      if (op->value < 0) {
+        temp << "-";
+      }
+      temp << "std::numeric_limits<";
+      p->PrintType(op->dtype, temp);
+      temp << ">::infinity()";
+    } else if (std::isnan(op->value)) {
+      temp << "std::numeric_limits<";
+      p->PrintType(op->dtype, temp);
+      temp << ">::quiet_NaN()";
+    } else {
+      p->PrintType(op->dtype, temp);
+      temp << '(' << std::hexfloat << op->value << 'f';
+      temp << "/*" << std::scientific << op->value << "*/";
+      temp << ')';
+    }
+    p->MarkConst(temp.str());
+    os << temp.str();
     return;
   }
   // Type code is kFloat8_e5m2 or kE4M4Float
@@ -2760,7 +3379,7 @@ inline void PrintConst(const FloatImmNode *op, std::ostream &os,
     os << ')';
     return;
   }
-  // Type code is kFloat
+  // Type code is kFloat64/kFloat32 (kFloat16 is handled above)
   switch (op->dtype.bits()) {
   case 64:
   case 32: {
@@ -2784,13 +3403,6 @@ inline void PrintConst(const FloatImmNode *op, std::ostream &os,
     os << temp.str();
     break;
   }
-  case 16: {
-    os << "half_t" << '(';
-    FloatImm const_f32 = FloatImm(DataType::Float(32), op->value);
-    PrintConst(const_f32.get(), os, p);
-    os << ')';
-    break;
-  }
   default:
     LOG(FATAL) << "Bad bit-width for float: " << op->dtype << "\n";
   }
@@ -2946,6 +3558,20 @@ void CodeGenTileLangCUDA::PrintFunctionSignature(const String &function_name,
   CodeGenC::PrintType(func->ret_type, os);
   CodeGenC::PrintExtraAttrs(func, os);
   bool no_alias = func->HasNonzeroAttr(tir::attr::kNoAlias);
+  std::unordered_set<const VarNode *> non_restrict;
+  if (auto opt =
+          func->GetAttr<ffi::Array<tir::Var>>(tl::attr::kNonRestrictParams)) {
+    for (const tir::Var &v : opt.value())
+      non_restrict.insert(v.get());
+  }
+  // Read-only param indices attribute, if present.
+  std::unordered_set<int> ro_param_indices;
+  if (auto opt =
+          func->GetAttr<ffi::Array<Integer>>("tl.readonly_param_indices")) {
+    for (const auto &idx : opt.value()) {
+      ro_param_indices.insert(static_cast<int>(Downcast<Integer>(idx)->value));
+    }
+  }
   os << " " << function_name << "(";
   for (size_t i = 0; i < func->params.size(); ++i) {
     tir::Var v = func->params[i];
@@ -2970,7 +3596,10 @@ void CodeGenTileLangCUDA::PrintFunctionSignature(const String &function_name,
       if (it != alloc_storage_scope_.end()) {
         PrintStorageScope(it->second, os);
       }
-
+      // If marked read-only, emit const qualifier before type.
+      if (ro_param_indices.count(static_cast<int>(i))) {
+        os << "const ";
+      }
       CodeGenC::PrintType(GetType(v), os);
       if (auto *ptr = v->type_annotation.as<PointerTypeNode>()) {
         if (auto *prim = ptr->element_type.as<PrimTypeNode>()) {
@@ -2978,7 +3607,7 @@ void CodeGenTileLangCUDA::PrintFunctionSignature(const String &function_name,
         }
       }
 
-      if (no_alias) {
+      if (no_alias && !non_restrict.count(v.get())) {
         PrintRestrict(v, os);
       }
     } else {
@@ -3011,9 +3640,22 @@ void CodeGenTileLangCUDA::AddFunction(const GlobalVar &gvar,
   ReserveKeywordsAsUnique();
 
   auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-  ICHECK(global_symbol.defined())
+  ICHECK(global_symbol)
       << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
   bool no_alias = f->HasNonzeroAttr(tir::attr::kNoAlias);
+  std::unordered_set<const VarNode *> non_restrict;
+  if (auto opt =
+          f->GetAttr<ffi::Array<tir::Var>>(tl::attr::kNonRestrictParams)) {
+    for (const tir::Var &v : opt.value())
+      non_restrict.insert(v.get());
+  }
+  // Read-only param indices attribute, if present.
+  std::unordered_set<int> ro_param_indices;
+  if (auto opt = f->GetAttr<ffi::Array<Integer>>("tl.readonly_param_indices")) {
+    for (const auto &idx : opt.value()) {
+      ro_param_indices.insert(static_cast<int>(Downcast<Integer>(idx)->value));
+    }
+  }
 
   this->PrintFuncPrefix(stream);
   CodeGenC::PrintType(f->ret_type, stream);
@@ -3041,7 +3683,10 @@ void CodeGenTileLangCUDA::AddFunction(const GlobalVar &gvar,
       if (it != alloc_storage_scope_.end()) {
         PrintStorageScope(it->second, stream);
       }
-
+      // If marked read-only, emit const qualifier before type.
+      if (ro_param_indices.count(static_cast<int>(i))) {
+        stream << "const ";
+      }
       CodeGenC::PrintType(GetType(v), stream);
       if (auto *ptr = v->type_annotation.as<PointerTypeNode>()) {
         if (auto *prim = ptr->element_type.as<PrimTypeNode>()) {
@@ -3049,7 +3694,7 @@ void CodeGenTileLangCUDA::AddFunction(const GlobalVar &gvar,
         }
       }
 
-      if (no_alias) {
+      if (no_alias && !non_restrict.count(v.get())) {
         PrintRestrict(v, stream);
       }
     } else {
diff --git a/src/target/codegen_cuda.h b/src/target/codegen_cuda.h
index 43d83f6e0..6c5f89e07 100644
--- a/src/target/codegen_cuda.h
+++ b/src/target/codegen_cuda.h
@@ -65,21 +65,24 @@ class CodeGenTileLangCUDA final : public CodeGenC {
   void VisitExpr_(const FloatImmNode *op, std::ostream &os) final;
   void VisitExpr_(const CallNode *op, std::ostream &os) final;
   void VisitExpr_(const CastNode *op, std::ostream &os) final;
+  void VisitExpr_(const MinNode *op, std::ostream &os) final;
+  void VisitExpr_(const MaxNode *op, std::ostream &os) final;
   void VisitStmt_(const EvaluateNode *op) final;
   void VisitStmt_(const AllocateNode *op) final;
   void VisitStmt_(const AttrStmtNode *op) final;
   void VisitExpr_(const BufferLoadNode *op, std::ostream &os) final;
+  void VisitStmt_(const BufferStoreNode *op) final;
 
   // Override this as a work around for __grid_constant__ parameter
   void AddFunction(const GlobalVar &gvar, const PrimFunc &f);
-  void PrintFunctionSignature(const String &function_name, const PrimFunc &func,
-                              std::ostream &os);
+  void PrintFunctionSignature(const ffi::String &function_name,
+                              const PrimFunc &func, std::ostream &os);
 
 protected:
   virtual std::string GetBufferRef(DataType t, const BufferNode *buffer,
                                    PrimExpr index) final;
-  void PrintCallExtern(Type ret_type, String global_symbol,
-                       const Array<PrimExpr> &args, bool skip_first_arg,
+  void PrintCallExtern(Type ret_type, ffi::String global_symbol,
+                       const ffi::Array<PrimExpr> &args, bool skip_first_arg,
                        std::ostream &os) final; // NOLINT(*)
 
 private:
@@ -99,6 +102,8 @@ class CodeGenTileLangCUDA final : public CodeGenC {
   std::string vid_global_barrier_state_;
   // Global barrier expected node.
   std::string vid_global_barrier_expect_;
+  // Global curand state
+  std::string curand_philox_state;
 
   // whether enable fp16
   bool enable_fp16_{false};
@@ -120,14 +125,25 @@ class CodeGenTileLangCUDA final : public CodeGenC {
   bool need_math_constants_h_{false};
   // whether need mma.h
   bool need_mma_h_{false};
+  // whether need tl mma instruction header
+  bool need_mma_instruction_h_{false};
+  // whether need tl wgmma instruction header
+  bool need_wgmma_instruction_h_{false};
+  // whether need tl tcgen05mma instruction header
+  bool need_tcgen05mma_instruction_h_{false};
+  // whether need tl mma_sm70 instruction header
+  bool need_mma_sm70_instruction_h_{false};
+  // whether need tcgen_05 common header
+  bool need_tcgen05_common_h_{false};
   // whether need cast_smem_ptr_to_int helper function
   bool need_cast_smem_ptr_to_int_{false};
   // whether need cooperative_groups.h
   bool need_cooperative_groups_{false};
+  // whether need curand_kernel.h
+  bool need_curand_kernel_h_{false};
   // whether need distributed.h
   bool use_distributed_{use_distributed()};
   // whether need nvshmem.h
-
   bool use_nvshmem_{false};
   // Op attribute map
   OpAttrMap<bool> op_need_warp_shuffle_ =
@@ -147,6 +163,7 @@ class CodeGenTileLangCUDA final : public CodeGenC {
 
   std::unordered_map<const VarNode *, std::string> fragment_shapes;
   std::unordered_map<const VarNode *, std::string> fragment_layouts;
+  std::unordered_map<const VarNode *, IntImm> unroll_factor;
   friend void PrintConst(const FloatImmNode *op, std::ostream &os,
                          CodeGenTileLangCUDA *p);
   void PrintWmmaScope(const std::string &scope, DataType t,
diff --git a/src/target/codegen_cutedsl.cc b/src/target/codegen_cutedsl.cc
new file mode 100644
index 000000000..8279710de
--- /dev/null
+++ b/src/target/codegen_cutedsl.cc
@@ -0,0 +1,1355 @@
+/*!
+ * \file target/codegen_cutedsl.cc
+ */
+
+#include "codegen_cutedsl.h"
+#include "codegen_utils.h"
+#include <tvm/arith/analyzer.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ir/transform.h>
+#include <tvm/tir/index_map.h>
+#include <tvm/tir/op.h>
+
+#include <cmath>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "../op/builtin.h"
+#include "arith/pattern_match.h"
+
+namespace tvm {
+namespace codegen {
+namespace {
+
+// The threshold of the loop extent to use cutlass.range_constexpr
+// Higher values would lead to DSLOptimizationWarning:
+// This static loop has 128 iterations, which may be very slow to compile,
+//  consider using `cutlass.range(..., unroll_full=True)` instead.
+const int64_t LOOP_UNROLL_THRESHOLD = 64;
+
+void ReplaceAll(std::string &str, const std::string &from,
+                const std::string &to) {
+  ICHECK(!from.empty()) << "ReplaceAll(): `from` must be non-empty";
+  auto pos = str.find(from);
+  while (pos != std::string::npos) {
+    str.replace(pos, from.size(), to);
+    pos = str.find(from, pos + to.size());
+  }
+}
+
+} // namespace
+
+CodeGenTileLangCuTeDSL::CodeGenTileLangCuTeDSL() {
+  // Read fastmath configuration from current PassContext
+  auto pass_ctx = tvm::transform::PassContext::Current();
+
+  // Read tl.enable_fast_math config, default to false
+  enable_fastmath_ =
+      pass_ctx->GetConfig<Bool>(tl::kEnableFastMath, Bool(false)).value();
+}
+
+std::string CodeGenTileLangCuTeDSL::CanonicalizeFastmathFunctionName_(
+    const std::string &func_name) const {
+  static const std::unordered_map<std::string, std::string> kFastMathMap = {
+      {"divf", "tl.divf"},   {"exp", "tl.exp"},    {"expf", "tl.exp"},
+      {"exp2", "tl.exp2"},   {"exp2f", "tl.exp2"}, {"log", "tl.log"},
+      {"logf", "tl.log"},    {"log2", "tl.log2"},  {"log2f", "tl.log2"},
+      {"log10", "tl.log10"}, {"tan", "tl.tan"},    {"cos", "tl.cos"},
+      {"sin", "tl.sin"},     {"sqrt", "tl.sqrt"},  {"sqrtf", "tl.sqrt"},
+  };
+
+  auto it = kFastMathMap.find(func_name);
+  if (it != kFastMathMap.end()) {
+    return it->second;
+  }
+  return "";
+}
+
+void CodeGenTileLangCuTeDSL::PrintFuncDecorator_(
+    std::ostream &os) { // NOLINT(*)
+  os << "@cute.kernel\n";
+}
+
+void CodeGenTileLangCuTeDSL::PreFunctionBody_(const PrimFunc &f) {
+  PrintIndent();
+  stream << "threadIdx = tl.ThreadIdx()" << "\n";
+  PrintIndent();
+  stream << "blockIdx = tl.BlockIdx()" << "\n";
+}
+
+namespace {
+std::string DTypeToString(DataType t) {
+  ICHECK(t.is_scalar()) << "unsupported type " << t;
+
+  if (t.is_void()) {
+    return "void";
+  }
+  if (t == tl::cuTensorMapType()) {
+    return "CUtensorMap";
+  }
+
+  int bits = t.bits();
+  std::string elem_type;
+  if (t.is_float()) {
+    if (bits == 16 || bits == 32 || bits == 64) {
+      elem_type = "Float" + std::to_string(bits);
+    }
+  } else if (t.is_bfloat16()) {
+    elem_type = "BFloat16";
+  } else if (t.is_float8()) {
+    if (t.is_float8_e3m4()) {
+      // unsupported
+    } else if (t.is_float8_e4m3()) {
+      elem_type =
+          "Float8E4M3FN"; // Only Float8E4M3FN is supported at the moment
+    } else if (t.is_float8_e4m3b11fnuz()) {
+      // unsupported
+    } else if (t.is_float8_e4m3fn()) {
+      elem_type = "Float8E4M3FN";
+    } else if (t.is_float8_e4m3fnuz()) {
+      // unsupported
+    } else if (t.is_float8_e5m2()) {
+      elem_type = "Float8E5M2";
+    } else if (t.is_float8_e5m2fnuz()) {
+      // unsupported
+    } else if (t.is_float8_e8m0fnu()) {
+      elem_type = "Float8E8M0FNU";
+    }
+  } else if (t.is_float6()) {
+    if (t.is_float6_e3m2fn()) {
+      elem_type = "Float6E3M2FN";
+    } else if (t.is_float6_e2m3fn()) {
+      elem_type = "Float6E2M3FN";
+    }
+  } else if (t.is_float4()) {
+    if (t.is_float4_e2m1fn()) {
+      elem_type = "Float4E2M1FN";
+    }
+  } else if (t.is_bool()) {
+    elem_type = "Boolean";
+  } else if (t.is_uint()) {
+    if (bits == 8 || bits == 16 || bits == 32 || bits == 64 || bits == 128) {
+      elem_type = "Uint" + std::to_string(bits);
+    }
+  } else if (t.is_int()) {
+    if (bits == 4 || bits == 8 || bits == 16 || bits == 32 || bits == 64 ||
+        bits == 128) {
+      elem_type = "Int" + std::to_string(bits);
+    }
+  }
+
+  if (elem_type.empty()) {
+    LOG(FATAL) << "Cannot convert type " << t << " to CuTeDSL type!";
+  }
+
+  return "cutlass." + elem_type;
+}
+} // namespace
+
+void CodeGenTileLangCuTeDSL::PrintType(DataType t,
+                                       std::ostream &os) { // NOLINT(*)
+  CHECK(t.is_scalar()) << "Should not print a non-scalar type in CuTeDSL: "
+                       << t;
+  os << DTypeToString(t);
+}
+
+void CodeGenTileLangCuTeDSL::VisitExpr_(const BroadcastNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  os << "tl.make_filled_tensor((" << PrintExpr_(op->lanes) << ",), "
+     << PrintExpr_(op->value) << ").load()";
+}
+
+void CodeGenTileLangCuTeDSL::VisitExpr_(const FloatImmNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  switch (op->dtype.bits()) {
+  case 64:
+  case 32:
+  case 16:
+  case 8:
+  case 4: {
+    std::ostringstream temp;
+    if (std::isinf(op->value)) {
+      // For CuTeDSL, use Python's float('inf') instead of CUDA macros
+      PrintType(op->dtype, temp);
+      temp << "(";
+      if (op->value < 0) {
+        temp << "float('-inf')";
+      } else {
+        temp << "float('inf')";
+      }
+      temp << ")";
+    } else if (std::isnan(op->value)) {
+      // For CuTeDSL, use Python's float('nan')
+      PrintType(op->dtype, temp);
+      temp << "(float('nan'))";
+    } else {
+      // For CuTeDSL, use Python's float.fromhex() with hexfloat for full
+      // precision
+      PrintType(op->dtype, temp);
+      temp << "(float.fromhex('" << std::hexfloat << op->value << "'))";
+    }
+    MarkConst(temp.str());
+    os << temp.str();
+    break;
+  }
+  default:
+    LOG(FATAL) << "Bad bit-width for float: " << op->dtype << "\n";
+  }
+}
+
+void CodeGenTileLangCuTeDSL::VisitExpr_(const CastNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  DataType from_ty = op->value.dtype();
+  DataType target_ty = op->dtype;
+  ICHECK_EQ(target_ty.lanes(), from_ty.lanes());
+
+  if (from_ty.is_scalar())
+    return CodeGenTileLangPY::VisitExpr_(op, os);
+
+  // Emit this as vectorized unary ops.
+  std::string sret = name_supply_->FreshName("_");
+  PrintIndent();
+  stream << sret << " = tl.make_rmem_tensor((" << target_ty.lanes() << ",), ";
+  PrintType(target_ty.element_of(), stream);
+  stream << ")\n";
+
+  std::string src = SSAGetID(PrintExpr_(op->value), from_ty);
+
+  PrintIndent();
+  stream << sret << ".store(" << src << ".to(";
+  PrintType(target_ty.element_of(), stream);
+  stream << "))\n";
+  os << sret << ".load()";
+  return;
+}
+
+void CodeGenTileLangCuTeDSL::VisitExpr_(const DivNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  if (op->dtype.is_int() || op->dtype.is_uint()) {
+    PrintBinaryExpr_("//", op->dtype, op->a, op->b, os);
+  } else {
+    if (enable_fastmath_) {
+      os << "tl.divf(" << PrintExpr_(op->a) << ", " << PrintExpr_(op->b)
+         << ", fastmath=True)";
+    } else {
+      PrintBinaryExpr_("tl.divf", op->dtype, op->a, op->b, os);
+    }
+  }
+}
+void CodeGenTileLangCuTeDSL::VisitExpr_(const MinNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("tl.min", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangCuTeDSL::VisitExpr_(const MaxNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("tl.max", op->dtype, op->a, op->b, os);
+}
+
+/**
+ * @brief Emit CuTeDSL-specific code for a call expression.
+ *
+ * This visitor handles CallNode intrinsics and builtins that require emitting
+ * CuTeDSL-specific code (inline PTX/ASM sequences, TensorLanguage runtime
+ * calls, WMMA/TMA helpers, barriers, cp.async primitives, index-map based
+ * stores, reinterpret/packing helpers, and various mma/ldmatrix patterns). The
+ * function writes the generated code to the provided output stream and falls
+ * back to the Python codegen for unrecognized calls.
+ *
+ * The method recognizes and emits code for (non-exhaustive): cp.async and its
+ * commit/wait variants, tma_load/store and im2col variants, ptX
+ * ldmatrix/stmatrix helpers, mbarrier APIs, cooperative grid sync, WMMA/legacy
+ * MMA intrinsics (fill/load/store/mma/bmma/ptx_mma/ptx_mma_sp), low-level PTX
+ * asm helpers (ldg32, cp_async bulk/init/arrive/wait barriers), reinterpret
+ * paths for special small-float encodings (e.g., float4 e2m1fn), tl::tl_gemm
+ * and related external calls, and other TL runtime calls.
+ *
+ * Side effects:
+ * - Emits to `os` and the internal codegen output stream.
+ * - May set internal feature flags (e.g., need_cooperative_groups_).
+ * - May open/close SSA scopes and mutate internal variable mappings.
+ * - May call LOG(FATAL) / CHECK / ICHECK on invalid or unsupported argument
+ *   patterns.
+ *
+ * @param op The call node to generate code for; the function inspects op->op
+ *           and op->args to determine the appropriate emission.
+ * @param os  Output stream to receive expression-level output when the caller
+ *            expects an expression result (some paths write directly to the
+ *            member stream instead).
+ */
+void CodeGenTileLangCuTeDSL::VisitExpr_(const CallNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  auto print_extern_call_stmt = [&](std::string name, size_t start = 0,
+                                    size_t end = 0) {
+    // Cache context into a private ss, otherwise the let node may generate
+    // within the function call arguments.
+    std::ostringstream ss;
+    for (size_t i = start; i < op->args.size() - end; i++) {
+      if (i > start)
+        ss << ", ";
+      ss << PrintExpr_(op->args[i]);
+    }
+
+    PrintIndent();
+    stream << name << "(";
+    stream << ss.str();
+    stream << ")\n";
+  };
+
+  auto print_mbarrier_obj = [&](PrimExpr barrier_id) {
+    std::ostringstream ss;
+    if (barrier_id.as<IntImmNode>()) {
+      // incase the barrier_id is an integer, we need to print the barrier_id as
+      // an integer
+      ss << "(" << mbarrier_name_ << "+" << barrier_id << ")";
+    } else {
+      // otherwise may be a T.get_mbarrier() call or BufferLoad Node
+      // we need to print the barrier_id as a string
+      ss << PrintExpr_(barrier_id);
+    }
+    return ss.str();
+  };
+
+  if (op->op.same_as(builtin::ptx_cp_async())) {
+    std::string dst = PrintExpr_(op->args[0]);
+    std::string dst_offset = PrintExpr_(op->args[1]);
+    std::string src = PrintExpr_(op->args[2]);
+    std::string src_offset = PrintExpr_(op->args[3]);
+    std::string size = PrintExpr_(op->args[4]);
+    // use size of argument list to indicate whether or not to use predicated
+    // cp.async
+    if (op->args.size() == 5) {
+      PrintIndent();
+      stream << "tl.cp_async_gs(" << size << ", " << dst << ", " << dst_offset
+             << ", " << src << ", " << src_offset << ")\n";
+    } else {
+      std::string condition = PrintExpr_(op->args[5]);
+      PrintIndent();
+      stream << "tl.cp_async_gs_conditional(" << size << ", " << dst << ", "
+             << dst_offset << ", " << src << ", " << src_offset << ", "
+             << condition << ")\n";
+    }
+  } else if (op->op.same_as(builtin::ptx_commit_group())) {
+    print_extern_call_stmt("tl.cp_async_commit");
+  } else if (op->op.same_as(builtin::ptx_wait_group())) {
+    print_extern_call_stmt("tl.cp_async_wait");
+  } else if (op->op.same_as(builtin::create_barriers())) {
+    PrintIndent();
+    int barrier_count = Downcast<IntImm>(op->args[0])->value;
+    stream << mbarrier_name_
+           << " = tl.alloc_smem(cutlass.Uint64, size_in_elems=" << barrier_count
+           << ")\n";
+  } else if (op->op.same_as(tl::get_mbarrier())) {
+    ICHECK_EQ(op->args.size(), 1);
+    std::string barrier_id = PrintExpr_(op->args[0]);
+    os << "(" << mbarrier_name_ << "+" << barrier_id << ")";
+  } else if (op->op.same_as(builtin::ptx_arrive_barrier())) {
+    if (op->args.size() == 1) {
+      PrintIndent();
+      auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+      stream << "tl.mbarrier_arrive(" << mbarrier_obj << ")\n";
+    } else if (op->args.size() == 3) {
+      PrintIndent();
+      auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+      auto cta_id = PrintExpr_(op->args[1]);
+      auto pred = PrintExpr_(op->args[2]);
+      stream << "tl.mbarrier_arrive(" << mbarrier_obj << ", " << cta_id << ", "
+             << pred << ")\n";
+    } else {
+      LOG(FATAL) << "Invalid parameter  for tl::arrive_barrier "
+                 << op->args.size();
+    }
+  } else if (op->op.same_as(builtin::ptx_init_barrier_thread_count())) {
+    ICHECK_EQ(op->args.size(), 2);
+    PrintIndent();
+    auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+    auto arrive_count = PrintExpr_(op->args[1]);
+    stream << "tl.mbarrier_init(" << mbarrier_obj << ", " << arrive_count
+           << ")\n";
+  } else if (op->op.same_as(builtin::ptx_arrive_barrier_expect_tx())) {
+    if (op->args.size() == 2) {
+      PrintIndent();
+      auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+      auto transaction_bytes = PrintExpr_(op->args[1]);
+      stream << "tl.arrive_and_expect_tx(" << mbarrier_obj << ", "
+             << transaction_bytes << ")\n";
+    } else if (op->args.size() == 4) {
+      PrintIndent();
+      auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+      auto transaction_bytes = PrintExpr_(op->args[1]);
+      auto cta_id = PrintExpr_(op->args[2]);
+      auto pred = PrintExpr_(op->args[3]);
+      stream << "tl.arrive_and_expect_tx(" << mbarrier_obj << ", "
+             << transaction_bytes << ", " << cta_id << ", " << pred << ")\n";
+    } else {
+      LOG(FATAL) << "Invalid parameter  for tl::arrive_barrier_expect_tx "
+                 << op->args.size();
+    }
+  } else if (op->op.same_as(builtin::ptx_cp_async_barrier())) {
+    print_extern_call_stmt("tl.mbarrier_cp_async_arrive");
+  } else if (op->op.same_as(tl::ptx_fence_barrier_init())) {
+    print_extern_call_stmt("tl.fence_barrier_init");
+  } else if (op->op.same_as(tl::ptx_cp_async_barrier_noinc())) {
+    print_extern_call_stmt("tl.mbarrier_cp_async_arrive_noinc");
+  } else if (op->op.same_as(tl::mbarrier_expect_tx())) {
+    ICHECK_EQ(op->args.size(), 2);
+    PrintIndent();
+    auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+    auto transaction_bytes = PrintExpr_(op->args[1]);
+    stream << "tl.mbarrier_expect_tx(" << mbarrier_obj << ", "
+           << transaction_bytes << ")\n";
+  } else if (op->op.same_as(tl::mbarrier_wait_parity())) {
+    ICHECK_EQ(op->args.size(), 2);
+    PrintIndent();
+    auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+    auto phase = PrintExpr_(op->args[1]);
+    stream << "tl.mbarrier_wait(" << mbarrier_obj << ", " << phase << ")\n";
+  } else if (op->op.same_as(tl::ptx_init_tensor_memory())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ptx_deallocate_tensor_memory())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::no_set_max_nreg())) {
+    // do nothing
+  } else if (op->op.same_as(tl::tma_load())) {
+    std::ostringstream ss;
+    ICHECK_GE(op->args.size(), 2);
+    auto pol = op->args[op->args.size() - 1].as<IntImmNode>();
+    ICHECK(pol) << "Eviction policy must be IntImm";
+    ICHECK_GE(pol->value, 0);
+    ICHECK_LT(static_cast<size_t>(pol->value), eviction_policy_names_.size());
+    auto eviction_policy = eviction_policy_names_[pol->value];
+    // Simplify the code by using the default eviction policy
+    if (eviction_policy != "EVICT_NORMAL") {
+      LOG(FATAL) << "Eviction policy " << eviction_policy
+                 << " is not supported currently";
+    } else {
+      ss << "tl.tma_load(";
+    }
+    auto desc = op->args[0];
+    ss << PrintExpr_(desc) << ", ";
+    ss << print_mbarrier_obj(op->args[1]) << ", ";
+    ss << PrintExpr_(op->args[2]) << ", (";
+    for (size_t i = 3; i < op->args.size() - 1; i++) {
+      if (i > 3)
+        ss << ", ";
+      ss << PrintExpr_(op->args[i]);
+    }
+    ss << "))\n";
+    PrintIndent();
+    stream << ss.str();
+  } else if (op->op.same_as(tl::tma_load_im2col())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::tma_store())) {
+    std::stringstream ss;
+    // Check minimum argument count (desc, data, at least one coord,
+    // need_reduce, eviction)
+    ICHECK_GE(op->args.size(), 4) << "tma_store requires at least 4 arguments "
+                                     "(desc, data, coords..., need_reduce, "
+                                     "eviction_policy), got "
+                                  << op->args.size();
+
+    // Safely extract need_reduce flag
+    auto need_reduce_ptr = op->args[op->args.size() - 2].as<IntImmNode>();
+    ICHECK(need_reduce_ptr)
+        << "tma_store need_reduce flag (args[-2]) must be IntImm, got "
+        << op->args[op->args.size() - 2]->GetTypeKey();
+    auto need_reduce = need_reduce_ptr->value;
+    if (need_reduce) {
+      LOG(FATAL) << "Currently unsupported op: " << op->op;
+    }
+
+    // Safely extract and validate eviction policy index
+    auto eviction_idx_ptr = op->args[op->args.size() - 1].as<IntImmNode>();
+    ICHECK(eviction_idx_ptr)
+        << "tma_store eviction policy (args[-1]) must be IntImm, got "
+        << op->args[op->args.size() - 1]->GetTypeKey();
+    ICHECK_GE(eviction_idx_ptr->value, 0)
+        << "tma_store eviction policy index must be >= 0, got "
+        << eviction_idx_ptr->value;
+    ICHECK_LT(static_cast<size_t>(eviction_idx_ptr->value),
+              eviction_policy_names_.size())
+        << "tma_store eviction policy index " << eviction_idx_ptr->value
+        << " out of bounds (max " << eviction_policy_names_.size() - 1 << ")";
+    auto eviction_policy = eviction_policy_names_[eviction_idx_ptr->value];
+
+    ss << "tl.tma_store(";
+    auto desc = op->args[0];
+    ss << PrintExpr_(desc) << ", ";
+    ss << PrintExpr_(op->args[1]) << ", (";
+    for (size_t i = 2; i < op->args.size() - 2; i++) {
+      if (i > 2)
+        ss << ", ";
+      ss << PrintExpr_(op->args[i]);
+    }
+    ss << ")";
+    if (eviction_policy != "EVICT_NORMAL") {
+      ss << ", eviction_kind = nvvm.EvictKind." << eviction_policy.substr(6);
+    }
+    ss << ")\n";
+    PrintIndent();
+    stream << ss.str();
+  } else if (op->op.same_as(tl::ptx_ldmatrix())) {
+    int trans = Downcast<IntImm>(op->args[0])->value;
+    int num = Downcast<IntImm>(op->args[1])->value;
+    std::string func_name = "tl.ptx_ldmatrix_x" + std::to_string(num);
+    if (trans == 1)
+      func_name += "_trans";
+    print_extern_call_stmt(func_name, 2);
+  } else if (op->op.same_as(tl::ptx_stmatrix())) {
+    int trans = Downcast<IntImm>(op->args[0])->value;
+    int num = Downcast<IntImm>(op->args[1])->value;
+    std::string func_name = "tl.ptx_stmatrix_x" + std::to_string(num);
+    if (trans == 1)
+      func_name += "_trans";
+    print_extern_call_stmt(func_name, 2);
+  } else if (op->op.same_as(tl::fence_proxy_async())) {
+    print_extern_call_stmt("tl.fence_proxy_async");
+  } else if (op->op.same_as(tl::tma_store_arrive())) {
+    print_extern_call_stmt("tl.tma_store_arrive");
+  } else if (op->op.same_as(tl::tma_store_wait())) {
+    PrintIndent();
+    stream << "tl.tma_store_wait(0)\n";
+  } else if (op->op.same_as(tl::warpgroup_arrive())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warpgroup_commit_batch())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warpgroup_wait())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warpgroup_fence_operand())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::set_max_nreg())) {
+    PrintIndent();
+    int nreg = Downcast<IntImm>(op->args[0])->value;
+    int is_inc = Downcast<IntImm>(op->args[1])->value;
+    std::string func_name =
+        is_inc ? "tl.warpgroup_reg_alloc" : "tl.warpgroup_reg_dealloc";
+    stream << func_name << "(" << nreg << ")\n";
+  } else if (op->op.same_as(tl::wait_wgmma())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::pack_b16())) {
+    os << "tl.pack_half2(" << PrintExpr_(op->args[0]) << ", "
+       << PrintExpr_(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::sync_grid())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::loop_break())) {
+    PrintIndent();
+    stream << "break\n";
+  } else if (op->op.same_as(builtin::ptx_mma())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ptx_mma_sm70())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::ptx_mma_sp())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ptx_wgmma_ss())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ptx_wgmma_rs())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ptx_tcgen05_mma_ss())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ptx_tcgen05_mma_ts())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::tcgen05_mma_arrive())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::ptx_ldmatrix())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::mma_store())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::mma_fill())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::ptx_cp_async_bulk())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::ptx_wait_barrier())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::ptx_ldg32())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::reinterpret())) {
+    DataType tgt_dtype = op->dtype;
+    DataType src_dtype = op->args[0]->dtype;
+    ICHECK_EQ(tgt_dtype.lanes() * tgt_dtype.bits(),
+              src_dtype.lanes() * src_dtype.bits())
+        << "reinterpret expects source and target to have the same number of "
+           "bits";
+
+    const BufferLoadNode *load = op->args[0].as<BufferLoadNode>();
+    ICHECK(op->args.size() == 1 && load);
+    ICHECK_EQ(load->indices.size(), 1)
+        << "CodeGenTileLangCuTeDSL only supports flat memory";
+
+    PrimExpr index = load->indices[0];
+    if (const RampNode *node = index.as<RampNode>(); node) {
+      auto *p_stride = as_const_int(node->stride);
+      CHECK(p_stride);
+      ICHECK_EQ(*p_stride, 1) << "reinterpret expects contiguous elements";
+      index = node->base;
+    }
+
+    auto ptr_str = GetBufferPtr_(load->buffer.get(), index);
+    os << "tl.make_tensor(tl.recast_ptr(" << ptr_str << ", dtype=";
+    PrintType(tgt_dtype.element_of(), os);
+    os << "), (" << tgt_dtype.lanes() << ",)).load()";
+  } else if (op->op.same_as(builtin::thread_return())) {
+    os << "return";
+  } else if (op->op.same_as(tl::tl_gemm())) {
+    ICHECK(op->args.size() == 4) << "tl_gemm expects 4 arguments <op_instance, "
+                                    "A_ptr, B_ptr, C_ptr>, but got "
+                                 << op->args.size();
+
+    auto op_instance = Downcast<StringImm>(op->args[0]);
+    PrintCallExtern_(GetType(tvm::ffi::GetRef<PrimExpr>(op)),
+                     op_instance->value, op->args, true, os);
+  } else if (op->op.same_as(tl::tl_gemm_sp())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::get_lane_idx())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::get_warp_idx_sync())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::get_warp_idx())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::get_warp_group_idx())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::tl_shuffle_elect())) {
+    os << "tl.shuffle_elect(" << PrintExpr_(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::initialize_wgmma_descriptor())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::initialize_tcgen05_descriptor())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::increase_descriptor_offset())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::__exp())) {
+    os << "tl.exp2(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::__exp10())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::__log())) {
+    os << "tl.log(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::__log2())) {
+    os << "tl.log2(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::__log10())) {
+    os << "tl.log10(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::__tan())) {
+    os << "tl.tan(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::__cos())) {
+    os << "tl.cos(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::__sin())) {
+    os << "tl.sin(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
+  } else if (op->op.same_as(tl::ieee_add())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_sub())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_mul())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_fmaf())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_frcp())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_fsqrt())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_frsqrt())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::ieee_fdiv())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warp_reduce_sum())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warp_reduce_max())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warp_reduce_min())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warp_reduce_bitand())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(tl::warp_reduce_bitor())) {
+    LOG(FATAL) << "Currently unsupported op: " << op->op;
+  } else if (op->op.same_as(builtin::address_of())) {
+    const BufferLoadNode *load = op->args[0].as<BufferLoadNode>();
+    ICHECK(op->args.size() == 1 && load);
+    ICHECK_EQ(load->indices.size(), 1)
+        << "CodeGenTileLangCuTeDSL only supports flat memory";
+    os << GetBufferPtr_(load->buffer.get(), load->indices[0]);
+  } else {
+    CodeGenTileLangPY::VisitExpr_(op, os);
+  }
+}
+
+void CodeGenTileLangCuTeDSL::VisitExpr_(const BufferLoadNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  ICHECK_EQ(op->indices.size(), 1)
+      << "Load from non-flat memory not supported.";
+  ICHECK(!op->predicate.defined())
+      << "Predicated buffer load is not supported.";
+
+  DataType value_dtype = op->dtype;
+  PrimExpr index = op->indices[0];
+  Var buffer_var = op->buffer->data;
+  DataType element_dtype = op->buffer->dtype;
+
+  const int value_lanes = value_dtype.lanes();
+  if (value_lanes == element_dtype.lanes()) {
+    std::string ref = GetBufferRef_(value_dtype, op->buffer.get(), index);
+    if (ref.back() == ')') {
+      ref += ".load()";
+    }
+    os << ref;
+  } else {
+    ICHECK_GE(value_lanes, element_dtype.lanes())
+        << "Unsupported load/store: value lanes < buffer element lanes";
+    bool is_contiguous = false;
+    arith::PVar<PrimExpr> base;
+    if (arith::ramp(base, 1, value_lanes / element_dtype.lanes())
+            .Match(index)) {
+      is_contiguous = true;
+    }
+
+    if (is_contiguous) {
+      std::string ref =
+          GetBufferRef_(value_dtype, op->buffer.get(), base.Eval());
+      if (ref.back() == ')') {
+        ref += ".load()";
+      }
+      os << ref;
+    } else {
+      ICHECK(element_dtype.is_scalar())
+          << "buffer element type for non-contiguous load must be scalar "
+             "currently";
+
+      std::string sret = name_supply_->FreshName("_");
+      PrintIndent();
+      stream << sret << " = tl.make_rmem_tensor((" << value_lanes << ",), ";
+      PrintType(element_dtype, stream);
+      stream << ")\n";
+
+      std::string vid = GetVarID(buffer_var.get());
+      const RampNode *ramp = index.as<RampNode>();
+      ICHECK(ramp)
+          << "Expected Ramp index for vectorized non-contiguous access";
+      for (int i = 0; i < value_lanes; ++i) {
+        auto idx_expr =
+            arith::Analyzer().Simplify(ramp->base + ramp->stride * i);
+
+        PrintIndent();
+        stream << sret << "[" << i << "] = "
+               << GetBufferRef_(element_dtype, op->buffer.get(), idx_expr)
+               << "\n";
+      }
+      os << sret << ".load()";
+    }
+  }
+}
+
+void CodeGenTileLangCuTeDSL::VisitStmt_(const BufferStoreNode *op) {
+  ICHECK_EQ(op->indices.size(), 1) << "Store to non-flat memory not supported.";
+  ICHECK(!op->predicate.defined())
+      << "Predicated buffer store is not supported.";
+
+  DataType value_dtype = op->value.dtype();
+  DataType element_dtype = op->buffer->dtype;
+  PrimExpr index_expr = op->indices[0];
+  Var buffer_var = op->buffer->data;
+  std::string value_str = PrintExpr_(op->value);
+
+  int value_lanes = value_dtype.lanes();
+  if (value_lanes == element_dtype.lanes()) {
+    std::string ref = GetBufferRef_(value_dtype, op->buffer.get(), index_expr);
+    PrintIndent();
+
+    if (ref.back() != ')') {
+      stream << ref << " = " << RemoveOutermostParentheses(value_str) << "\n";
+    } else {
+      stream << ref << ".store(" << RemoveOutermostParentheses(value_str)
+             << ")\n";
+    }
+  } else {
+    bool is_contiguous = false;
+    arith::PVar<PrimExpr> base;
+    if (arith::ramp(base, 1, value_lanes / element_dtype.lanes())
+            .Match(index_expr)) {
+      is_contiguous = true;
+    }
+
+    if (is_contiguous) {
+      PrintVecStore_(op->buffer.get(), value_dtype, base.Eval(), value_str);
+    } else {
+      ICHECK(element_dtype.is_scalar())
+          << "buffer element type for non-contiguous store must be scalar "
+             "currently";
+
+      // store elements separately
+      value_str = SSAGetID(value_str, element_dtype);
+      for (int i = 0; i < value_lanes; ++i) {
+        const RampNode *ramp = index_expr.as<RampNode>();
+        ICHECK(ramp);
+        auto idx_expr =
+            arith::Analyzer().Simplify(ramp->base + ramp->stride * i);
+
+        PrintIndent();
+        stream << GetBufferRef_(element_dtype, op->buffer.get(), idx_expr)
+               << " = ";
+        PrintVecElemLoad_(value_str, value_dtype, i, stream);
+        stream << "\n";
+      }
+    }
+  }
+}
+
+void CodeGenTileLangCuTeDSL::VisitStmt_(const AllocateNode *op) {
+  ICHECK(!is_zero(op->condition));
+  std::string vid = AllocVarID(op->buffer_var.get());
+  PrintIndent();
+  std::string scope = GetPtrStorageScope(op->buffer_var);
+  alloc_storage_scope_[op->buffer_var.get()] = scope;
+
+  if (scope == "local.descriptor.wgmma") {
+    stream << vid << " = tl.GmmaDescriptor()\n";
+  } else if (scope == "local.descriptor.tcgen05_smem") {
+    LOG(FATAL) << "Currently unsupported scope: " << scope;
+  } else if (scope == "local.descriptor.tcgen05_instr") {
+    LOG(FATAL) << "Currently unsupported scope: " << scope;
+  } else if (scope == "shared.dyn") {
+    stream << vid << " = tl.make_tensor(tl.get_dyn_smem(";
+    PrintType(op->dtype, stream);
+    // there is no bound check for Tensor access, so just set shape to 1
+    stream << ", alignment=1024), (1,))\n";
+  } else {
+    size_t constant_size = op->ConstantAllocationSize();
+    ICHECK_GT(constant_size, 0)
+        << "Can only handle constant size stack allocation for now, but get "
+        << constant_size << " for " << op->buffer_var->name_hint;
+
+    if (scope == "shared") {
+      stream << vid << " = tl.make_tensor(tl.alloc_smem(";
+      PrintType(op->dtype, stream);
+      stream << ", " << constant_size << "), (" << constant_size << ",))\n";
+    } else if (scope == "shared.barrier") {
+      ICHECK(false) << "Unsupported scope: " << scope;
+    } else if (scope == "local") {
+      stream << vid << " = tl.make_rmem_tensor((" << constant_size << "),";
+      PrintType(op->dtype, stream);
+      stream << ")\n";
+    } else if (scope == "local.var") {
+      PrimExpr init = tir::make_const(op->dtype, 0);
+      auto init_it = op->annotations.find(tl::attr::kLocalVarInit);
+      if (init_it != op->annotations.end()) {
+        PrimExpr user_init = Downcast<PrimExpr>((*init_it).second);
+        if (!user_init.dtype().is_void() && user_init.dtype() != op->dtype) {
+          user_init = tir::Cast(op->dtype, user_init);
+        }
+        init = user_init;
+      }
+      stream << vid << " = " << PrintExpr_(init) << "\n";
+    } else {
+      ICHECK(false) << "Unsupported scope: " << scope;
+    }
+  }
+
+  RegisterHandleType_(op->buffer_var.get(), op->dtype);
+  PrintStmt_(op->body);
+}
+
+void CodeGenTileLangCuTeDSL::VisitStmt_(const AttrStmtNode *op) {
+  if (op->attr_key == tir::attr::thread_extent) {
+    IterVar iv = Downcast<IterVar>(op->node);
+    if (!iv->thread_tag.empty()) {
+      if (!var_idmap_.count(iv->var.get())) {
+        BindThreadIndex_(iv);
+      }
+    }
+    VisitStmt(op->body);
+  } else if (op->attr_key == tir::attr::async_commit_queue_scope) {
+    const IntImmNode *queue_id = op->value.as<IntImmNode>();
+    ICHECK(queue_id && queue_id->value == 0)
+        << "For CUDA, the index of an async queue must be 0.";
+    VisitStmt(op->body);
+    auto commit_group = Call(DataType::Void(), builtin::ptx_commit_group(), {});
+    VisitExpr(commit_group, stream);
+  } else if (op->attr_key == tir::attr::async_wait_queue_scope) {
+    auto wait_attrs = GetAsyncWaitAttributes(op);
+    auto queue_id = wait_attrs.first.as<IntImmNode>();
+    ICHECK(queue_id && queue_id->value == 0)
+        << "For CUDA, the index of an async queue must be 0.";
+    auto wait_cnt = wait_attrs.second;
+    auto wait_group =
+        Call(DataType::Void(), builtin::ptx_wait_group(), {wait_cnt});
+    VisitExpr(wait_group, stream);
+    auto inner = op->body.as<AttrStmtNode>();
+    ICHECK(inner);
+    VisitStmt(inner->body);
+  } else if (op->attr_key == "threadblock_swizzle_pattern") {
+    this->PrintIndent();
+    const StringImmNode *pattern = op->value.as<StringImmNode>();
+    ICHECK(pattern);
+    std::string call_str = pattern->value;
+    // replace :: with . and replace < with ( and replace > with )
+    ReplaceAll(call_str, "::", ".");
+    ReplaceAll(call_str, "<", "(");
+    ReplaceAll(call_str, ">", ")");
+    this->stream << "blockIdx = " << call_str << "\n";
+    this->VisitStmt(op->body);
+  } else if (op->attr_key == "pragma_unroll_factor") {
+    const IntImmNode *factor = op->value.as<IntImmNode>();
+    ICHECK(factor);
+    unroll_factor_[op->node.as<VarNode>()] = Downcast<IntImm>(factor);
+    CodeGenTileLangPY::VisitStmt_(op);
+  } else {
+    CodeGenTileLangPY::VisitStmt_(op);
+  }
+}
+
+void CodeGenTileLangCuTeDSL::VisitStmt_(const ForNode *op) {
+  if (op->kind != tir::ForKind::kUnrolled) {
+    CodeGenTileLangPY::VisitStmt_(op);
+    return;
+  }
+
+  auto start_expr = arith::Analyzer().Simplify(op->min);
+  auto stop_expr = arith::Analyzer().Simplify(op->extent + op->min);
+  std::string unroll_factor;
+  if (auto it = unroll_factor_.find(op->loop_var.get());
+      it != unroll_factor_.end()) {
+    unroll_factor = PrintExpr_(it->second);
+  }
+  bool use_range_constexpr = unroll_factor.empty() &&
+                             as_const_int(op->extent) != nullptr &&
+                             *as_const_int(op->extent) <= LOOP_UNROLL_THRESHOLD;
+  PrintIndent();
+  std::string vid = AllocVarID(op->loop_var.get());
+  stream << "for " << vid << " in cutlass.range";
+  if (use_range_constexpr) {
+    stream << "_constexpr";
+  }
+  stream << "(";
+  if (!is_zero(start_expr)) {
+    PrintExpr_(start_expr, stream);
+    stream << ", ";
+  }
+  PrintExpr_(stop_expr, stream);
+  if (!unroll_factor.empty()) {
+    stream << ", unroll=" << unroll_factor;
+  } else if (!use_range_constexpr) {
+    stream << ", unroll_full=True";
+  }
+  stream << "):\n";
+  int for_scope = BeginScope();
+  PrintStmt_(op->body);
+  EndScope(for_scope);
+}
+
+void CodeGenTileLangCuTeDSL::VisitStmt_(const IfThenElseNode *op) {
+  std::string cond = PrintExpr_(op->condition);
+  PrintIndent();
+  stream << "if " << RemoveOutermostParentheses(cond) << ":\n";
+  int then_scope = BeginScope();
+  if (const CallNode *call = op->condition.as<CallNode>();
+      call && call->op.same_as(tl::tl_shuffle_elect())) {
+    PrintIndent();
+    stream << "with cute.arch.elect_one():\n";
+    int with_scope = BeginScope();
+    PrintStmt_(op->then_case);
+    EndScope(with_scope);
+  } else {
+    PrintStmt_(op->then_case);
+  }
+  EndScope(then_scope);
+
+  if (op->else_case) {
+    PrintIndent();
+    stream << "else:\n";
+    int else_scope = BeginScope();
+    PrintStmt_(op->else_case.value());
+    EndScope(else_scope);
+  }
+}
+
+void CodeGenTileLangCuTeDSL::VisitStmt_(const EvaluateNode *op) {
+  if (is_const_int(op->value))
+    return;
+  const CallNode *call = op->value.as<CallNode>();
+  if (call && call->op.same_as(builtin::tvm_global_barrier_kinit())) {
+    LOG(FATAL) << "Currently unsupported op: " << call->op;
+  }
+  if (call && (call->op.same_as(tvm::tl::device_assert()))) {
+    std::string cond = RemoveOutermostParentheses(PrintExpr_(call->args[0]));
+    PrintIndent();
+    stream << "assert " << cond << "\n";
+  } else if (call && call->op.same_as(tvm::tl::device_assert_with_msg())) {
+    std::string cond = RemoveOutermostParentheses(PrintExpr_(call->args[0]));
+    std::string msg_expr = PrintExpr_(call->args[1]);
+    PrintIndent();
+    stream << "assert " << cond << ", " << msg_expr << "\n";
+  } else if (call && call->op.same_as(builtin::tvm_storage_sync())) {
+    PrintStorageSync_(call);
+  } else {
+    CodeGenTileLangPY::VisitStmt_(op);
+  }
+}
+
+void CodeGenTileLangCuTeDSL::PrintVecElemLoad_(const std::string &vec,
+                                               DataType t, int i,
+                                               std::ostream &os) { // NOLINT(*)
+  if (t.is_scalar()) {
+    os << vec;
+    return;
+  }
+  os << vec << "[" << i << "]";
+}
+
+void CodeGenTileLangCuTeDSL::PrintVecElemStore_(const std::string &vec,
+                                                DataType t, int i,
+                                                const std::string &value) {
+  PrintIndent();
+  stream << vec << "[" << i << "] = " << value << "\n";
+}
+
+void CodeGenTileLangCuTeDSL::PrintVecStore_(const BufferNode *buffer,
+                                            DataType t, PrimExpr base,
+                                            const std::string &value) {
+  ICHECK(!t.is_scalar()) << "PrintVecStore_() should not be used for scalar";
+
+  std::string ref = GetBufferRef_(t, buffer, base);
+  PrintIndent();
+  stream << ref << ".store(" << value << ")\n";
+}
+
+void CodeGenTileLangCuTeDSL::PrintVecBinaryOp_(const std::string &opstr,
+                                               DataType dtype, PrimExpr lhs,
+                                               PrimExpr rhs,
+                                               std::ostream &os) { // NOLINT(*)
+  // Declare the result.
+  std::string sret = name_supply_->FreshName("_");
+  PrintIndent();
+  stream << sret << " = tl.make_rmem_tensor((" << dtype.lanes() << ",), ";
+  PrintType(dtype.element_of(), stream);
+  stream << ")\n";
+
+  std::string vlhs = SSAGetID(PrintExpr_(lhs), lhs.dtype());
+  std::string vrhs = SSAGetID(PrintExpr_(rhs), rhs.dtype());
+
+  const std::string one_char_op{"+-*%<>^|&"};
+  const std::string two_char_op{"// == != <= >="};
+  if ((opstr.size() == 1 && one_char_op.find(opstr) != std::string::npos) ||
+      (opstr.size() == 2 && two_char_op.find(opstr) != std::string::npos)) {
+    PrintIndent();
+    stream << sret << ".store(" << vlhs << " " << opstr << " " << vrhs << ")\n";
+  } else {
+    // Unpack into individual ops.
+    for (int i = 0, lanes = dtype.lanes(); i < lanes; ++i) {
+      std::ostringstream value_temp;
+      if (isalpha(opstr[0])) {
+        value_temp << opstr << "(";
+        PrintVecElemLoad_(vlhs, lhs.dtype(), i, value_temp);
+        value_temp << ", ";
+        PrintVecElemLoad_(vrhs, rhs.dtype(), i, value_temp);
+        value_temp << ")";
+      } else {
+        value_temp << "(";
+        PrintVecElemLoad_(vlhs, lhs.dtype(), i, value_temp);
+        value_temp << opstr;
+        PrintVecElemLoad_(vrhs, rhs.dtype(), i, value_temp);
+        value_temp << ")";
+      }
+      PrintVecElemStore_(sret, dtype, i, value_temp.str());
+    }
+  }
+  os << sret << ".load()";
+}
+
+void CodeGenTileLangCuTeDSL::PrintBinaryExpr_(const std::string &opstr,
+                                              DataType dtype, PrimExpr lhs,
+                                              PrimExpr rhs,
+                                              std::ostream &os) { // NOLINT(*)
+  if (dtype.is_scalar()) {
+    CodeGenTileLangPY::PrintBinaryExpr_(opstr, dtype, lhs, rhs, os);
+  } else {
+    PrintVecBinaryOp_(opstr, dtype, lhs, rhs, os);
+  }
+}
+
+void CodeGenTileLangCuTeDSL::PrintBinaryIntrinsic_(
+    const CallNode *op, const char *opstr,
+    std::ostream &os) { // NOLINT(*)
+  if (op->dtype.is_scalar()) {
+    CodeGenTileLangPY::PrintBinaryIntrinsic_(op, opstr, os);
+  } else {
+    PrintVecBinaryOp_(opstr, op->dtype, op->args[0], op->args[1], os);
+  }
+}
+
+void CodeGenTileLangCuTeDSL::PrintCallExtern_(Type ret_type,
+                                              ffi::String global_symbol,
+                                              const ffi::Array<PrimExpr> &args,
+                                              bool skip_first_arg,
+                                              std::ostream &os) { // NOLINT(*)
+  DataType ret_dtype = GetRuntimeDataType(ret_type);
+
+  std::string global_symbol_str = global_symbol;
+  ReplaceAll(global_symbol_str, "::", ".");
+
+  std::vector<std::string> sargs;
+  // when the template arguments occurs at the end, merge them with function
+  // arguments
+  if (global_symbol_str.back() == '>') {
+    auto pos = global_symbol_str.rfind('<');
+    ICHECK(pos != std::string::npos);
+    std::string template_args =
+        global_symbol_str.substr(pos + 1, global_symbol_str.size() - pos - 2);
+    ReplaceAll(template_args, "true", "True");
+    ReplaceAll(template_args, "false", "False");
+    sargs.push_back(template_args);
+
+    global_symbol_str.resize(pos);
+  }
+  const size_t arg_begin = static_cast<size_t>(skip_first_arg);
+  for (size_t i = arg_begin; i < args.size(); ++i) {
+    std::string sarg = PrintExpr_(args[i]);
+    if (ret_dtype.is_fixed_length_vector()) {
+      std::string val = SSAGetID(sarg, args[i].dtype());
+      sargs.push_back(std::move(val));
+    } else {
+      sargs.push_back(sarg);
+    }
+  }
+
+  // Replace "<...>" with "(...)". Nested "<" is not supported
+  {
+    auto pos_left = global_symbol_str.find('<');
+    while (pos_left != std::string::npos) {
+      auto pos_right = global_symbol_str.find('>', pos_left + 1);
+      if (pos_right != std::string::npos) {
+        auto args =
+            global_symbol_str.substr(pos_left + 1, pos_right - pos_left - 1);
+        ReplaceAll(args, "true", "True");
+        ReplaceAll(args, "false", "False");
+        global_symbol_str.replace(pos_left, args.size() + 2, "(" + args + ")");
+      }
+      pos_left = global_symbol_str.find('<');
+    }
+  }
+
+  // Special cases:
+  // Map C math functions to Python/cutedsl equivalents
+  const auto canonicalized_global_symbol_str =
+      CanonicalizeFastmathFunctionName_(global_symbol_str);
+  const bool canonicalized = !canonicalized_global_symbol_str.empty();
+  if (canonicalized) {
+    global_symbol_str = canonicalized_global_symbol_str;
+  }
+
+  // Atomic Functions
+  if (global_symbol_str.substr(0, 6) == "Atomic") {
+    global_symbol_str = "tl." + global_symbol_str;
+    // Convert first argument (Buffer) to pointer for atomic operations
+    if (const BufferLoadNode *load = args[arg_begin].as<BufferLoadNode>()) {
+      ICHECK_EQ(load->indices.size(), 1)
+          << "CodeGenTileLangCuTeDSL only supports flat memory";
+      sargs[0] = GetBufferPtr_(load->buffer.get(), load->indices[0]);
+    }
+  }
+  // some optional template arguments might be ommited, so add names explicitly
+  // for remain arguments
+  if (global_symbol_str == "tl.gemm_ss" || global_symbol_str == "tl.gemm_rs" ||
+      global_symbol_str == "tl.gemm_sr" || global_symbol_str == "tl.gemm_rr") {
+    ICHECK(sargs.size() >= 3);
+    sargs[sargs.size() - 3] = "A_ptr=" + sargs[sargs.size() - 3];
+    sargs[sargs.size() - 2] = "B_ptr=" + sargs[sargs.size() - 2];
+    sargs[sargs.size() - 1] = "C_ptr=" + sargs[sargs.size() - 1];
+  }
+
+  if (ret_dtype.is_fixed_length_vector()) {
+    // maybe simplify this if TensorSSA suppports this OP
+    std::string sret = name_supply_->FreshName("_");
+    PrintIndent();
+    stream << sret << " = tl.make_rmem_tensor((" << ret_dtype.lanes() << ",), ";
+    PrintType(ret_dtype.element_of(), stream);
+    stream << ")\n";
+
+    // Emit a scalar call for each lane.
+    bool has_template_arg = (sargs.size() > args.size() - arg_begin);
+    for (int i = 0; i < ret_dtype.lanes(); ++i) {
+      std::ostringstream scall;
+      scall << global_symbol_str << "(";
+      for (size_t j = 0; j < sargs.size(); ++j) {
+        if (j != 0) {
+          scall << ", ";
+        }
+
+        if (j == 0 && has_template_arg) {
+          scall << sargs[j];
+        } else {
+          PrintVecElemLoad_(
+              sargs[j],
+              args[arg_begin + j - static_cast<size_t>(has_template_arg)]
+                  .dtype(),
+              i, scall);
+        }
+      }
+      if (canonicalized && enable_fastmath_) {
+        if (!sargs.empty()) {
+          scall << ", ";
+        }
+        scall << "fastmath=True";
+      }
+      scall << ")";
+      PrintVecElemStore_(sret, ret_dtype, i, scall.str());
+    }
+    os << sret << ".load()";
+  } else {
+    os << global_symbol_str << "(";
+    for (size_t i = 0; i < sargs.size(); ++i) {
+      if (i != 0) {
+        os << ", ";
+      }
+      os << sargs[i];
+    }
+    if (canonicalized && enable_fastmath_) {
+      if (!sargs.empty()) {
+        os << ", ";
+      }
+      os << "fastmath=True";
+    }
+    os << ")";
+  }
+}
+
+std::string CodeGenTileLangCuTeDSL::GetBufferPtr_(const BufferNode *buffer,
+                                                  PrimExpr index) {
+  const VarNode *buffer_var = buffer->data.get();
+  const std::string vid = GetVarID(buffer_var);
+
+  DataType buffer_element_dtype = buffer->dtype;
+  bool is_handle_type_match =
+      HandleTypeMatch_(buffer_var, buffer_element_dtype);
+  std::string ptr_str;
+  if (is_handle_type_match) {
+    ptr_str = vid + ".iterator";
+  } else {
+    ptr_str = "tl.recast_ptr(" + vid +
+              ".iterator, dtype=" + DTypeToString(buffer_element_dtype) + ")";
+  }
+
+  std::string index_str = PrintExpr_(index);
+  return "(" + ptr_str + " + " + index_str + ")";
+}
+
+// The following forms can be returned:
+// (1) vid
+// (2) vid[i]
+// (3) tl.make_tensor_at_offset(...)[0]
+// (4) tl.make_tensor_at_offset(...)
+//
+// Form (4) is needed when the whole tensor is loaded or stored.
+// It's the only form that ends with ")". Using this fact, BufferLoadNode will
+// add ".load()" and BufferStoreNode will add ".store()".
+std::string CodeGenTileLangCuTeDSL::GetBufferRef_(DataType t,
+                                                  const BufferNode *buffer,
+                                                  PrimExpr index) {
+  const VarNode *buffer_var = buffer->data.get();
+  std::string vid = GetVarID(buffer_var);
+  std::string scope;
+  if (alloc_storage_scope_.count(buffer_var)) {
+    scope = alloc_storage_scope_.at(buffer_var);
+  }
+  if (scope.empty()) {
+    scope = GetPtrStorageScope(buffer->data);
+  }
+  if (scope == "local.var" || scope.find("local.descriptor") == 0) {
+    return vid;
+  }
+
+  DataType buffer_element_dtype = buffer->dtype;
+  bool is_handle_type_match =
+      HandleTypeMatch_(buffer_var, buffer_element_dtype);
+  std::string ptr_str;
+  if (is_handle_type_match) {
+    ptr_str = vid + ".iterator";
+  } else {
+    ptr_str = "tl.recast_ptr(" + vid +
+              ".iterator, dtype=" + DTypeToString(buffer_element_dtype) + ")";
+  }
+
+  const std::string index_str = PrintExpr_(index);
+
+  if (t == buffer_element_dtype) {
+    if (is_handle_type_match && buffer_element_dtype.is_scalar() &&
+        (scope == "local" || scope == "shared" || scope == "shared.dyn" ||
+         scope == "shared.barrier")) {
+      // Tensors in these scopes are allocated as one-dimensional, so can be
+      // assessed via "[]" correctly. Other tensors may be multi-dimensional,
+      // and must be assessed via ptr, otherwise CuTeDSL will interpret "[]"
+      // access using its visiting order and layout.
+      return vid + "[" + index_str + "]";
+    } else {
+      std::ostringstream os;
+      os << "tl.make_tensor_at_offset(" << ptr_str << ", " << index_str
+         << ", (1,), div_by=" << buffer_element_dtype.lanes() << ")";
+      // for vector data types, ".load()" (added by BufferLoadNode) is neeed
+      // instead of "[0]"
+      if (buffer_element_dtype.is_scalar()) {
+        os << "[0]";
+      }
+      return os.str();
+    }
+  } else {
+    const int num = t.bits() * t.lanes();
+    const int den = buffer_element_dtype.bits() * buffer_element_dtype.lanes();
+    ICHECK_EQ(num % den, 0) << "Cannot form view: bitwidth not divisible";
+    int buffer_size = num / den;
+
+    std::ostringstream os;
+    os << "tl.make_tensor_at_offset(" << ptr_str << ", " << index_str << ", ("
+       << buffer_size << ",), div_by=" << buffer_size << ")";
+    return os.str();
+  }
+}
+
+void CodeGenTileLangCuTeDSL::BindThreadIndex_(const IterVar &iv) {
+  ICHECK(!var_idmap_.count(iv->var.get()));
+
+  auto &thread_tag = iv->thread_tag;
+  ICHECK(thread_tag == "threadIdx.x" || thread_tag == "threadIdx.y" ||
+         thread_tag == "threadIdx.z" || thread_tag == "blockIdx.x" ||
+         thread_tag == "blockIdx.y" || thread_tag == "blockIdx.z");
+
+  // cute.arch.thread_idx() and block_idx() are Int32
+  DataType from_dtype = DataType::Int(32);
+  var_idmap_[iv->var.get()] =
+      CastFromTo_(thread_tag, from_dtype, iv->var.dtype());
+}
+
+void CodeGenTileLangCuTeDSL::PrintStorageSync_(const CallNode *op) {
+  auto args = op->args;
+  const std::string &sync = args[0].as<StringImmNode>()->value;
+  if (sync == "warp") {
+    // do nothing
+  } else if (sync == "shared" || sync == "shared.dyn") {
+    PrintIndent();
+    if (args.size() == 1) {
+      stream << "tl.sync_threads()\n";
+    } else if (args.size() == 2) {
+      auto barrier_id_ptr = args[1].as<IntImmNode>();
+      ICHECK(barrier_id_ptr)
+          << "storage_sync barrier_id (args[1]) must be IntImm, got "
+          << args[1]->GetTypeKey();
+      auto barrier_id = barrier_id_ptr->value;
+      stream << "tl.sync_thread_partial(" << barrier_id << ")\n";
+    } else if (args.size() == 3) {
+      auto barrier_id_ptr = args[1].as<IntImmNode>();
+      ICHECK(barrier_id_ptr)
+          << "storage_sync barrier_id (args[1]) must be IntImm, got "
+          << args[1]->GetTypeKey();
+      auto thread_count_ptr = args[2].as<IntImmNode>();
+      ICHECK(thread_count_ptr)
+          << "storage_sync thread_count (args[2]) must be IntImm, got "
+          << args[2]->GetTypeKey();
+      auto barrier_id = barrier_id_ptr->value;
+      auto thread_count = thread_count_ptr->value;
+      stream << "tl.sync_thread_partial(" << barrier_id << ", " << thread_count
+             << ")\n";
+    } else {
+      LOG(FATAL) << "Invalid number of arguments for storage sync: "
+                 << args.size();
+    }
+  } else if (sync == "global") {
+    LOG(FATAL) << "PrintStorageSync_ for global is not supported for now";
+  } else {
+    LOG(FATAL) << "Unknown storage sync scope: " << sync;
+  }
+}
+
+} // namespace codegen
+} // namespace tvm
diff --git a/src/target/codegen_cutedsl.h b/src/target/codegen_cutedsl.h
new file mode 100644
index 000000000..1d4edc538
--- /dev/null
+++ b/src/target/codegen_cutedsl.h
@@ -0,0 +1,102 @@
+/*!
+ * \file target/codegen_cutedsl.h
+ * \brief Utility to generate CuTeDSL code
+ */
+#ifndef TVM_TL_TARGET_CODEGEN_CUTEDSL_H_
+#define TVM_TL_TARGET_CODEGEN_CUTEDSL_H_
+
+#include <tvm/target/codegen.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/op.h>
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "codegen_py.h"
+
+namespace tvm {
+namespace codegen {
+
+class CodeGenTileLangCuTeDSL final : public CodeGenTileLangPY {
+public:
+  CodeGenTileLangCuTeDSL();
+
+protected:
+  void PrintFuncDecorator_(std::ostream &os) override; // NOLINT(*)
+  void PreFunctionBody_(const PrimFunc &f) override;
+
+protected:
+  void PrintType(DataType t, std::ostream &os) override; // NOLINT(*)
+
+  void VisitExpr_(const BroadcastNode *op,
+                  std::ostream &os) override; // NOLINT(*)
+  void VisitExpr_(const FloatImmNode *op,
+                  std::ostream &os) override;                     // NOLINT(*)
+  void VisitExpr_(const CastNode *op, std::ostream &os) override; // NOLINT(*)
+  void VisitExpr_(const DivNode *op, std::ostream &os) override;  // NOLINT(*)
+  void VisitExpr_(const MinNode *op, std::ostream &os) override;  // NOLINT(*)
+  void VisitExpr_(const MaxNode *op, std::ostream &os) override;  // NOLINT(*)
+  void VisitExpr_(const CallNode *op, std::ostream &os) override; // NOLINT(*)
+  void VisitExpr_(const BufferLoadNode *op,
+                  std::ostream &os) override; // NOLINT(*)
+
+  void VisitStmt_(const BufferStoreNode *op) override;
+  void VisitStmt_(const AllocateNode *op) override;
+  void VisitStmt_(const AttrStmtNode *op) override;
+  void VisitStmt_(const ForNode *op) override;
+  void VisitStmt_(const IfThenElseNode *op) override;
+  void VisitStmt_(const EvaluateNode *op) override;
+
+protected:
+  virtual void PrintVecElemLoad_(const std::string &vec, DataType t, int i,
+                                 std::ostream &os); // NOLINT(*)
+  virtual void PrintVecElemStore_(const std::string &vec, DataType t, int i,
+                                  const std::string &value);
+  virtual void PrintVecStore_(const BufferNode *buffer, DataType t,
+                              PrimExpr base, const std::string &value);
+  void PrintVecBinaryOp_(const std::string &opstr, DataType dtype, PrimExpr lhs,
+                         PrimExpr rhs,
+                         std::ostream &os); // NOLINT(*)
+  void PrintBinaryExpr_(const std::string &opstr, DataType dtype, PrimExpr lhs,
+                        PrimExpr rhs,
+                        std::ostream &os) override; // NOLINT(*)
+  void PrintBinaryIntrinsic_(const CallNode *op, const char *opstr,
+                             std::ostream &os) override; // NOLINT(*)
+
+  void PrintCallExtern_(Type ret_type, ffi::String global_symbol,
+                        const ffi::Array<PrimExpr> &args, bool skip_first_arg,
+                        std::ostream &os) override; // NOLINT(*)
+
+  std::string GetBufferPtr_(const BufferNode *buffer, PrimExpr index);
+  std::string GetBufferRef_(DataType t, const BufferNode *buffer,
+                            PrimExpr index) override;
+
+  /*!
+   * \brief Print expr representing the thread tag
+   * \param IterVar iv The thread index to be binded;
+   */
+  virtual void BindThreadIndex_(const IterVar &iv); // NOLINT(*)
+
+  virtual void PrintStorageSync_(const CallNode *op);
+
+  std::string
+  CanonicalizeFastmathFunctionName_(const std::string &func_name) const;
+
+private:
+  // The name of the mbarrier array in shared memory
+  const std::string mbarrier_name_ = "mbarrier";
+
+  std::unordered_map<const VarNode *, IntImm> unroll_factor_;
+
+  std::vector<std::string> eviction_policy_names_ = {
+      "EVICT_NORMAL", "EVICT_FIRST", "EVICT_LAST"};
+
+  // Fastmath configuration (read from PassContext)
+  bool enable_fastmath_ = false;
+};
+
+} // namespace codegen
+} // namespace tvm
+
+#endif // TVM_TL_TARGET_CODEGEN_CUTEDSL_H_
diff --git a/src/target/codegen_hip.cc b/src/target/codegen_hip.cc
index 9c145750d..8a18c3fc9 100644
--- a/src/target/codegen_hip.cc
+++ b/src/target/codegen_hip.cc
@@ -828,6 +828,16 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
   } else if (op->op.same_as(tl::pack_b16())) {
     os << "__pack_half2(" << this->PrintExpr(op->args[0]) << ", "
        << this->PrintExpr(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::__ldg())) {
+    // HIP fallback: regular load
+    const BufferLoadNode *bl = op->args[0].as<BufferLoadNode>();
+    ICHECK(bl) << "T.__ldg expects a BufferLoad as the first argument.";
+    ICHECK_EQ(bl->indices.size(), 1)
+        << "T.__ldg currently supports flattened 1D buffer accesses.";
+    const BufferNode *buffer = bl->buffer.get();
+    PrimExpr base = bl->indices[0];
+    auto buffer_ref = this->GetBufferRef(op->dtype, buffer, base);
+    os << buffer_ref;
   } else if (op->op.same_as(builtin::tvm_fill_fragment())) {
     need_mma_h_ = true;
     ICHECK_EQ(op->args.size(), 6U);
@@ -928,7 +938,7 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
         {"float32", "float"},
         {"float64", "double"},
         {"float16x4", "float16x4"},
-        {"bfloat16x4", "bfloat16x4"},
+        {"bfloat16x4", "bfloat16x4_vec"},
         {"float32x4", "float32x4"},
         {"float8_e4m3fnuzx4", "fp8_e4_4_t"},
         {"float8_e4m3fnuzx8", "long"},
@@ -959,8 +969,8 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
                                     "A_ptr, B_ptr, C_ptr>, but got "
                                  << op->args.size();
     auto op_instance = Downcast<StringImm>(op->args[0]);
-    this->PrintCallExtern(GetType(GetRef<PrimExpr>(op)), op_instance->value,
-                          op->args, true, os);
+    this->PrintCallExtern(GetType(tvm::ffi::GetRef<PrimExpr>(op)),
+                          op_instance->value, op->args, true, os);
   } else if (op->op.same_as(tl::tl_gemm_sp())) {
     LOG(FATAL) << "tl_gemm_sp is not supported on HIP";
   } else if (op->op.same_as(tl::loop_break())) {
@@ -1190,9 +1200,9 @@ inline void PrintConst(const FloatImmNode *op, std::ostream &os,
       if (op->value < 0) {
         temp << "-";
       }
-      temp << ((op->dtype.bits() == 32) ? "HIPRT_INF_F" : "HIPRT_INF");
+      temp << ((op->dtype.bits() == 32) ? "HUGE_VALF" : "HUGE_VAL");
     } else if (std::isnan(op->value)) {
-      temp << ((op->dtype.bits() == 32) ? "HIPRT_NAN_F" : "HIPRT_NAN");
+      temp << ((op->dtype.bits() == 32) ? "NAN" : "NAN");
     } else {
       temp << std::scientific << op->value;
       if (op->dtype.bits() == 32)
@@ -1309,9 +1319,15 @@ void CodeGenTileLangHIP::AddFunction(const PrimFunc &f) {
   ReserveKeywordsAsUnique();
 
   auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-  ICHECK(global_symbol.defined())
+  ICHECK(global_symbol.has_value())
       << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
   bool no_alias = f->HasNonzeroAttr(tir::attr::kNoAlias);
+  std::unordered_set<const VarNode *> non_restrict;
+  if (auto opt =
+          f->GetAttr<ffi::Array<tir::Var>>(tl::attr::kNonRestrictParams)) {
+    for (const tir::Var &v : opt.value())
+      non_restrict.insert(v.get());
+  }
 
   this->PrintFuncPrefix(stream);
   CodeGenC::PrintType(f->ret_type, stream);
@@ -1346,7 +1362,7 @@ void CodeGenTileLangHIP::AddFunction(const PrimFunc &f) {
         }
       }
 
-      if (no_alias) {
+      if (no_alias && !non_restrict.count(v.get())) {
         PrintRestrict(v, stream);
       }
     } else {
diff --git a/src/target/codegen_hip.h b/src/target/codegen_hip.h
index 491040be3..631050feb 100644
--- a/src/target/codegen_hip.h
+++ b/src/target/codegen_hip.h
@@ -56,8 +56,8 @@ class CodeGenTileLangHIP final : public CodeGenC {
 protected:
   virtual std::string GetBufferRef(DataType t, const BufferNode *buffer,
                                    PrimExpr index) final;
-  void PrintCallExtern(Type ret_type, String global_symbol,
-                       const Array<PrimExpr> &args, bool skip_first_arg,
+  void PrintCallExtern(Type ret_type, ffi::String global_symbol,
+                       const ffi::Array<PrimExpr> &args, bool skip_first_arg,
                        std::ostream &os) final; // NOLINT(*)
 
 private:
diff --git a/src/target/codegen_py.cc b/src/target/codegen_py.cc
new file mode 100644
index 000000000..aa12eef09
--- /dev/null
+++ b/src/target/codegen_py.cc
@@ -0,0 +1,715 @@
+/*!
+ * \file codegen_py.cc
+ */
+#include "codegen_py.h"
+#include "codegen_utils.h"
+
+#include <tvm/arith/analyzer.h>
+#include <tvm/ir/name_supply.h>
+
+#include <cctype>
+
+namespace tvm {
+namespace codegen {
+
+void CodeGenTileLangPY::AddFunction(const GlobalVar &gvar, const PrimFunc &f) {
+  RegisterFunction_(gvar, f);
+  auto function_name = GetFunctionName_(gvar);
+
+  // clear previous generated state.
+  InitFuncState_(f);
+
+  PrintFuncDecorator_(stream);
+  PrintFunctionSignature_(function_name, f, stream);
+  stream << ":\n";
+
+  int func_scope = BeginScope();
+  PreFunctionBody_(f);
+  PrintStmt_(f->body);
+  EndScope(func_scope);
+}
+
+std::string CodeGenTileLangPY::Finish() {
+  std::ostringstream code;
+  code << decl_stream.str();
+  code << stream.str();
+  return code.str();
+}
+
+ffi::String CodeGenTileLangPY::GetFunctionName_(const GlobalVar &gvar) {
+  auto it = internal_functions_.find(gvar);
+  ICHECK(it != internal_functions_.end())
+      << "Attempted to find name of " << gvar
+      << ", but no function with this GlobalVar has been declared";
+  return it->second;
+}
+
+void CodeGenTileLangPY::RegisterFunction_(const GlobalVar &gvar,
+                                          const PrimFunc &func) {
+  if (internal_functions_.count(gvar)) {
+    return;
+  }
+
+  auto function_name = [&]() -> ffi::String {
+    if (auto global_symbol =
+            func->GetAttr<ffi::String>(tvm::attr::kGlobalSymbol)) {
+      auto name = global_symbol.value();
+      ICHECK(!func_name_supply_->ContainsName(name))
+          << "Function " << gvar << " must use global symbol " << name
+          << ", but this name has already been used.";
+      func_name_supply_->ReserveName(name);
+      return name;
+    } else {
+      ICHECK(!func_name_supply_->ContainsName(gvar->name_hint))
+          << "Function " << gvar << " must use name hint " << gvar->name_hint
+          << ", but this name has already been used.";
+      func_name_supply_->ReserveName(gvar->name_hint);
+      return gvar->name_hint;
+    }
+  }();
+  internal_functions_.insert({gvar, function_name});
+}
+
+void CodeGenTileLangPY::InitFuncState_(const PrimFunc &f) {
+  alloc_storage_scope_.clear();
+  handle_data_type_.clear();
+  CodeGenSourceBase::ClearFuncState();
+  ReserveKeywordsAsUnique_();
+}
+
+void CodeGenTileLangPY::PrintFunctionSignature_(
+    const ffi::String &function_name, const PrimFunc &func,
+    std::ostream &os) { // NOLINT(*)
+  os << "def " << function_name << "(";
+  for (size_t i = 0; i < func->params.size(); ++i) {
+    tir::Var v = func->params[i];
+    if (i > 0) {
+      os << ", ";
+    }
+    os << AllocVarID(v.get());
+  }
+  os << ")";
+
+  // Register handle data type
+  for (const auto &param : func->params) {
+    if (auto *ptr = param->type_annotation.as<PointerTypeNode>()) {
+      if (auto *prim = ptr->element_type.as<PrimTypeNode>()) {
+        RegisterHandleType_(param.get(), prim->dtype);
+      }
+    }
+  }
+}
+
+void CodeGenTileLangPY::ReserveKeywordsAsUnique_() {
+  // skip the first underscore, so SSA variable starts from _1
+  name_supply_->ReserveName("_");
+  name_supply_->ReserveName("False");
+  name_supply_->ReserveName("None");
+  name_supply_->ReserveName("True");
+  name_supply_->ReserveName("and");
+  name_supply_->ReserveName("as");
+  name_supply_->ReserveName("assert");
+  name_supply_->ReserveName("async");
+  name_supply_->ReserveName("await");
+  name_supply_->ReserveName("break");
+  name_supply_->ReserveName("class");
+  name_supply_->ReserveName("continue");
+  name_supply_->ReserveName("def");
+  name_supply_->ReserveName("del");
+  name_supply_->ReserveName("elif");
+  name_supply_->ReserveName("else");
+  name_supply_->ReserveName("except");
+  name_supply_->ReserveName("finally");
+  name_supply_->ReserveName("for");
+  name_supply_->ReserveName("from");
+  name_supply_->ReserveName("global");
+  name_supply_->ReserveName("if");
+  name_supply_->ReserveName("import");
+  name_supply_->ReserveName("in");
+  name_supply_->ReserveName("is");
+  name_supply_->ReserveName("lambda");
+  name_supply_->ReserveName("nonlocal");
+  name_supply_->ReserveName("not");
+  name_supply_->ReserveName("or");
+  name_supply_->ReserveName("pass");
+  name_supply_->ReserveName("raise");
+  name_supply_->ReserveName("return");
+  name_supply_->ReserveName("try");
+  name_supply_->ReserveName("while");
+  name_supply_->ReserveName("with");
+  name_supply_->ReserveName("yield");
+
+  name_supply_->ReserveName("void");
+  name_supply_->ReserveName("int");
+  name_supply_->ReserveName("float");
+  name_supply_->ReserveName("double");
+  name_supply_->ReserveName("char");
+  name_supply_->ReserveName("unsigned");
+  name_supply_->ReserveName("short");
+  name_supply_->ReserveName("long");
+
+  name_supply_->ReserveName("cutlass");
+  name_supply_->ReserveName("cute");
+  name_supply_->ReserveName("tl");
+}
+
+void CodeGenTileLangPY::PrintSSAAssign(const std::string &target,
+                                       const std::string &src, DataType t) {
+  stream << target << " = " << RemoveOutermostParentheses(src) << "\n";
+}
+
+void CodeGenTileLangPY::PrintType(DataType type,
+                                  std::ostream &os) { // NOLINT(*)
+  if (type.is_float()) {
+    if (type.bits() == 16 || type.bits() == 32 || type.bits() == 64) {
+      os << "float";
+    } else {
+      LOG(FATAL) << "Cannot convert float" << type.bits() << " to Python type";
+    }
+  } else if (type.is_uint()) {
+    switch (type.bits()) {
+    case 8:
+    case 16:
+    case 32:
+    case 64: {
+      os << "int";
+      break;
+    }
+    case 1:
+      os << "bool";
+      break;
+    default:
+      LOG(FATAL) << "Cannot convert uint" << type.bits() << " to Python type";
+    }
+  } else if (type.is_int()) {
+    switch (type.bits()) {
+    case 8:
+    case 16:
+    case 32:
+    case 64: {
+      os << "int";
+      break;
+    }
+    case 1:
+      os << "bool";
+      break;
+    default:
+      LOG(FATAL) << "Cannot convert int" << type.bits() << " to Python type";
+    }
+  } else {
+    LOG(FATAL) << "Cannot convert type " << type << " to Python type";
+  }
+}
+
+void CodeGenTileLangPY::VisitExpr_(const VarNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  os << GetVarID(op);
+}
+
+void CodeGenTileLangPY::VisitExpr_(const IntImmNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  if (op->dtype == DataType::Bool()) {
+    os << (op->value ? "True" : "False");
+  } else {
+    std::ostringstream temp;
+    temp << op->value;
+    MarkConst(temp.str());
+    os << temp.str();
+  }
+}
+
+void CodeGenTileLangPY::VisitExpr_(const FloatImmNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  switch (op->dtype.bits()) {
+  case 64:
+  case 32: {
+    std::ostringstream temp;
+    temp << "float.fromhex('" << std::hexfloat << op->value << "')";
+    MarkConst(temp.str());
+    os << temp.str();
+    break;
+  }
+  case 16: {
+    PrintType(op->dtype, os);
+    os << "(float.fromhex('" << std::hexfloat << op->value << "'))";
+    break;
+  }
+  default:
+    LOG(FATAL) << "Bad bit-width for float: " << op->dtype << "\n";
+  }
+}
+
+void CodeGenTileLangPY::VisitExpr_(const StringImmNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  EscapeStringLiteral_(op->value, os);
+}
+
+void CodeGenTileLangPY::VisitExpr_(const CastNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  std::stringstream value;
+  PrintExpr_(op->value, value);
+  os << CastFromTo_(value.str(), op->value.dtype(), op->dtype);
+}
+
+void CodeGenTileLangPY::VisitExpr_(const AddNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("+", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const SubNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("-", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const MulNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("*", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const DivNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  if (op->dtype.is_int() || op->dtype.is_uint()) {
+    PrintBinaryExpr_("//", op->dtype, op->a, op->b, os);
+  } else {
+    PrintBinaryExpr_("/", op->dtype, op->a, op->b, os);
+  }
+}
+void CodeGenTileLangPY::VisitExpr_(const ModNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  ICHECK(op->dtype.is_int() || op->dtype.is_uint() || op->dtype.is_float())
+      << "Expected floating point or integer dtype in Mod, but got "
+      << op->dtype;
+  PrintBinaryExpr_("%", op->dtype, op->a, op->b, os);
+}
+
+void CodeGenTileLangPY::VisitExpr_(const MinNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("min", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const MaxNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("max", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const EQNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("==", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const NENode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("!=", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const LTNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("<", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const LENode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("<=", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const GTNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_(">", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const GENode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_(">=", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const AndNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("and", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const OrNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  PrintBinaryExpr_("or", op->dtype, op->a, op->b, os);
+}
+void CodeGenTileLangPY::VisitExpr_(const NotNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  os << "(not ";
+  PrintExpr_(op->a, os);
+  os << ")";
+}
+
+void CodeGenTileLangPY::VisitExpr_(const SelectNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  os << "(";
+  PrintExpr_(op->true_value, os);
+  os << " if ";
+  PrintExpr_(op->condition, os);
+  os << " else ";
+  PrintExpr_(op->false_value, os);
+  os << ")";
+}
+
+void CodeGenTileLangPY::VisitExpr_(const RampNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  int lanes = op->dtype.lanes();
+  os << "(";
+  for (int i = 0; i < lanes; i++) {
+    os << "(" << PrintExpr_(op->base) << ")"
+       << "+(" << PrintExpr_(op->stride) << "*" << i << ")";
+    if (i != lanes - 1)
+      os << ", ";
+  }
+  os << ")";
+}
+
+void CodeGenTileLangPY::VisitExpr_(const CallNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  if (auto opt_call_op = op->op.as<Op>()) {
+    const auto &call_op = opt_call_op.value();
+
+    if (op->op.same_as(builtin::ret())) {
+      os << "return " << RemoveOutermostParentheses(PrintExpr_(op->args[0]));
+    } else if (op->op.same_as(builtin::continue_loop())) {
+      os << "continue";
+    } else if (op->op.same_as(builtin::break_loop())) {
+      os << "break";
+    } else if (op->op.same_as(builtin_call_extern_) ||
+               op->op.same_as(builtin_call_pure_extern_)) {
+      ICHECK_GE(op->args.size(), 1U);
+      auto func = Downcast<StringImm>(op->args[0]);
+      PrintCallExtern_(GetType(ffi::GetRef<PrimExpr>(op)), func->value,
+                       op->args, true, os);
+    } else if (op_attr_global_symbol_.count(call_op)) {
+      // call extern if the op itself have a global symbol.
+      PrintCallExtern_(GetType(ffi::GetRef<PrimExpr>(op)),
+                       op_attr_global_symbol_[call_op], op->args, false, os);
+    } else if (op->op.same_as(builtin::large_uint_imm())) {
+      ICHECK_EQ(op->args.size(), 2U);
+      uint64_t low =
+          static_cast<uint64_t>(Downcast<IntImm>(op->args[0])->value);
+      uint64_t high =
+          static_cast<uint64_t>(Downcast<IntImm>(op->args[1])->value);
+      uint64_t val = (high << 32U) | low;
+
+      if (op->dtype == DataType::UInt(32)) {
+        std::ostringstream temp;
+        temp << val;
+        MarkConst(temp.str());
+        os << temp.str();
+      } else {
+        PrintType(op->dtype, os);
+        os << "(" << val << ")";
+      }
+    } else if (op->op.same_as(builtin::bitwise_and())) {
+      PrintBinaryIntrinsic_(op, "&", os);
+    } else if (op->op.same_as(builtin::bitwise_or())) {
+      PrintBinaryIntrinsic_(op, "|", os);
+    } else if (op->op.same_as(builtin::bitwise_xor())) {
+      PrintBinaryIntrinsic_(op, "^", os);
+    } else if (op->op.same_as(builtin::bitwise_not())) {
+      ICHECK_EQ(op->args.size(), 1U);
+      os << "~";
+      PrintExpr_(op->args[0], os);
+    } else if (op->op.same_as(builtin::shift_left())) {
+      PrintBinaryIntrinsic_(op, "<<", os);
+    } else if (op->op.same_as(builtin::shift_right())) {
+      PrintBinaryIntrinsic_(op, ">>", os);
+    } else if (op->op.same_as(builtin::if_then_else())) {
+
+      std::string cond = PrintExpr_(op->args[0]);
+      std::string true_val = PrintExpr_(op->args[1]);
+      std::string false_val = PrintExpr_(op->args[2]);
+      os << "(" << true_val << " if " << cond << " else " << false_val << ")";
+    } else if (op->op.same_as(builtin::isnullptr())) {
+      ICHECK_EQ(op->args.size(), 1U);
+      os << "(";
+      PrintExpr_(op->args[0], os);
+      os << " is None)";
+    } else if (op->op.same_as(builtin::isnan())) {
+      os << "(";
+      PrintExpr_(op->args[0], os);
+      os << " != ";
+      PrintExpr_(op->args[0], os);
+      os << ")";
+    } else {
+      LOG(FATAL) << "Unresolved call " << op->op;
+    }
+  } else if (auto opt = op->op.as<GlobalVar>()) {
+    const auto &gvar = opt.value();
+    auto callee_name = GetFunctionName_(gvar);
+    PrintCallExtern_(GetType(ffi::GetRef<PrimExpr>(op)), callee_name, op->args,
+                     false, os);
+  } else {
+    LOG(FATAL)
+        << "CodeGenTileLangPY: Unknown operation " << op->op
+        << " is neither a recognized built-in, "
+        << "nor a GlobalVar reference to another function in the IRModule";
+  }
+}
+
+void CodeGenTileLangPY::VisitExpr_(const BufferLoadNode *op,
+                                   std::ostream &os) { // NOLINT(*)
+  ICHECK_EQ(op->indices.size(), 1)
+      << "Load from non-flat memory not supported.";
+  ICHECK(!op->predicate.defined())
+      << "Predicated buffer load is not supported.";
+
+  DataType value_dtype = op->dtype;
+  PrimExpr index = op->indices[0];
+  Var buffer_var = op->buffer->data;
+  DataType element_dtype = op->buffer->dtype;
+
+  ICHECK_EQ(value_dtype, element_dtype)
+      << "value_dtype and element_dtype must be same for a BufferLoadNode";
+  std::string ref = GetBufferRef_(op->dtype, op->buffer.get(), index);
+  os << ref;
+}
+
+void CodeGenTileLangPY::VisitStmt_(const BufferStoreNode *op) {
+  ICHECK_EQ(op->indices.size(), 1) << "Store to non-flat memory not supported.";
+  ICHECK(!op->predicate.defined())
+      << "Predicated buffer store is not supported.";
+
+  DataType value_dtype = op->value.dtype();
+  DataType element_dtype = op->buffer->dtype;
+  PrimExpr index_expr = op->indices[0];
+  Var buffer_var = op->buffer->data;
+
+  ICHECK_EQ(value_dtype, element_dtype)
+      << "value_dtype and element_dtype must be same for a BufferStoreNode";
+  std::string value = PrintExpr_(op->value);
+  std::string ref = GetBufferRef_(value_dtype, op->buffer.get(), index_expr);
+  PrintIndent();
+  stream << ref << " = " << RemoveOutermostParentheses(value) << "\n";
+}
+
+void CodeGenTileLangPY::VisitStmt_(const DeclBufferNode *op) {
+  PrintStmt_(op->body);
+}
+
+void CodeGenTileLangPY::VisitStmt_(const LetStmtNode *op) {
+  std::string value = PrintExpr_(op->value);
+  PrintIndent();
+  stream << AllocVarID(op->var.get()) << " = " << value << "\n";
+  PrintStmt_(op->body);
+}
+
+void CodeGenTileLangPY::VisitStmt_(const AllocateNode *op) {
+  ICHECK(!is_zero(op->condition));
+  std::string vid = AllocVarID(op->buffer_var.get());
+
+  PrintIndent();
+  size_t constant_size = op->ConstantAllocationSize();
+  ICHECK_GT(constant_size, 0)
+      << "Can only handle constant size stack allocation for now";
+
+  auto scope = GetPtrStorageScope(op->buffer_var);
+  alloc_storage_scope_[op->buffer_var.get()] = scope;
+
+  stream << vid << " = [None] * " << constant_size << "\n";
+
+  RegisterHandleType_(op->buffer_var.get(), op->dtype);
+  PrintStmt_(op->body);
+}
+
+void CodeGenTileLangPY::VisitStmt_(const AttrStmtNode *op) {
+  PrintStmt_(op->body);
+}
+
+void CodeGenTileLangPY::VisitStmt_(const ForNode *op) {
+  PrintIndent();
+  std::string vid = AllocVarID(op->loop_var.get());
+  stream << "for " << vid << " in range(";
+  if (is_zero(op->min)) {
+    PrintExpr_(op->extent, stream);
+  } else {
+    PrintExpr_(op->min, stream);
+    stream << ", ";
+    PrimExpr upper_bound = arith::Analyzer().Simplify(op->extent + op->min);
+    PrintExpr_(upper_bound, stream);
+  }
+  stream << "):\n";
+  int for_scope = BeginScope();
+  PrintStmt_(op->body);
+  EndScope(for_scope);
+}
+
+void CodeGenTileLangPY::VisitStmt_(const WhileNode *op) {
+  std::string cond = PrintExpr_(op->condition);
+  PrintIndent();
+  stream << "while " << RemoveOutermostParentheses(cond) << ":\n";
+  int while_scope = BeginScope();
+  PrintStmt_(op->body);
+  EndScope(while_scope);
+}
+
+void CodeGenTileLangPY::VisitStmt_(const IfThenElseNode *op) {
+  std::string cond = PrintExpr_(op->condition);
+  PrintIndent();
+  stream << "if " << RemoveOutermostParentheses(cond) << ":\n";
+  int then_scope = BeginScope();
+  PrintStmt_(op->then_case);
+  EndScope(then_scope);
+
+  if (op->else_case) {
+    PrintIndent();
+    stream << "else:\n";
+    int else_scope = BeginScope();
+    PrintStmt_(op->else_case.value());
+    EndScope(else_scope);
+  }
+}
+
+void CodeGenTileLangPY::VisitStmt_(const SeqStmtNode *op) {
+  for (Stmt stmt : op->seq) {
+    PrintStmt_(stmt);
+  }
+}
+
+void CodeGenTileLangPY::VisitStmt_(const EvaluateNode *op) {
+  if (is_const_int(op->value))
+    return;
+
+  std::string vid = PrintExpr_(op->value);
+  if (!vid.empty()) {
+    PrintIndent();
+    stream << vid << "\n";
+  }
+}
+
+void CodeGenTileLangPY::VisitStmt_(const AssertStmtNode *op) {
+  std::string cond = PrintExpr_(op->condition);
+  PrintIndent();
+  if (const auto *str = op->message.as<StringImmNode>()) {
+    stream << "assert " << cond << ", ";
+    EscapeStringLiteral_(str->value, stream);
+    stream << "\n";
+  } else {
+    stream << "assert " << cond << "\n";
+  }
+  PrintStmt_(op->body);
+}
+
+std::string CodeGenTileLangPY::CastFromTo_(const std::string &value,
+                                           DataType from, DataType target) {
+  if (from == target)
+    return value;
+  std::ostringstream os;
+  PrintType(target, os);
+  os << "(" << value << ")";
+  return os.str();
+}
+
+void CodeGenTileLangPY::PrintBinaryExpr_(const std::string &opstr,
+                                         DataType dtype, PrimExpr lhs,
+                                         PrimExpr rhs,
+                                         std::ostream &os) { // NOLINT(*)
+  ICHECK_EQ(dtype.lanes(), 1);
+  if (isalpha(opstr[0]) && opstr != "and" && opstr != "or") {
+    os << opstr << '(';
+    PrintExpr_(lhs, os);
+    os << ", ";
+    PrintExpr_(rhs, os);
+    os << ')';
+  } else {
+    os << '(';
+    PrintExpr_(lhs, os);
+    os << ' ' << opstr << ' ';
+    PrintExpr_(rhs, os);
+    os << ')';
+  }
+}
+
+void CodeGenTileLangPY::PrintBinaryIntrinsic_(const CallNode *op,
+                                              const char *opstr,
+                                              std::ostream &os) { // NOLINT(*)
+  ICHECK_EQ(op->dtype.lanes(), 1);
+  ICHECK_EQ(op->args.size(), 2U);
+  os << '(';
+  PrintExpr_(op->args[0], os);
+  os << ' ' << opstr << ' ';
+  PrintExpr_(op->args[1], os);
+  os << ')';
+}
+
+void CodeGenTileLangPY::PrintCallExtern_(Type ret_type,
+                                         ffi::String global_symbol,
+                                         const ffi::Array<PrimExpr> &args,
+                                         bool skip_first_arg,
+                                         std::ostream &os) { // NOLINT(*)
+  os << global_symbol << "(";
+  for (size_t i = static_cast<size_t>(skip_first_arg); i < args.size(); ++i) {
+    PrintExpr_(args[i], os);
+    if (i < args.size() - 1) {
+      os << ", ";
+    }
+  }
+  os << ")";
+}
+
+// Print a reference expression to a buffer.
+std::string CodeGenTileLangPY::GetBufferRef_(DataType t,
+                                             const BufferNode *buffer,
+                                             PrimExpr index) {
+  const VarNode *buffer_var = buffer->data.get();
+  std::string vid = GetVarID(buffer_var);
+  DataType buffer_element_dtype = buffer->dtype;
+
+  ICHECK(HandleTypeMatch_(buffer_var, buffer_element_dtype));
+  ICHECK_EQ(t, buffer_element_dtype);
+
+  std::string index_str = PrintExpr_(index);
+  return vid + "[" + index_str + "]";
+}
+
+void CodeGenTileLangPY::RegisterHandleType_(const VarNode *buf_var,
+                                            DataType t) {
+  auto it = handle_data_type_.find(buf_var);
+  if (it == handle_data_type_.end()) {
+    handle_data_type_[buf_var] = t;
+  } else {
+    ICHECK(it->second == t) << "conflicting buf var type";
+  }
+}
+
+bool CodeGenTileLangPY::HandleTypeMatch_(const VarNode *buf_var,
+                                         DataType t) const {
+  auto it = handle_data_type_.find(buf_var);
+  if (it == handle_data_type_.end())
+    return false;
+  return it->second == t;
+}
+
+void CodeGenTileLangPY::EscapeStringLiteral_(const std::string &s,
+                                             std::ostream &os) {
+  os << '"';
+  for (unsigned char c : s) {
+    switch (c) {
+    case '\\':
+      os << "\\\\";
+      break;
+    case '"':
+      os << "\\\"";
+      break;
+    case '\n':
+      os << "\\n";
+      break;
+    case '\r':
+      os << "\\r";
+      break;
+    case '\t':
+      os << "\\t";
+      break;
+    case '\f':
+      os << "\\f";
+      break;
+    case '\b':
+      os << "\\b";
+      break;
+    default:
+      // Handle non-printable and non-ASCII characters
+      if (c < 32 || c == 127) {
+        // Output as \xHH
+        os << "\\x";
+        const char hex[] = "0123456789abcdef";
+        os << hex[(c >> 4) & 0xF];
+        os << hex[c & 0xF];
+      } else {
+        os << c;
+      }
+      break;
+    }
+  }
+  os << '"';
+}
+
+} // namespace codegen
+} // namespace tvm
diff --git a/src/target/codegen_py.h b/src/target/codegen_py.h
new file mode 100644
index 000000000..431fe933d
--- /dev/null
+++ b/src/target/codegen_py.h
@@ -0,0 +1,255 @@
+/*!
+ * \file codegen_py.h
+ * \brief Common utilities to generate simple Python code.
+ */
+#ifndef TVM_TL_TARGET_CODEGEN_PY_H_
+#define TVM_TL_TARGET_CODEGEN_PY_H_
+
+#include <tvm/ir/op.h>
+#include <tvm/target/codegen.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/function.h>
+#include <tvm/tir/op_attr_types.h>
+#include <tvm/tir/stmt.h>
+#include <tvm/tir/stmt_functor.h>
+
+#include <string>
+#include <unordered_map>
+
+// from tvm/src/
+#include "target/source/codegen_source_base.h"
+#include "tir/transforms/ir_utils.h"
+
+namespace tvm {
+namespace codegen {
+
+using namespace tir;
+/*!
+ * \brief A base class to generate simple Python code.
+ */
+class CodeGenTileLangPY
+    : public ExprFunctor<void(const PrimExpr &, std::ostream &)>,
+      public StmtFunctor<void(const Stmt &)>,
+      public CodeGenSourceBase {
+public:
+  /*!
+   * \brief Add the function definition to the generated module.
+   * \param gvar The GlobalVar representing the function.
+   * \param func The function to be compiled.
+   */
+  virtual void AddFunction(const GlobalVar &gvar, const PrimFunc &func);
+
+  /*!
+   * \brief Finalize the compilation and return the code.
+   * \return The code.
+   */
+  virtual std::string Finish();
+
+protected:
+  /*!
+   * \brief Get the name of a declared function
+   * \param gvar The GlobalVar of the function
+   * \returns The string name of the function
+   */
+  ffi::String GetFunctionName_(const GlobalVar &gvar);
+
+  /*!
+   * \brief Reserve the function name in the generated module.
+   *
+   * \param gvar The GlobalVar representing the function.
+   * \param func The function to be compiled.
+   * \param whether to append return 0 in the end.
+   */
+  virtual void RegisterFunction_(const GlobalVar &gvar, const PrimFunc &func);
+
+  /*!
+   * \brief Initialize codegen state for generating f.
+   * \param f The function to be compiled.
+   */
+  virtual void InitFuncState_(const PrimFunc &f);
+
+  /*! \brief Print the function signature before ":"
+   * \param function_name The name of the function
+   * \param func The function whose signature should be printed
+   * \param os The output stream
+   */
+  virtual void PrintFunctionSignature_(const ffi::String &function_name,
+                                       const PrimFunc &func,
+                                       std::ostream &os); // NOLINT(*)
+
+  /*!
+   * \brief Print the function decorator
+   * \param os The output stream
+   */
+  virtual void PrintFuncDecorator_(std::ostream &os) {} // NOLINT(*)
+
+  /*!
+   * \brief Insert statement before function body.
+   * \param f The function to be compiled.
+   */
+  virtual void PreFunctionBody_(const PrimFunc &f) {}
+
+protected:
+  /*! \brief reserves common Python keywords */
+  void ReserveKeywordsAsUnique_();
+
+  void PrintSSAAssign(const std::string &target, const std::string &src,
+                      DataType t) override;
+
+protected:
+  /*!
+   * \brief Print Type representation of type type.
+   * \param t The type representation.
+   * \param os The output stream
+   */
+  void PrintType(DataType type, std::ostream &os) override; // NOLINT(*)
+
+  /*!
+   * \brief Print the Stmt n to CodeGenTileLangPY->stream
+   * \param n The statement to be printed.
+   */
+  void PrintStmt_(const Stmt &n) { VisitStmt(n); }
+  /*!
+   * \brief Print the expression n into os
+   * \param n The expression to be printed.
+   * \param os The output stream
+   */
+  void PrintExpr_(const PrimExpr &n, std::ostream &os) { // NOLINT(*)
+    VisitExpr(n, os);
+  }
+  /*!
+   * \brief Same as PrintExpr_, but simply returns result string
+   * \param n The expression to be printed.
+   */
+  std::string PrintExpr_(const PrimExpr &n) {
+    std::ostringstream os;
+    PrintExpr_(n, os);
+    return os.str();
+  }
+
+  // expression
+  void VisitExpr_(const VarNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const IntImmNode *op, std::ostream &os) override; // NOLINT(*)
+  void VisitExpr_(const FloatImmNode *op,
+                  std::ostream &os) override; // NOLINT(*)
+  void VisitExpr_(const StringImmNode *op,
+                  std::ostream &os) override;                       // NOLINT(*)
+  void VisitExpr_(const CastNode *op, std::ostream &os) override;   // NOLINT(*)
+  void VisitExpr_(const AddNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const SubNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const MulNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const DivNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const ModNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const MinNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const MaxNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const EQNode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const NENode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const LTNode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const LENode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const GTNode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const GENode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const AndNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const OrNode *op, std::ostream &os) override;     // NOLINT(*)
+  void VisitExpr_(const NotNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const SelectNode *op, std::ostream &os) override; // NOLINT(*)
+  void VisitExpr_(const RampNode *op, std::ostream &os) override;   // NOLINT(*)
+  void VisitExpr_(const CallNode *op, std::ostream &os) override;   // NOLINT(*)
+  void VisitExpr_(const BufferLoadNode *op,
+                  std::ostream &os) override; // NOLINT(*)
+
+  // statment
+  void VisitStmt_(const BufferStoreNode *op) override;
+  void VisitStmt_(const DeclBufferNode *op) override;
+  void VisitStmt_(const LetStmtNode *op) override;
+  void VisitStmt_(const AllocateNode *op) override;
+  void VisitStmt_(const AttrStmtNode *op) override;
+  void VisitStmt_(const ForNode *op) override;
+  void VisitStmt_(const WhileNode *op) override;
+  void VisitStmt_(const IfThenElseNode *op) override;
+  void VisitStmt_(const SeqStmtNode *op) override;
+  void VisitStmt_(const EvaluateNode *op) override;
+  void VisitStmt_(const AssertStmtNode *op) override;
+
+protected:
+  // Get a string of type casting
+  virtual std::string CastFromTo_(const std::string &value, DataType from,
+                                  DataType target);
+
+  virtual void PrintBinaryExpr_(const std::string &opstr, DataType dtype,
+                                PrimExpr lhs, PrimExpr rhs,
+                                std::ostream &os); // NOLINT(*)
+  virtual void PrintBinaryIntrinsic_(const CallNode *op, const char *opstr,
+                                     std::ostream &os); // NOLINT(*)
+
+  /*!
+   * \brief Print external function call.
+   * \param ret_type The return type.
+   * \param global_symbol The symbolc of the target function.
+   * \param args The arguments to the function.
+   * \param skip_first_arg Whether to skip the first arguments.
+   * \param os The output stream.
+   */
+  virtual void PrintCallExtern_(Type ret_type, ffi::String global_symbol,
+                                const ffi::Array<PrimExpr> &args,
+                                bool skip_first_arg,
+                                std::ostream &os); // NOLINT(*)
+
+  // Print reference to a buffer as type t in index.
+  virtual std::string GetBufferRef_(DataType t, const BufferNode *buffer,
+                                    PrimExpr index);
+
+  /*!
+   * \brief Register the data type of buf_var
+   * \param buf_var The buffer variable.
+   * \param t The type to be checked.
+   */
+  void RegisterHandleType_(const VarNode *buf_var, DataType t);
+
+  /*!
+   * \brief If buffer is allocated as type t.
+   * \param buf_var The buffer variable.
+   * \param t The type to be checked.
+   */
+  bool HandleTypeMatch_(const VarNode *buf_var, DataType t) const;
+
+protected:
+  /*! \brief the storage scope of allocation */
+  std::unordered_map<const VarNode *, std::string> alloc_storage_scope_;
+
+  /*! \brief Record of ops that have pre-defined global symbol. */
+  OpAttrMap<TGlobalSymbol> op_attr_global_symbol_ =
+      Op::GetAttrMap<TGlobalSymbol>("TGlobalSymbol");
+
+  // cache commonly used ops
+  const Op &builtin_call_extern_ = builtin::call_extern();
+  const Op &builtin_call_pure_extern_ = builtin::call_pure_extern();
+
+private:
+  /*! \brief the data type of allocated buffers */
+  std::unordered_map<const VarNode *, DataType> handle_data_type_;
+
+  /* \brief Map of GlobalVar to their symbol.
+   *
+   * For externally-exposed functions, this is given by the
+   * tvm::attr::kTarget attribute of the PrimFunc.  For internal
+   * functions, this is the name of the function's GlobalVar, possibly
+   * altered to prevent duplicate names.
+   */
+  std::unordered_map<GlobalVar, ffi::String> internal_functions_;
+
+  /* \brief Name supply to generate unique function names */
+  NameSupply func_name_supply_;
+
+  /*!
+   * \brief Escape a string to be a valid Python double-quoted string literal.
+   * \param s The input string to escape.
+   * \param os The output stream to write the escaped string to.
+   */
+  void EscapeStringLiteral_(const std::string &s, std::ostream &os);
+};
+
+} // namespace codegen
+} // namespace tvm
+#endif // TVM_TL_TARGET_CODEGEN_PY_H_
diff --git a/src/target/codegen_utils.cc b/src/target/codegen_utils.cc
new file mode 100644
index 000000000..75d038d3a
--- /dev/null
+++ b/src/target/codegen_utils.cc
@@ -0,0 +1,41 @@
+/*!
+ * \file target/codegen_utils.cc
+ * \brief Shared utility functions for code generation
+ */
+
+#include "codegen_utils.h"
+
+namespace tvm {
+namespace codegen {
+
+bool CheckOutermostParenthesesMatch(const std::string &s) {
+  if (!s.empty() && s.front() == '(' && s.back() == ')') {
+    size_t len = s.size();
+    int n_unmatched = 0;
+    for (size_t i = 0; i < len; ++i) {
+      if (s[i] == '(') {
+        n_unmatched++;
+      } else if (s[i] == ')') {
+        n_unmatched--;
+      }
+      if (n_unmatched < 0) {
+        return false;
+      }
+      if (n_unmatched == 0) {
+        return i == len - 1;
+      }
+    }
+  }
+  return false;
+}
+
+std::string RemoveOutermostParentheses(const std::string &s) {
+  if (CheckOutermostParenthesesMatch(s)) {
+    return s.substr(1, s.size() - 2);
+  } else {
+    return s;
+  }
+}
+
+} // namespace codegen
+} // namespace tvm
diff --git a/src/target/codegen_utils.h b/src/target/codegen_utils.h
new file mode 100644
index 000000000..1ef52d4b1
--- /dev/null
+++ b/src/target/codegen_utils.h
@@ -0,0 +1,33 @@
+/*!
+ * \file target/codegen_utils.h
+ * \brief Shared utility functions for code generation
+ */
+
+#ifndef TVM_TARGET_CODEGEN_UTILS_H_
+#define TVM_TARGET_CODEGEN_UTILS_H_
+
+#include <string>
+
+namespace tvm {
+namespace codegen {
+
+/*!
+ * \brief Check if the outermost parentheses match
+ * \param s The input string
+ * \return true if the first character is '(' and the last character is ')'
+ *         and they form a matching pair
+ */
+bool CheckOutermostParenthesesMatch(const std::string &s);
+
+/*!
+ * \brief Remove outermost parentheses if they match
+ * \param s The input string
+ * \return The string with outermost parentheses removed if they match,
+ *         otherwise return the original string
+ */
+std::string RemoveOutermostParentheses(const std::string &s);
+
+} // namespace codegen
+} // namespace tvm
+
+#endif // TVM_TARGET_CODEGEN_UTILS_H_
diff --git a/src/target/codegen_webgpu.cc b/src/target/codegen_webgpu.cc
deleted file mode 100644
index 1d64ccbc6..000000000
--- a/src/target/codegen_webgpu.cc
+++ /dev/null
@@ -1,786 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file codegen_webgpu.cc
- */
-#include "codegen_webgpu.h"
-#include <tvm/ffi/reflection/registry.h>
-
-#include <tvm/arith/analyzer.h>
-#include <tvm/tir/builtin.h>
-#include <tvm/tir/transform.h>
-
-#include <algorithm>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "arith/pattern_match.h"
-#include "runtime/meta_data.h"
-#include "runtime/thread_storage_scope.h"
-#include "target/build_common.h"
-
-namespace tvm {
-namespace codegen {
-
-// WebGPU Info
-struct WebGPUWorkGroupInfo {
-  int workgroup_size[3] = {1, 1, 1};
-  // whether we have ref to block index z is used.
-  bool has_block_index_z{false};
-  // set of handles that have write access
-  std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual> write_access_set;
-};
-
-class WebGPUWorkgroupInfoCollector : public StmtExprVisitor {
-public:
-  static WebGPUWorkGroupInfo Collect(const Stmt &stmt) {
-    WebGPUWorkgroupInfoCollector collector;
-    collector(stmt);
-    return collector.info_;
-  }
-
-private:
-  void VisitExpr_(const VarNode *op) final {
-    StmtExprVisitor::VisitExpr_(op);
-    Var buffer_var = GetRef<Var>(op);
-    if (buffer_var.dtype().is_handle()) {
-      info_.write_access_set.insert(buffer_var);
-    }
-  }
-
-  void VisitStmt_(const BufferStoreNode *op) final {
-    StmtExprVisitor::VisitStmt_(op);
-    info_.write_access_set.insert(op->buffer->data);
-  }
-
-  void VisitStmt_(const AttrStmtNode *op) final {
-    // record workgroup size
-    if (op->attr_key == tir::attr::thread_extent) {
-      IterVar iv = Downcast<IterVar>(op->node);
-      if (!iv->thread_tag.empty()) {
-        runtime::ThreadScope ts = runtime::ThreadScope::Create(iv->thread_tag);
-        if (ts.rank == 1) {
-          ICHECK_GE(ts.dim_index, 0)
-              << "vthread should have been optimized out by here";
-          ICHECK_LT(ts.dim_index, 3);
-          auto *sizeptr = op->value.as<tir::IntImmNode>();
-          ICHECK(sizeptr) << "CodeGenTileLangWebGPU: only allows constant "
-                             "thread group size "
-                          << " get " << op->value;
-          info_.workgroup_size[ts.dim_index] =
-              static_cast<uint32_t>(sizeptr->value);
-        } else if (ts.rank == 0) {
-          if (ts.dim_index == 2) {
-            info_.has_block_index_z = true;
-          }
-        }
-      }
-    }
-    // normal operation
-    StmtExprVisitor::VisitStmt_(op);
-  }
-  WebGPUWorkGroupInfo info_;
-};
-
-std::string CodeGenTileLangWebGPU::Finish() {
-  // Using f16 requires enable directive
-  if (enable_fp16_) {
-    header_stream << "enable f16;\n\n";
-  }
-  // WebGPU WGSL doesn't support #include.
-  // We must explicitly include all the templates here.
-  return header_stream.str() + decl_stream.str() + this->fwd_decl_stream.str() +
-         stream.str();
-}
-
-void CodeGenTileLangWebGPU::InitFuncState(const PrimFunc &f) {
-  CodeGenC::InitFuncState(f);
-  // analyze the data;
-  for (Var arg : f->params) {
-    if (arg.dtype().is_handle()) {
-      alloc_storage_scope_[arg.get()] = "global";
-    }
-  }
-}
-
-CodeGenTileLangWebGPU::CodeGenTileLangWebGPU(Target target) : target_(target) {}
-
-runtime::FunctionInfo
-CodeGenTileLangWebGPU::AddFunction(const PrimFunc &f, bool skip_readonly_decl) {
-  // clear previous generated state.
-  this->InitFuncState(f);
-  // reserve keywords
-  name_supply_->ReserveName("var");
-  name_supply_->ReserveName("let");
-  name_supply_->ReserveName("const");
-
-  // skip the first underscore, so SSA variable starts from
-  name_supply_->FreshName("v_");
-  // Setup the thread group info.
-  ICHECK_EQ(name_supply_->FreshName("threadIdx"), "threadIdx");
-  ICHECK_EQ(name_supply_->FreshName("blockIdx"), "blockIdx");
-  ICHECK_EQ(name_supply_->FreshName("gridDim"), "gridDim");
-
-  // add to alloc buffer type.
-  auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-  ICHECK(global_symbol.defined()) << "CodeGenTileLangWebGPU: Expect PrimFunc "
-                                     "to have the global_symbol attribute";
-
-  header_stream << "//----------------------------------------\n"
-                << "// Function: " << global_symbol.value() << "\n"
-                << "//----------------------------------------\n";
-  runtime::FunctionInfo func_info;
-  func_info.name = global_symbol.value();
-
-  WebGPUWorkGroupInfo info = WebGPUWorkgroupInfoCollector::Collect(f->body);
-
-  std::vector<Var> pod_args;
-  int num_buffer = 0;
-
-  // add param_access modes info to launch params
-  std::ostringstream os_param_access;
-  os_param_access << "paramWriteAccess:[";
-  // setup buffer argumemts
-  for (Var arg : f->params) {
-    DataType t = arg.dtype();
-    func_info.arg_types.push_back(t);
-
-    if (t.is_handle()) {
-      auto *ptr = arg->type_annotation.as<PointerTypeNode>();
-      ICHECK(ptr) << "All handles passed to the CodeGenTileLangWebGPU must "
-                     "have a type_annotation as a "
-                     "PointerType, "
-                  << "and must point to a PrimType";
-      auto *prim = ptr->element_type.as<PrimTypeNode>();
-      ICHECK(prim) << "All handles passed to the CodeGenTileLangWebGPU must "
-                      "have a type_annotation as a "
-                      "PointerType, "
-                   << "and must point to a PrimType";
-      DataType value_storage_type = prim->dtype;
-      if (value_storage_type == DataType::Bool()) {
-        // We need a physically addressable buffer type to support boolean
-        // tensors. The loaded byte is cast to bool inside the LoadNode visitor
-        // below.
-        value_storage_type =
-            boolean_storage_type_.with_lanes(value_storage_type.lanes());
-      }
-      std::string vid = AllocVarID(arg.get());
-      std::string access_mode;
-      if (num_buffer != 0) {
-        os_param_access << ",";
-      }
-      if (skip_readonly_decl || info.write_access_set.count(arg)) {
-        access_mode = "read_write";
-        os_param_access << "1";
-      } else {
-        access_mode = "read";
-        os_param_access << "0";
-      }
-      // add extra access mode info to launch params
-      this->decl_stream << "@group(0) @binding(" << num_buffer++ << ") "
-                        << "var<storage, " << access_mode << "> " << vid
-                        << " : array<";
-      this->PrintType(value_storage_type, this->decl_stream);
-      this->decl_stream << ">;\n";
-    } else {
-      pod_args.push_back(arg);
-    }
-  }
-
-  // Store all pod arguments in a single buffer of int32
-  // do bitcast to change to other data types
-  // always pass gridDimX in to get around of the 65535 gridDim
-  // restrictions in some platforms
-  std::string type_pod_args = name_supply_->FreshName("PODArgs");
-  std::string val_pod_args = name_supply_->FreshName("podArgs");
-  std::string packGridDimX = name_supply_->FreshName("packGridDimX");
-
-  this->decl_stream << "\nstruct " << type_pod_args << " {\n";
-
-  for (size_t i = 0; i < pod_args.size(); ++i) {
-    const Var &v = pod_args[i];
-    ICHECK(!v.dtype().is_handle());
-    std::string vid = AllocVarID(v.get());
-
-    if (v.dtype() == DataType::Int(32)) {
-      this->decl_stream << "  " << vid << ": i32";
-    } else if (v.dtype() == DataType::UInt(32)) {
-      this->decl_stream << "  " << vid << ": u32";
-    } else if (v.dtype() == DataType::Float(32)) {
-      this->decl_stream << "  " << vid << ": f32";
-    } else {
-      LOG(FATAL) << "Do not support pod argument type " << v.dtype();
-    }
-    this->decl_stream << ",\n";
-    // value ref
-    std::ostringstream vref;
-    vref << val_pod_args << "." << vid;
-    var_idmap_[v.get()] = vref.str();
-  }
-  this->decl_stream << "  " << packGridDimX << ": u32\n}\n";
-
-  this->decl_stream << "@group(0) @binding(" << num_buffer++ << ") "
-                    << "var<uniform> " << val_pod_args << " : " << type_pod_args
-                    << ";\n\n";
-
-  // setup thread tags and param access in launch param tags;
-  if (auto opt = f->GetAttr<Array<String>>(tir::attr::kKernelLaunchParams)) {
-    for (const auto &thread_tag : opt.value()) {
-      func_info.launch_param_tags.push_back(thread_tag);
-    }
-  }
-  os_param_access << "]";
-  func_info.launch_param_tags.push_back(os_param_access.str());
-
-  ICHECK(!info.has_block_index_z) << "blockIdx.z is not supported in WebGPU to "
-                                     "accommodate large blockIdx.x";
-  // annotate workgroup
-  this->stream << "@compute @workgroup_size(" << info.workgroup_size[0] << ", "
-               << info.workgroup_size[1] << ", " << info.workgroup_size[2]
-               << ")\n";
-
-  // add to alloc buffer type.
-  // Function header.
-  this->stream << "fn " << func_info.name << "(\n"
-               << "  @builtin(workgroup_id) blockIdx : vec3<u32>,\n"
-               << "  @builtin(num_workgroups) gridDim : vec3<u32>,\n"
-               << "  @builtin(local_invocation_id) threadIdx : vec3<u32>\n"
-               << ") {\n";
-  // skip out of bound grids
-  this->stream << "  if (blockIdx.z * gridDim.x + blockIdx.x > " // NOLINT(*)
-               << val_pod_args << "." << packGridDimX << ") { return; }\n";
-  // the function scope.
-  int func_scope = this->BeginScope();
-  this->PrintStmt(f->body);
-  this->EndScope(func_scope);
-  this->PrintIndent();
-  this->stream << "}\n\n";
-  return func_info;
-}
-
-void CodeGenTileLangWebGPU::BindThreadIndex(const IterVar &iv) {
-  ICHECK(!var_idmap_.count(iv->var.get()));
-  std::ostringstream os;
-  PrintType(iv->var.dtype(), os);
-  if (iv->thread_tag == "blockIdx.x") {
-    // WebGPU have restriction to limit the maximum size of blockId.x to be
-    // 65535 We allow runtime to spread the load out to blockIdx.z so it can be
-    // a large number.
-    os << "(blockIdx.z * gridDim.x + blockIdx.x)";
-    std::string tidx = os.str();
-    std::string aggregated_bidx = SSAGetID(os.str(), iv->var.dtype());
-    var_idmap_[iv->var.get()] = aggregated_bidx;
-  } else {
-    os << "(" << iv->thread_tag << ")";
-    std::string tidx = os.str();
-    this->MarkConst(tidx);
-    var_idmap_[iv->var.get()] = tidx;
-  }
-}
-
-void CodeGenTileLangWebGPU::PrintType(DataType t,
-                                      std::ostream &os) { // NOLINT(*)
-  int lanes = t.lanes();
-  if (t.is_handle()) {
-    LOG(FATAL) << "Cannot print handle type in WebGPU";
-  }
-  if (t.is_void()) {
-    os << "void";
-    return;
-  }
-  if (t == DataType::Bool()) {
-    os << "bool";
-    return;
-  }
-
-  if (lanes != 1) {
-    // ICHECK(lanes >= 2 && lanes <= 4) << "CodeGenTileLangWebGPU: only allows
-    // vector with lanes in {2, 3, 4} " << " while lanes is " << lanes;
-    os << "vec" << lanes << "<";
-  }
-
-  if (t.is_float()) {
-    ICHECK(t.bits() == 16 || t.bits() == 32)
-        << "CodeGenTileLangWebGPU: only support f16 or f32";
-    if (t.bits() == 16) {
-      // Using f16 requires enable directive
-      enable_fp16_ = true;
-    }
-    os << "f" << t.bits();
-  } else if (t.is_uint()) {
-    ICHECK(t.bits() != 64) << "CodeGenTileLangWebGPU: do not support u64";
-    os << "u" << t.bits();
-  } else if (t.is_int()) {
-    ICHECK(t.bits() != 64) << "CodeGenTileLangWebGPU: do not support i64";
-    os << "i" << t.bits();
-  } else {
-    LOG(FATAL) << "CodeGenTileLangWebGPU: Cannot convert type " << t
-               << " to WebGPU type";
-  }
-  if (lanes != 1) {
-    os << ">";
-  }
-}
-
-void CodeGenTileLangWebGPU::PrintStorageSync(const CallNode *op) {
-  const std::string &sync = op->args[0].as<StringImmNode>()->value;
-  if (sync == "warp") {
-    this->PrintIndent();
-    this->stream << "workgroupBarrier();\n";
-  } else if (sync == "shared") {
-    this->PrintIndent();
-    this->stream << "workgroupBarrier();\n";
-  } else if (sync == "global") {
-    LOG(FATAL) << "global barrier not supported";
-  }
-}
-
-void CodeGenTileLangWebGPU::PrintSSAAssign(const std::string &target,
-                                           const std::string &src,
-                                           DataType type) {
-  stream << "let " << target << " : ";
-  PrintType(type, stream);
-  stream << " = " << src << ";\n";
-}
-
-void CodeGenTileLangWebGPU::VisitExpr_(const BroadcastNode *op,
-                                       std::ostream &os) { // NOLINT(*)
-  std::string v = PrintExpr(op->value);
-  int lanes = op->dtype.lanes();
-  PrintType(op->dtype, os);
-  os << "(";
-  for (int i = 0; i < lanes; ++i) {
-    if (i != 0)
-      os << ", ";
-    os << v;
-  }
-  os << ')';
-}
-
-PrimExpr CodeGenTileLangWebGPU::EnforceU32(PrimExpr value) {
-  return cast(DataType::UInt(32, value.dtype().lanes()), value);
-}
-
-void CodeGenTileLangWebGPU::VisitExpr_(const CallNode *op,
-                                       std::ostream &os) { // NOLINT(*)
-  if (op->op.same_as(builtin::reinterpret())) {
-    // generate bitcast<TYPE>(ARG)
-    os << "bitcast<";
-    this->PrintType(op->dtype, os);
-    os << ">(";
-    this->PrintExpr(op->args[0], os);
-    os << ")";
-  } else if (op->op.same_as(builtin::shift_right())) {
-    os << '(';
-    this->PrintExpr(op->args[0], os);
-    os << ">>";
-    // WebGPU requires shift bits to be u32.
-    this->PrintExpr(EnforceU32(op->args[1]), os);
-    os << ')';
-  } else if (op->op.same_as(builtin::shift_left())) {
-    os << '(';
-    this->PrintExpr(op->args[0], os);
-    os << "<<";
-    // WebGPU requires shift bits to be u32.
-    this->PrintExpr(EnforceU32(op->args[1]), os);
-    os << ')';
-  } else if (op->op.same_as(builtin::if_then_else())) {
-    // conditional that skips eval if cond evals to false
-    std::string result = name_supply_->FreshName("condval");
-    std::string cond = PrintExpr(op->args[0]);
-    this->PrintIndent();
-    this->stream << "var " << result << " : ";
-    PrintType(op->dtype, this->stream);
-    this->stream << ";\n";
-    this->PrintIndent();
-    this->stream << "if (" << cond << ") {\n";
-    {
-      int then_scope = this->BeginScope();
-      std::string true_val = PrintExpr(op->args[1]);
-      this->PrintIndent();
-      this->stream << result << " = " << true_val << ";\n} else {\n";
-      this->EndScope(then_scope);
-    }
-    {
-      int else_scope = this->BeginScope();
-      std::string false_val = PrintExpr(op->args[2]);
-      this->PrintIndent();
-      this->stream << result << " = " << false_val << ";\n}\n";
-      this->EndScope(else_scope);
-    }
-    os << result;
-  } else {
-    CodeGenC::VisitExpr_(op, os);
-  }
-}
-
-void CodeGenTileLangWebGPU::VisitExpr_(const CastNode *op,
-                                       std::ostream &os) { // NOLINT(*)
-  PrintType(op->dtype, os);
-  os << "(" << PrintExpr(op->value) << ")";
-}
-
-void CodeGenTileLangWebGPU::VisitExpr_(const SelectNode *op,
-                                       std::ostream &os) { // NOLINT(*)
-  os << "select(" << PrintExpr(op->false_value) << ", "
-     << PrintExpr(op->true_value) << ", " << PrintExpr(op->condition) << ")";
-}
-
-void CodeGenTileLangWebGPU::VisitExpr_(const IntImmNode *op,
-                                       std::ostream &os) { // NOLINT(*)
-  if (op->dtype.bits() == 32) {
-    std::ostringstream temp;
-    if (op->dtype.is_int()) {
-      temp << op->value << "i";
-    } else {
-      ICHECK(op->dtype.is_uint());
-      temp << op->value << "u";
-    }
-    this->MarkConst(temp.str());
-    os << temp.str();
-  } else {
-    this->PrintType(op->dtype, os);
-    os << "(" << op->value << ")";
-  }
-}
-
-void CodeGenTileLangWebGPU::VisitExpr_(const FloatImmNode *op,
-                                       std::ostream &os) { // NOLINT(*)
-  std::ostringstream temp;
-  temp << std::scientific << op->value;
-  if (op->dtype.bits() == 32) {
-    temp << 'f';
-  } else if (op->dtype.bits() == 16) {
-    // Using f16 requires enable directive
-    enable_fp16_ = true;
-    temp << 'h';
-  } else {
-    LOG(FATAL) << "Unsupported floating point bits " << op->dtype.bits();
-  }
-  MarkConst(temp.str());
-  os << temp.str();
-}
-
-void CodeGenTileLangWebGPU::VisitExpr_(const BufferLoadNode *op,
-                                       std::ostream &os) { // NOLINT(*)
-  // NOTE: direct impl of load/store for correctness
-  // Each printing stmt must stand on their own after all preprocessing steps
-  // to ensure correctness in the case of nested-expression
-  // do not try to lift common printings from each case
-  ICHECK_EQ(op->indices.size(), 1)
-      << "Load from non-flat memory not supported.";
-
-  DataType value_dtype = op->dtype;
-  PrimExpr index = op->indices[0];
-  Var buffer_var = op->buffer->data;
-  DataType element_dtype = op->buffer->dtype;
-
-  int lanes = op->dtype.lanes();
-  std::string buffer_vid = GetVarID(buffer_var.get());
-
-  if (value_dtype.lanes() == element_dtype.lanes()) {
-    // Direct buffer loading
-    // Special handle bool loading
-    if (value_dtype == DataType::Bool()) {
-      this->PrintType(value_dtype, os);
-      os << "(";
-    } else {
-      ICHECK(value_dtype == element_dtype);
-    }
-    ICHECK_EQ(index.dtype().lanes(), 1);
-    os << buffer_vid << "[" << this->PrintExpr(index) << "]";
-    // Special handle bool loading
-    if (value_dtype == DataType::Bool()) {
-      os << ")";
-    }
-  } else {
-    // Vector load from scalar buffer
-    ICHECK_EQ(element_dtype.lanes(), 1) << "Can only vector load scalar array";
-    ICHECK(value_dtype.element_of() == element_dtype)
-        << "WebGPU vector loading requires base type to match";
-    arith::PVar<PrimExpr> base;
-    if (arith::ramp(base, 1, op->dtype.lanes()).Match(index)) {
-      // vec3<f32>(buf[base + 0], buf[base + 1], buf[base + 2]);
-      std::string base_vid =
-          SSAGetID(PrintExpr(base.Eval()), base.Eval().dtype());
-      PrintType(element_dtype.with_lanes(value_dtype.lanes()), os);
-      os << "(";
-      for (int i = 0; i < lanes; ++i) {
-        if (i != 0)
-          os << ", ";
-        os << buffer_vid << "[" << base_vid << " + " << i << "]";
-      }
-      os << ")";
-    } else {
-      // vec3<f32>(buf[index[0]], buf[index[1]], buf[index[2]]);
-      std::string index_vid = SSAGetID(PrintExpr(index), index.dtype());
-      PrintType(element_dtype.with_lanes(value_dtype.lanes()), os);
-      os << "(";
-      for (int i = 0; i < lanes; ++i) {
-        if (i != 0)
-          os << ", ";
-        os << buffer_vid << "[" << index_vid << "[" << i << "]]";
-      }
-      os << ")";
-    }
-  }
-}
-
-void CodeGenTileLangWebGPU::VisitStmt_(const LetStmtNode *op) {
-  // use ssa form.
-  if (print_ssa_form_) {
-    std::string value = PrintExpr(op->value);
-    ICHECK(!var_idmap_.count(op->var.get()));
-    var_idmap_[op->var.get()] = value;
-  } else {
-    PrintIndent();
-    std::string value = PrintExpr(op->value);
-    this->stream << "let " << AllocVarID(op->var.get()) << " : ";
-    PrintType(op->var.dtype(), this->stream);
-    this->stream << " = " << value << ";\n";
-  }
-  PrintStmt(op->body);
-}
-
-void CodeGenTileLangWebGPU::VisitStmt_(const BufferStoreNode *op) {
-  CHECK_EQ(op->indices.size(), 1) << "Store to non-flat memory not supported.";
-  DataType value_dtype = op->value.dtype();
-  DataType element_dtype = op->buffer->dtype;
-  PrimExpr index = op->indices[0];
-  Var buffer_var = op->buffer->data;
-
-  std::string buffer_vid = GetVarID(buffer_var.get());
-
-  if (value_dtype.lanes() == element_dtype.lanes()) {
-    // must execute print expr first
-    // so we won't have recursive append to stream
-    std::string index_vid = PrintExpr(index);
-    std::string value_vid = PrintExpr(op->value);
-    // now print the assignment line.
-    this->PrintIndent();
-    stream << buffer_vid << "[" << index_vid << "] = ";
-    // special explicit conversion of bool
-    if (value_dtype == DataType::Bool()) {
-      PrintType(element_dtype, stream);
-      stream << "(";
-    } else {
-      ICHECK(value_dtype == element_dtype);
-    }
-    stream << value_vid;
-    // Special handle bool store
-    if (value_dtype == DataType::Bool()) {
-      stream << ")";
-    }
-    stream << ";\n";
-  } else {
-    // Vector store into scalar buffer
-    ICHECK_EQ(element_dtype.lanes(), 1) << "Can only vector load scalar array";
-    ICHECK(value_dtype.element_of() == element_dtype)
-        << "WebGPU vector stire requires base type to match";
-    std::string value_vid = PrintExpr(op->value);
-    arith::PVar<PrimExpr> base;
-    if (arith::ramp(base, 1, value_dtype.lanes()).Match(index)) {
-      // buf[base + 0] = value[0]
-      // buf[base + 1] = value[1]
-      std::string base_vid =
-          SSAGetID(PrintExpr(base.Eval()), base.Eval().dtype());
-      for (int i = 0; i < value_dtype.lanes(); ++i) {
-        this->PrintIndent();
-        stream << buffer_vid << "[" << base_vid << " + " << i
-               << "] = " << value_vid << "[" << i << "];\n";
-      }
-    } else {
-      // buf[index[0]] = value[0]
-      // buf[index[1]] = value[1]
-      std::string index_vid = SSAGetID(PrintExpr(index), index.dtype());
-      for (int i = 0; i < value_dtype.lanes(); ++i) {
-        this->PrintIndent();
-        stream << buffer_vid << "[" << index_vid << "[" << i
-               << "]] = " << value_vid << "[" << i << "];\n";
-      }
-    }
-  }
-}
-
-void CodeGenTileLangWebGPU::VisitStmt_(const AllocateNode *op) {
-  ICHECK(!is_zero(op->condition));
-  std::string vid = AllocVarID(op->buffer_var.get());
-  size_t constant_size = op->ConstantAllocationSize();
-  ICHECK_GT(constant_size, 0)
-      << "Can only handle constant size stack allocation for now";
-  auto storage_scope =
-      runtime::StorageScope::Create(GetPtrStorageScope(op->buffer_var));
-
-  if (storage_scope.rank == runtime::StorageRank::kShared) {
-    this->decl_stream << "var<workgroup> " << vid << " : array<";
-    PrintType(op->dtype, this->decl_stream);
-    this->decl_stream << ", " << constant_size << ">;\n";
-  } else if (storage_scope.rank == runtime::StorageRank::kLocal) {
-    // TODO(Charlie): These code would cause non-uniformity as it introduces
-    // variables in module scope rather than function scope; but it was included
-    // for some unknown reasons; kept for now. this->decl_stream <<
-    // "var<private> " << vid << " : array<"; PrintType(op->dtype,
-    // this->decl_stream); this->decl_stream << ", " << constant_size << ">;\n";
-    this->PrintIndent();
-    this->stream << "var " << vid << " : array<";
-    PrintType(op->dtype, this->stream);
-    this->stream << ", " << constant_size << ">;\n";
-  } else {
-    LOG(FATAL) << "WebGPU: Do not support storage scope: "
-               << storage_scope.to_string();
-  }
-  this->PrintStmt(op->body);
-}
-
-void CodeGenTileLangWebGPU::VisitStmt_(const ForNode *op) {
-  std::string extent = PrintExpr(op->extent);
-  std::string vid = AllocVarID(op->loop_var.get());
-  ICHECK(is_zero(op->min));
-  PrintIndent();
-  stream << "for (var " << vid << " : ";
-  PrintType(op->loop_var.dtype(), stream);
-  stream << " = 0; " << vid << " < " << extent << "; " << vid << "++) {\n";
-  int for_scope = BeginScope();
-  PrintStmt(op->body);
-  this->EndScope(for_scope);
-  PrintIndent();
-  stream << "}\n";
-}
-
-void CodeGenTileLangWebGPU::VisitStmt_(const AssertStmtNode *op) {
-  // skip assert
-  PrintStmt(op->body);
-}
-
-void CodeGenTileLangWebGPU::VisitStmt_(const AllocateConstNode *op) {
-  LOG(FATAL) << "WebGPU: do not support alloc const";
-}
-
-void CodeGenTileLangWebGPU::VisitStmt_(const WhileNode *op) {
-  PrintIndent();
-  stream << "while (true) {\n";
-  int while_scope = BeginScope();
-  std::string cond = PrintExpr(op->condition);
-  PrintIndent();
-  stream << "if (!(" << cond << ")) { break; }\n";
-  PrintStmt(op->body);
-  this->EndScope(while_scope);
-  PrintIndent();
-  stream << "}\n";
-}
-
-//-------------------------------------------------
-// WebGPUSourceModule to enable export
-//-------------------------------------------------
-class WebGPUSourceModuleNode final : public runtime::ModuleNode {
-public:
-  explicit WebGPUSourceModuleNode(
-      std::unordered_map<std::string, std::string> smap,
-      std::unordered_map<std::string, runtime::FunctionInfo> fmap)
-      : smap_(smap), fmap_(fmap) {}
-
-  const char *type_key() const final { return "webgpu"; }
-  /*! \brief Get the property of the runtime module .*/
-  int GetPropertyMask() const final {
-    return runtime::ModulePropertyMask::kBinarySerializable;
-  }
-
-  ffi::Function GetFunction(const String &name,
-                            const ObjectPtr<Object> &sptr_to_self) final {
-    LOG(FATAL) << "WebGPUSourceModule is not directly runnable, export and run "
-                  "through tvmjs";
-    return ffi::Function(nullptr);
-  }
-
-  void SaveToBinary(dmlc::Stream *stream) final {
-    stream->Write(fmap_);
-    stream->Write(smap_);
-  }
-
-  String GetSource(const String &format) final {
-    if (format == "func_info") {
-      std::ostringstream stream;
-      dmlc::JSONWriter(&stream).Write(fmap_);
-      return stream.str();
-    } else {
-      std::ostringstream os;
-      for (const auto &kv : smap_) {
-        os << kv.second;
-      }
-      return os.str();
-    }
-  }
-
-private:
-  // function shader code table.
-  std::unordered_map<std::string, std::string> smap_;
-  // function information table.
-  std::unordered_map<std::string, runtime::FunctionInfo> fmap_;
-};
-
-//-------------------------------------------------
-// Build logic.
-//-------------------------------------------------
-runtime::Module BuildTileLangWebGPU(IRModule mod, Target target) {
-  mod = tir::transform::PointerValueTypeRewrite()(std::move(mod));
-  bool output_ssa = false;
-  bool skip_readonly_decl = false;
-  std::unordered_map<std::string, std::string> smap;
-  std::unordered_map<std::string, runtime::FunctionInfo> fmap;
-
-  // narrow all i64 to i32
-  mod = tir::transform::ForceNarrowIndexToInt32()(std::move(mod));
-
-  for (auto kv : mod->functions) {
-    CodeGenTileLangWebGPU cg(target);
-    ICHECK(kv.second->IsInstance<PrimFuncNode>())
-        << "CodeGenTileLangWebGPU: Can only take PrimFunc";
-    auto f = Downcast<PrimFunc>(kv.second);
-    auto calling_conv = f->GetAttr<Integer>(tvm::attr::kCallingConv);
-    ICHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
-        << "CodeGenTileLangWebGPU: expect calling_conv equals "
-           "CallingConv::kDeviceKernelLaunch";
-    auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-    ICHECK(global_symbol.defined()) << "CodeGenTileLangWebGPU: Expect PrimFunc "
-                                       "to have the global_symbol attribute";
-    std::string f_name = global_symbol.value();
-    cg.Init(output_ssa);
-    fmap[f_name] = cg.AddFunction(f, skip_readonly_decl);
-    std::string code = cg.Finish();
-    smap[f_name] = code;
-  }
-
-  auto n = make_object<WebGPUSourceModuleNode>(smap, fmap);
-  return runtime::Module(n);
-}
-
-TVM_FFI_STATIC_INIT_BLOCK({
-  namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("target.build.tilelang_webgpu",
-                        [](IRModule mod, Target target) {
-                          return BuildTileLangWebGPU(mod, target);
-                        });
-});
-
-} // namespace codegen
-} // namespace tvm
diff --git a/src/target/codegen_webgpu.h b/src/target/codegen_webgpu.h
deleted file mode 100644
index fa2da8895..000000000
--- a/src/target/codegen_webgpu.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file codegen_webgpu.h
- * \brief Generate WebGPU shaders in WGSL.
- *
- * This module generates WGSL shading language.
- * See https://www.w3.org/TR/WGSL/ for the language reference.
- */
-#ifndef TVM_TARGET_SOURCE_CODEGEN_WEBGPU_H_
-#define TVM_TARGET_SOURCE_CODEGEN_WEBGPU_H_
-
-#include <tvm/target/codegen.h>
-
-#include <string>
-
-#include "target/source/codegen_c.h"
-
-namespace tvm {
-namespace codegen {
-
-/*!
- * \brief WebGPU code generator.
- *
- * Note WGSL have a different syntax from normal C.
- * We only leverage the C for expression generation and
- * write most of the language generations.
- */
-class CodeGenTileLangWebGPU final : public CodeGenC {
-public:
-  explicit CodeGenTileLangWebGPU(Target target);
-  // overrides
-  std::string Finish() final;
-  using CodeGenC::AddFunction;
-  runtime::FunctionInfo AddFunction(const PrimFunc &f,
-                                    bool skip_readonly_decl); // NOLINT(*)
-  void InitFuncState(const PrimFunc &f) final;
-  void PrintStorageSync(const CallNode *op) final;    // NOLINT(*)
-  void PrintType(DataType t, std::ostream &os) final; // NOLINT(*)
-  void BindThreadIndex(const IterVar &iv) final;      // NOLINT(*)
-
-  // assignment printing
-  void PrintSSAAssign(const std::string &target, const std::string &src,
-                      DataType type) final;
-
-  // overload visitor
-  void VisitExpr_(const BroadcastNode *op, std::ostream &os) final; // NOLINT(*)
-  void VisitExpr_(const CallNode *op, std::ostream &os) final;      // NOLINT(*)
-  void VisitExpr_(const BufferLoadNode *op,
-                  std::ostream &os) final;                          // NOLINT(*)
-  void VisitExpr_(const CastNode *op, std::ostream &os) final;      // NOLINT(*)
-  void VisitExpr_(const SelectNode *op, std::ostream &os) override; // NOLINT(*)
-  void VisitExpr_(const FloatImmNode *op, std::ostream &os) final;  // NOLINT(*)
-  void VisitExpr_(const IntImmNode *op, std::ostream &os) final;    // NOLINT(*)
-
-  // stmt printing
-  void VisitStmt_(const LetStmtNode *op) final;
-  void VisitStmt_(const BufferStoreNode *op) final;
-  void VisitStmt_(const ForNode *op) final;
-  void VisitStmt_(const AllocateNode *op) final;
-  void VisitStmt_(const AssertStmtNode *op) final;
-  void VisitStmt_(const AllocateConstNode *op) final;
-  void VisitStmt_(const WhileNode *op) final;
-
-private:
-  /*!
-   * \brief Enforce value to be U32.
-   */
-  static PrimExpr EnforceU32(PrimExpr value);
-  /*!
-   * \brief Storage type of bool values.
-   */
-  DataType boolean_storage_type_{DataType::Int(8)};
-
-  // whether enable fp16
-  bool enable_fp16_{false};
-
-  /*! \brief the header stream for function label and enable directive if any,
-   * goes before any other declaration */
-  std::ostringstream header_stream;
-
-  Target target_;
-};
-} // namespace codegen
-} // namespace tvm
-
-#endif // TVM_TARGET_SOURCE_CODEGEN_WEBGPU_H_
diff --git a/src/target/intrin_rule_cuda.cc b/src/target/intrin_rule_cuda.cc
index 4ba3f10ab..1aacd7204 100644
--- a/src/target/intrin_rule_cuda.cc
+++ b/src/target/intrin_rule_cuda.cc
@@ -5,6 +5,7 @@
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/op_attr_types.h>
 
+#include "../support/ffi_aliases.h"
 #include "target/intrin_rule.h"
 
 namespace tvm {
diff --git a/src/target/intrin_rule_hip.cc b/src/target/intrin_rule_hip.cc
index 2bd3e2dd9..e142d8474 100644
--- a/src/target/intrin_rule_hip.cc
+++ b/src/target/intrin_rule_hip.cc
@@ -5,6 +5,7 @@
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/op_attr_types.h>
 
+#include "../support/ffi_aliases.h"
 #include "target/intrin_rule.h"
 
 namespace tvm {
@@ -286,4 +287,4 @@ TVM_REGISTER_OP("tir.hip.__activemask")
 
 } // namespace intrin
 } // namespace codegen
-} // namespace tvm
\ No newline at end of file
+} // namespace tvm
diff --git a/src/target/ptx.cc b/src/target/ptx.cc
index 9de548fc2..53f83ded9 100644
--- a/src/target/ptx.cc
+++ b/src/target/ptx.cc
@@ -74,9 +74,9 @@ DataType DTypeFromString(const std::string str) {
     return DataType::kInt64;
   } else if (str == "uint64" || str == ".u64") {
     return DataType::kUInt64;
-  } else if (str == "e4m3" || str == ".e4m3") {
+  } else if (str == "float8_e4m3" || str == "e4m3" || str == ".e4m3") {
     return DataType::kFloat8_e4m3;
-  } else if (str == "e5m2" || str == ".e5m2") {
+  } else if (str == "float8_e5m2" || str == "e5m2" || str == ".e5m2") {
     return DataType::kFloat8_e5m2;
   } else if (str == "float16" || str == "fp16" || str == ".f16") {
     return DataType::kFloat16;
@@ -1529,5 +1529,20 @@ std::string PrintWaitBarrierAsm(const std::string &barrier) {
   return predicated_asm_code;
 }
 
+std::string GetMMARegisterType(const ptx::DataType &dtype) {
+  switch (dtype) {
+  case ptx::DataType::kInt32:
+    return "unsigned";
+  case ptx::DataType::kUInt32:
+    return "unsigned";
+  case ptx::DataType::kFloat32:
+    return "float";
+  case ptx::DataType::kFloat64:
+    return "double";
+  default:
+    return "unsigned";
+  }
+}
+
 } // namespace codegen
 } // namespace tvm::tl
diff --git a/src/target/ptx.h b/src/target/ptx.h
index 68d5b04a3..566cded6f 100644
--- a/src/target/ptx.h
+++ b/src/target/ptx.h
@@ -269,6 +269,11 @@ std::string PrintArriveBarrierExpectTxAsm(const std::string &barrier,
  */
 std::string PrintWaitBarrierAsm(const std::string &barrier);
 
+/*!
+ * \brief Return the register-level C++ type used by MMA fragments.
+ */
+std::string GetMMARegisterType(const ptx::DataType &dtype);
+
 } // namespace codegen
 } // namespace tvm::tl
 
diff --git a/src/target/rt_mod_cpp.cc b/src/target/rt_mod_cpp.cc
index a7f2e62b9..10e3d57b6 100644
--- a/src/target/rt_mod_cpp.cc
+++ b/src/target/rt_mod_cpp.cc
@@ -1,10 +1,13 @@
 #include "codegen_cpp.h"
+#include <tvm/ffi/extra/module.h>
 #include <tvm/ffi/reflection/registry.h>
 
+#include "../support/ffi_aliases.h"
+
 namespace tvm {
 namespace codegen {
 
-runtime::Module BuildCPPHost(IRModule mod, Target target) {
+ffi::Module BuildCPPHost(IRModule mod, Target target) {
   bool output_ssa = false;
   bool emit_asserts = false;
   bool emit_fwd_func_decl = true;
@@ -67,10 +70,10 @@ runtime::Module BuildCPPHost(IRModule mod, Target target) {
   return CSourceModuleCreate(code, "c", cg.GetFunctionNames());
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("target.build.tilelang_cpp", BuildCPPHost);
-});
+}
 
 } // namespace codegen
 } // namespace tvm
diff --git a/src/target/rt_mod_cuda.cc b/src/target/rt_mod_cuda.cc
index 63a9f020b..69e16148f 100644
--- a/src/target/rt_mod_cuda.cc
+++ b/src/target/rt_mod_cuda.cc
@@ -1,7 +1,8 @@
+#include "../runtime/tilescale_cuda_module.h"
 #include "codegen_cuda.h"
-#include "runtime/cuda/cuda_module.h"
 #include "runtime/pack_args.h"
 #include <tvm/ffi/reflection/registry.h>
+#include <tvm/ir/transform.h>
 
 namespace tvm {
 namespace codegen {
@@ -24,20 +25,25 @@ ExtractFuncInfo(const IRModule &mod) {
           continue;
         }
       }
-      info.arg_types.push_back(f->params[i].dtype());
+      DataType dtype = f->params[i].dtype();
+      // Device runtime cannot directly take bool arguments, map to int32.
+      if (dtype.is_bool())
+        dtype = DataType::Int(32);
+      info.arg_types.push_back(dtype);
     }
-    if (auto opt = f->GetAttr<Array<String>>(tir::attr::kKernelLaunchParams)) {
+    if (auto opt = f->GetAttr<ffi::Array<ffi::String>>(
+            tir::attr::kKernelLaunchParams)) {
       for (const auto &tag : opt.value()) {
         info.launch_param_tags.push_back(tag);
       }
     }
-    auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
+    auto global_symbol = f->GetAttr<ffi::String>(tvm::attr::kGlobalSymbol);
     fmap[static_cast<std::string>(global_symbol.value())] = info;
   }
   return fmap;
 }
 
-runtime::Module BuildTileLangCUDA(IRModule mod, Target target) {
+ffi::Module BuildTileLangCUDA(IRModule mod, Target target) {
   bool output_ssa = false;
   CodeGenTileLangCUDA cg;
   cg.Init(output_ssa);
@@ -61,16 +67,20 @@ runtime::Module BuildTileLangCUDA(IRModule mod, Target target) {
   std::string ptx;
   if (const auto f =
           ffi::Function::GetGlobal("tilelang_callback_cuda_compile")) {
-    ptx = (*f)(code, target).cast<std::string>();
+    // Fetch current pass context config and pass into the compile callback
+    tvm::transform::PassContext pass_ctx =
+        tvm::transform::PassContext::Current();
+    ptx = (*f)(code, target, pass_ctx->config).cast<std::string>();
     if (ptx[0] != '/')
       fmt = "cubin";
   } else {
     ICHECK(0);
   }
-  return runtime::CUDAModuleCreate(ptx, fmt, ExtractFuncInfo(mod), code);
+  return runtime::TileScaleCUDAModuleCreate(ptx, fmt, ExtractFuncInfo(mod),
+                                            code);
 }
 
-runtime::Module BuildTileLangCUDAWithoutCompile(IRModule mod, Target target) {
+ffi::Module BuildTileLangCUDAWithoutCompile(IRModule mod, Target target) {
   bool output_ssa = false;
   CodeGenTileLangCUDA cg;
   cg.Init(output_ssa);
@@ -90,16 +100,17 @@ runtime::Module BuildTileLangCUDAWithoutCompile(IRModule mod, Target target) {
           ffi::Function::GetGlobal("tilelang_callback_cuda_postproc")) {
     code = (*f)(code, target).cast<std::string>();
   }
-  return runtime::CUDAModuleCreate("ptx", "ptx", ExtractFuncInfo(mod), code);
+  return runtime::TileScaleCUDAModuleCreate("ptx", "ptx", ExtractFuncInfo(mod),
+                                            code);
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
       .def("target.build.tilelang_cuda", BuildTileLangCUDA)
       .def("target.build.tilelang_cuda_without_compile",
            BuildTileLangCUDAWithoutCompile);
-});
+}
 
 } // namespace codegen
 } // namespace tvm
diff --git a/src/target/rt_mod_cutedsl.cc b/src/target/rt_mod_cutedsl.cc
new file mode 100644
index 000000000..a2b6d05d1
--- /dev/null
+++ b/src/target/rt_mod_cutedsl.cc
@@ -0,0 +1,69 @@
+#include "codegen_cutedsl.h"
+#include "runtime/cuda/cuda_module.h"
+#include "runtime/pack_args.h"
+#include <tvm/ffi/reflection/registry.h>
+
+namespace tvm {
+namespace codegen {
+
+static std::unordered_map<std::string, runtime::FunctionInfo>
+ExtractFuncInfo(const IRModule &mod) {
+  std::unordered_map<std::string, runtime::FunctionInfo> fmap;
+
+  for (auto kv : mod->functions) {
+    ICHECK(kv.second->IsInstance<tir::PrimFuncNode>())
+        << "Can only lower IR Module with PrimFuncs";
+    auto f = Downcast<tir::PrimFunc>(kv.second);
+
+    runtime::FunctionInfo info;
+    for (size_t i = 0; i < f->params.size(); ++i) {
+      if (f->params[i]->dtype.is_handle()) {
+        auto ptr = f->params[i]->type_annotation.as<PointerTypeNode>();
+        if (ptr && ptr->storage_scope == "grid_constant") {
+          info.arg_types.push_back(DataType(runtime::kDLGridConstant, 64, 1));
+          continue;
+        }
+      }
+      info.arg_types.push_back(f->params[i].dtype());
+    }
+    if (auto opt = f->GetAttr<ffi::Array<ffi::String>>(
+            tir::attr::kKernelLaunchParams)) {
+      for (const auto &tag : opt.value()) {
+        info.launch_param_tags.push_back(tag);
+      }
+    }
+    auto global_symbol = f->GetAttr<ffi::String>(tvm::attr::kGlobalSymbol);
+    fmap[static_cast<std::string>(global_symbol.value())] = info;
+  }
+  return fmap;
+}
+
+ffi::Module BuildTileLangCuTeDSLWithoutCompile(IRModule mod, Target target) {
+  CodeGenTileLangCuTeDSL cg;
+
+  for (auto kv : mod->functions) {
+    ICHECK(kv.second->IsInstance<PrimFuncNode>())
+        << "CodeGenTileLangCuTeDSL: Can only take PrimFunc";
+    auto gvar = Downcast<GlobalVar>(kv.first);
+    auto f = Downcast<PrimFunc>(kv.second);
+    auto calling_conv = f->GetAttr<Integer>(tvm::attr::kCallingConv);
+    ICHECK(calling_conv == CallingConv::kDeviceKernelLaunch);
+    cg.AddFunction(gvar, f);
+  }
+
+  std::string code = cg.Finish();
+  if (const auto f =
+          ffi::Function::GetGlobal("tilelang_callback_cutedsl_postproc")) {
+    code = (*f)(code, target).cast<std::string>();
+  }
+  return runtime::CUDAModuleCreate("ptx", "ptx", ExtractFuncInfo(mod), code);
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("target.build.tilelang_cutedsl_without_compile",
+                        BuildTileLangCuTeDSLWithoutCompile);
+}
+
+} // namespace codegen
+} // namespace tvm
diff --git a/src/target/rt_mod_hip.cc b/src/target/rt_mod_hip.cc
index d0041f570..1e5c689c6 100644
--- a/src/target/rt_mod_hip.cc
+++ b/src/target/rt_mod_hip.cc
@@ -35,20 +35,25 @@ ExtractFuncInfo(const IRModule &mod) {
           continue;
         }
       }
-      info.arg_types.push_back(f->params[i].dtype());
+      DataType dtype = f->params[i].dtype();
+      // Device runtime cannot directly take bool arguments, map to int32.
+      if (dtype.is_bool())
+        dtype = DataType::Int(32);
+      info.arg_types.push_back(dtype);
     }
-    if (auto opt = f->GetAttr<Array<String>>(tir::attr::kKernelLaunchParams)) {
+    if (auto opt = f->GetAttr<ffi::Array<ffi::String>>(
+            tir::attr::kKernelLaunchParams)) {
       for (const auto &tag : opt.value()) {
         info.launch_param_tags.push_back(tag);
       }
     }
-    auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
+    auto global_symbol = f->GetAttr<ffi::String>(tvm::attr::kGlobalSymbol);
     fmap[static_cast<std::string>(global_symbol.value())] = info;
   }
   return fmap;
 }
 
-runtime::Module BuildTileLangHIP(IRModule mod, Target target) {
+ffi::Module BuildTileLangHIP(IRModule mod, Target target) {
   bool output_ssa = false;
   CodeGenTileLangHIP cg;
   cg.Init(output_ssa);
@@ -84,7 +89,7 @@ runtime::Module BuildTileLangHIP(IRModule mod, Target target) {
   return ROCMModuleCreate(ptx, fmt, ExtractFuncInfo(mod), code, std::string());
 }
 
-runtime::Module BuildTileLangHIPWithoutCompile(IRModule mod, Target target) {
+ffi::Module BuildTileLangHIPWithoutCompile(IRModule mod, Target target) {
   bool output_ssa = false;
   CodeGenTileLangHIP cg;
   cg.Init(output_ssa);
@@ -110,13 +115,13 @@ runtime::Module BuildTileLangHIPWithoutCompile(IRModule mod, Target target) {
                           std::string());
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
       .def("target.build.tilelang_hip", BuildTileLangHIP)
       .def("target.build.tilelang_hip_without_compile",
            BuildTileLangHIPWithoutCompile);
-});
+}
 
 } // namespace codegen
-} // namespace tvm
\ No newline at end of file
+} // namespace tvm
diff --git a/src/target/utils.cc b/src/target/utils.cc
index ca4f8570b..993590ffb 100644
--- a/src/target/utils.cc
+++ b/src/target/utils.cc
@@ -5,6 +5,9 @@
 
 #include "utils.h"
 
+#include "../support/ffi_aliases.h"
+#include <tvm/node/node.h>
+
 namespace tvm {
 namespace tl {
 
@@ -16,8 +19,8 @@ bool TargetIsRocm(Target target) {
 }
 
 int GetArchInt(Target target) {
-  auto s = target->GetAttr<String>("arch");
-  ICHECK(s.defined());
+  auto s = target->GetAttr<tvm::ffi::String>("arch");
+  ICHECK(s.has_value());
   const std::string arch_str = s.value();
   ICHECK(arch_str.size() >= 3);
   ICHECK_EQ(arch_str.compare(0, 3, "sm_"), 0)
@@ -71,7 +74,7 @@ bool TargetIsCDNA(Target target) {
   if (!TargetIsRocm(target))
     return false;
   if (target->attrs.count("mcpu")) {
-    std::string mcpu = Downcast<String>(target->attrs.at("mcpu"));
+    std::string mcpu = Downcast<tvm::ffi::String>(target->attrs.at("mcpu"));
     // if mcpu start with "gfx9", it is CDNA
     return mcpu.find("gfx9") == 0;
   }
@@ -84,7 +87,7 @@ bool TargetHasAsyncCopy(Target target) {
     return arch >= 80;
   } else if (TargetIsCDNA(target)) {
     if (target->attrs.count("mcpu")) {
-      std::string mcpu = Downcast<String>(target->attrs.at("mcpu"));
+      std::string mcpu = Downcast<tvm::ffi::String>(target->attrs.at("mcpu"));
       if (mcpu.rfind("gfx9", 0) == 0) {
         int gfx_version = std::stoi(mcpu.substr(3, 2));
         return gfx_version >= 94;
@@ -131,7 +134,48 @@ int TargetGetWarpSize(Target target) {
   return res;
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+bool IsCudaVectorizableFP8(DataType dtype) {
+  return dtype.is_float8_e4m3() || dtype.is_float8_e4m3fn() ||
+         dtype.is_float8_e5m2();
+}
+
+bool IsCudaVectorizableCast(DataType from_ty, DataType target_ty) {
+  // float16 -> float32
+  if (from_ty.is_float16() && target_ty.is_float())
+    return true;
+
+  // float32 -> float16
+  if (from_ty.is_float() && target_ty.is_float16())
+    return true;
+
+  // bfloat16 -> float32
+  if (from_ty.is_bfloat16() && target_ty.is_float())
+    return true;
+
+  // float32 -> bfloat16
+  if (from_ty.is_float() && target_ty.is_bfloat16())
+    return true;
+
+  // float32 -> float8 (E4M3/E5M2)
+  if (from_ty.is_float() && IsCudaVectorizableFP8(target_ty))
+    return true;
+
+  // float8 (E4M3/E5M2) -> float32
+  if (IsCudaVectorizableFP8(from_ty) && target_ty.is_float())
+    return true;
+
+  // float4_e2m1fn -> float32
+  if (from_ty.is_float4_e2m1fn() && target_ty.is_float())
+    return true;
+
+  // float32 -> float4_e2m1fn
+  if (from_ty.is_float() && target_ty.is_float4_e2m1fn())
+    return true;
+
+  return false;
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
       .def("tl.TargetIsCuda",
@@ -160,7 +204,7 @@ TVM_FFI_STATIC_INIT_BLOCK({
            [](Target target) { return TargetHasBulkCopy(target); })
       .def("tl.TargetGetWarpSize",
            [](Target target) { return TargetGetWarpSize(target); });
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/target/utils.h b/src/target/utils.h
index bfd88281c..9de2d4d4f 100644
--- a/src/target/utils.h
+++ b/src/target/utils.h
@@ -30,6 +30,9 @@ bool TargetHasTmem(Target target);
 bool TargetHasBulkCopy(Target target);
 int TargetGetWarpSize(Target target);
 
+bool IsCudaVectorizableFP8(DataType dtype);
+bool IsCudaVectorizableCast(DataType from_ty, DataType target_ty);
+
 } // namespace tl
 } // namespace tvm
 
diff --git a/src/tl_templates/cpp/common.h b/src/tl_templates/cpp/common.h
index 0ce6580d3..f1fe801e6 100644
--- a/src/tl_templates/cpp/common.h
+++ b/src/tl_templates/cpp/common.h
@@ -5,4 +5,4 @@
 #include <stdbool.h>
 
 using half_float::half;
-// Not Implemented
\ No newline at end of file
+// Not Implemented
diff --git a/src/tl_templates/cpu/common.h b/src/tl_templates/cpu/common.h
index b288cd114..b69b23186 100644
--- a/src/tl_templates/cpu/common.h
+++ b/src/tl_templates/cpu/common.h
@@ -4,4 +4,4 @@
 #include <stdbool.h>
 
 // Not Implemented
-F
\ No newline at end of file
+F
diff --git a/src/tl_templates/cuda/atomic.h b/src/tl_templates/cuda/atomic.h
index fe4607020..5f81d8148 100644
--- a/src/tl_templates/cuda/atomic.h
+++ b/src/tl_templates/cuda/atomic.h
@@ -5,13 +5,18 @@
 #endif
 
 #include <cuda/atomic>
+#include <cuda_fp16.h>
 #include <cutlass/numeric_types.h>
 
 using cutlass::bfloat16_t;
 using cutlass::half_t;
 
 #define TL_DEVICE __forceinline__ __device__
-
+#define TL_NOT_IMPLEMENTED()                                                   \
+  {                                                                            \
+    printf("%s not implemented\n", __PRETTY_FUNCTION__);                       \
+    asm volatile("brkpt;\n");                                                  \
+  }
 template <typename T> struct normalize_atomic_type {
   using type = T;
 };
@@ -41,166 +46,668 @@ template <> TL_DEVICE __nv_bfloat16 cuda_cast<__nv_bfloat16, float>(float val) {
 #endif
 
 template <typename T1, typename T2>
-TL_DEVICE void AtomicMax(T1 &ref, T2 val,
+TL_DEVICE void AtomicMax(T1 *ref, T2 val,
                          int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  T1 *address = &ref;
+  T1 *address = ref;
   if constexpr (std::is_same_v<NT1, half> ||
                 std::is_same_v<NT1, __nv_bfloat16>) {
-    atomicMax(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val));
+    // There is no implementation of atomicMax for half and bf16 in cuda.
+    // We simulate this process by atomicCAS loop.
+    unsigned short *address_as_ushort =
+        reinterpret_cast<unsigned short *>(address);
+    unsigned short val_as_ushort = *reinterpret_cast<unsigned short *>(&val);
+    unsigned short old_val_ushort = *address_as_ushort;
+    while (val > *reinterpret_cast<T1 *>(&old_val_ushort)) {
+      unsigned short assumed_val_ushort = old_val_ushort;
+      old_val_ushort =
+          atomicCAS(address_as_ushort, assumed_val_ushort, val_as_ushort);
+      if (assumed_val_ushort == old_val_ushort) {
+        break;
+      }
+    }
   } else {
+#if CUDART_VERSION >= 11080
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     aref.fetch_max(cuda_cast<NT1>(val), cuda::memory_order(memory_order));
+#else
+    TL_NOT_IMPLEMENTED();
+#endif
   }
 }
 
 template <typename T1, typename T2>
-TL_DEVICE T1 AtomicMaxRet(T1 &ref, T2 val,
+TL_DEVICE T1 AtomicMaxRet(T1 *ref, T2 val,
                           int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  T1 *address = &ref;
+  T1 *address = ref;
   if constexpr (std::is_same_v<NT1, half> ||
                 std::is_same_v<NT1, __nv_bfloat16>) {
-    return static_cast<T1>(
-        atomicMax(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
+    unsigned short *address_as_ushort =
+        reinterpret_cast<unsigned short *>(address);
+    unsigned short val_as_ushort = *reinterpret_cast<unsigned short *>(&val);
+    unsigned short old_val_ushort = *address_as_ushort;
+    while (val > *reinterpret_cast<T1 *>(&old_val_ushort)) {
+      unsigned short assumed_val_ushort = old_val_ushort;
+      old_val_ushort =
+          atomicCAS(address_as_ushort, assumed_val_ushort, val_as_ushort);
+      if (assumed_val_ushort == old_val_ushort) {
+        break;
+      }
+    }
+    return static_cast<T1>(*reinterpret_cast<T1 *>(&old_val_ushort));
   } else {
+#if CUDART_VERSION >= 11080
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     return static_cast<T1>(
         aref.fetch_max(cuda_cast<NT1>(val), cuda::memory_order(memory_order)));
+#else
+    TL_NOT_IMPLEMENTED();
+#endif
   }
 }
 
 template <typename T1, typename T2>
-TL_DEVICE void AtomicMin(T1 &ref, T2 val,
+TL_DEVICE void AtomicMin(T1 *ref, T2 val,
                          int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  T1 *address = &ref;
+  T1 *address = ref;
   if constexpr (std::is_same_v<NT1, half> ||
                 std::is_same_v<NT1, __nv_bfloat16>) {
-    atomicMin(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val));
+    // There is no implementation of atomicMin for half and bf16 in cuda.
+    // We simulate this process by atomicCAS loop.
+    unsigned short *address_as_ushort =
+        reinterpret_cast<unsigned short *>(address);
+    unsigned short val_as_ushort = *reinterpret_cast<unsigned short *>(&val);
+    unsigned short old_val_ushort = *address_as_ushort;
+    while (val < *reinterpret_cast<T1 *>(&old_val_ushort)) {
+      unsigned short assumed_val_ushort = old_val_ushort;
+      old_val_ushort =
+          atomicCAS(address_as_ushort, assumed_val_ushort, val_as_ushort);
+      if (assumed_val_ushort == old_val_ushort) {
+        break;
+      }
+    }
   } else {
+#if CUDART_VERSION >= 11080
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     aref.fetch_min(cuda_cast<NT1>(val), cuda::memory_order(memory_order));
+#else
+    TL_NOT_IMPLEMENTED();
+#endif
   }
 }
 
 template <typename T1, typename T2>
-TL_DEVICE T1 AtomicMinRet(T1 &ref, T2 val,
+TL_DEVICE T1 AtomicMinRet(T1 *ref, T2 val,
                           int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  T1 *address = &ref;
+  T1 *address = ref;
   if constexpr (std::is_same_v<NT1, half> ||
                 std::is_same_v<NT1, __nv_bfloat16>) {
-    return static_cast<T1>(
-        atomicMin(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
+    unsigned short *address_as_ushort =
+        reinterpret_cast<unsigned short *>(address);
+    unsigned short val_as_ushort = *reinterpret_cast<unsigned short *>(&val);
+    unsigned short old_val_ushort = *address_as_ushort;
+    while (val < *reinterpret_cast<T1 *>(&old_val_ushort)) {
+      unsigned short assumed_val_ushort = old_val_ushort;
+      old_val_ushort =
+          atomicCAS(address_as_ushort, assumed_val_ushort, val_as_ushort);
+      if (assumed_val_ushort == old_val_ushort) {
+        break;
+      }
+    }
+    return static_cast<T1>(*reinterpret_cast<T1 *>(&old_val_ushort));
   } else {
+#if CUDART_VERSION >= 11080
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     return static_cast<T1>(
         aref.fetch_min(cuda_cast<NT1>(val), cuda::memory_order(memory_order)));
+#else
+    TL_NOT_IMPLEMENTED();
+#endif
   }
 }
 
+#if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ > 890))
 template <typename T1, typename T2>
-TL_DEVICE void AtomicAdd(T1 &ref, T2 val,
+TL_DEVICE void AtomicAdd(T1 *address, T2 val,
                          int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  T1 *address = &ref;
-  if constexpr ((std::is_same_v<NT1, half> ||
-                 std::is_same_v<NT1, __nv_bfloat16>) &&
-                memory_order == int(cuda::memory_order_relaxed)) {
-    atomicAdd(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val));
+  if constexpr (std::is_same_v<NT1, half> ||
+                std::is_same_v<NT1, __nv_bfloat16>) {
+    if (memory_order == int(cuda::memory_order_relaxed)) {
+      atomicAdd(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val));
+    } else {
+      // Since atomic ref do not support memory order, we need to inline ptx
+      // code here for each situation
+      if constexpr (std::is_same_v<NT1, half>) {
+        // fp16
+        __half ret_val;
+        unsigned short ret_val_cast =
+            *reinterpret_cast<unsigned short *>(&ret_val);
+        unsigned long long ref_address =
+            reinterpret_cast<unsigned long long>(address);
+        unsigned short val_cast = *reinterpret_cast<unsigned short *>(&val);
+        if (memory_order == int(cuda::memory_order_release) ||
+            memory_order == int(cuda::memory_order_consume)) {
+          asm volatile("atom.release.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acquire)) {
+          asm volatile("atom.acquire.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+                   memory_order == int(cuda::memory_order_seq_cst)) {
+          asm volatile("atom.acq_rel.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        }
+      } else if constexpr (std::is_same_v<NT1, __nv_bfloat16>) {
+        // bf16
+        __nv_bfloat16 ret_val;
+        unsigned short ret_val_cast =
+            *reinterpret_cast<unsigned short *>(&ret_val);
+        unsigned long long ref_address =
+            reinterpret_cast<unsigned long long>(address);
+        unsigned short val_cast = *reinterpret_cast<unsigned short *>(&val);
+        if (memory_order == int(cuda::memory_order_release) ||
+            memory_order == int(cuda::memory_order_consume)) {
+          asm volatile("atom.release.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acquire)) {
+          asm volatile("atom.acquire.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+                   memory_order == int(cuda::memory_order_seq_cst)) {
+          asm volatile("atom.acq_rel.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        }
+      }
+    }
   } else {
-    cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
-    aref.fetch_add(cuda_cast<NT1>(val), cuda::memory_order(memory_order));
+    atomicAdd(reinterpret_cast<NT1 *>(address), cuda_cast<NT1>(val));
   }
 }
+#else
+template <typename T1, typename T2>
+TL_DEVICE void AtomicAdd(T1 *address, T2 val,
+                         int memory_order = int(cuda::memory_order_relaxed)) {
+  using NT1 = typename normalize_atomic_type<T1>::type;
+  (void)memory_order;
+  atomicAdd(reinterpret_cast<NT1 *>(address), cuda_cast<NT1>(val));
+}
+#endif
 
 template <typename T1, typename T2>
-TL_DEVICE T1 AtomicAddRet(T1 &ref, T2 val,
+TL_DEVICE T1 AtomicAddRet(T1 *address, T2 val,
                           int memory_order = int(cuda::memory_order_relaxed)) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  T1 *address = &ref;
-  if constexpr ((std::is_same_v<NT1, half> ||
-                 std::is_same_v<NT1, __nv_bfloat16>) &&
-                memory_order == int(cuda::memory_order_relaxed)) {
-    return static_cast<T1>(
-        atomicAdd(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
+  if constexpr (std::is_same_v<NT1, half> ||
+                std::is_same_v<NT1, __nv_bfloat16>) {
+    if (memory_order == int(cuda::memory_order_relaxed)) {
+      return static_cast<T1>(
+          atomicAdd(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
+    } else {
+      if constexpr (std::is_same_v<NT1, half>) {
+        // fp16
+        __half ret_val;
+        unsigned short ret_val_cast =
+            *reinterpret_cast<unsigned short *>(&ret_val);
+        unsigned long long ref_address =
+            reinterpret_cast<unsigned long long>(address);
+        unsigned short val_cast = *reinterpret_cast<unsigned short *>(&val);
+        if (memory_order == int(cuda::memory_order_release) ||
+            memory_order == int(cuda::memory_order_consume)) {
+          asm volatile("atom.release.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acquire)) {
+          asm volatile("atom.acquire.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+                   memory_order == int(cuda::memory_order_seq_cst)) {
+          asm volatile("atom.acq_rel.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        }
+        return static_cast<T1>(*reinterpret_cast<__half *>(&ret_val_cast));
+      } else if constexpr (std::is_same_v<NT1, __nv_bfloat16>) {
+        // bf16
+        __nv_bfloat16 ret_val;
+        unsigned short ret_val_cast =
+            *reinterpret_cast<unsigned short *>(&ret_val);
+        unsigned long long ref_address =
+            reinterpret_cast<unsigned long long>(address);
+        unsigned short val_cast = *reinterpret_cast<unsigned short *>(&val);
+        if (memory_order == int(cuda::memory_order_release) ||
+            memory_order == int(cuda::memory_order_consume)) {
+          asm volatile("atom.release.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acquire)) {
+          asm volatile("atom.acquire.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+                   memory_order == int(cuda::memory_order_seq_cst)) {
+          asm volatile("atom.acq_rel.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                       : "=h"(ret_val_cast)
+                       : "l"(ref_address), "h"(val_cast)
+                       : "memory");
+        }
+        return static_cast<T1>(
+            *reinterpret_cast<__nv_bfloat16 *>(&ret_val_cast));
+      }
+    }
   } else {
+#if CUDART_VERSION >= 11080
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
     return static_cast<T1>(
         aref.fetch_add(cuda_cast<NT1>(val), cuda::memory_order(memory_order)));
+#else
+    TL_NOT_IMPLEMENTED();
+#endif
   }
 }
 
 // TODO add memory_order for vectorized atomic add
 TL_DEVICE void AtomicAddx2(half_t *ref, half_t *val,
                            int memory_order = int(cuda::memory_order_relaxed)) {
-  atomicAdd(reinterpret_cast<half2 *>(ref),
-            static_cast<half2>(*reinterpret_cast<half2 *>(val)));
+  if (memory_order == int(cuda::memory_order_relaxed)) {
+    atomicAdd(reinterpret_cast<half2 *>(ref),
+              static_cast<half2>(*reinterpret_cast<half2 *>(val)));
+  } else {
+    // Since atomicAdd does not support memory order, atomic_ref does not
+    // support vectorized atomic operation we can only inline ptx code here
+    // Note: Vectorized atomic operations only support global space
+    // Note: for 16-bit value, we need to reinterpret_cast the value to unsigned
+    // short and use "h" register in assembly
+    __half2 add_val = *reinterpret_cast<__half2 *>(val);
+    unsigned short add_val_x_cast =
+        *reinterpret_cast<unsigned short *>(&add_val.x);
+    unsigned short add_val_y_cast =
+        *reinterpret_cast<unsigned short *>(&add_val.y);
+    unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
+    __half ret_val_x, ret_val_y;
+    unsigned short ret_val_x_cast =
+        *reinterpret_cast<unsigned short *>(&ret_val_x);
+    unsigned short ret_val_y_cast =
+        *reinterpret_cast<unsigned short *>(&ret_val_y);
+    if (memory_order == int(cuda::memory_order_release) ||
+        memory_order == int(cuda::memory_order_consume)) {
+      asm volatile(
+          "atom.release.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
+          : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
+          : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
+          : "memory");
+    } else if (memory_order == int(cuda::memory_order_acquire)) {
+      asm volatile(
+          "atom.acquire.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
+          : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
+          : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
+          : "memory");
+    } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+               memory_order == int(cuda::memory_order_seq_cst)) {
+      asm volatile(
+          "atom.acq_rel.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
+          : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
+          : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
+          : "memory");
+    }
+  }
 }
 
 TL_DEVICE half2
 AtomicAddx2Ret(half_t *ref, half_t *val,
                int memory_order = int(cuda::memory_order_relaxed)) {
-  return atomicAdd(reinterpret_cast<half2 *>(ref),
-                   static_cast<half2>(*reinterpret_cast<half2 *>(val)));
+  if (memory_order == int(cuda::memory_order_relaxed)) {
+    return atomicAdd(reinterpret_cast<half2 *>(ref),
+                     static_cast<half2>(*reinterpret_cast<half2 *>(val)));
+  } else {
+    __half2 add_val = *reinterpret_cast<__half2 *>(val);
+    unsigned short add_val_x_cast =
+        *reinterpret_cast<unsigned short *>(&add_val.x);
+    unsigned short add_val_y_cast =
+        *reinterpret_cast<unsigned short *>(&add_val.y);
+    unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
+    __half ret_val_x, ret_val_y;
+    unsigned short ret_val_x_cast =
+        *reinterpret_cast<unsigned short *>(&ret_val_x);
+    unsigned short ret_val_y_cast =
+        *reinterpret_cast<unsigned short *>(&ret_val_y);
+    if (memory_order == int(cuda::memory_order_release) ||
+        memory_order == int(cuda::memory_order_consume)) {
+      asm volatile(
+          "atom.release.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
+          : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
+          : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
+          : "memory");
+    } else if (memory_order == int(cuda::memory_order_acquire)) {
+      asm volatile(
+          "atom.acquire.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
+          : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
+          : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
+          : "memory");
+    } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+               memory_order == int(cuda::memory_order_seq_cst)) {
+      asm volatile(
+          "atom.acq_rel.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
+          : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
+          : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
+          : "memory");
+    }
+    return half2(*reinterpret_cast<__half *>(&ret_val_x_cast),
+                 *reinterpret_cast<__half *>(&ret_val_y_cast));
+  }
 }
 
 #if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ > 750))
 TL_DEVICE void AtomicAddx2(bfloat16_t *ref, bfloat16_t *val,
                            int memory_order = int(cuda::memory_order_relaxed)) {
-  atomicAdd(
-      reinterpret_cast<__nv_bfloat162 *>(ref),
-      static_cast<__nv_bfloat162>(*reinterpret_cast<__nv_bfloat162 *>(val)));
+  if (memory_order == int(cuda::memory_order_relaxed)) {
+    atomicAdd(
+        reinterpret_cast<__nv_bfloat162 *>(ref),
+        static_cast<__nv_bfloat162>(*reinterpret_cast<__nv_bfloat162 *>(val)));
+  } else {
+    __nv_bfloat162 add_val = *reinterpret_cast<__nv_bfloat162 *>(val);
+    unsigned short add_val_x_cast =
+        *reinterpret_cast<unsigned short *>(&add_val.x);
+    unsigned short add_val_y_cast =
+        *reinterpret_cast<unsigned short *>(&add_val.y);
+    unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
+    __nv_bfloat162 ret_val;
+    unsigned short ret_val_x_cast =
+        *reinterpret_cast<unsigned short *>(&ret_val.x);
+    unsigned short ret_val_y_cast =
+        *reinterpret_cast<unsigned short *>(&ret_val.y);
+    if (memory_order == int(cuda::memory_order_release) ||
+        memory_order == int(cuda::memory_order_consume)) {
+      asm volatile("atom.release.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
+                   : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
+                   : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
+                   : "memory");
+    } else if (memory_order == int(cuda::memory_order_acquire)) {
+      asm volatile("atom.acquire.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
+                   : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
+                   : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
+                   : "memory");
+    } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+               memory_order == int(cuda::memory_order_seq_cst)) {
+      asm volatile("atom.acq_rel.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
+                   : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
+                   : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
+                   : "memory");
+    }
+  }
 }
 
 TL_DEVICE __nv_bfloat162
 AtomicAddx2Ret(bfloat16_t *ref, bfloat16_t *val,
                int memory_order = int(cuda::memory_order_relaxed)) {
-  return atomicAdd(
-      reinterpret_cast<__nv_bfloat162 *>(ref),
-      static_cast<__nv_bfloat162>(*reinterpret_cast<__nv_bfloat162 *>(val)));
+  if (memory_order == int(cuda::memory_order_relaxed)) {
+    return atomicAdd(
+        reinterpret_cast<__nv_bfloat162 *>(ref),
+        static_cast<__nv_bfloat162>(*reinterpret_cast<__nv_bfloat162 *>(val)));
+  } else {
+    __nv_bfloat162 add_val = *reinterpret_cast<__nv_bfloat162 *>(val);
+    unsigned short add_val_x_cast =
+        *reinterpret_cast<unsigned short *>(&add_val.x);
+    unsigned short add_val_y_cast =
+        *reinterpret_cast<unsigned short *>(&add_val.y);
+    unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
+    __nv_bfloat162 ret_val;
+    unsigned short ret_val_x_cast =
+        *reinterpret_cast<unsigned short *>(&ret_val.x);
+    unsigned short ret_val_y_cast =
+        *reinterpret_cast<unsigned short *>(&ret_val.y);
+    if (memory_order == int(cuda::memory_order_release) ||
+        memory_order == int(cuda::memory_order_consume)) {
+      asm volatile("atom.release.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
+                   : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
+                   : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
+                   : "memory");
+    } else if (memory_order == int(cuda::memory_order_acquire)) {
+      asm volatile("atom.acquire.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
+                   : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
+                   : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
+                   : "memory");
+    } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+               memory_order == int(cuda::memory_order_seq_cst)) {
+      asm volatile("atom.acq_rel.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
+                   : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
+                   : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
+                   : "memory");
+    }
+    return __nv_bfloat162(*reinterpret_cast<__nv_bfloat16 *>(&ret_val_x_cast),
+                          *reinterpret_cast<__nv_bfloat16 *>(&ret_val_y_cast));
+  }
 }
 #endif
 
 #if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ >= 900))
 TL_DEVICE void AtomicAddx2(float *ref, float *val,
                            int memory_order = int(cuda::memory_order_relaxed)) {
-  atomicAdd(reinterpret_cast<float2 *>(ref),
-            static_cast<float2>(*reinterpret_cast<float2 *>(val)));
+  if (memory_order == int(cuda::memory_order_relaxed)) {
+    atomicAdd(reinterpret_cast<float2 *>(ref),
+              static_cast<float2>(*reinterpret_cast<float2 *>(val)));
+  } else {
+    float2 add_val = *reinterpret_cast<float2 *>(val);
+    unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
+    float2 ret_val;
+    if (memory_order == int(cuda::memory_order_release) ||
+        memory_order == int(cuda::memory_order_consume)) {
+      asm volatile("atom.release.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
+                   : "=f"(ret_val.x), "=f"(ret_val.y)
+                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
+                   : "memory");
+    } else if (memory_order == int(cuda::memory_order_acquire)) {
+      asm volatile("atom.acquire.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
+                   : "=f"(ret_val.x), "=f"(ret_val.y)
+                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
+                   : "memory");
+    } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+               memory_order == int(cuda::memory_order_seq_cst)) {
+      asm volatile("atom.acq_rel.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
+                   : "=f"(ret_val.x), "=f"(ret_val.y)
+                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
+                   : "memory");
+    }
+  }
+}
+
+TL_DEVICE float2
+AtomicAddx2Ret(float *ref, float *val,
+               int memory_order = int(cuda::memory_order_relaxed)) {
+  if (memory_order == int(cuda::memory_order_relaxed)) {
+    return atomicAdd(reinterpret_cast<float2 *>(ref),
+                     static_cast<float2>(*reinterpret_cast<float2 *>(val)));
+  } else {
+    float2 add_val = *reinterpret_cast<float2 *>(val);
+    unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
+    float2 ret_val;
+    if (memory_order == int(cuda::memory_order_release) ||
+        memory_order == int(cuda::memory_order_consume)) {
+      asm volatile("atom.release.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
+                   : "=f"(ret_val.x), "=f"(ret_val.y)
+                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
+                   : "memory");
+    } else if (memory_order == int(cuda::memory_order_acquire)) {
+      asm volatile("atom.acquire.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
+                   : "=f"(ret_val.x), "=f"(ret_val.y)
+                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
+                   : "memory");
+    } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+               memory_order == int(cuda::memory_order_seq_cst)) {
+      asm volatile("atom.acq_rel.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
+                   : "=f"(ret_val.x), "=f"(ret_val.y)
+                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
+                   : "memory");
+    }
+    return ret_val;
+  }
+}
+
+TL_DEVICE void AtomicAddx4(float *ref, float *val,
+                           int memory_order = int(cuda::memory_order_relaxed)) {
+  if (memory_order == int(cuda::memory_order_relaxed)) {
+    atomicAdd(reinterpret_cast<float4 *>(ref),
+              static_cast<float4>(*reinterpret_cast<float4 *>(val)));
+  } else {
+    // Since atomicAdd does not support memory order, atomic_ref does not
+    // support vectorized atomic operation we can only inline ptx code here
+    // Note: Vectorized atomic operations only support global space
+    float4 add_val = *reinterpret_cast<float4 *>(val);
+    unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
+    float4 ret_val;
+    if (memory_order == int(cuda::memory_order_release) ||
+        memory_order == int(cuda::memory_order_consume)) {
+      asm volatile("atom.release.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], "
+                   "{%5,%6,%7,%8};"
+                   : "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
+                     "=f"(ret_val.w)
+                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
+                     "f"(add_val.z), "f"(add_val.w)
+                   : "memory");
+    } else if (memory_order == int(cuda::memory_order_acquire)) {
+      asm volatile("atom.acquire.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], "
+                   "{%5,%6,%7,%8};"
+                   : "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
+                     "=f"(ret_val.w)
+                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
+                     "f"(add_val.z), "f"(add_val.w)
+                   : "memory");
+    } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+               memory_order == int(cuda::memory_order_seq_cst)) {
+      asm volatile("atom.acq_rel.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], "
+                   "{%5,%6,%7,%8};"
+                   : "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
+                     "=f"(ret_val.w)
+                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
+                     "f"(add_val.z), "f"(add_val.w)
+                   : "memory");
+    }
+  }
+}
+
+TL_DEVICE float4
+AtomicAddx4Ret(float *ref, float *val,
+               int memory_order = int(cuda::memory_order_relaxed)) {
+  if (memory_order == int(cuda::memory_order_relaxed)) {
+    return atomicAdd(reinterpret_cast<float4 *>(ref),
+                     static_cast<float4>(*reinterpret_cast<float4 *>(val)));
+  } else {
+    float4 add_val = *reinterpret_cast<float4 *>(val);
+    unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
+    float4 ret_val;
+    if (memory_order == int(cuda::memory_order_release) ||
+        memory_order == int(cuda::memory_order_consume)) {
+      asm volatile("atom.global.gpu.release.add.v4.f32 {%0,%1,%2,%3}, [%4], "
+                   "{%5,%6,%7,%8};"
+                   : "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
+                     "=f"(ret_val.w)
+                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
+                     "f"(add_val.z), "f"(add_val.w)
+                   : "memory");
+    } else if (memory_order == int(cuda::memory_order_acquire)) {
+      asm volatile("atom.global.gpu.acquire.add.v4.f32 {%0,%1,%2,%3}, [%4], "
+                   "{%5,%6,%7,%8};"
+                   : "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
+                     "=f"(ret_val.w)
+                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
+                     "f"(add_val.z), "f"(add_val.w)
+                   : "memory");
+    } else if (memory_order == int(cuda::memory_order_acq_rel) ||
+               memory_order == int(cuda::memory_order_seq_cst)) {
+      asm volatile("atom.global.gpu.acq_rel.add.v4.f32 {%0,%1,%2,%3}, [%4], "
+                   "{%5,%6,%7,%8};"
+                   : "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
+                     "=f"(ret_val.w)
+                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
+                     "f"(add_val.z), "f"(add_val.w)
+                   : "memory");
+    }
+    return ret_val;
+  }
+}
+#else
+TL_DEVICE void AtomicAddx2(float *ref, float *val,
+                           int memory_order = int(cuda::memory_order_relaxed)) {
+  (void)memory_order;
+  float2 add_val = *reinterpret_cast<float2 *>(val);
+  atomicAdd(ref + 0, add_val.x);
+  atomicAdd(ref + 1, add_val.y);
 }
 
 TL_DEVICE float2
 AtomicAddx2Ret(float *ref, float *val,
                int memory_order = int(cuda::memory_order_relaxed)) {
-  return atomicAdd(reinterpret_cast<float2 *>(ref),
-                   static_cast<float2>(*reinterpret_cast<float2 *>(val)));
+  (void)memory_order;
+  float2 add_val = *reinterpret_cast<float2 *>(val);
+  float2 ret;
+  ret.x = atomicAdd(ref + 0, add_val.x);
+  ret.y = atomicAdd(ref + 1, add_val.y);
+  return ret;
 }
 
 TL_DEVICE void AtomicAddx4(float *ref, float *val,
                            int memory_order = int(cuda::memory_order_relaxed)) {
-  atomicAdd(reinterpret_cast<float4 *>(ref),
-            static_cast<float4>(*reinterpret_cast<float4 *>(val)));
+  (void)memory_order;
+  float4 add_val = *reinterpret_cast<float4 *>(val);
+  atomicAdd(ref + 0, add_val.x);
+  atomicAdd(ref + 1, add_val.y);
+  atomicAdd(ref + 2, add_val.z);
+  atomicAdd(ref + 3, add_val.w);
 }
 
 TL_DEVICE float4
 AtomicAddx4Ret(float *ref, float *val,
                int memory_order = int(cuda::memory_order_relaxed)) {
-  return atomicAdd(reinterpret_cast<float4 *>(ref),
-                   static_cast<float4>(*reinterpret_cast<float4 *>(val)));
+  (void)memory_order;
+  float4 add_val = *reinterpret_cast<float4 *>(val);
+  float4 ret;
+  ret.x = atomicAdd(ref + 0, add_val.x);
+  ret.y = atomicAdd(ref + 1, add_val.y);
+  ret.z = atomicAdd(ref + 2, add_val.z);
+  ret.w = atomicAdd(ref + 3, add_val.w);
+  return ret;
 }
 #endif
 
-template <typename T> TL_DEVICE T AtomicLoad(T &ref, int memory_order) {
-  cuda::atomic_ref<T, cuda::thread_scope_device> aref(ref);
+template <typename T> TL_DEVICE T AtomicLoad(T *ref, int memory_order) {
+#if CUDART_VERSION >= 11080
+  cuda::atomic_ref<T, cuda::thread_scope_device> aref(*ref);
   return aref.load(cuda::memory_order(memory_order));
+#else
+  TL_NOT_IMPLEMENTED();
+#endif
 }
 
 template <typename T1, typename T2>
-TL_DEVICE void AtomicStore(T1 &ref, T2 value, int memory_order) {
+TL_DEVICE void AtomicStore(T1 *ref, T2 value, int memory_order) {
   using NT1 = typename normalize_atomic_type<T1>::type;
-  cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(ref);
+#if CUDART_VERSION >= 11080
+  cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*ref);
   aref.store(cuda_cast<NT1>(value), cuda::memory_order(memory_order));
+#else
+  TL_NOT_IMPLEMENTED();
+#endif
 }
 
 namespace tl {
@@ -276,4 +783,5 @@ TL_DEVICE uint32_t ptx_atom_add_acq_rel_sys(const uint32_t *ptr,
                : "l"(ptr), "r"(value));
   return ret;
 }
-} // namespace tl
\ No newline at end of file
+
+} // namespace tl
diff --git a/src/tl_templates/cuda/barrier.h b/src/tl_templates/cuda/barrier.h
index 5eeb4abd3..79a57f7df 100644
--- a/src/tl_templates/cuda/barrier.h
+++ b/src/tl_templates/cuda/barrier.h
@@ -133,6 +133,10 @@ TL_DEVICE void fence_proxy_async() {
   asm volatile("fence.proxy.async.shared::cta;" : :);
 }
 
+TL_DEVICE void fence_barrier_init() {
+  asm volatile("fence.mbarrier_init.release.cluster;" : :);
+}
+
 // Indicate arrival of warp issuing TMA_STORE
 TL_DEVICE void tma_store_arrive() {
   asm volatile("cp.async.bulk.commit_group;");
diff --git a/src/tl_templates/cuda/common.h b/src/tl_templates/cuda/common.h
index 057f6a25c..c33f225c0 100644
--- a/src/tl_templates/cuda/common.h
+++ b/src/tl_templates/cuda/common.h
@@ -10,6 +10,9 @@
 #include <cutlass/numeric_types.h>
 #include <math_constants.h>
 
+#include <cutlass/bfloat16.h>
+#include <cutlass/float8.h>
+
 using cutlass::bfloat16_t;
 using cutlass::half_t;
 using cutlass::tfloat32_t;
@@ -93,12 +96,22 @@ TL_DEVICE unsigned __pack_nv_bfloat162(const bfloat16_t x, const bfloat16_t y) {
   return (v1 << 16) | v0;
 }
 
-// Pack four char values
+// Pack four char values.
 TL_DEVICE int make_int(signed char x0, signed char x1, signed char x2,
                        signed char x3) {
   return (x3 << 24) | (x2 << 16) | (x1 << 8) | x0;
 }
 
+// Pack eight char values.
+TL_DEVICE int2 make_int2(signed char x0, signed char x1, signed char x2,
+                         signed char x3, signed char y0, signed char y1,
+                         signed char y2, signed char y3) {
+  int2 result;
+  result.x = make_int(x0, x1, x2, x3);
+  result.y = make_int(y0, y1, y2, y3);
+  return result;
+}
+
 // Pack sixteen char values.
 TL_DEVICE int4_t make_int4(signed char x0, signed char x1, signed char x2,
                            signed char x3, signed char y0, signed char y1,
@@ -114,6 +127,70 @@ TL_DEVICE int4_t make_int4(signed char x0, signed char x1, signed char x2,
   return result;
 }
 
+TL_DEVICE int4_t make_int4(short x0, short x1, short y0, short y1, short z0,
+                           short z1, short w0, short w1) {
+  int4_t result;
+  *((short2 *)&result.x) = make_short2(x0, x1);
+  *((short2 *)&result.y) = make_short2(y0, y1);
+  *((short2 *)&result.z) = make_short2(z0, z1);
+  *((short2 *)&result.w) = make_short2(w0, w1);
+  return result;
+}
+
+// Pack four char values.
+TL_DEVICE unsigned int make_uint(unsigned char x0, unsigned char x1,
+                                 unsigned char x2, unsigned char x3) {
+  return (x3 << 24) | (x2 << 16) | (x1 << 8) | x0;
+}
+
+// Pack eight char values.
+TL_DEVICE uint2 make_uint2(unsigned char x0, unsigned char x1, unsigned char x2,
+                           unsigned char x3, unsigned char y0, unsigned char y1,
+                           unsigned char y2, unsigned char y3) {
+  uint2 result;
+  result.x = make_uint(x0, x1, x2, x3);
+  result.y = make_uint(y0, y1, y2, y3);
+  return result;
+}
+
+// Pack sixteen char values.
+TL_DEVICE uint4 make_uint4(unsigned char x0, unsigned char x1, unsigned char x2,
+                           unsigned char x3, unsigned char y0, unsigned char y1,
+                           unsigned char y2, unsigned char y3, unsigned char z0,
+                           unsigned char z1, unsigned char z2, unsigned char z3,
+                           unsigned char w0, unsigned char w1, unsigned char w2,
+                           unsigned char w3) {
+  uint4 result;
+  result.x = make_uint(x0, x1, x2, x3);
+  result.y = make_uint(y0, y1, y2, y3);
+  result.z = make_uint(z0, z1, z2, z3);
+  result.w = make_uint(w0, w1, w2, w3);
+  return result;
+}
+
+TL_DEVICE uint4 make_uint4(unsigned short x0, unsigned short x1,
+                           unsigned short y0, unsigned short y1,
+                           unsigned short z0, unsigned short z1,
+                           unsigned short w0, unsigned short w1) {
+  uint4 result;
+  *((ushort2 *)&result.x) = make_ushort2(x0, x1);
+  *((ushort2 *)&result.y) = make_ushort2(y0, y1);
+  *((ushort2 *)&result.z) = make_ushort2(z0, z1);
+  *((ushort2 *)&result.w) = make_ushort2(w0, w1);
+  return result;
+}
+
+// Pack eight int values.
+TL_DEVICE longlong4 make_longlong4(int x0, int x1, int y0, int y1, int z0,
+                                   int z1, int w0, int w1) {
+  longlong4 result;
+  *((int2 *)&result.x) = make_int2(x0, x1);
+  *((int2 *)&result.y) = make_int2(y0, y1);
+  *((int2 *)&result.z) = make_int2(z0, z1);
+  *((int2 *)&result.w) = make_int2(w0, w1);
+  return result;
+}
+
 // Helper to cast SMEM pointer to unsigned
 TL_DEVICE uint32_t smem_ptr_to_uint(void const *const ptr) {
   return static_cast<uint32_t>(__cvta_generic_to_shared(ptr));
@@ -264,6 +341,138 @@ union GmmaDescriptor {
   }
 };
 
+union Tcgen05SMemDescriptor {
+  CUTE_HOST_DEVICE constexpr Tcgen05SMemDescriptor() noexcept : desc_(0) {}
+  CUTE_HOST_DEVICE constexpr Tcgen05SMemDescriptor(uint64_t desc) noexcept
+      : desc_(desc) {}
+  CUTE_HOST_DEVICE constexpr Tcgen05SMemDescriptor(
+      Tcgen05SMemDescriptor const &t) noexcept
+      : desc_(t.desc_) {}
+  CUTE_HOST_DEVICE constexpr Tcgen05SMemDescriptor(
+      Tcgen05SMemDescriptor &&t) noexcept
+      : desc_(t.desc_) {}
+
+  CUTE_HOST_DEVICE constexpr Tcgen05SMemDescriptor &
+  operator=(Tcgen05SMemDescriptor const &t) noexcept {
+    desc_ = t.desc_;
+    return *this;
+  }
+
+  CUTE_HOST_DEVICE constexpr Tcgen05SMemDescriptor &
+  operator=(Tcgen05SMemDescriptor &&t) noexcept {
+    desc_ = t.desc_;
+    return *this;
+  }
+
+  uint64_t desc_;
+  uint32_t reg32_[2];
+
+  // Bitfield implementation avoids the need for shifts in assignment
+  struct {
+    // start_address, bit [0,14), 4LSB not included
+    uint16_t start_address_ : 14, : 2; // 14 bits [0,14), 2 bits unused
+    // leading dimension byte offset, bit [16,30), 4LSB not included
+    uint16_t leading_byte_offset_ : 14, : 2; // 14 bits [0,14), 2 bits unused
+    // stride dimension byte offset, bit [32,46), 4LSB not included
+    uint16_t stride_byte_offset_ : 14,
+        version_ : 2; // 14 bits [0,14), 2 bits [14,16)
+    // base_offset, bit [49,52). leading_byte_offset_mode, bit [52,53).
+    uint8_t : 1, base_offset_ : 3, lbo_mode_ : 1,
+        : 3; // 1 bit unused, 3 bits [1,4), 1 bit [4,5), 3 bits unused
+    // layout type, bit [61,64), SWIZZLE_NONE matrix descriptor = 0,
+    // SWIZZLE_128B matrix descriptor = 2, SWIZZLE_64B descriptor = 4,
+    // SWIZZLE_32B descriptor = 6, SWIZZLE_128B_BASE32B = 1, N/A = 3, N/A = 5,
+    // N/A = 7
+    uint8_t : 5, layout_type_ : 3; // 6 bits unused, 3 bits [5,8)
+  } bitfield;
+  // Separate the field, as we may only update one part of desc
+  struct {
+    uint32_t lo;
+    uint32_t hi;
+  } words;
+
+  CUTE_HOST_DEVICE constexpr operator uint64_t() const noexcept {
+    return desc_;
+  }
+  template <typename T>
+  CUTE_HOST_DEVICE constexpr Tcgen05SMemDescriptor
+  operator+(const T &offset) const {
+    Tcgen05SMemDescriptor ret;
+    // Address addition is in units of 16 bytes (4 LSB not encoded)
+    ret.reg32_[0] = reg32_[0] + (uint32_t(offset) >> 4);
+    ret.reg32_[1] = reg32_[1];
+    return ret;
+  }
+};
+
+//
+// Tcgen05 instruction descriptor (wraps cute::UMMA::InstrDescriptor layout)
+//
+union Tcgen05InstrDescriptor {
+  CUTE_HOST_DEVICE constexpr Tcgen05InstrDescriptor() noexcept : desc_(0) {}
+  CUTE_HOST_DEVICE constexpr Tcgen05InstrDescriptor(uint32_t desc) noexcept
+      : desc_(desc) {}
+  CUTE_HOST_DEVICE constexpr Tcgen05InstrDescriptor(
+      Tcgen05InstrDescriptor const &t) noexcept
+      : desc_(t.desc_) {}
+  CUTE_HOST_DEVICE constexpr Tcgen05InstrDescriptor(
+      Tcgen05InstrDescriptor &&t) noexcept
+      : desc_(t.desc_) {}
+
+  CUTE_HOST_DEVICE constexpr Tcgen05InstrDescriptor &
+  operator=(Tcgen05InstrDescriptor const &t) noexcept {
+    desc_ = t.desc_;
+    return *this;
+  }
+
+  CUTE_HOST_DEVICE constexpr Tcgen05InstrDescriptor &
+  operator=(Tcgen05InstrDescriptor &&t) noexcept {
+    desc_ = t.desc_;
+    return *this;
+  }
+
+  uint32_t desc_;
+  uint16_t reg16_[2];
+
+  // Bitfield implementation mirrors cute::UMMA::InstrDescriptor
+  struct {
+    // bit [ 0, 2) : Sparse meta data id2
+    uint16_t sparse_id2_ : 2,
+        // bit [ 2, 3) : 0 = dense. 1 = sparse. Only valid for
+        // F32F16/S8/MXF8F6F4
+        sparse_flag_ : 1,
+        // bit [ 3, 4) : 0 = no saturate. 1 = saturate. Only valid for S8
+        saturate_ : 1,
+        // bit [ 4, 6) : 0 = F16. 1 = F32, 2 = S32
+        c_format_ : 2,
+        // padding
+        : 1,
+        // bit [ 7,10) : see UMMA format encoding
+        a_format_ : 3,
+        // bit [10,13) : see UMMA format encoding
+        b_format_ : 3,
+        // bit [13,14) : 0 = no negate. 1 = negate
+        a_negate_ : 1,
+        // bit [14,15) : 0 = no negate. 1 = negate
+        b_negate_ : 1,
+        // bit [15,16) : 0 = K-major. 1 = MN-major
+        a_major_ : 1;
+
+    // Upper 16 bits
+    uint16_t b_major_ : 1, // bit [16,17)
+        n_dim_ : 6,        // bit [17,23) : 3 LSBs not included
+        : 1,               // padding
+        m_dim_ : 5,        // bit [24,29) : 4 LSBs not included
+        : 1,               // padding
+        max_shift_ : 2;    // bit [30,32)
+  } bitfield;
+
+  // Decay to a uint32_t
+  CUTE_HOST_DEVICE constexpr explicit operator uint32_t() const noexcept {
+    return desc_;
+  }
+};
+
 // Any
 template <typename T> TL_DEVICE bool Any(T *a, int size) {
   for (int i = 0; i < size; i++) {
@@ -296,12 +505,15 @@ template <int y = 1, typename T> TL_DEVICE T pow_of_int(T x) {
 // Thread partial barrier synchronization
 // https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model
 TL_DEVICE void __sync_thread_partial(int barrier_id = 0, int thread_count = 0) {
+  // In contrast to TileLang, here we support runtime determined barrier_id and
+  // thread_count.`
   asm volatile("bar.sync %0, %1;" : : "r"(barrier_id), "r"(thread_count));
 }
+
 template <int layout_type = 0, int leading_byte_offset = 0,
           int stride_byte_offset = 0, typename T>
-TL_DEVICE void initialize_descriptor(GmmaDescriptor &descriptor,
-                                     T *start_address) {
+TL_DEVICE void initialize_wgmma_descriptor(GmmaDescriptor &descriptor,
+                                           T *start_address) {
   descriptor.bitfield.start_address_ =
       cute::cast_smem_ptr_to_uint(start_address) >> 4;
   descriptor.bitfield.layout_type_ = layout_type;
@@ -310,15 +522,151 @@ TL_DEVICE void initialize_descriptor(GmmaDescriptor &descriptor,
   descriptor.bitfield.stride_byte_offset_ = stride_byte_offset;
 }
 
+template <typename T>
+TL_DEVICE void
+initialize_tcgen05_descriptor(Tcgen05SMemDescriptor &descriptor,
+                              T *start_address, int leading_byte_offset,
+                              int stride_byte_offset, int base_offset,
+                              bool leading_is_absolute, int swizzle_mode) {
+
+  descriptor.bitfield.start_address_ =
+      static_cast<uint16_t>(cast_smem_ptr_to_uint(start_address) >> 4);
+  descriptor.bitfield.leading_byte_offset_ = leading_byte_offset;
+  descriptor.bitfield.stride_byte_offset_ = stride_byte_offset;
+  descriptor.bitfield.version_ = 1;
+  descriptor.bitfield.base_offset_ = base_offset & 0x7;
+  descriptor.bitfield.lbo_mode_ = leading_is_absolute ? 1 : 0;
+  descriptor.bitfield.layout_type_ = swizzle_mode & 0x7;
+}
+
 template <typename T>
 TL_DEVICE void increase_descriptor_offset(GmmaDescriptor &descriptor,
                                           T offset) {
   descriptor.reg32_[0] += (offset >> 4);
 }
 
+// and add the desired implicit conversion from bfloat16_t.
+struct float_e4m3_t : public cute::float_e4m3_t {
+  using cute::float_e4m3_t::float_e4m3_t;
+  CUTLASS_HOST_DEVICE
+  float_e4m3_t() = default;
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e4m3_t(__nv_bfloat16 x)
+      : float_e4m3_t(static_cast<float>(x)) {}
+};
+
+struct float_e5m2_t : public cute::float_e5m2_t {
+  using cute::float_e5m2_t::float_e5m2_t;
+  CUTLASS_HOST_DEVICE
+  float_e5m2_t() = default;
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e5m2_t(__nv_bfloat16 x)
+      : float_e5m2_t(static_cast<float>(x)) {}
+};
+
+template <typename T> struct to_cute_type {
+  using type = T;
+};
+template <> struct to_cute_type<tl::float_e4m3_t> {
+  using type = cute::float_e4m3_t;
+};
+template <> struct to_cute_type<tl::float_e5m2_t> {
+  using type = cute::float_e5m2_t;
+};
+
 } // namespace tl
 
 namespace cutlass {
 TL_DEVICE
 bfloat16_t fast_exp(bfloat16_t x) { return ::hexp(x); }
 } // namespace cutlass
+
+//
+// Type-safe warp shuffle helpers for 16-bit float types
+// These wrappers avoid relying on implicit conversions that may be disallowed
+// (e.g., converting float -> cutlass::bfloat16_t) by explicitly promoting to
+// float for the shuffle and then down-converting.
+//
+namespace tl {
+
+// Generic passthroughs
+template <typename T>
+TL_DEVICE T shfl_xor_sync(unsigned mask, T val, int laneMask) {
+  return __shfl_xor_sync(mask, val, laneMask);
+}
+
+template <typename T>
+TL_DEVICE T shfl_down_sync(unsigned mask, T val, int delta) {
+  return __shfl_down_sync(mask, val, delta);
+}
+
+template <typename T>
+TL_DEVICE T shfl_up_sync(unsigned mask, T val, int delta) {
+  return __shfl_up_sync(mask, val, delta);
+}
+
+template <typename T> TL_DEVICE T shfl_sync(unsigned mask, T val, int srcLane) {
+  return __shfl_sync(mask, val, srcLane);
+}
+
+// Specializations for cutlass::half_t
+template <>
+TL_DEVICE half_t shfl_xor_sync(unsigned mask, half_t val, int laneMask) {
+  float f = static_cast<float>(val);
+  float r = __shfl_xor_sync(mask, f, laneMask);
+  return half_t(r);
+}
+
+template <>
+TL_DEVICE half_t shfl_down_sync(unsigned mask, half_t val, int delta) {
+  float f = static_cast<float>(val);
+  float r = __shfl_down_sync(mask, f, delta);
+  return half_t(r);
+}
+
+template <>
+TL_DEVICE half_t shfl_up_sync(unsigned mask, half_t val, int delta) {
+  float f = static_cast<float>(val);
+  float r = __shfl_up_sync(mask, f, delta);
+  return half_t(r);
+}
+
+template <> TL_DEVICE half_t shfl_sync(unsigned mask, half_t val, int srcLane) {
+  float f = static_cast<float>(val);
+  float r = __shfl_sync(mask, f, srcLane);
+  return half_t(r);
+}
+
+// Specializations for cutlass::bfloat16_t
+template <>
+TL_DEVICE bfloat16_t shfl_xor_sync(unsigned mask, bfloat16_t val,
+                                   int laneMask) {
+  float f = static_cast<float>(val);
+  float r = __shfl_xor_sync(mask, f, laneMask);
+  return bfloat16_t(r);
+}
+
+template <>
+TL_DEVICE bfloat16_t shfl_down_sync(unsigned mask, bfloat16_t val, int delta) {
+  float f = static_cast<float>(val);
+  float r = __shfl_down_sync(mask, f, delta);
+  return bfloat16_t(r);
+}
+
+template <>
+TL_DEVICE bfloat16_t shfl_up_sync(unsigned mask, bfloat16_t val, int delta) {
+  float f = static_cast<float>(val);
+  float r = __shfl_up_sync(mask, f, delta);
+  return bfloat16_t(r);
+}
+
+template <>
+TL_DEVICE bfloat16_t shfl_sync(unsigned mask, bfloat16_t val, int srcLane) {
+  float f = static_cast<float>(val);
+  float r = __shfl_sync(mask, f, srcLane);
+  return bfloat16_t(r);
+}
+
+} // namespace tl
diff --git a/src/tl_templates/cuda/copy.h b/src/tl_templates/cuda/copy.h
index df68287cb..f905d8834 100644
--- a/src/tl_templates/cuda/copy.h
+++ b/src/tl_templates/cuda/copy.h
@@ -26,7 +26,8 @@ template <int N> TL_DEVICE void cp_async_wait() {
 }
 
 template <int N>
-TL_DEVICE void cp_async_gs(void const *const smem_addr, void *global_ptr) {
+TL_DEVICE void cp_async_gs(void const *const smem_addr,
+                           void const *global_ptr) {
   static_assert(N == 16 || N == 8 || N == 4);
   unsigned int addr = smem_ptr_to_uint(smem_addr);
   if constexpr (N == 16) {
@@ -37,7 +38,7 @@ TL_DEVICE void cp_async_gs(void const *const smem_addr, void *global_ptr) {
         "cp.async.cg.shared.global [%0], [%1], %2;"
 #endif
         ::"r"(addr),
-        "l"((void *)(global_ptr)), "n"(N));
+        "l"((void const *)(global_ptr)), "n"(N));
   } else {
     asm volatile(
 #if TL_ENABLE_L2_PREFETCH
@@ -46,13 +47,13 @@ TL_DEVICE void cp_async_gs(void const *const smem_addr, void *global_ptr) {
         "cp.async.ca.shared.global [%0], [%1], %2;"
 #endif
         ::"r"(addr),
-        "l"((void *)(global_ptr)), "n"(N));
+        "l"((void const *)(global_ptr)), "n"(N));
   }
 }
 
 template <int N>
 TL_DEVICE void cp_async_gs_conditional(void const *const smem_addr,
-                                       void *global_ptr, bool cond) {
+                                       void const *global_ptr, bool cond) {
   static_assert(N == 16 || N == 8 || N == 4);
   int bytes = cond ? N : 0;
   unsigned int addr = smem_ptr_to_uint(smem_addr);
@@ -64,7 +65,7 @@ TL_DEVICE void cp_async_gs_conditional(void const *const smem_addr,
         "cp.async.cg.shared.global [%0], [%1], %2, %3;"
 #endif
         ::"r"(addr),
-        "l"((void *)(global_ptr)), "n"(N), "r"(bytes));
+        "l"((void const *)(global_ptr)), "n"(N), "r"(bytes));
   } else {
     asm volatile(
 #if TL_ENABLE_L2_PREFETCH
@@ -73,11 +74,12 @@ TL_DEVICE void cp_async_gs_conditional(void const *const smem_addr,
         "cp.async.ca.shared.global [%0], [%1], %2, %3;"
 #endif
         ::"r"(addr),
-        "l"((void *)(global_ptr)), "n"(N), "r"(bytes));
+        "l"((void const *)(global_ptr)), "n"(N), "r"(bytes));
   }
 }
 
 template <int kBytes> struct VecInt {};
+
 template <> struct VecInt<1> {
   using vec_t = int8_t;
 };
@@ -201,20 +203,19 @@ TL_DEVICE void cp_warp_impl(dtype_t const *const dst_addr,
                             dtype_t const *const src_addr) {
   int lane_id;
   asm("mov.s32 %0, %laneid;" : "=r"(lane_id));
-  constexpr int kLoopStride = 32 * (UNROLL_FACTOR);
-  typename std::remove_reference<decltype(LD_FUNC((src_addr) + 0))>::type
-      unrolled_values[(UNROLL_FACTOR)];
-  auto __src = (src_addr);
-  auto __dst = (dst_addr);
-  for (int __i = (lane_id); __i < ((N) / kLoopStride) * kLoopStride;
+  constexpr int kLoopStride = 32 * UNROLL_FACTOR;
+  typename std::remove_reference<decltype(LD_FUNC(src_addr + 0))>::type
+      unrolled_values[UNROLL_FACTOR];
+  auto __src = src_addr;
+  auto __dst = dst_addr;
+  for (int __i = lane_id; __i < (N / kLoopStride) * kLoopStride;
        __i += kLoopStride) {
-    _Pragma("unroll") for (int __j = 0; __j < (UNROLL_FACTOR); ++__j)
+    _Pragma("unroll") for (int __j = 0; __j < UNROLL_FACTOR; ++__j)
         unrolled_values[__j] = LD_FUNC(__src + __i + __j * 32);
-    _Pragma("unroll") for (int __j = 0; __j < (UNROLL_FACTOR); ++__j)
+    _Pragma("unroll") for (int __j = 0; __j < UNROLL_FACTOR; ++__j)
         ST_FUNC(__dst + __i + __j * 32, unrolled_values[__j]);
   }
-  for (int __i = ((N) / kLoopStride) * kLoopStride + (lane_id); __i < (N);
-       __i += 32)
+  for (int __i = (N / kLoopStride) * kLoopStride + lane_id; __i < N; __i += 32)
     ST_FUNC(__dst + __i, LD_FUNC(__src + __i));
 }
 
diff --git a/src/tl_templates/cuda/copy_sm100.h b/src/tl_templates/cuda/copy_sm100.h
index c4047c349..82d0cca26 100644
--- a/src/tl_templates/cuda/copy_sm100.h
+++ b/src/tl_templates/cuda/copy_sm100.h
@@ -5,6 +5,7 @@
 
 namespace tl {
 
+// 256-bit load for longlong4
 __device__ __forceinline__ longlong4 ld_global_256(const longlong4 *ptr) {
   longlong4 ret;
   asm volatile("ld.global.v4.s64 {%0, %1, %2, %3}, [%4];"
@@ -13,13 +14,18 @@ __device__ __forceinline__ longlong4 ld_global_256(const longlong4 *ptr) {
   return ret;
 }
 
-__device__ __forceinline__ void st_global_256(longlong4 *ptr, longlong4 &val) {
-  asm volatile("st.global.v4.s64 [%0], {%1, %2, %3, %4};"
-               :
-               : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
+// 256-bit load for ulonglong4
+__device__ __forceinline__ ulonglong4 ld_global_256(const ulonglong4 *ptr) {
+  ulonglong4 ret;
+  asm volatile("ld.global.v4.u64 {%0, %1, %2, %3}, [%4];"
+               : "=l"(ret.x), "=l"(ret.y), "=l"(ret.z), "=l"(ret.w)
+               : "l"(ptr));
+  return ret;
 }
 
-__device__ __forceinline__ ulonglong4 ld_global_256(const ulonglong4 *ptr) {
+// Generic 256-bit load for FP8 types (returns ulonglong4)
+template <typename T>
+__device__ __forceinline__ ulonglong4 ld_global_256(const T *ptr) {
   ulonglong4 ret;
   asm volatile("ld.global.v4.u64 {%0, %1, %2, %3}, [%4];"
                : "=l"(ret.x), "=l"(ret.y), "=l"(ret.z), "=l"(ret.w)
@@ -27,6 +33,22 @@ __device__ __forceinline__ ulonglong4 ld_global_256(const ulonglong4 *ptr) {
   return ret;
 }
 
+// 256-bit store for longlong4
+__device__ __forceinline__ void st_global_256(longlong4 *ptr, longlong4 &val) {
+  asm volatile("st.global.v4.s64 [%0], {%1, %2, %3, %4};"
+               :
+               : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
+}
+
+// 256-bit store for ulonglong4 with non-const reference
+__device__ __forceinline__ void st_global_256(ulonglong4 *ptr,
+                                              ulonglong4 &val) {
+  asm volatile("st.global.v4.u64 [%0], {%1, %2, %3, %4};"
+               :
+               : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
+}
+
+// 256-bit store for ulonglong4 with const reference
 // must be const &val, otherwise the compiler will generate a temporary variable
 // and compilation will fail if we have st_global_256(ptr, ld_global_256(ptr))
 __device__ __forceinline__ void st_global_256(ulonglong4 *ptr,
@@ -36,20 +58,22 @@ __device__ __forceinline__ void st_global_256(ulonglong4 *ptr,
                : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
 }
 
-__device__ __forceinline__ ulonglong4 ld_global_256(const fp8_e4_32_t *ptr) {
-  ulonglong4 ret;
-  asm volatile("ld.global.v4.u64 {%0, %1, %2, %3}, [%4];"
-               : "=l"(ret.x), "=l"(ret.y), "=l"(ret.z), "=l"(ret.w)
-               : "l"(ptr));
-  return ret;
+// Generic 256-bit store for FP8 types
+template <typename T>
+__device__ __forceinline__ void st_global_256(T *ptr, const ulonglong4 &val) {
+  asm volatile("st.global.v4.u64 [%0], {%1, %2, %3, %4};"
+               :
+               : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
 }
 
-__device__ __forceinline__ void st_global_256(fp8_e4_32_t *ptr,
-                                              fp8_e4_32_t &val8) {
-  ulonglong4 &val = *((ulonglong4 *)&val8);
+// Generic 256-bit store for FP8 types with non-const reference
+template <typename T>
+__device__ __forceinline__ void st_global_256(T *ptr, T &val) {
+  ulonglong4 &val_u64 = *((ulonglong4 *)&val);
   asm volatile("st.global.v4.u64 [%0], {%1, %2, %3, %4};"
                :
-               : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
+               : "l"(ptr), "l"(val_u64.x), "l"(val_u64.y), "l"(val_u64.z),
+                 "l"(val_u64.w));
 }
 
 __device__ __forceinline__ unsigned long long
@@ -95,38 +119,38 @@ __device__ __forceinline__ void tcgen05_ld_core(uint32_t const &tmem_start_col,
   }
 }
 
-template <int N, typename dst_t>
+template <int N, bool pack16, typename dst_t>
 __device__ __forceinline__ void
 tcgen05_ld_32dp32bNx(uint32_t const &tmem_start_col,
                      uint32_t const &tmem_col_offset, dst_t *dst_ptr) {
-  tcgen05_ld_core<tl::tmem_ld_32dp32bNx, 7, N>(tmem_start_col + tmem_col_offset,
-                                               dst_ptr);
+  tcgen05_ld_core<tl::tmem_ld_32dp32bNx<pack16>, 7, N>(
+      tmem_start_col + tmem_col_offset, dst_ptr);
   tl::fence_view_async_tmem_load();
 }
 
-template <int N, typename dst_t>
+template <int N, bool pack16, typename dst_t>
 __device__ __forceinline__ void
 tcgen05_ld_32dp64bNx(uint32_t const &tmem_start_col,
                      uint32_t const &tmem_col_offset, dst_t *dst_ptr) {
-  tcgen05_ld_core<tl::tmem_ld_32dp64bNx, 7, N>(tmem_start_col + tmem_col_offset,
-                                               dst_ptr);
+  tcgen05_ld_core<tl::tmem_ld_32dp64bNx<pack16>, 7, N>(
+      tmem_start_col + tmem_col_offset, dst_ptr);
   tl::fence_view_async_tmem_load();
 }
 
-template <int N, typename dst_t>
+template <int N, bool pack16, typename dst_t>
 __device__ __forceinline__ void
 tcgen05_ld_32dp128bNx(uint32_t const &tmem_start_col,
                       uint32_t const &tmem_col_offset, dst_t *dst_ptr) {
-  tcgen05_ld_core<tl::tmem_ld_32dp128bNx, 6, N>(
+  tcgen05_ld_core<tl::tmem_ld_32dp128bNx<pack16>, 6, N>(
       tmem_start_col + tmem_col_offset, dst_ptr);
   tl::fence_view_async_tmem_load();
 }
 
-template <int N, typename dst_t>
+template <int N, bool pack16, typename dst_t>
 __device__ __forceinline__ void
 tcgen05_ld_32dp256bNx(uint32_t const &tmem_start_col,
                       uint32_t const &tmem_col_offset, dst_t *dst_ptr) {
-  tcgen05_ld_core<tl::tmem_ld_32dp256bNx, 5, N>(
+  tcgen05_ld_core<tl::tmem_ld_32dp256bNx<pack16>, 5, N>(
       tmem_start_col + tmem_col_offset, dst_ptr);
   tl::fence_view_async_tmem_load();
 }
diff --git a/src/tl_templates/cuda/copy_sm90.h b/src/tl_templates/cuda/copy_sm90.h
index b8b174dc4..0b51450b3 100644
--- a/src/tl_templates/cuda/copy_sm90.h
+++ b/src/tl_templates/cuda/copy_sm90.h
@@ -15,14 +15,14 @@ enum class CacheHintSm90 : uint64_t {
 };
 
 template <typename BarrierType = uint64_t>
-TL_DEVICE void tma_load(void *smem_ptr, void *gmem_ptr, BarrierType &smem_mbar,
-                        uint32_t size) {
+TL_DEVICE void tma_load(void *smem_ptr, void const *gmem_ptr,
+                        BarrierType &smem_mbar, uint32_t size) {
   uint32_t smem_int_mbar =
       smem_ptr_to_uint(reinterpret_cast<uint64_t *>(&smem_mbar));
   uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
   asm volatile("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::"
                "bytes [%0], [%1], %2, [%3]; \n" ::"r"(smem_int_ptr),
-               "l"(gmem_ptr), "r"(size), "r"(smem_int_mbar)
+               "l"((void const *)gmem_ptr), "r"(size), "r"(smem_int_mbar)
                :);
 }
 
diff --git a/src/tl_templates/cuda/cuda_fp4.h b/src/tl_templates/cuda/cuda_fp4.h
new file mode 100644
index 000000000..22cc0460c
--- /dev/null
+++ b/src/tl_templates/cuda/cuda_fp4.h
@@ -0,0 +1,275 @@
+#pragma once
+
+#include "common.h"
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+#include <cuda_fp4.h>
+
+// Wrapper for __nv_fp4_e2m1 with implicit conversions
+struct fp4_e2_t {
+  __nv_fp4_storage_t __x;
+
+  TL_DEVICE fp4_e2_t() = default;
+
+  // Constructor from __nv_fp4_e2m1
+  TL_DEVICE fp4_e2_t(__nv_fp4_e2m1 x) : __x(x.__x) {}
+
+  // Constructor from storage type
+  TL_DEVICE fp4_e2_t(__nv_fp4_storage_t x) : __x(x) {}
+
+  // Constructor from float
+  TL_DEVICE explicit fp4_e2_t(float x) {
+    __nv_fp4_e2m1 tmp(x);
+    __x = tmp.__x;
+  }
+
+  // Conversion to __nv_fp4_e2m1
+  TL_DEVICE operator __nv_fp4_e2m1() const {
+    __nv_fp4_e2m1 tmp;
+    tmp.__x = __x;
+    return tmp;
+  }
+
+  // Conversion to float
+  TL_DEVICE operator float() const {
+    __nv_fp4_e2m1 tmp;
+    tmp.__x = __x;
+    return float(tmp);
+  }
+
+  // Implicit conversion to half_t (cutlass::half_t)
+  TL_DEVICE operator half_t() const { return half_t(float(*this)); }
+
+  // Implicit conversion to __half
+  TL_DEVICE operator __half() const { return __half(float(*this)); }
+};
+
+using fp4_e2x2_t = __nv_fp4x2_e2m1;
+using fp4_e2x4_t = __nv_fp4x4_e2m1;
+
+struct fp4_e2x8_t {
+  fp4_e2_t data[8];
+};
+
+struct fp4_e2x16_t {
+  fp4_e2_t data[16];
+};
+
+struct __CUDA_ALIGN__(1) fp4_e2_2_t {
+  fp4_e2_t x;
+  fp4_e2_t y;
+};
+
+struct __CUDA_ALIGN__(2) fp4_e2_4_t {
+  fp4_e2_t x;
+  fp4_e2_t y;
+  fp4_e2_t z;
+  fp4_e2_t w;
+};
+
+struct __CUDA_ALIGN__(4) fp4_e2_8_t {
+  fp4_e2_4_t x;
+  fp4_e2_4_t y;
+};
+
+struct __CUDA_ALIGN__(8) fp4_e2_16_t {
+  fp4_e2_8_t x;
+  fp4_e2_8_t y;
+};
+
+struct __CUDA_ALIGN__(16) fp4_e2_32_t {
+  fp4_e2_16_t x;
+  fp4_e2_16_t y;
+
+  TL_DEVICE fp4_e2_32_t &operator=(const ulonglong4 &rhs) {
+    x.x = *(fp4_e2_8_t *)&rhs.x;
+    x.y = *(fp4_e2_8_t *)&rhs.y;
+    y.x = *(fp4_e2_8_t *)&rhs.z;
+    y.y = *(fp4_e2_8_t *)&rhs.w;
+    return *this;
+  }
+};
+
+struct __CUDA_ALIGN__(32) fp4_e2_64_t {
+  fp4_e2_32_t x;
+  fp4_e2_32_t y;
+};
+
+// Pack two fp4_e2_t values.
+TL_DEVICE fp4_e2_2_t make_fp4_e2_2_t(fp4_e2_t x, fp4_e2_t y) {
+  fp4_e2_2_t result;
+  result.x = x;
+  result.y = y;
+  return result;
+}
+
+// Pack four fp4_e2_t values.
+TL_DEVICE fp4_e2_4_t make_fp4_e2_4_t(fp4_e2_t x0, fp4_e2_t x1, fp4_e2_t x2,
+                                     fp4_e2_t x3) {
+  fp4_e2_4_t result;
+  result.x = x0;
+  result.y = x1;
+  result.z = x2;
+  result.w = x3;
+  return result;
+}
+
+// Pack eight fp4_e2_t values.
+TL_DEVICE fp4_e2_8_t make_fp4_e2_8_t(fp4_e2_t x0, fp4_e2_t x1, fp4_e2_t x2,
+                                     fp4_e2_t x3, fp4_e2_t x4, fp4_e2_t x5,
+                                     fp4_e2_t x6, fp4_e2_t x7) {
+  fp4_e2_8_t result;
+  result.x = make_fp4_e2_4_t(x0, x1, x2, x3);
+  result.y = make_fp4_e2_4_t(x4, x5, x6, x7);
+  return result;
+}
+
+// Pack sixteen fp4_e2_t values.
+TL_DEVICE fp4_e2_16_t make_fp4_e2_16_t(fp4_e2_t x0, fp4_e2_t x1, fp4_e2_t x2,
+                                       fp4_e2_t x3, fp4_e2_t x4, fp4_e2_t x5,
+                                       fp4_e2_t x6, fp4_e2_t x7, fp4_e2_t y0,
+                                       fp4_e2_t y1, fp4_e2_t y2, fp4_e2_t y3,
+                                       fp4_e2_t y4, fp4_e2_t y5, fp4_e2_t y6,
+                                       fp4_e2_t y7) {
+  fp4_e2_16_t result;
+  result.x = make_fp4_e2_8_t(x0, x1, x2, x3, x4, x5, x6, x7);
+  result.y = make_fp4_e2_8_t(y0, y1, y2, y3, y4, y5, y6, y7);
+  return result;
+}
+
+// Pack thirty-two fp4_e2_t values.
+TL_DEVICE fp4_e2_32_t make_fp4_e2_32_t(
+    fp4_e2_t x0, fp4_e2_t x1, fp4_e2_t x2, fp4_e2_t x3, fp4_e2_t x4,
+    fp4_e2_t x5, fp4_e2_t x6, fp4_e2_t x7, fp4_e2_t x8, fp4_e2_t x9,
+    fp4_e2_t x10, fp4_e2_t x11, fp4_e2_t x12, fp4_e2_t x13, fp4_e2_t x14,
+    fp4_e2_t x15, fp4_e2_t y0, fp4_e2_t y1, fp4_e2_t y2, fp4_e2_t y3,
+    fp4_e2_t y4, fp4_e2_t y5, fp4_e2_t y6, fp4_e2_t y7, fp4_e2_t y8,
+    fp4_e2_t y9, fp4_e2_t y10, fp4_e2_t y11, fp4_e2_t y12, fp4_e2_t y13,
+    fp4_e2_t y14, fp4_e2_t y15) {
+  fp4_e2_32_t result;
+  result.x = make_fp4_e2_16_t(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
+                              x12, x13, x14, x15);
+  result.y = make_fp4_e2_16_t(y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11,
+                              y12, y13, y14, y15);
+  return result;
+}
+
+// ============================================================================
+// FP4 <-> Half Precision Conversions
+// ============================================================================
+// https://docs.nvidia.com/cuda/cuda-math-api/cuda_math_api/group__CUDA__MATH__FP4__MISC.html
+
+// fp4_e2m1 -> half
+TL_DEVICE __half __tl_cvt_fp4_to_half(const __nv_fp4_storage_t src) {
+  __half_raw raw = __nv_cvt_fp4_to_halfraw(src, __NV_E2M1);
+  __half result;
+  result = *reinterpret_cast<__half *>(&raw);
+  return result;
+}
+
+// fp4_e2m1x2 (1 byte) -> half2
+TL_DEVICE half2 __tl_cvt_fp4x2_to_half2(const __nv_fp4x2_storage_t src) {
+  __half2_raw raw = __nv_cvt_fp4x2_to_halfraw2(src, __NV_E2M1);
+  half2 result;
+  result = *reinterpret_cast<half2 *>(&raw);
+  return result;
+}
+
+// half -> fp4_e2m1
+TL_DEVICE __nv_fp4_storage_t __tl_cvt_half_to_fp4(const __half src) {
+  __half_raw raw = *reinterpret_cast<const __half_raw *>(&src);
+  return __nv_cvt_halfraw_to_fp4(raw, __NV_E2M1, cudaRoundZero);
+}
+
+// half2 -> fp4_e2m1x2 (1 byte)
+TL_DEVICE __nv_fp4x2_storage_t __tl_cvt_half2_to_fp4x2(const half2 src) {
+  __half2_raw raw = *reinterpret_cast<const __half2_raw *>(&src);
+  return __nv_cvt_halfraw2_to_fp4x2(raw, __NV_E2M1, cudaRoundZero);
+}
+
+// ============================================================================
+// FP4 <-> Float Conversions
+// ============================================================================
+
+// fp4_e2m1 -> float
+TL_DEVICE float __tl_cvt_fp4_to_float(const __nv_fp4_storage_t src) {
+  return __half2float(__tl_cvt_fp4_to_half(src));
+}
+
+// fp4_e2m1x2 (1 byte) -> float2
+TL_DEVICE float2 __tl_cvt_fp4x2_to_float2(const __nv_fp4x2_storage_t src) {
+  half2 tmp = __tl_cvt_fp4x2_to_half2(src);
+  float2 result;
+  result.x = __half2float(tmp.x);
+  result.y = __half2float(tmp.y);
+  return result;
+}
+
+// float -> fp4_e2m1
+TL_DEVICE __nv_fp4_storage_t __tl_cvt_float_to_fp4(const float src) {
+  return __nv_cvt_float_to_fp4(src, __NV_E2M1, cudaRoundZero);
+}
+
+// float2 -> fp4_e2m1x2 (1 byte)
+TL_DEVICE __nv_fp4x2_storage_t __tl_cvt_float2_to_fp4x2(const float2 src) {
+  return __nv_cvt_float2_to_fp4x2(src, __NV_E2M1, cudaRoundZero);
+}
+
+// ============================================================================
+// FP4 <-> Double Conversions
+// ============================================================================
+
+// fp4_e2m1 -> double
+TL_DEVICE double __tl_cvt_fp4_to_double(const __nv_fp4_storage_t src) {
+  return static_cast<double>(__tl_cvt_fp4_to_float(src));
+}
+
+// fp4_e2m1x2 -> double2
+TL_DEVICE double2 __tl_cvt_fp4x2_to_double2(const __nv_fp4x2_storage_t src) {
+  float2 tmp = __tl_cvt_fp4x2_to_float2(src);
+  double2 result;
+  result.x = static_cast<double>(tmp.x);
+  result.y = static_cast<double>(tmp.y);
+  return result;
+}
+
+// double -> fp4_e2m1
+TL_DEVICE __nv_fp4_storage_t __tl_cvt_double_to_fp4(const double src) {
+  return __nv_cvt_double_to_fp4(src, __NV_E2M1, cudaRoundZero);
+}
+
+// double2 -> fp4_e2m1x2
+TL_DEVICE __nv_fp4x2_storage_t __tl_cvt_double2_to_fp4x2(const double2 src) {
+  return __nv_cvt_double2_to_fp4x2(src, __NV_E2M1, cudaRoundZero);
+}
+
+// ============================================================================
+// FP4 <-> BFloat16 Conversions
+// ============================================================================
+
+// fp4_e2m1 -> bfloat16
+TL_DEVICE __nv_bfloat16 __tl_cvt_fp4_to_bfloat16(const __nv_fp4_storage_t src) {
+  return __float2bfloat16(__tl_cvt_fp4_to_float(src));
+}
+
+// fp4_e2m1x2 -> bfloat162
+TL_DEVICE __nv_bfloat162
+__tl_cvt_fp4x2_to_bfloat162(const __nv_fp4x2_storage_t src) {
+  float2 tmp = __tl_cvt_fp4x2_to_float2(src);
+  return __floats2bfloat162_rn(tmp.x, tmp.y);
+}
+
+// bfloat16 -> fp4_e2m1
+TL_DEVICE __nv_fp4_storage_t __tl_cvt_bfloat16_to_fp4(const __nv_bfloat16 src) {
+  __nv_bfloat16_raw raw = *reinterpret_cast<const __nv_bfloat16_raw *>(&src);
+  return __nv_cvt_bfloat16raw_to_fp4(raw, __NV_E2M1, cudaRoundZero);
+}
+
+// bfloat162 -> fp4_e2m1x2
+TL_DEVICE __nv_fp4x2_storage_t
+__tl_cvt_bfloat162_to_fp4x2(const __nv_bfloat162 src) {
+  __nv_bfloat162_raw raw = *reinterpret_cast<const __nv_bfloat162_raw *>(&src);
+  return __nv_cvt_bfloat16raw2_to_fp4x2(raw, __NV_E2M1, cudaRoundZero);
+}
+
+#endif
diff --git a/src/tl_templates/cuda/cuda_fp8.h b/src/tl_templates/cuda/cuda_fp8.h
index 8d2165822..c80046296 100644
--- a/src/tl_templates/cuda/cuda_fp8.h
+++ b/src/tl_templates/cuda/cuda_fp8.h
@@ -1,10 +1,24 @@
 #pragma once
 
+#include "common.h"
 #include <cuda_fp8.h>
 #include <cute/numeric/numeric_types.hpp>
 
-using fp8_e4_t = cute::float_e4m3_t;
-using fp8_e5_t = cute::float_e5m2_t;
+using fp8_e4_t = tl::float_e4m3_t;
+using fp8_e5_t = tl::float_e5m2_t;
+
+// __nv_fp8_e8m0 is only available in CUDA 12.6+
+#if __CUDACC_VER_MAJOR__ > 12 ||                                               \
+    (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 6)
+using fp8_e8_t = __nv_fp8_e8m0;
+#define TL_HAS_FP8_E8M0 1
+#else
+// Placeholder for CUDA < 12.6
+struct fp8_e8_t {
+  unsigned char data;
+};
+#define TL_HAS_FP8_E8M0 0
+#endif
 
 struct __CUDA_ALIGN__(2) fp8_e4_2_t {
   fp8_e4_t x;
@@ -32,7 +46,7 @@ struct __CUDA_ALIGN__(32) fp8_e4_32_t {
   fp8_e4_16_t x;
   fp8_e4_16_t y;
 
-  __device__ __forceinline__ fp8_e4_32_t &operator=(const ulonglong4 &rhs) {
+  TL_DEVICE fp8_e4_32_t &operator=(const ulonglong4 &rhs) {
     x.x = *(fp8_e4_8_t *)&rhs.x;
     x.y = *(fp8_e4_8_t *)&rhs.y;
     y.x = *(fp8_e4_8_t *)&rhs.z;
@@ -67,7 +81,7 @@ struct __CUDA_ALIGN__(32) fp8_e5_32_t {
   fp8_e5_16_t x;
   fp8_e5_16_t y;
 
-  __device__ __forceinline__ fp8_e5_32_t &operator=(const ulonglong4 &rhs) {
+  TL_DEVICE fp8_e5_32_t &operator=(const ulonglong4 &rhs) {
     x.x = *(fp8_e5_8_t *)&rhs.x;
     x.y = *(fp8_e5_8_t *)&rhs.y;
     y.x = *(fp8_e5_8_t *)&rhs.z;
@@ -76,8 +90,43 @@ struct __CUDA_ALIGN__(32) fp8_e5_32_t {
   }
 };
 
+struct __CUDA_ALIGN__(2) fp8_e8_2_t {
+  fp8_e8_t x;
+  fp8_e8_t y;
+};
+
+struct __CUDA_ALIGN__(4) fp8_e8_4_t {
+  fp8_e8_t x;
+  fp8_e8_t y;
+  fp8_e8_t z;
+  fp8_e8_t w;
+};
+
+struct __CUDA_ALIGN__(8) fp8_e8_8_t {
+  fp8_e8_4_t x;
+  fp8_e8_4_t y;
+};
+
+struct __CUDA_ALIGN__(16) fp8_e8_16_t {
+  fp8_e8_8_t x;
+  fp8_e8_8_t y;
+};
+
+struct __CUDA_ALIGN__(32) fp8_e8_32_t {
+  fp8_e8_16_t x;
+  fp8_e8_16_t y;
+
+  TL_DEVICE fp8_e8_32_t &operator=(const ulonglong4 &rhs) {
+    x.x = *(fp8_e8_8_t *)&rhs.x;
+    x.y = *(fp8_e8_8_t *)&rhs.y;
+    y.x = *(fp8_e8_8_t *)&rhs.z;
+    y.y = *(fp8_e8_8_t *)&rhs.w;
+    return *this;
+  }
+};
+
 // Pack two fp8_e4_t values.
-__forceinline__ __device__ fp8_e4_2_t make_fp8_e4_2_t(fp8_e4_t x, fp8_e4_t y) {
+TL_DEVICE fp8_e4_2_t make_fp8_e4_2_t(fp8_e4_t x, fp8_e4_t y) {
   fp8_e4_2_t result;
   result.x = x;
   result.y = y;
@@ -85,9 +134,8 @@ __forceinline__ __device__ fp8_e4_2_t make_fp8_e4_2_t(fp8_e4_t x, fp8_e4_t y) {
 }
 
 // Pack four fp8_e4_t values.
-__forceinline__ __device__ fp8_e4_4_t make_fp8_e4_4_t(fp8_e4_t x0, fp8_e4_t x1,
-                                                      fp8_e4_t x2,
-                                                      fp8_e4_t x3) {
+TL_DEVICE fp8_e4_4_t make_fp8_e4_4_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2,
+                                     fp8_e4_t x3) {
   fp8_e4_4_t result;
   result.x = x0;
   result.y = x1;
@@ -97,11 +145,9 @@ __forceinline__ __device__ fp8_e4_4_t make_fp8_e4_4_t(fp8_e4_t x0, fp8_e4_t x1,
 }
 
 // Pack eight fp8_e4_t values.
-__forceinline__ __device__ fp8_e4_8_t make_fp8_e4_8_t(fp8_e4_t x0, fp8_e4_t x1,
-                                                      fp8_e4_t x2, fp8_e4_t x3,
-                                                      fp8_e4_t x4, fp8_e4_t x5,
-                                                      fp8_e4_t x6,
-                                                      fp8_e4_t x7) {
+TL_DEVICE fp8_e4_8_t make_fp8_e4_8_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2,
+                                     fp8_e4_t x3, fp8_e4_t x4, fp8_e4_t x5,
+                                     fp8_e4_t x6, fp8_e4_t x7) {
   fp8_e4_8_t result;
   result.x = make_fp8_e4_4_t(x0, x1, x2, x3);
   result.y = make_fp8_e4_4_t(x4, x5, x6, x7);
@@ -109,11 +155,12 @@ __forceinline__ __device__ fp8_e4_8_t make_fp8_e4_8_t(fp8_e4_t x0, fp8_e4_t x1,
 }
 
 // Pack sixteen fp8_e4_t values.
-__forceinline__ __device__ fp8_e4_16_t
-make_fp8_e4_16_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2, fp8_e4_t x3,
-                 fp8_e4_t x4, fp8_e4_t x5, fp8_e4_t x6, fp8_e4_t x7,
-                 fp8_e4_t y0, fp8_e4_t y1, fp8_e4_t y2, fp8_e4_t y3,
-                 fp8_e4_t y4, fp8_e4_t y5, fp8_e4_t y6, fp8_e4_t y7) {
+TL_DEVICE fp8_e4_16_t make_fp8_e4_16_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2,
+                                       fp8_e4_t x3, fp8_e4_t x4, fp8_e4_t x5,
+                                       fp8_e4_t x6, fp8_e4_t x7, fp8_e4_t y0,
+                                       fp8_e4_t y1, fp8_e4_t y2, fp8_e4_t y3,
+                                       fp8_e4_t y4, fp8_e4_t y5, fp8_e4_t y6,
+                                       fp8_e4_t y7) {
   fp8_e4_16_t result;
   result.x = make_fp8_e4_8_t(x0, x1, x2, x3, x4, x5, x6, x7);
   result.y = make_fp8_e4_8_t(y0, y1, y2, y3, y4, y5, y6, y7);
@@ -121,7 +168,7 @@ make_fp8_e4_16_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2, fp8_e4_t x3,
 }
 
 // Pack thirty-two fp8_e4_t values.
-__forceinline__ __device__ fp8_e4_32_t make_fp8_e4_32_t(
+TL_DEVICE fp8_e4_32_t make_fp8_e4_32_t(
     fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2, fp8_e4_t x3, fp8_e4_t x4,
     fp8_e4_t x5, fp8_e4_t x6, fp8_e4_t x7, fp8_e4_t x8, fp8_e4_t x9,
     fp8_e4_t x10, fp8_e4_t x11, fp8_e4_t x12, fp8_e4_t x13, fp8_e4_t x14,
@@ -138,7 +185,7 @@ __forceinline__ __device__ fp8_e4_32_t make_fp8_e4_32_t(
 }
 
 // Pack two fp8_e5_t values.
-__forceinline__ __device__ fp8_e5_2_t make_fp8_e5_2_t(fp8_e5_t x, fp8_e5_t y) {
+TL_DEVICE fp8_e5_2_t make_fp8_e5_2_t(fp8_e5_t x, fp8_e5_t y) {
   fp8_e5_2_t result;
   result.x = x;
   result.y = y;
@@ -146,9 +193,8 @@ __forceinline__ __device__ fp8_e5_2_t make_fp8_e5_2_t(fp8_e5_t x, fp8_e5_t y) {
 }
 
 // Pack four fp8_e5_t values.
-__forceinline__ __device__ fp8_e5_4_t make_fp8_e5_4_t(fp8_e5_t x0, fp8_e5_t x1,
-                                                      fp8_e5_t x2,
-                                                      fp8_e5_t x3) {
+TL_DEVICE fp8_e5_4_t make_fp8_e5_4_t(fp8_e5_t x0, fp8_e5_t x1, fp8_e5_t x2,
+                                     fp8_e5_t x3) {
   fp8_e5_4_t result;
   result.x = x0;
   result.y = x1;
@@ -158,11 +204,9 @@ __forceinline__ __device__ fp8_e5_4_t make_fp8_e5_4_t(fp8_e5_t x0, fp8_e5_t x1,
 }
 
 // Pack eight fp8_e5_t values.
-__forceinline__ __device__ fp8_e5_8_t make_fp8_e5_8_t(fp8_e5_t x0, fp8_e5_t x1,
-                                                      fp8_e5_t x2, fp8_e5_t x3,
-                                                      fp8_e5_t x4, fp8_e5_t x5,
-                                                      fp8_e5_t x6,
-                                                      fp8_e5_t x7) {
+TL_DEVICE fp8_e5_8_t make_fp8_e5_8_t(fp8_e5_t x0, fp8_e5_t x1, fp8_e5_t x2,
+                                     fp8_e5_t x3, fp8_e5_t x4, fp8_e5_t x5,
+                                     fp8_e5_t x6, fp8_e5_t x7) {
   fp8_e5_8_t result;
   result.x = make_fp8_e5_4_t(x0, x1, x2, x3);
   result.y = make_fp8_e5_4_t(x4, x5, x6, x7);
@@ -170,11 +214,12 @@ __forceinline__ __device__ fp8_e5_8_t make_fp8_e5_8_t(fp8_e5_t x0, fp8_e5_t x1,
 }
 
 // Pack sixteen fp8_e5_t values.
-__forceinline__ __device__ fp8_e5_16_t
-make_fp8_e5_16_t(fp8_e5_t x0, fp8_e5_t x1, fp8_e5_t x2, fp8_e5_t x3,
-                 fp8_e5_t x4, fp8_e5_t x5, fp8_e5_t x6, fp8_e5_t x7,
-                 fp8_e5_t y0, fp8_e5_t y1, fp8_e5_t y2, fp8_e5_t y3,
-                 fp8_e5_t y4, fp8_e5_t y5, fp8_e5_t y6, fp8_e5_t y7) {
+TL_DEVICE fp8_e5_16_t make_fp8_e5_16_t(fp8_e5_t x0, fp8_e5_t x1, fp8_e5_t x2,
+                                       fp8_e5_t x3, fp8_e5_t x4, fp8_e5_t x5,
+                                       fp8_e5_t x6, fp8_e5_t x7, fp8_e5_t y0,
+                                       fp8_e5_t y1, fp8_e5_t y2, fp8_e5_t y3,
+                                       fp8_e5_t y4, fp8_e5_t y5, fp8_e5_t y6,
+                                       fp8_e5_t y7) {
   fp8_e5_16_t result;
   result.x = make_fp8_e5_8_t(x0, x1, x2, x3, x4, x5, x6, x7);
   result.y = make_fp8_e5_8_t(y0, y1, y2, y3, y4, y5, y6, y7);
@@ -182,7 +227,7 @@ make_fp8_e5_16_t(fp8_e5_t x0, fp8_e5_t x1, fp8_e5_t x2, fp8_e5_t x3,
 }
 
 // Pack thirty-two fp8_e5_t values.
-__forceinline__ __device__ fp8_e5_32_t make_fp8_e5_32_t(
+TL_DEVICE fp8_e5_32_t make_fp8_e5_32_t(
     fp8_e5_t x0, fp8_e5_t x1, fp8_e5_t x2, fp8_e5_t x3, fp8_e5_t x4,
     fp8_e5_t x5, fp8_e5_t x6, fp8_e5_t x7, fp8_e5_t x8, fp8_e5_t x9,
     fp8_e5_t x10, fp8_e5_t x11, fp8_e5_t x12, fp8_e5_t x13, fp8_e5_t x14,
@@ -197,3 +242,73 @@ __forceinline__ __device__ fp8_e5_32_t make_fp8_e5_32_t(
                               y12, y13, y14, y15);
   return result;
 }
+
+// Pack two fp8_e8_t values.
+TL_DEVICE fp8_e8_2_t make_fp8_e8_2_t(fp8_e8_t x, fp8_e8_t y) {
+  fp8_e8_2_t result;
+  result.x = x;
+  result.y = y;
+  return result;
+}
+
+// Pack four fp8_e8_t values.
+TL_DEVICE fp8_e8_4_t make_fp8_e8_4_t(fp8_e8_t x0, fp8_e8_t x1, fp8_e8_t x2,
+                                     fp8_e8_t x3) {
+  fp8_e8_4_t result;
+  result.x = x0;
+  result.y = x1;
+  result.z = x2;
+  result.w = x3;
+  return result;
+}
+
+// Pack eight fp8_e8_t values.
+TL_DEVICE fp8_e8_8_t make_fp8_e8_8_t(fp8_e8_t x0, fp8_e8_t x1, fp8_e8_t x2,
+                                     fp8_e8_t x3, fp8_e8_t x4, fp8_e8_t x5,
+                                     fp8_e8_t x6, fp8_e8_t x7) {
+  fp8_e8_8_t result;
+  result.x = make_fp8_e8_4_t(x0, x1, x2, x3);
+  result.y = make_fp8_e8_4_t(x4, x5, x6, x7);
+  return result;
+}
+
+// Pack sixteen fp8_e8_t values.
+TL_DEVICE fp8_e8_16_t make_fp8_e8_16_t(fp8_e8_t x0, fp8_e8_t x1, fp8_e8_t x2,
+                                       fp8_e8_t x3, fp8_e8_t x4, fp8_e8_t x5,
+                                       fp8_e8_t x6, fp8_e8_t x7, fp8_e8_t y0,
+                                       fp8_e8_t y1, fp8_e8_t y2, fp8_e8_t y3,
+                                       fp8_e8_t y4, fp8_e8_t y5, fp8_e8_t y6,
+                                       fp8_e8_t y7) {
+  fp8_e8_16_t result;
+  result.x = make_fp8_e8_8_t(x0, x1, x2, x3, x4, x5, x6, x7);
+  result.y = make_fp8_e8_8_t(y0, y1, y2, y3, y4, y5, y6, y7);
+  return result;
+}
+
+// Pack thirty-two fp8_e8_t values.
+TL_DEVICE fp8_e8_32_t make_fp8_e8_32_t(
+    fp8_e8_t x0, fp8_e8_t x1, fp8_e8_t x2, fp8_e8_t x3, fp8_e8_t x4,
+    fp8_e8_t x5, fp8_e8_t x6, fp8_e8_t x7, fp8_e8_t x8, fp8_e8_t x9,
+    fp8_e8_t x10, fp8_e8_t x11, fp8_e8_t x12, fp8_e8_t x13, fp8_e8_t x14,
+    fp8_e8_t x15, fp8_e8_t y0, fp8_e8_t y1, fp8_e8_t y2, fp8_e8_t y3,
+    fp8_e8_t y4, fp8_e8_t y5, fp8_e8_t y6, fp8_e8_t y7, fp8_e8_t y8,
+    fp8_e8_t y9, fp8_e8_t y10, fp8_e8_t y11, fp8_e8_t y12, fp8_e8_t y13,
+    fp8_e8_t y14, fp8_e8_t y15) {
+  fp8_e8_32_t result;
+  result.x = make_fp8_e8_16_t(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
+                              x12, x13, x14, x15);
+  result.y = make_fp8_e8_16_t(y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11,
+                              y12, y13, y14, y15);
+  return result;
+}
+
+// e4m3x2 -> float2
+TL_DEVICE float2
+__tl_cvt_fp8x2_to_float2(const __nv_fp8x2_storage_t x,
+                         const __nv_fp8_interpretation_t fp8_interpretation) {
+  half2 tmp = __nv_cvt_fp8x2_to_halfraw2(x, fp8_interpretation);
+  float2 result;
+  result.x = (float)tmp.x;
+  result.y = (float)tmp.y;
+  return result;
+}
diff --git a/src/tl_templates/cuda/debug.h b/src/tl_templates/cuda/debug.h
index 3a71f60ba..8fae775a0 100644
--- a/src/tl_templates/cuda/debug.h
+++ b/src/tl_templates/cuda/debug.h
@@ -1,272 +1,134 @@
 #pragma once
 
+#if __CUDA_ARCH_LIST__ >= 890
 #include "./cuda_fp8.h"
-#include "common.h"
+#endif
 
+#include "common.h"
 #ifndef __CUDACC_RTC__
+#include <cstdint>
 #include <cstdio>
 #endif
 
-// Template declaration for device-side debug printing (variable only)
-template <typename T> __device__ void debug_print_var(const char *msg, T var);
-
-// Overload for pointer type (supports any cv-qualified T*)
-template <typename T> __device__ void debug_print_var(const char *msg, T *var) {
-  printf(
-      "msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=pointer "
-      "value=%p\n",
-      msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-      threadIdx.z, var);
-}
-
-// Specialization for signed char type
-template <>
-__device__ void debug_print_var<signed char>(const char *msg, signed char var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=signed "
-         "char "
-         "value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, var);
-}
-
-// Specialization for unsigned char type
-template <>
-__device__ void debug_print_var<unsigned char>(const char *msg,
-                                               unsigned char var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
-         "dtype=unsigned char "
-         "value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, var);
-}
-
-// Specialization for integer type
-template <> __device__ void debug_print_var<int>(const char *msg, int var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=int "
-         "value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, var);
-}
-
-// Specialization for unsigned integer type
-template <>
-__device__ void debug_print_var<unsigned int>(const char *msg,
-                                              unsigned int var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=int "
-         "value=%u\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, var);
-}
-
-// Specialization for float type
-template <> __device__ void debug_print_var<float>(const char *msg, float var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=float "
-         "value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, var);
-}
-
-// Specialization for half type
-template <> __device__ void debug_print_var<half>(const char *msg, half var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=half "
-         "value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, (float)var);
-}
-
-// Specialization for half_t type
-template <>
-__device__ void debug_print_var<half_t>(const char *msg, half_t var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=half_t "
-         "value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, (float)var);
-}
-
-// Specialization for bfloat16_t type
-template <>
-__device__ void debug_print_var<bfloat16_t>(const char *msg, bfloat16_t var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
-         "dtype=bfloat16_t value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, (float)var);
-}
-
-// Specialization for double type
-template <>
-__device__ void debug_print_var<double>(const char *msg, double var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=double "
-         "value=%lf\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, var);
-}
-
-// Specialization for fp8_e4_t type
-template <>
-__device__ void debug_print_var<fp8_e4_t>(const char *msg, fp8_e4_t var) {
-  printf(
-      "msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=fp8_e4_t "
-      "value=%f\n",
-      msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-      threadIdx.z, (float)var);
-}
+template <typename T> struct PrintTraits {
+  static __device__ void print_var(const char *msg, T val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
+           "dtype=unknown value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, (const void *)&val);
+  }
+
+  static __device__ void print_buffer(const char *msg, const char *buf_name,
+                                      int index, T val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+           "index=%d, dtype=unknown value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, buf_name, index, (const void *)&val);
+  }
+};
+
+#define DEFINE_PRINT_TRAIT(TYPE, NAME, FORMAT, CAST_TYPE)                      \
+  template <> struct PrintTraits<TYPE> {                                       \
+    static __device__ void print_var(const char *msg, TYPE val) {              \
+      printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "        \
+             "dtype=" NAME " value=" FORMAT "\n",                              \
+             msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x,             \
+             threadIdx.y, threadIdx.z, (CAST_TYPE)val);                        \
+    }                                                                          \
+    static __device__ void print_buffer(const char *msg, const char *buf_name, \
+                                        int index, TYPE val) {                 \
+      printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "        \
+             "buffer=%s, index=%d, dtype=" NAME " value=" FORMAT "\n",         \
+             msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x,             \
+             threadIdx.y, threadIdx.z, buf_name, index, (CAST_TYPE)val);       \
+    }                                                                          \
+  }
+
+DEFINE_PRINT_TRAIT(char, "char", "%d", int);
+DEFINE_PRINT_TRAIT(signed char, "signed char", "%d", int);
+DEFINE_PRINT_TRAIT(unsigned char, "unsigned char", "%u", unsigned int);
+DEFINE_PRINT_TRAIT(short, "short", "%d", int);
+DEFINE_PRINT_TRAIT(unsigned short, "unsigned short", "%u", unsigned int);
+DEFINE_PRINT_TRAIT(int, "int", "%d", int);
+DEFINE_PRINT_TRAIT(unsigned int, "uint", "%u", unsigned int);
+DEFINE_PRINT_TRAIT(long, "long", "%ld", long);
+DEFINE_PRINT_TRAIT(unsigned long, "ulong", "%lu", unsigned long);
+DEFINE_PRINT_TRAIT(long long, "long long", "%lld", long long);
+
+DEFINE_PRINT_TRAIT(float, "float", "%f", float);
+DEFINE_PRINT_TRAIT(double, "double", "%lf", double);
+DEFINE_PRINT_TRAIT(half, "half", "%f", float);
+DEFINE_PRINT_TRAIT(half_t, "half_t", "%f", float);
+DEFINE_PRINT_TRAIT(bfloat16_t, "bfloat16_t", "%f", float);
+
+#if __CUDA_ARCH_LIST__ >= 890
+DEFINE_PRINT_TRAIT(fp8_e4_t, "fp8_e4_t", "%f", float);
+DEFINE_PRINT_TRAIT(fp8_e5_t, "fp8_e5_t", "%f", float);
+#endif
 
-// Specialization for fp8_e5_t type
-template <>
-__device__ void debug_print_var<fp8_e5_t>(const char *msg, fp8_e5_t var) {
-  printf(
-      "msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=fp8_e5_t "
-      "value=%f\n",
-      msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-      threadIdx.z, (float)var);
+template <> struct PrintTraits<bool> {
+  static __device__ void print_var(const char *msg, bool val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=bool "
+           "value=%s\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, val ? "true" : "false");
+  }
+  static __device__ void print_buffer(const char *msg, const char *buf_name,
+                                      int index, bool val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+           "index=%d, dtype=bool value=%s\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, buf_name, index, val ? "true" : "false");
+  }
+};
+
+template <typename T> struct PrintTraits<T *> {
+  static __device__ void print_var(const char *msg, T *val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
+           "dtype=pointer value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, (void *)val);
+  }
+  static __device__ void print_buffer(const char *msg, const char *buf_name,
+                                      int index, T *val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+           "index=%d, dtype=pointer value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, buf_name, index, (void *)val);
+  }
+};
+
+template <typename T> __device__ void debug_print_var(const char *msg, T var) {
+  PrintTraits<T>::print_var(msg, var);
 }
 
-// Template declaration for device-side debug printing (buffer only)
 template <typename T>
 __device__ void debug_print_buffer_value(const char *msg, const char *buf_name,
-                                         int index, T var);
-
-// Specialization for signed char type
-template <>
-__device__ void
-debug_print_buffer_value<signed char>(const char *msg, const char *buf_name,
-                                      int index, signed char var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=signed char value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, var);
+                                         int index, T var) {
+  PrintTraits<T>::print_buffer(msg, buf_name, index, var);
 }
 
-// Specialization for unsigned char type
 template <>
-__device__ void
-debug_print_buffer_value<unsigned char>(const char *msg, const char *buf_name,
-                                        int index, unsigned char var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=char value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, var);
-}
-
-// Specialization for integer type
-template <>
-__device__ void debug_print_buffer_value<int>(const char *msg,
-                                              const char *buf_name, int index,
-                                              int var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=int value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, var);
-}
-
-// Specialization for unsigned integer type
-template <>
-__device__ void
-debug_print_buffer_value<unsigned int>(const char *msg, const char *buf_name,
-                                       int index, unsigned int var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=int value=%u\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, var);
-}
-
-// Specialization for float type
-template <>
-__device__ void debug_print_buffer_value<float>(const char *msg,
-                                                const char *buf_name, int index,
-                                                float var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=float value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, var);
-}
-
-// Specialization for half type
-template <>
-__device__ void debug_print_buffer_value<half>(const char *msg,
-                                               const char *buf_name, int index,
-                                               half var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=half value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, (float)var);
-}
-
-// Specialization for half_t type
-template <>
-__device__ void debug_print_buffer_value<half_t>(const char *msg,
-                                                 const char *buf_name,
-                                                 int index, half_t var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=half_t value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, (float)var);
-}
-
-// Specialization for bfloat16_t type
-template <>
-__device__ void
-debug_print_buffer_value<bfloat16_t>(const char *msg, const char *buf_name,
-                                     int index, bfloat16_t var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=bfloat16_t value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, (float)var);
-}
-
-// Specialization for double type
-template <>
-__device__ void debug_print_buffer_value<double>(const char *msg,
-                                                 const char *buf_name,
-                                                 int index, double var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=double value=%lf\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, var);
-}
-
-// Specialization for fp8_e4_t type
-template <>
-__device__ void debug_print_buffer_value<fp8_e4_t>(const char *msg,
+__device__ void debug_print_buffer_value<uint16_t>(const char *msg,
                                                    const char *buf_name,
-                                                   int index, fp8_e4_t var) {
+                                                   int index, uint16_t var) {
   printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=fp8_e4_t value=%f\n",
+         "index=%d, dtype=uint16_t value=%u\n",
          msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, (float)var);
+         threadIdx.z, buf_name, index, (uint32_t)var);
 }
 
-// Specialization for fp8_e5_t type
-template <>
-__device__ void debug_print_buffer_value<fp8_e5_t>(const char *msg,
-                                                   const char *buf_name,
-                                                   int index, fp8_e5_t var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=fp8_e5_t value=%f\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, (float)var);
-}
+TL_DEVICE void device_assert(bool cond) { assert(cond); }
 
-// Specialization for int16 type
-template <>
-__device__ void debug_print_buffer_value<int16_t>(const char *msg,
-                                                  const char *buf_name,
-                                                  int index, int16_t var) {
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=int16_t value=%d\n",
-         msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
-         threadIdx.z, buf_name, index, (int32_t)var);
+TL_DEVICE void device_assert_with_msg(bool cond, const char *msg) {
+  if (!cond) {
+    printf("Device assert failed: %s\n", msg);
+    assert(0);
+  }
 }
 
-// Specialization for msg-only debug print
-__device__ void debug_print_msg(const char *msg) {
+TL_DEVICE void debug_print_msg(const char *msg) {
   printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d)\n", msg,
          blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
          threadIdx.z);
 }
-
-__device__ uint64_t get_clock() {
-  uint64_t gpu_clock;
-  asm volatile("mov.u64 %0, %%clock64;\n" : "=l"(gpu_clock) : : "memory");
-  return gpu_clock;
-}
diff --git a/src/tl_templates/cuda/distributed.h b/src/tl_templates/cuda/distributed.h
index 7eca3a708..64c3cd453 100644
--- a/src/tl_templates/cuda/distributed.h
+++ b/src/tl_templates/cuda/distributed.h
@@ -2,10 +2,9 @@
 
 #include "common.h"
 
+extern "C" extern __constant__ uint64_t meta_data[1024];
 namespace tl {
 
-extern "C" extern __device__ uint64_t meta_data[1024];
-
 TL_DEVICE uint64_t get_rank() { return meta_data[0]; }
 
 TL_DEVICE uint64_t get_num_ranks() { return meta_data[1]; }
@@ -19,3 +18,10 @@ template <typename dtype_t> TL_DEVICE uint64_t get_uintptr_t(dtype_t *ptr) {
 }
 
 } // namespace tl
+
+TL_DEVICE void print_table() {
+  std::printf("Table base address: %llu\n", meta_data);
+  for (int i = 0; i < 10; i++) {
+    std::printf("meta_data[%d] = %llu\n", i, meta_data[i]);
+  }
+}
diff --git a/src/tl_templates/cuda/gemm_mma.h b/src/tl_templates/cuda/gemm_mma.h
index 9462514f8..ea01fa9aa 100644
--- a/src/tl_templates/cuda/gemm_mma.h
+++ b/src/tl_templates/cuda/gemm_mma.h
@@ -8,7 +8,6 @@
 #include <cute/underscore.hpp>
 
 #include "common.h"
-#include "cuda_fp8.h"
 #include "intrin.h"
 
 namespace cute::tl_mma {
@@ -51,6 +50,7 @@ TL_DISPATCH_MMA(half_t, half_t, half_t, SM80_16x8x16_F16F16F16F16_TN)
 TL_DISPATCH_MMA(half_t, half_t, float, SM80_16x8x16_F32F16F16F32_TN)
 TL_DISPATCH_MMA(bfloat16_t, bfloat16_t, float, SM80_16x8x16_F32BF16BF16F32_TN)
 TL_DISPATCH_MMA(tfloat32_t, tfloat32_t, float, SM80_16x8x8_F32TF32TF32F32_TN)
+TL_DISPATCH_MMA(float, float, float, SM80_16x8x8_F32TF32TF32F32_TN)
 TL_DISPATCH_MMA(int8_t, int8_t, int, SM80_16x8x32_S32S8S8S32_TN)
 TL_DISPATCH_MMA(double, double, double, SM80_8x8x4_F64F64F64F64_TN)
 #elif __CUDA_ARCH_LIST__ >= 1000
@@ -64,6 +64,7 @@ TL_DISPATCH_MMA(half_t, half_t, half_t, SM80_16x8x16_F16F16F16F16_TN)
 TL_DISPATCH_MMA(half_t, half_t, float, SM80_16x8x16_F32F16F16F32_TN)
 TL_DISPATCH_MMA(bfloat16_t, bfloat16_t, float, SM80_16x8x16_F32BF16BF16F32_TN)
 TL_DISPATCH_MMA(tfloat32_t, tfloat32_t, float, SM80_16x8x8_F32TF32TF32F32_TN)
+TL_DISPATCH_MMA(float, float, float, SM80_16x8x8_F32TF32TF32F32_TN)
 TL_DISPATCH_MMA(int8_t, int8_t, int, SM80_16x8x32_S32S8S8S32_TN)
 TL_DISPATCH_MMA(double, double, double, SM80_8x8x4_F64F64F64F64_TN)
 #elif __CUDA_ARCH_LIST__ >= 900
@@ -76,6 +77,7 @@ TL_DISPATCH_MMA(half_t, half_t, half_t, SM80_16x8x16_F16F16F16F16_TN)
 TL_DISPATCH_MMA(half_t, half_t, float, SM80_16x8x16_F32F16F16F32_TN)
 TL_DISPATCH_MMA(bfloat16_t, bfloat16_t, float, SM80_16x8x16_F32BF16BF16F32_TN)
 TL_DISPATCH_MMA(tfloat32_t, tfloat32_t, float, SM80_16x8x8_F32TF32TF32F32_TN)
+TL_DISPATCH_MMA(float, float, float, SM80_16x8x8_F32TF32TF32F32_TN)
 TL_DISPATCH_MMA(int8_t, int8_t, int, SM80_16x8x32_S32S8S8S32_TN)
 TL_DISPATCH_MMA(double, double, double, SM80_8x8x4_F64F64F64F64_TN)
 #elif __CUDA_ARCH_LIST__ >= 890
@@ -88,6 +90,7 @@ TL_DISPATCH_MMA(half_t, half_t, half_t, SM80_16x8x16_F16F16F16F16_TN)
 TL_DISPATCH_MMA(half_t, half_t, float, SM80_16x8x16_F32F16F16F32_TN)
 TL_DISPATCH_MMA(bfloat16_t, bfloat16_t, float, SM80_16x8x16_F32BF16BF16F32_TN)
 TL_DISPATCH_MMA(tfloat32_t, tfloat32_t, float, SM80_16x8x8_F32TF32TF32F32_TN)
+TL_DISPATCH_MMA(float, float, float, SM80_16x8x8_F32TF32TF32F32_TN)
 TL_DISPATCH_MMA(int8_t, int8_t, int, SM80_16x8x32_S32S8S8S32_TN)
 TL_DISPATCH_MMA(double, double, double, SM80_8x8x4_F64F64F64F64_TN)
 #elif __CUDA_ARCH_LIST__ >= 800
@@ -96,6 +99,7 @@ TL_DISPATCH_MMA(half_t, half_t, half_t, SM80_16x8x16_F16F16F16F16_TN)
 TL_DISPATCH_MMA(half_t, half_t, float, SM80_16x8x16_F32F16F16F32_TN)
 TL_DISPATCH_MMA(bfloat16_t, bfloat16_t, float, SM80_16x8x16_F32BF16BF16F32_TN)
 TL_DISPATCH_MMA(tfloat32_t, tfloat32_t, float, SM80_16x8x8_F32TF32TF32F32_TN)
+TL_DISPATCH_MMA(float, float, float, SM80_16x8x8_F32TF32TF32F32_TN)
 TL_DISPATCH_MMA(int8_t, int8_t, int, SM80_16x8x32_S32S8S8S32_TN)
 TL_DISPATCH_MMA(double, double, double, SM80_8x8x4_F64F64F64F64_TN)
 #elif __CUDA_ARCH_LIST__ >= 750
@@ -263,16 +267,18 @@ template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
           typename C_type_raw>
 class GemmTensorOp {
 public:
+  using A_type_cute = typename tl::to_cute_type<A_type_raw>::type;
+  using B_type_cute = typename tl::to_cute_type<B_type_raw>::type;
   using A_type =
-      typename std::conditional<std::is_same<A_type_raw, float>::value,
-                                tfloat32_t, A_type_raw>::type;
+      typename std::conditional<std::is_same<A_type_cute, float>::value,
+                                tfloat32_t, A_type_cute>::type;
   using B_type =
-      typename std::conditional<std::is_same<B_type_raw, float>::value,
-                                tfloat32_t, A_type_raw>::type;
+      typename std::conditional<std::is_same<B_type_cute, float>::value,
+                                tfloat32_t, B_type_cute>::type;
   using C_type = C_type_raw;
 
-  using Instruction =
-      DispatchInstruction<A_type, B_type, C_type, num_warp_m, num_warp_n, N>;
+  using Instruction = DispatchInstruction<A_type_raw, B_type_raw, C_type_raw,
+                                          num_warp_m, num_warp_n, N>;
 
   using OperandATraits = OperandTraits<sizeof_bits<A_type>::value, M, K,
                                        !trans_A, num_warp_m, lda>;
diff --git a/src/tl_templates/cuda/gemm_sm100.h b/src/tl_templates/cuda/gemm_sm100.h
index 5b50fe72a..84e22f24e 100644
--- a/src/tl_templates/cuda/gemm_sm100.h
+++ b/src/tl_templates/cuda/gemm_sm100.h
@@ -243,58 +243,112 @@ struct DispatchInstruction<half_t, half_t, float, M, N, K, a_major, b_major,
 };
 
 template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<fp8_e4_t, fp8_e4_t, float, M, N, K, a_major, b_major,
+struct DispatchInstruction<cute::float_e4m3_t, cute::float_e4m3_t, float, M, N,
+                           K, a_major, b_major,
                            std::enable_if_t<M == 128 && K == 32>> {
-  using MMA = MMA_Traits<SM100_MMA_F8F6F4_SS, fp8_e4_t, fp8_e4_t, float, Int<M>,
-                         Int<N>, integral_constant<UMMA::Major, a_major>,
-                         integral_constant<UMMA::Major, b_major>,
-                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
-                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
+  using MMA =
+      MMA_Traits<SM100_MMA_F8F6F4_SS, cute::float_e4m3_t, cute::float_e4m3_t,
+                 float, Int<M>, Int<N>, integral_constant<UMMA::Major, a_major>,
+                 integral_constant<UMMA::Major, b_major>,
+                 integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
+                 integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
 };
 
 template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<fp8_e4_t, fp8_e4_t, float, M, N, K, a_major, b_major,
+struct DispatchInstruction<cute::float_e4m3_t, cute::float_e4m3_t, float, M, N,
+                           K, a_major, b_major,
                            std::enable_if_t<(M == 64 || M == 32) && K == 32>> {
   using MMA =
-      MMA_Traits<SM100_MMA_F8F6F4_WS_SS, fp8_e4_t, fp8_e4_t, float, Int<M>,
-                 Int<N>, integral_constant<UMMA::Major, a_major>,
+      MMA_Traits<SM100_MMA_F8F6F4_WS_SS, cute::float_e4m3_t, cute::float_e4m3_t,
+                 float, Int<M>, Int<N>, integral_constant<UMMA::Major, a_major>,
                  integral_constant<UMMA::Major, b_major>,
                  integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
                  integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
 };
 
 template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<fp8_e5_t, fp8_e5_t, float, M, N, K, a_major, b_major,
+struct DispatchInstruction<cute::float_e4m3_t, cute::float_e4m3_t, half_t, M, N,
+                           K, a_major, b_major,
                            std::enable_if_t<M == 128 && K == 32>> {
-  using MMA = MMA_Traits<SM100_MMA_F8F6F4_SS, fp8_e5_t, fp8_e5_t, float, Int<M>,
-                         Int<N>, integral_constant<UMMA::Major, a_major>,
+  using MMA = MMA_Traits<SM100_MMA_F8F6F4_SS, cute::float_e4m3_t,
+                         cute::float_e4m3_t, half_t, Int<M>, Int<N>,
+                         integral_constant<UMMA::Major, a_major>,
                          integral_constant<UMMA::Major, b_major>,
                          integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
                          integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
 };
+template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
+struct DispatchInstruction<cute::float_e4m3_t, cute::float_e4m3_t, half_t, M, N,
+                           K, a_major, b_major,
+                           std::enable_if_t<(M == 64 || M == 32) && K == 32>> {
+  using MMA = MMA_Traits<SM100_MMA_F8F6F4_WS_SS, cute::float_e4m3_t,
+                         cute::float_e4m3_t, half_t, Int<M>, Int<N>,
+                         integral_constant<UMMA::Major, a_major>,
+                         integral_constant<UMMA::Major, b_major>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
+};
+
+template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
+struct DispatchInstruction<cute::float_e5m2_t, cute::float_e5m2_t, float, M, N,
+                           K, a_major, b_major,
+                           std::enable_if_t<M == 128 && K == 32>> {
+  using MMA =
+      MMA_Traits<SM100_MMA_F8F6F4_SS, cute::float_e5m2_t, cute::float_e5m2_t,
+                 float, Int<M>, Int<N>, integral_constant<UMMA::Major, a_major>,
+                 integral_constant<UMMA::Major, b_major>,
+                 integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
+                 integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
+};
 
 template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
-struct DispatchInstruction<fp8_e5_t, fp8_e5_t, float, M, N, K, a_major, b_major,
+struct DispatchInstruction<cute::float_e5m2_t, cute::float_e5m2_t, float, M, N,
+                           K, a_major, b_major,
                            std::enable_if_t<(M == 64 || M == 32) && K == 32>> {
   using MMA =
-      MMA_Traits<SM100_MMA_F8F6F4_WS_SS, fp8_e5_t, fp8_e5_t, float, Int<M>,
-                 Int<N>, integral_constant<UMMA::Major, a_major>,
+      MMA_Traits<SM100_MMA_F8F6F4_WS_SS, cute::float_e5m2_t, cute::float_e5m2_t,
+                 float, Int<M>, Int<N>, integral_constant<UMMA::Major, a_major>,
                  integral_constant<UMMA::Major, b_major>,
                  integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
                  integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
 };
 
+template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
+struct DispatchInstruction<cute::float_e5m2_t, cute::float_e5m2_t, half_t, M, N,
+                           K, a_major, b_major,
+                           std::enable_if_t<M == 128 && K == 32>> {
+  using MMA = MMA_Traits<SM100_MMA_F8F6F4_SS, cute::float_e5m2_t,
+                         cute::float_e5m2_t, half_t, Int<M>, Int<N>,
+                         integral_constant<UMMA::Major, a_major>,
+                         integral_constant<UMMA::Major, b_major>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
+};
+template <int M, int N, int K, UMMA::Major a_major, UMMA::Major b_major>
+struct DispatchInstruction<cute::float_e5m2_t, cute::float_e5m2_t, half_t, M, N,
+                           K, a_major, b_major,
+                           std::enable_if_t<(M == 64 || M == 32) && K == 32>> {
+  using MMA = MMA_Traits<SM100_MMA_F8F6F4_WS_SS, cute::float_e5m2_t,
+                         cute::float_e5m2_t, half_t, Int<M>, Int<N>,
+                         integral_constant<UMMA::Major, a_major>,
+                         integral_constant<UMMA::Major, b_major>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>,
+                         integral_constant<UMMA::ScaleIn, UMMA::ScaleIn::One>>;
+};
+
 template <int M, int N, int K, int AtomM, int AtomN, int AtomK, bool trans_A,
           bool trans_B, typename A_type_raw, typename B_type_raw,
           typename C_type_raw>
 class GemmTensorOp {
 public:
+  using A_type_cute = typename tl::to_cute_type<A_type_raw>::type;
+  using B_type_cute = typename tl::to_cute_type<B_type_raw>::type;
   using A_type =
-      typename std::conditional<std::is_same<A_type_raw, float>::value,
-                                tfloat32_t, A_type_raw>::type;
+      typename std::conditional<std::is_same<A_type_cute, float>::value,
+                                tfloat32_t, A_type_cute>::type;
   using B_type =
-      typename std::conditional<std::is_same<B_type_raw, float>::value,
-                                tfloat32_t, B_type_raw>::type;
+      typename std::conditional<std::is_same<B_type_cute, float>::value,
+                                tfloat32_t, B_type_cute>::type;
   using C_type = C_type_raw;
 
   static_assert(AtomM == 128 || AtomM == 64 || AtomM == 32);
diff --git a/src/tl_templates/cuda/gemm_sm90.h b/src/tl_templates/cuda/gemm_sm90.h
index 1aa3ecff9..543a29d09 100644
--- a/src/tl_templates/cuda/gemm_sm90.h
+++ b/src/tl_templates/cuda/gemm_sm90.h
@@ -21,10 +21,12 @@ template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
           typename B_type_raw, typename C_type_raw>
 class GemmTensorOp {
 public:
-  using A_type = conditional_t<std::is_same<A_type_raw, float>::value,
-                               tfloat32_t, A_type_raw>;
-  using B_type = conditional_t<std::is_same<B_type_raw, float>::value,
-                               tfloat32_t, B_type_raw>;
+  using A_type_cute = typename tl::to_cute_type<A_type_raw>::type;
+  using B_type_cute = typename tl::to_cute_type<B_type_raw>::type;
+  using A_type = conditional_t<std::is_same<A_type_cute, float>::value,
+                               tfloat32_t, A_type_cute>;
+  using B_type = conditional_t<std::is_same<B_type_cute, float>::value,
+                               tfloat32_t, A_type_cute>;
   using C_type = C_type_raw;
 
   static constexpr GMMA::Major GmmaMajorA =
diff --git a/src/tl_templates/cuda/gemm_sp_sm90.h b/src/tl_templates/cuda/gemm_sp_sm90.h
index db55a21ec..522fc11ee 100644
--- a/src/tl_templates/cuda/gemm_sp_sm90.h
+++ b/src/tl_templates/cuda/gemm_sp_sm90.h
@@ -13,10 +13,12 @@ class GemmTensorOp {
 public:
   static_assert(num_warp_m % 4 == 0, "num_warp_m must be a multiple of 4");
 
-  using A_type = conditional_t<std::is_same<A_type_raw, float>::value,
-                               tfloat32_t, A_type_raw>;
-  using B_type = conditional_t<std::is_same<B_type_raw, float>::value,
-                               tfloat32_t, B_type_raw>;
+  using A_type_cute = typename tl::to_cute_type<A_type_raw>::type;
+  using B_type_cute = typename tl::to_cute_type<B_type_raw>::type;
+  using A_type = conditional_t<std::is_same<A_type_cute, float>::value,
+                               tfloat32_t, A_type_cute>;
+  using B_type = conditional_t<std::is_same<B_type_cute, float>::value,
+                               tfloat32_t, B_type_cute>;
   using C_type = C_type_raw;
 
   static constexpr bool need_tfloat32_cast =
@@ -229,4 +231,4 @@ TL_DEVICE void gemm_sp_ss(A_type *pA, B_type *pB, C_type *accum, E_type *pE) {
     CUTE_GCC_UNREACHABLE;
   }
 }
-} // namespace tl
\ No newline at end of file
+} // namespace tl
diff --git a/src/tl_templates/cuda/instruction/mma.h b/src/tl_templates/cuda/instruction/mma.h
new file mode 100644
index 000000000..869fa777b
--- /dev/null
+++ b/src/tl_templates/cuda/instruction/mma.h
@@ -0,0 +1,165 @@
+#pragma once
+
+#include "../common.h"
+#include <cute/arch/mma_sm80.hpp>
+#include <cute/arch/mma_sm89.hpp>
+
+#ifndef __CUDACC_RTC__
+#include <type_traits>
+#include <utility>
+#endif
+
+namespace tl {
+
+#ifndef TL_ALWAYS_FALSE_V_DEFINED
+#define TL_ALWAYS_FALSE_V_DEFINED
+template <class> inline constexpr bool always_false_v = false;
+#endif
+
+namespace detail {
+
+template <class Impl> struct MmaImplTraits {
+  using DReg = std::remove_extent_t<typename Impl::DRegisters>;
+  using AReg = std::remove_extent_t<typename Impl::ARegisters>;
+  using BReg = std::remove_extent_t<typename Impl::BRegisters>;
+  using CReg = std::remove_extent_t<typename Impl::CRegisters>;
+
+  static constexpr int kDRegs = std::extent_v<typename Impl::DRegisters>;
+  static constexpr int kARegs = std::extent_v<typename Impl::ARegisters>;
+  static constexpr int kBRegs = std::extent_v<typename Impl::BRegisters>;
+  static constexpr int kCRegs = std::extent_v<typename Impl::CRegisters>;
+};
+
+template <class Impl, size_t... DIdx, size_t... AIdx, size_t... BIdx,
+          size_t... CIdx>
+TL_DEVICE void
+call_fma_impl(typename MmaImplTraits<Impl>::DReg *d,
+              const typename MmaImplTraits<Impl>::AReg *a,
+              const typename MmaImplTraits<Impl>::BReg *b,
+              const typename MmaImplTraits<Impl>::CReg *c,
+              std::index_sequence<DIdx...>, std::index_sequence<AIdx...>,
+              std::index_sequence<BIdx...>, std::index_sequence<CIdx...>) {
+  Impl::fma(d[DIdx]..., a[AIdx]..., b[BIdx]..., c[CIdx]...);
+}
+
+template <class Impl>
+TL_DEVICE void call_fma(typename MmaImplTraits<Impl>::DReg *d,
+                        const typename MmaImplTraits<Impl>::AReg *a,
+                        const typename MmaImplTraits<Impl>::BReg *b,
+                        const typename MmaImplTraits<Impl>::CReg *c) {
+  call_fma_impl<Impl>(d, a, b, c,
+                      std::make_index_sequence<MmaImplTraits<Impl>::kDRegs>{},
+                      std::make_index_sequence<MmaImplTraits<Impl>::kARegs>{},
+                      std::make_index_sequence<MmaImplTraits<Impl>::kBRegs>{},
+                      std::make_index_sequence<MmaImplTraits<Impl>::kCRegs>{});
+}
+
+template <DataType AType, DataType BType, DataType CType, int M, int N, int K,
+          bool TransA, bool TransB, bool Saturate>
+struct MmaDispatcher {
+  using CRegType = void;
+  using ARegType = void;
+  using BRegType = void;
+
+  static TL_DEVICE void exec(CRegType *, const ARegType *, const BRegType *,
+                             const CRegType *) {
+    static_assert(always_false_v<std::integral_constant<int, M>>,
+                  "tl::mma_sync: unsupported configuration");
+  }
+};
+
+#define TL_DEFINE_MMA_DISPATCHER(ATypeEnum, BTypeEnum, CTypeEnum, MValue,      \
+                                 NValue, KValue, TransAValue, TransBValue,     \
+                                 SaturateValue, ImplType)                      \
+  template <>                                                                  \
+  struct MmaDispatcher<DataType::ATypeEnum, DataType::BTypeEnum,               \
+                       DataType::CTypeEnum, MValue, NValue, KValue,            \
+                       TransAValue, TransBValue, SaturateValue> {              \
+    using Impl = ImplType;                                                     \
+    using Traits = MmaImplTraits<Impl>;                                        \
+    using CRegType = typename Traits::DReg;                                    \
+    using ARegType = typename Traits::AReg;                                    \
+    using BRegType = typename Traits::BReg;                                    \
+    static_assert(                                                             \
+        std::is_same_v<typename Traits::DReg, typename Traits::CReg>,          \
+        "tl::mma_sync requires matching accumulator/output regs");             \
+    static TL_DEVICE void exec(CRegType *d, const ARegType *a,                 \
+                               const BRegType *b, const CRegType *c) {         \
+      call_fma<Impl>(d, a, b, c);                                              \
+    }                                                                          \
+  };
+
+// FP16 inputs (TN layout: A row-major, B column-major)
+TL_DEFINE_MMA_DISPATCHER(kFloat16, kFloat16, kFloat16, 16, 8, 16, false, true,
+                         false, cute::SM80_16x8x16_F16F16F16F16_TN)
+TL_DEFINE_MMA_DISPATCHER(kFloat16, kFloat16, kFloat32, 16, 8, 16, false, true,
+                         false, cute::SM80_16x8x16_F32F16F16F32_TN)
+
+// BF16 inputs
+TL_DEFINE_MMA_DISPATCHER(kBFloat16, kBFloat16, kFloat32, 16, 8, 16, false, true,
+                         false, cute::SM80_16x8x16_F32BF16BF16F32_TN)
+
+// INT8 inputs (k32)
+TL_DEFINE_MMA_DISPATCHER(kInt8, kInt8, kInt32, 16, 8, 32, false, true, false,
+                         cute::SM80_16x8x32_S32S8S8S32_TN)
+TL_DEFINE_MMA_DISPATCHER(kUInt8, kUInt8, kInt32, 16, 8, 32, false, true, false,
+                         cute::SM80_16x8x32_S32U8U8S32_TN)
+
+// INT4 inputs (k32)
+TL_DEFINE_MMA_DISPATCHER(kInt4, kInt4, kInt32, 16, 8, 32, false, true, false,
+                         cute::SM80_16x8x32_S32S4S4S32_TN)
+TL_DEFINE_MMA_DISPATCHER(kUInt4, kUInt4, kInt32, 16, 8, 32, false, true, false,
+                         cute::SM80_16x8x32_S32U4U4S32_TN)
+
+// FP8 inputs (k32)
+TL_DEFINE_MMA_DISPATCHER(kFloat8_e4m3, kFloat8_e4m3, kFloat16, 16, 8, 32, false,
+                         true, false, cute::SM89_16x8x32_F16E4M3E4M3F16_TN)
+TL_DEFINE_MMA_DISPATCHER(kFloat8_e4m3, kFloat8_e4m3, kFloat32, 16, 8, 32, false,
+                         true, false, cute::SM89_16x8x32_F32E4M3E4M3F32_TN)
+TL_DEFINE_MMA_DISPATCHER(kFloat8_e4m3, kFloat8_e5m2, kFloat16, 16, 8, 32, false,
+                         true, false, cute::SM89_16x8x32_F16E4M3E5M2F16_TN)
+TL_DEFINE_MMA_DISPATCHER(kFloat8_e4m3, kFloat8_e5m2, kFloat32, 16, 8, 32, false,
+                         true, false, cute::SM89_16x8x32_F32E4M3E5M2F32_TN)
+TL_DEFINE_MMA_DISPATCHER(kFloat8_e5m2, kFloat8_e4m3, kFloat16, 16, 8, 32, false,
+                         true, false, cute::SM89_16x8x32_F16E5M2E4M3F16_TN)
+TL_DEFINE_MMA_DISPATCHER(kFloat8_e5m2, kFloat8_e4m3, kFloat32, 16, 8, 32, false,
+                         true, false, cute::SM89_16x8x32_F32E5M2E4M3F32_TN)
+TL_DEFINE_MMA_DISPATCHER(kFloat8_e5m2, kFloat8_e5m2, kFloat16, 16, 8, 32, false,
+                         true, false, cute::SM89_16x8x32_F16E5M2E5M2F16_TN)
+TL_DEFINE_MMA_DISPATCHER(kFloat8_e5m2, kFloat8_e5m2, kFloat32, 16, 8, 32, false,
+                         true, false, cute::SM89_16x8x32_F32E5M2E5M2F32_TN)
+
+// TF32 inputs (FP32 math on Tensor Cores)
+// Support both k=4 and k=8 variants on SM80
+TL_DEFINE_MMA_DISPATCHER(kTensorFloat32, kTensorFloat32, kFloat32, 16, 8, 4,
+                         false, true, false,
+                         cute::SM80_16x8x4_F32TF32TF32F32_TN)
+TL_DEFINE_MMA_DISPATCHER(kTensorFloat32, kTensorFloat32, kFloat32, 16, 8, 8,
+                         false, true, false,
+                         cute::SM80_16x8x8_F32TF32TF32F32_TN)
+
+// FP64 inputs (DMMA: m8n8k4, TN layout)
+TL_DEFINE_MMA_DISPATCHER(kFloat64, kFloat64, kFloat64, 8, 8, 4, false, true,
+                         false, cute::SM80_8x8x4_F64F64F64F64_TN)
+
+#undef TL_DEFINE_MMA_DISPATCHER
+
+} // namespace detail
+
+template <DataType AType, DataType BType, DataType CType, int M, int N, int K,
+          bool TransA, bool TransB, bool Saturate = false>
+TL_DEVICE void mma_sync(
+    typename detail::MmaDispatcher<AType, BType, CType, M, N, K, TransA, TransB,
+                                   Saturate>::CRegType *c,
+    const typename detail::MmaDispatcher<AType, BType, CType, M, N, K, TransA,
+                                         TransB, Saturate>::ARegType *a,
+    const typename detail::MmaDispatcher<AType, BType, CType, M, N, K, TransA,
+                                         TransB, Saturate>::BRegType *b) {
+  using Dispatcher = detail::MmaDispatcher<AType, BType, CType, M, N, K, TransA,
+                                           TransB, Saturate>;
+  static_assert(!std::is_void_v<typename Dispatcher::CRegType>,
+                "tl::mma_sync: unsupported configuration");
+  Dispatcher::exec(c, a, b, c);
+}
+
+} // namespace tl
diff --git a/src/tl_templates/cuda/instruction/mma_sm70.h b/src/tl_templates/cuda/instruction/mma_sm70.h
new file mode 100644
index 000000000..7a44b9212
--- /dev/null
+++ b/src/tl_templates/cuda/instruction/mma_sm70.h
@@ -0,0 +1,355 @@
+#pragma once
+
+#include "../common.h"
+
+#ifndef __CUDACC_RTC__
+#include <type_traits>
+#include <utility>
+#endif
+
+namespace tl {
+
+#ifndef TL_ALWAYS_FALSE_V_DEFINED
+#define TL_ALWAYS_FALSE_V_DEFINED
+template <class> inline constexpr bool always_false_v = false;
+#endif
+
+namespace detail {
+
+// SM70 MMA Instruction Traits and Implementations
+// SM70 supports m16n16k4 (m8n8k4 instruction at warp level) with FP16/FP32
+// accumulation
+
+// Base template for SM70 MMA implementation
+template <DataType AType, DataType BType, DataType CType, bool TransA,
+          bool TransB>
+struct MmaSm70Impl {
+  // Default: unsupported configuration
+  static constexpr bool kSupported = false;
+
+  static TL_DEVICE void exec(void *, const void *, const void *, const void *) {
+    static_assert(always_false_v<std::integral_constant<bool, TransA>>,
+                  "tl::mma_sync_sm70: unsupported configuration");
+  }
+};
+
+// FP16 inputs, FP16 accumulation - col.col (TransA=true, TransB=true)
+template <>
+struct MmaSm70Impl<DataType::kFloat16, DataType::kFloat16, DataType::kFloat16,
+                   true, true> {
+  using DRegisters = unsigned[4];
+  using ARegisters = unsigned[2];
+  using BRegisters = unsigned[2];
+  using CRegisters = unsigned[4];
+
+  static constexpr bool kSupported = true;
+
+  static TL_DEVICE void fma(unsigned &d0, unsigned &d1, unsigned &d2,
+                            unsigned &d3, unsigned a0, unsigned a1, unsigned b0,
+                            unsigned b1, unsigned c0, unsigned c1, unsigned c2,
+                            unsigned c3) {
+    asm volatile("mma.sync.aligned.m8n8k4.col.col.f16.f16.f16.f16 "
+                 "{%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11};\n"
+                 : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+                 : "r"(a0), "r"(a1), "r"(b0), "r"(b1), "r"(c0), "r"(c1),
+                   "r"(c2), "r"(c3));
+  }
+};
+
+// FP16 inputs, FP16 accumulation - col.row (TransA=true, TransB=false)
+template <>
+struct MmaSm70Impl<DataType::kFloat16, DataType::kFloat16, DataType::kFloat16,
+                   true, false> {
+  using DRegisters = unsigned[4];
+  using ARegisters = unsigned[2];
+  using BRegisters = unsigned[2];
+  using CRegisters = unsigned[4];
+
+  static constexpr bool kSupported = true;
+
+  static TL_DEVICE void fma(unsigned &d0, unsigned &d1, unsigned &d2,
+                            unsigned &d3, unsigned a0, unsigned a1, unsigned b0,
+                            unsigned b1, unsigned c0, unsigned c1, unsigned c2,
+                            unsigned c3) {
+    asm volatile("mma.sync.aligned.m8n8k4.col.row.f16.f16.f16.f16 "
+                 "{%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11};\n"
+                 : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+                 : "r"(a0), "r"(a1), "r"(b0), "r"(b1), "r"(c0), "r"(c1),
+                   "r"(c2), "r"(c3));
+  }
+};
+
+// FP16 inputs, FP16 accumulation - row.col (TransA=false, TransB=true)
+template <>
+struct MmaSm70Impl<DataType::kFloat16, DataType::kFloat16, DataType::kFloat16,
+                   false, true> {
+  using DRegisters = unsigned[4];
+  using ARegisters = unsigned[2];
+  using BRegisters = unsigned[2];
+  using CRegisters = unsigned[4];
+
+  static constexpr bool kSupported = true;
+
+  static TL_DEVICE void fma(unsigned &d0, unsigned &d1, unsigned &d2,
+                            unsigned &d3, unsigned a0, unsigned a1, unsigned b0,
+                            unsigned b1, unsigned c0, unsigned c1, unsigned c2,
+                            unsigned c3) {
+    asm volatile("mma.sync.aligned.m8n8k4.row.col.f16.f16.f16.f16 "
+                 "{%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11};\n"
+                 : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+                 : "r"(a0), "r"(a1), "r"(b0), "r"(b1), "r"(c0), "r"(c1),
+                   "r"(c2), "r"(c3));
+  }
+};
+
+// FP16 inputs, FP16 accumulation - row.row (TransA=false, TransB=false)
+template <>
+struct MmaSm70Impl<DataType::kFloat16, DataType::kFloat16, DataType::kFloat16,
+                   false, false> {
+  using DRegisters = unsigned[4];
+  using ARegisters = unsigned[2];
+  using BRegisters = unsigned[2];
+  using CRegisters = unsigned[4];
+
+  static constexpr bool kSupported = true;
+
+  static TL_DEVICE void fma(unsigned &d0, unsigned &d1, unsigned &d2,
+                            unsigned &d3, unsigned a0, unsigned a1, unsigned b0,
+                            unsigned b1, unsigned c0, unsigned c1, unsigned c2,
+                            unsigned c3) {
+    asm volatile("mma.sync.aligned.m8n8k4.row.row.f16.f16.f16.f16 "
+                 "{%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11};\n"
+                 : "=r"(d0), "=r"(d1), "=r"(d2), "=r"(d3)
+                 : "r"(a0), "r"(a1), "r"(b0), "r"(b1), "r"(c0), "r"(c1),
+                   "r"(c2), "r"(c3));
+  }
+};
+
+// FP16 inputs, FP32 accumulation - col.col (TransA=true, TransB=true)
+template <>
+struct MmaSm70Impl<DataType::kFloat16, DataType::kFloat16, DataType::kFloat32,
+                   true, true> {
+  using DRegisters = float[8];
+  using ARegisters = unsigned[2];
+  using BRegisters = unsigned[2];
+  using CRegisters = float[8];
+
+  static constexpr bool kSupported = true;
+
+  static TL_DEVICE void fma(float &d0, float &d1, float &d2, float &d3,
+                            float &d4, float &d5, float &d6, float &d7,
+                            unsigned a0, unsigned a1, unsigned b0, unsigned b1,
+                            float c0, float c1, float c2, float c3, float c4,
+                            float c5, float c6, float c7) {
+    asm volatile("mma.sync.aligned.m8n8k4.col.col.f32.f16.f16.f32 "
+                 "{%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, "
+                 "{%12,%13,%14,%15,%16,%17,%18,%19};\n"
+                 : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3), "=f"(d4), "=f"(d5),
+                   "=f"(d6), "=f"(d7)
+                 : "r"(a0), "r"(a1), "r"(b0), "r"(b1), "f"(c0), "f"(c1),
+                   "f"(c2), "f"(c3), "f"(c4), "f"(c5), "f"(c6), "f"(c7));
+  }
+};
+
+// FP16 inputs, FP32 accumulation - col.row (TransA=true, TransB=false)
+template <>
+struct MmaSm70Impl<DataType::kFloat16, DataType::kFloat16, DataType::kFloat32,
+                   true, false> {
+  using DRegisters = float[8];
+  using ARegisters = unsigned[2];
+  using BRegisters = unsigned[2];
+  using CRegisters = float[8];
+
+  static constexpr bool kSupported = true;
+
+  static TL_DEVICE void fma(float &d0, float &d1, float &d2, float &d3,
+                            float &d4, float &d5, float &d6, float &d7,
+                            unsigned a0, unsigned a1, unsigned b0, unsigned b1,
+                            float c0, float c1, float c2, float c3, float c4,
+                            float c5, float c6, float c7) {
+    asm volatile("mma.sync.aligned.m8n8k4.col.row.f32.f16.f16.f32 "
+                 "{%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, "
+                 "{%12,%13,%14,%15,%16,%17,%18,%19};\n"
+                 : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3), "=f"(d4), "=f"(d5),
+                   "=f"(d6), "=f"(d7)
+                 : "r"(a0), "r"(a1), "r"(b0), "r"(b1), "f"(c0), "f"(c1),
+                   "f"(c2), "f"(c3), "f"(c4), "f"(c5), "f"(c6), "f"(c7));
+  }
+};
+
+// FP16 inputs, FP32 accumulation - row.col (TransA=false, TransB=true)
+template <>
+struct MmaSm70Impl<DataType::kFloat16, DataType::kFloat16, DataType::kFloat32,
+                   false, true> {
+  using DRegisters = float[8];
+  using ARegisters = unsigned[2];
+  using BRegisters = unsigned[2];
+  using CRegisters = float[8];
+
+  static constexpr bool kSupported = true;
+
+  static TL_DEVICE void fma(float &d0, float &d1, float &d2, float &d3,
+                            float &d4, float &d5, float &d6, float &d7,
+                            unsigned a0, unsigned a1, unsigned b0, unsigned b1,
+                            float c0, float c1, float c2, float c3, float c4,
+                            float c5, float c6, float c7) {
+    asm volatile("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32 "
+                 "{%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, "
+                 "{%12,%13,%14,%15,%16,%17,%18,%19};\n"
+                 : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3), "=f"(d4), "=f"(d5),
+                   "=f"(d6), "=f"(d7)
+                 : "r"(a0), "r"(a1), "r"(b0), "r"(b1), "f"(c0), "f"(c1),
+                   "f"(c2), "f"(c3), "f"(c4), "f"(c5), "f"(c6), "f"(c7));
+  }
+};
+
+// FP16 inputs, FP32 accumulation - row.row (TransA=false, TransB=false)
+template <>
+struct MmaSm70Impl<DataType::kFloat16, DataType::kFloat16, DataType::kFloat32,
+                   false, false> {
+  using DRegisters = float[8];
+  using ARegisters = unsigned[2];
+  using BRegisters = unsigned[2];
+  using CRegisters = float[8];
+
+  static constexpr bool kSupported = true;
+
+  static TL_DEVICE void fma(float &d0, float &d1, float &d2, float &d3,
+                            float &d4, float &d5, float &d6, float &d7,
+                            unsigned a0, unsigned a1, unsigned b0, unsigned b1,
+                            float c0, float c1, float c2, float c3, float c4,
+                            float c5, float c6, float c7) {
+    asm volatile("mma.sync.aligned.m8n8k4.row.row.f32.f16.f16.f32 "
+                 "{%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, "
+                 "{%12,%13,%14,%15,%16,%17,%18,%19};\n"
+                 : "=f"(d0), "=f"(d1), "=f"(d2), "=f"(d3), "=f"(d4), "=f"(d5),
+                   "=f"(d6), "=f"(d7)
+                 : "r"(a0), "r"(a1), "r"(b0), "r"(b1), "f"(c0), "f"(c1),
+                   "f"(c2), "f"(c3), "f"(c4), "f"(c5), "f"(c6), "f"(c7));
+  }
+};
+
+// Helper to extract register types
+template <class Impl> struct MmaSm70ImplTraits {
+  using DReg = std::remove_extent_t<typename Impl::DRegisters>;
+  using AReg = std::remove_extent_t<typename Impl::ARegisters>;
+  using BReg = std::remove_extent_t<typename Impl::BRegisters>;
+  using CReg = std::remove_extent_t<typename Impl::CRegisters>;
+
+  static constexpr int kDRegs = std::extent_v<typename Impl::DRegisters>;
+  static constexpr int kARegs = std::extent_v<typename Impl::ARegisters>;
+  static constexpr int kBRegs = std::extent_v<typename Impl::BRegisters>;
+  static constexpr int kCRegs = std::extent_v<typename Impl::CRegisters>;
+};
+
+// Dispatcher for SM70 MMA operations
+template <DataType AType, DataType BType, DataType CType, int M, int N, int K,
+          bool TransA, bool TransB>
+struct MmaSm70Dispatcher {
+  using CRegType = void;
+  using ARegType = void;
+  using BRegType = void;
+
+  static TL_DEVICE void exec(CRegType *, const ARegType *, const BRegType *,
+                             const CRegType *) {
+    static_assert(always_false_v<std::integral_constant<int, M>>,
+                  "tl::mma_sync_sm70: unsupported configuration. "
+                  "SM70 only supports m16n16k4 with FP16 inputs and FP16/FP32 "
+                  "accumulation.");
+  }
+};
+
+// Helper to call fma with unpacked register arrays
+template <class Impl, size_t... DIdx, size_t... AIdx, size_t... BIdx,
+          size_t... CIdx>
+TL_DEVICE void
+call_fma_impl_sm70(typename MmaSm70ImplTraits<Impl>::DReg *d,
+                   const typename MmaSm70ImplTraits<Impl>::AReg *a,
+                   const typename MmaSm70ImplTraits<Impl>::BReg *b,
+                   const typename MmaSm70ImplTraits<Impl>::CReg *c,
+                   std::index_sequence<DIdx...>, std::index_sequence<AIdx...>,
+                   std::index_sequence<BIdx...>, std::index_sequence<CIdx...>) {
+  Impl::fma(d[DIdx]..., a[AIdx]..., b[BIdx]..., c[CIdx]...);
+}
+
+template <class Impl>
+TL_DEVICE void call_fma_sm70(typename MmaSm70ImplTraits<Impl>::DReg *d,
+                             const typename MmaSm70ImplTraits<Impl>::AReg *a,
+                             const typename MmaSm70ImplTraits<Impl>::BReg *b,
+                             const typename MmaSm70ImplTraits<Impl>::CReg *c) {
+  call_fma_impl_sm70<Impl>(
+      d, a, b, c, std::make_index_sequence<MmaSm70ImplTraits<Impl>::kDRegs>{},
+      std::make_index_sequence<MmaSm70ImplTraits<Impl>::kARegs>{},
+      std::make_index_sequence<MmaSm70ImplTraits<Impl>::kBRegs>{},
+      std::make_index_sequence<MmaSm70ImplTraits<Impl>::kCRegs>{});
+}
+
+// Define dispatchers for all supported SM70 configurations
+// Note: m8n8k4 instruction computes m16n16k4 at warp level
+#define TL_DEFINE_MMA_SM70_DISPATCHER(ATypeEnum, BTypeEnum, CTypeEnum,         \
+                                      TransAValue, TransBValue)                \
+  template <>                                                                  \
+  struct MmaSm70Dispatcher<DataType::ATypeEnum, DataType::BTypeEnum,           \
+                           DataType::CTypeEnum, 16, 16, 4, TransAValue,        \
+                           TransBValue> {                                      \
+    using Impl = MmaSm70Impl<DataType::ATypeEnum, DataType::BTypeEnum,         \
+                             DataType::CTypeEnum, TransAValue, TransBValue>;   \
+    using Traits = MmaSm70ImplTraits<Impl>;                                    \
+    using CRegType = typename Traits::DReg;                                    \
+    using ARegType = typename Traits::AReg;                                    \
+    using BRegType = typename Traits::BReg;                                    \
+    static_assert(                                                             \
+        std::is_same_v<typename Traits::DReg, typename Traits::CReg>,          \
+        "tl::mma_sync_sm70 requires matching accumulator/output regs");        \
+    static TL_DEVICE void exec(CRegType *d, const ARegType *a,                 \
+                               const BRegType *b, const CRegType *c) {         \
+      call_fma_sm70<Impl>(d, a, b, c);                                         \
+    }                                                                          \
+  };
+
+// FP16 inputs with FP16 accumulation (all layout combinations)
+TL_DEFINE_MMA_SM70_DISPATCHER(kFloat16, kFloat16, kFloat16, true, true)
+TL_DEFINE_MMA_SM70_DISPATCHER(kFloat16, kFloat16, kFloat16, true, false)
+TL_DEFINE_MMA_SM70_DISPATCHER(kFloat16, kFloat16, kFloat16, false, true)
+TL_DEFINE_MMA_SM70_DISPATCHER(kFloat16, kFloat16, kFloat16, false, false)
+
+// FP16 inputs with FP32 accumulation (all layout combinations)
+TL_DEFINE_MMA_SM70_DISPATCHER(kFloat16, kFloat16, kFloat32, true, true)
+TL_DEFINE_MMA_SM70_DISPATCHER(kFloat16, kFloat16, kFloat32, true, false)
+TL_DEFINE_MMA_SM70_DISPATCHER(kFloat16, kFloat16, kFloat32, false, true)
+TL_DEFINE_MMA_SM70_DISPATCHER(kFloat16, kFloat16, kFloat32, false, false)
+
+#undef TL_DEFINE_MMA_SM70_DISPATCHER
+
+} // namespace detail
+
+/// SM70 MMA synchronous instruction wrapper
+/// Supports m16n16k4 shape (m8n8k4 instruction at warp level) with FP16 inputs
+/// and FP16/FP32 accumulation
+///
+/// @tparam AType Input A data type (kFloat16)
+/// @tparam BType Input B data type (kFloat16)
+/// @tparam CType Accumulator/output data type (kFloat16 or kFloat32)
+/// @tparam M Matrix M dimension (16)
+/// @tparam N Matrix N dimension (16)
+/// @tparam K Matrix K dimension (4)
+/// @tparam TransA Whether A is transposed (false=row-major, true=col-major)
+/// @tparam TransB Whether B is transposed (false=row-major, true=col-major)
+template <DataType AType, DataType BType, DataType CType, int M, int N, int K,
+          bool TransA, bool TransB>
+TL_DEVICE void mma_sync_sm70(
+    typename detail::MmaSm70Dispatcher<AType, BType, CType, M, N, K, TransA,
+                                       TransB>::CRegType *c,
+    const typename detail::MmaSm70Dispatcher<AType, BType, CType, M, N, K,
+                                             TransA, TransB>::ARegType *a,
+    const typename detail::MmaSm70Dispatcher<AType, BType, CType, M, N, K,
+                                             TransA, TransB>::BRegType *b) {
+  using Dispatcher =
+      detail::MmaSm70Dispatcher<AType, BType, CType, M, N, K, TransA, TransB>;
+  static_assert(!std::is_void_v<typename Dispatcher::CRegType>,
+                "tl::mma_sync_sm70: unsupported configuration. "
+                "SM70 only supports m16n16k4 with FP16 inputs.");
+  Dispatcher::exec(c, a, b, c);
+}
+
+} // namespace tl
diff --git a/src/tl_templates/cuda/instruction/tcgen05mma.h b/src/tl_templates/cuda/instruction/tcgen05mma.h
new file mode 100644
index 000000000..9772d6438
--- /dev/null
+++ b/src/tl_templates/cuda/instruction/tcgen05mma.h
@@ -0,0 +1,337 @@
+#pragma once
+
+#include "../common.h"
+#include <cute/arch/cluster_sm90.hpp>
+
+namespace tl {
+
+#ifndef TL_ALWAYS_FALSE_V_DEFINED
+#define TL_ALWAYS_FALSE_V_DEFINED
+template <class> inline constexpr bool always_false_v = false;
+#endif
+
+// Generic declaration: unsupported by default
+template <DataType C_type>
+TL_DEVICE void
+tcgen05mma_ss(uint64_t const & /*desc_a*/, uint64_t const & /*desc_b*/,
+              uint32_t const & /*tmem_c*/, uint32_t const & /*scalec*/,
+              uint32_t const & /*desc_val*/, int const & /*mask0*/,
+              int const & /*mask1*/, int const & /*mask2*/,
+              int const & /*mask3*/) {
+  static_assert(
+      always_false_v<std::integral_constant<int, static_cast<int>(C_type)>>,
+      "tl::tcgen05mma_ss: unsupported accumulator type");
+}
+
+// TS variants: A from TMEM, B from SMEM (desc)
+// Generic declaration: unsupported by default
+template <DataType C_type>
+TL_DEVICE void
+tcgen05mma_ts(uint32_t const & /*tmem_a*/, uint64_t const & /*desc_b*/,
+              uint32_t const & /*tmem_c*/, uint32_t const & /*scalec*/,
+              uint32_t const & /*desc_val*/, int const & /*mask0*/,
+              int const & /*mask1*/, int const & /*mask2*/,
+              int const & /*mask3*/) {
+  static_assert(
+      always_false_v<std::integral_constant<int, static_cast<int>(C_type)>>,
+      "tl::tcgen05mma_ts: unsupported accumulator type");
+}
+
+// F16/BF16 instruction kind (maps to kind::f16)
+template <>
+TL_DEVICE void tcgen05mma_ts<DataType::kFloat16>(
+    uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  if (cute::elect_one_sync()) {
+    asm volatile("{\n\t"
+                 ".reg .pred p;\n\t"
+                 "setp.ne.b32 p, %4, 0;\n\t"
+                 "tcgen05.mma.cta_group::1.kind::f16 [%0], [%1], %2, %3, {%5, "
+                 "%6, %7, %8}, p; \n\t"
+                 "}\n"
+                 :
+                 : "r"(tmem_c), "r"(tmem_a), "l"(desc_b), "r"(desc_val),
+                   "r"(scalec), "r"(mask0), "r"(mask1), "r"(mask2), "r"(mask3));
+  }
+}
+
+// BF16 maps to the same f16-kind instruction
+template <>
+TL_DEVICE void tcgen05mma_ts<DataType::kBFloat16>(
+    uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ts<DataType::kFloat16>(tmem_a, desc_b, tmem_c, scalec, desc_val,
+                                    mask0, mask1, mask2, mask3);
+}
+
+// TF32 instruction kind
+template <>
+TL_DEVICE void tcgen05mma_ts<DataType::kTensorFloat32>(
+    uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  if (cute::elect_one_sync()) {
+    asm volatile("{\n\t"
+                 ".reg .pred p;\n\t"
+                 "setp.ne.b32 p, %4, 0;\n\t"
+                 "tcgen05.mma.cta_group::1.kind::tf32 [%0], [%1], %2, %3, {%5, "
+                 "%6, %7, %8}, p; \n\t"
+                 "}\n"
+                 :
+                 : "r"(tmem_c), "r"(tmem_a), "l"(desc_b), "r"(desc_val),
+                   "r"(scalec), "r"(mask0), "r"(mask1), "r"(mask2), "r"(mask3));
+  }
+}
+
+// INT8 instruction kind
+template <>
+TL_DEVICE void tcgen05mma_ts<DataType::kInt8>(
+    uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  if (cute::elect_one_sync()) {
+    asm volatile("{\n\t"
+                 ".reg .pred p;\n\t"
+                 "setp.ne.b32 p, %4, 0;\n\t"
+                 "tcgen05.mma.cta_group::1.kind::i8 [%0], [%1], %2, %3, {%5, "
+                 "%6, %7, %8}, p; \n\t"
+                 "}\n"
+                 :
+                 : "r"(tmem_c), "r"(tmem_a), "l"(desc_b), "r"(desc_val),
+                   "r"(scalec), "r"(mask0), "r"(mask1), "r"(mask2), "r"(mask3));
+  }
+}
+
+// FP8 family instruction kind (maps to f8f6f4)
+template <>
+TL_DEVICE void tcgen05mma_ts<DataType::kFloat8_e4m3>(
+    uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  if (cute::elect_one_sync()) {
+    asm volatile("{\n\t"
+                 ".reg .pred p;\n\t"
+                 "setp.ne.b32 p, %4, 0;\n\t"
+                 "tcgen05.mma.cta_group::1.kind::f8f6f4 [%0], [%1], %2, %3, "
+                 "{%5, %6, %7, %8}, p; \n\t"
+                 "}\n"
+                 :
+                 : "r"(tmem_c), "r"(tmem_a), "l"(desc_b), "r"(desc_val),
+                   "r"(scalec), "r"(mask0), "r"(mask1), "r"(mask2), "r"(mask3));
+  }
+}
+
+template <>
+TL_DEVICE void tcgen05mma_ts<DataType::kFloat8_e5m2>(
+    uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ts<DataType::kFloat8_e4m3>(tmem_a, desc_b, tmem_c, scalec,
+                                        desc_val, mask0, mask1, mask2, mask3);
+}
+
+// F16/BF16 instruction kind (maps to kind::f16)
+template <>
+TL_DEVICE void tcgen05mma_ss<DataType::kFloat16>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  // idescE upper 32 bits carry the instruction descriptor; lower 32 ignored for
+  // SS Load TMEM base from shared memory slot handled by caller
+
+  if (cute::elect_one_sync()) {
+    asm volatile("{\n\t"
+                 ".reg .pred p;\n\t"
+                 "setp.ne.b32 p, %4, 0;\n\t"
+                 "tcgen05.mma.cta_group::1.kind::f16 [%0], %1, %2, %3, {%5, "
+                 "%6, %7, %8}, p; \n\t"
+                 "}\n"
+                 :
+                 : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(desc_val),
+                   "r"(scalec), "r"(mask0), "r"(mask1), "r"(mask2), "r"(mask3));
+  }
+}
+
+// BF16 maps to the same f16-kind instruction
+template <>
+TL_DEVICE void tcgen05mma_ss<DataType::kBFloat16>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ss<DataType::kFloat16>(desc_a, desc_b, tmem_c, scalec, desc_val,
+                                    mask0, mask1, mask2, mask3);
+}
+
+// TF32 instruction kind
+template <>
+TL_DEVICE void tcgen05mma_ss<DataType::kTensorFloat32>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  if (cute::elect_one_sync()) {
+    asm volatile("{\n\t"
+                 ".reg .pred p;\n\t"
+                 "setp.ne.b32 p, %4, 0;\n\t"
+                 "tcgen05.mma.cta_group::1.kind::tf32 [%0], %1, %2, %3, {%5, "
+                 "%6, %7, %8}, p; \n\t"
+                 "}\n"
+                 :
+                 : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(desc_val),
+                   "r"(scalec), "r"(mask0), "r"(mask1), "r"(mask2), "r"(mask3));
+  }
+}
+
+// INT8 instruction kind
+template <>
+TL_DEVICE void tcgen05mma_ss<DataType::kInt8>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  if (cute::elect_one_sync()) {
+    asm volatile("{\n\t"
+                 ".reg .pred p;\n\t"
+                 "setp.ne.b32 p, %4, 0;\n\t"
+                 "tcgen05.mma.cta_group::1.kind::i8 [%0], %1, %2, %3, {%5, %6, "
+                 "%7, %8}, p; \n\t"
+                 "}\n"
+                 :
+                 : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(desc_val),
+                   "r"(scalec), "r"(mask0), "r"(mask1), "r"(mask2), "r"(mask3));
+  }
+}
+
+// FP8 family instruction kind (maps to f8f6f4)
+template <>
+TL_DEVICE void tcgen05mma_ss<DataType::kFloat8_e4m3>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  if (cute::elect_one_sync()) {
+    asm volatile("{\n\t"
+                 ".reg .pred p;\n\t"
+                 "setp.ne.b32 p, %4, 0;\n\t"
+                 "tcgen05.mma.cta_group::1.kind::f8f6f4 [%0], %1, %2, %3, {%5, "
+                 "%6, %7, %8}, p; \n\t"
+                 "}\n"
+                 :
+                 : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(desc_val),
+                   "r"(scalec), "r"(mask0), "r"(mask1), "r"(mask2), "r"(mask3));
+  }
+}
+
+template <>
+TL_DEVICE void tcgen05mma_ss<DataType::kFloat8_e5m2>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ss<DataType::kFloat8_e4m3>(desc_a, desc_b, tmem_c, scalec,
+                                        desc_val, mask0, mask1, mask2, mask3);
+}
+
+// WS variants: tcgen05.mma.ws.cta_group::1.kind::xxx
+// Generic declaration falls back to static assert
+template <DataType C_type>
+TL_DEVICE void
+tcgen05mma_ws_ss(uint64_t const & /*desc_a*/, uint64_t const & /*desc_b*/,
+                 uint32_t const & /*tmem_c*/, uint32_t const & /*scalec*/,
+                 uint32_t const & /*desc_val*/, int const & /*mask0*/,
+                 int const & /*mask1*/, int const & /*mask2*/,
+                 int const & /*mask3*/) {
+  static_assert(
+      always_false_v<std::integral_constant<int, static_cast<int>(C_type)>>,
+      "tl::tcgen05mma_ws_ss: unsupported accumulator type");
+}
+
+// F16/BF16 ws
+template <>
+TL_DEVICE void tcgen05mma_ws_ss<DataType::kFloat16>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  if (cute::elect_one_sync()) {
+    asm volatile(
+        "{\n\t"
+        ".reg .pred p;\n\t"
+        "setp.ne.b32 p, %4, 0;\n\t"
+        "tcgen05.mma.ws.cta_group::1.kind::f16 [%0], %1, %2, %3, p, 0; \n\t"
+        "}\n"
+        :
+        : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(desc_val), "r"(scalec));
+  }
+}
+
+template <>
+TL_DEVICE void tcgen05mma_ws_ss<DataType::kBFloat16>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ws_ss<DataType::kFloat16>(desc_a, desc_b, tmem_c, scalec, desc_val,
+                                       mask0, mask1, mask2, mask3);
+}
+
+// TF32 ws
+template <>
+TL_DEVICE void tcgen05mma_ws_ss<DataType::kTensorFloat32>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  if (cute::elect_one_sync()) {
+    asm volatile(
+        "{\n\t"
+        ".reg .pred p;\n\t"
+        "setp.ne.b32 p, %4, 0;\n\t"
+        "tcgen05.mma.ws.cta_group::1.kind::tf32 [%0], %1, %2, %3, p, 0; \n\t"
+        "}\n"
+        :
+        : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(desc_val), "r"(scalec));
+  }
+}
+
+// INT8 ws
+template <>
+TL_DEVICE void tcgen05mma_ws_ss<DataType::kInt8>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  if (cute::elect_one_sync()) {
+    asm volatile(
+        "{\n\t"
+        ".reg .pred p;\n\t"
+        "setp.ne.b32 p, %4, 0;\n\t"
+        "tcgen05.mma.ws.cta_group::1.kind::i8 [%0], %1, %2, %3, p, 0; \n\t"
+        "}\n"
+        :
+        : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(desc_val), "r"(scalec));
+  }
+}
+
+// FP8 ws (maps to f8f6f4)
+template <>
+TL_DEVICE void tcgen05mma_ws_ss<DataType::kFloat8_e4m3>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  if (cute::elect_one_sync()) {
+    asm volatile(
+        "{\n\t"
+        ".reg .pred p;\n\t"
+        "setp.ne.b32 p, %4, 0;\n\t"
+        "tcgen05.mma.ws.cta_group::1.kind::f8f6f4 [%0], %1, %2, %3, p, 0; \n\t"
+        "}\n"
+        :
+        : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(desc_val), "r"(scalec));
+  }
+}
+
+template <>
+TL_DEVICE void tcgen05mma_ws_ss<DataType::kFloat8_e5m2>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ws_ss<DataType::kFloat8_e4m3>(
+      desc_a, desc_b, tmem_c, scalec, desc_val, mask0, mask1, mask2, mask3);
+}
+
+} // namespace tl
diff --git a/src/tl_templates/cuda/instruction/wgmma.h b/src/tl_templates/cuda/instruction/wgmma.h
index 0e9717280..3af2d79fe 100644
--- a/src/tl_templates/cuda/instruction/wgmma.h
+++ b/src/tl_templates/cuda/instruction/wgmma.h
@@ -1,516 +1,459 @@
 #pragma once
+
 #include "../common.h"
-#include "cute/arch/mma_sm90_gmma.hpp"
+#include <cute/arch/mma_sm90_gmma.hpp>
+#include <cute/arch/mma_sm90_gmma_ext.hpp>
+
+#ifndef __CUDACC_RTC__
+#include <type_traits>
+#include <utility>
+#endif
 
 namespace tl {
 
+#ifndef TL_ALWAYS_FALSE_V_DEFINED
+#define TL_ALWAYS_FALSE_V_DEFINED
 template <class> inline constexpr bool always_false_v = false;
+#endif
 
-// 主类模板 - 移除默认参数，因为特化不能有默认参数
-template <DataType A_type, DataType B_type, DataType C_type, int M, int N,
-          int K, bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    printf("DEBUG: WgmmaSSImpl fallback - A_type=%d (kFloat16=%d), B_type=%d, "
-           "C_type=%d, M=%d, N=%d, K=%d, tnspA=%d, tnspB=%d, scaleA=%d, "
-           "scaleB=%d\n",
-           (int)A_type, (int)DataType::kFloat16, (int)B_type, (int)C_type, M, N,
-           K, (int)tnspA, (int)tnspB, scaleA, scaleB);
-    // 暂时注释掉 static_assert 来看调试输出
-    // static_assert(always_false_v<decltype(c)>,
-    //     "wgmma_ss: No specialization available for given template
-    //     parameters!");
-  };
-};
-
-// ================================= F16 x F16 -> F16
-// =================================
-
-// M64N8K16 F16
-template <bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl<DataType::kFloat16, DataType::kFloat16, DataType::kFloat16,
-                   64, 8, 16, tnspA, tnspB, scaleA, scaleB> {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    asm volatile("{\n"
-                 ".reg .pred p;\n"
-                 "setp.ne.b32 p, %4, 0;\n"
-                 "wgmma.mma_async.sync.aligned.m64n8k16.f16.f16.f16 "
-                 "{%0, %1}, %2, %3, p, %5, %6, %7, %8;\n"
-                 "}\n"
-                 : "+r"(c[0]), "+r"(c[1])
-                 : "l"(desc_a), "l"(desc_b), "r"(int32_t(scale_out)),
-                   "n"(int32_t(scaleA)), "n"(int32_t(scaleB)),
-                   "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-  }
-};
-
-// M64N16K16 F16
-template <bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl<DataType::kFloat16, DataType::kFloat16, DataType::kFloat16,
-                   64, 16, 16, tnspA, tnspB, scaleA, scaleB> {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    asm volatile("{\n"
-                 ".reg .pred p;\n"
-                 "setp.ne.b32 p, %6, 0;\n"
-                 "wgmma.mma_async.sync.aligned.m64n16k16.f16.f16.f16 "
-                 "{%0, %1, %2, %3}, %4, %5, p, %7, %8, %9, %10;\n"
-                 "}\n"
-                 : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3])
-                 : "l"(desc_a), "l"(desc_b), "r"(int32_t(scale_out)),
-                   "n"(int32_t(scaleA)), "n"(int32_t(scaleB)),
-                   "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-  }
-};
+namespace detail {
 
-// M64N32K16 F16
-template <bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl<DataType::kFloat16, DataType::kFloat16, DataType::kFloat16,
-                   64, 32, 16, tnspA, tnspB, scaleA, scaleB> {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    asm volatile(
-        "{\n"
-        ".reg .pred p;\n"
-        "setp.ne.b32 p, %10, 0;\n"
-        "wgmma.mma_async.sync.aligned.m64n32k16.f16.f16.f16 "
-        "{%0, %1, %2, %3, %4, %5, %6, %7}, %8, %9, p, %11, %12, %13, %14;\n"
-        "}\n"
-        : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(c[4]),
-          "+r"(c[5]), "+r"(c[6]), "+r"(c[7])
-        : "l"(desc_a), "l"(desc_b), "r"(int32_t(scale_out)),
-          "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)),
-          "n"(int32_t(tnspB)));
-  }
+template <bool IsMnMajor> struct MajorValue {
+  static constexpr auto value =
+      IsMnMajor ? cute::SM90::GMMA::Major::MN : cute::SM90::GMMA::Major::K;
 };
 
-// M64N64K16 F16
-template <bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl<DataType::kFloat16, DataType::kFloat16, DataType::kFloat16,
-                   64, 64, 16, tnspA, tnspB, scaleA, scaleB> {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    asm volatile("{\n"
-                 ".reg .pred p;\n"
-                 "setp.ne.b32 p, %18, 0;\n"
-                 "wgmma.mma_async.sync.aligned.m64n64k16.f16.f16.f16 "
-                 "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-                 "%8,  %9, %10, %11, %12, %13, %14, %15},"
-                 " %16, %17, p, %19, %20, %21, %22;\n"
-                 "}\n"
-                 : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(c[4]),
-                   "+r"(c[5]), "+r"(c[6]), "+r"(c[7]), "+r"(c[8]), "+r"(c[9]),
-                   "+r"(c[10]), "+r"(c[11]), "+r"(c[12]), "+r"(c[13]),
-                   "+r"(c[14]), "+r"(c[15])
-                 : "l"(desc_a), "l"(desc_b), "r"(int32_t(scale_out)),
-                   "n"(int32_t(scaleA)), "n"(int32_t(scaleB)),
-                   "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-  }
+template <int Scale> struct ScaleInValue {
+  static_assert(Scale == 1 || Scale == -1,
+                "tl::wgmma requires scale factors of +1 or -1.");
+  static constexpr auto value = Scale == 1 ? cute::SM90::GMMA::ScaleIn::One
+                                           : cute::SM90::GMMA::ScaleIn::Neg;
 };
 
-// M64N96K16 F16
-template <bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl<DataType::kFloat16, DataType::kFloat16, DataType::kFloat16,
-                   64, 96, 16, tnspA, tnspB, scaleA, scaleB> {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    asm volatile("{\n"
-                 ".reg .pred p;\n"
-                 "setp.ne.b32 p, %26, 0;\n"
-                 "wgmma.mma_async.sync.aligned.m64n96k16.f16.f16.f16 "
-                 "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-                 "%8,  %9, %10, %11, %12, %13, %14, %15, "
-                 "%16, %17, %18, %19, %20, %21, %22, %23}, "
-                 "%24, %25, p, %27, %28, %29, %30;\n"
-                 "}\n"
-                 : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(c[4]),
-                   "+r"(c[5]), "+r"(c[6]), "+r"(c[7]), "+r"(c[8]), "+r"(c[9]),
-                   "+r"(c[10]), "+r"(c[11]), "+r"(c[12]), "+r"(c[13]),
-                   "+r"(c[14]), "+r"(c[15]), "+r"(c[16]), "+r"(c[17]),
-                   "+r"(c[18]), "+r"(c[19]), "+r"(c[20]), "+r"(c[21]),
-                   "+r"(c[22]), "+r"(c[23])
-                 : "l"(desc_a), "l"(desc_b), "r"(int32_t(scale_out)),
-                   "n"(int32_t(scaleA)), "n"(int32_t(scaleB)),
-                   "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-  }
-};
+template <int Scale>
+inline constexpr bool IsValidScale = (Scale == 1 || Scale == -1);
 
-// M64N128K16 F16
-template <bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl<DataType::kFloat16, DataType::kFloat16, DataType::kFloat16,
-                   64, 128, 16, tnspA, tnspB, scaleA, scaleB> {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    asm volatile("{\n"
-                 ".reg .pred p;\n"
-                 "setp.ne.b32 p, %34, 0;\n"
-                 "wgmma.mma_async.sync.aligned.m64n128k16.f16.f16.f16 "
-                 "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-                 "%8,  %9, %10, %11, %12, %13, %14, %15, "
-                 "%16, %17, %18, %19, %20, %21, %22, %23, "
-                 "%24, %25, %26, %27, %28, %29, %30, %31}, "
-                 "%32, %33, p, %35, %36, %37, %38;\n"
-                 "}\n"
-                 : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(c[4]),
-                   "+r"(c[5]), "+r"(c[6]), "+r"(c[7]), "+r"(c[8]), "+r"(c[9]),
-                   "+r"(c[10]), "+r"(c[11]), "+r"(c[12]), "+r"(c[13]),
-                   "+r"(c[14]), "+r"(c[15]), "+r"(c[16]), "+r"(c[17]),
-                   "+r"(c[18]), "+r"(c[19]), "+r"(c[20]), "+r"(c[21]),
-                   "+r"(c[22]), "+r"(c[23]), "+r"(c[24]), "+r"(c[25]),
-                   "+r"(c[26]), "+r"(c[27]), "+r"(c[28]), "+r"(c[29]),
-                   "+r"(c[30]), "+r"(c[31])
-                 : "l"(desc_a), "l"(desc_b), "r"(int32_t(scale_out)),
-                   "n"(int32_t(scaleA)), "n"(int32_t(scaleB)),
-                   "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-  }
-};
+template <class Impl> struct CallWgmmaSS {
+  using CReg = std::remove_extent_t<typename Impl::CRegisters>;
+  static constexpr int kCRegs = std::extent_v<typename Impl::CRegisters>;
+  static_assert(sizeof(CReg) == sizeof(uint32_t),
+                "tl::wgmma_ss expects 32-bit accumulator registers.");
 
-// M64N192K16 F16
-template <bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl<DataType::kFloat16, DataType::kFloat16, DataType::kFloat16,
-                   64, 192, 16, tnspA, tnspB, scaleA, scaleB> {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    asm volatile(
-        "{\n"
-        ".reg .pred p;\n"
-        "setp.ne.b32 p, %50, 0;\n"
-        "wgmma.mma_async.sync.aligned.m64n192k16.f16.f16.f16 "
-        "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-        "%8,  %9, %10, %11, %12, %13, %14, %15, "
-        "%16, %17, %18, %19, %20, %21, %22, %23, "
-        "%24, %25, %26, %27, %28, %29, %30, %31, "
-        "%32, %33, %34, %35, %36, %37, %38, %39, "
-        "%40, %41, %42, %43, %44, %45, %46, %47}, "
-        "%48, %49, p, %51, %52, %53, %54;\n"
-        "}\n"
-        : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(c[4]),
-          "+r"(c[5]), "+r"(c[6]), "+r"(c[7]), "+r"(c[8]), "+r"(c[9]),
-          "+r"(c[10]), "+r"(c[11]), "+r"(c[12]), "+r"(c[13]), "+r"(c[14]),
-          "+r"(c[15]), "+r"(c[16]), "+r"(c[17]), "+r"(c[18]), "+r"(c[19]),
-          "+r"(c[20]), "+r"(c[21]), "+r"(c[22]), "+r"(c[23]), "+r"(c[24]),
-          "+r"(c[25]), "+r"(c[26]), "+r"(c[27]), "+r"(c[28]), "+r"(c[29]),
-          "+r"(c[30]), "+r"(c[31]), "+r"(c[32]), "+r"(c[33]), "+r"(c[34]),
-          "+r"(c[35]), "+r"(c[36]), "+r"(c[37]), "+r"(c[38]), "+r"(c[39]),
-          "+r"(c[40]), "+r"(c[41]), "+r"(c[42]), "+r"(c[43]), "+r"(c[44]),
-          "+r"(c[45]), "+r"(c[46]), "+r"(c[47])
-        : "l"(desc_a), "l"(desc_b), "r"(int32_t(scale_out)),
-          "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)),
-          "n"(int32_t(tnspB)));
+  template <size_t... Idx>
+  TL_DEVICE static void Run(uint64_t desc_a, uint64_t desc_b, CReg *c,
+                            cute::SM90::GMMA::ScaleOut scale,
+                            std::index_sequence<Idx...>) {
+    Impl::fma(desc_a, desc_b, c[Idx]..., scale);
   }
-};
 
-// M64N256K16 F16
-template <bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl<DataType::kFloat16, DataType::kFloat16, DataType::kFloat16,
-                   64, 256, 16, tnspA, tnspB, scaleA, scaleB> {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    asm volatile(
-        "{\n"
-        ".reg .pred p;\n"
-        "setp.ne.b32 p, %66, 0;\n"
-        "wgmma.mma_async.sync.aligned.m64n256k16.f16.f16.f16 "
-        "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-        "%8,  %9, %10, %11, %12, %13, %14, %15, "
-        "%16, %17, %18, %19, %20, %21, %22, %23, "
-        "%24, %25, %26, %27, %28, %29, %30, %31, "
-        "%32, %33, %34, %35, %36, %37, %38, %39, "
-        "%40, %41, %42, %43, %44, %45, %46, %47, "
-        "%48, %49, %50, %51, %52, %53, %54, %55, "
-        "%56, %57, %58, %59, %60, %61, %62, %63}, "
-        "%64, %65, p, %67, %68, %69, %70;\n"
-        "}\n"
-        : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(c[4]),
-          "+r"(c[5]), "+r"(c[6]), "+r"(c[7]), "+r"(c[8]), "+r"(c[9]),
-          "+r"(c[10]), "+r"(c[11]), "+r"(c[12]), "+r"(c[13]), "+r"(c[14]),
-          "+r"(c[15]), "+r"(c[16]), "+r"(c[17]), "+r"(c[18]), "+r"(c[19]),
-          "+r"(c[20]), "+r"(c[21]), "+r"(c[22]), "+r"(c[23]), "+r"(c[24]),
-          "+r"(c[25]), "+r"(c[26]), "+r"(c[27]), "+r"(c[28]), "+r"(c[29]),
-          "+r"(c[30]), "+r"(c[31]), "+r"(c[32]), "+r"(c[33]), "+r"(c[34]),
-          "+r"(c[35]), "+r"(c[36]), "+r"(c[37]), "+r"(c[38]), "+r"(c[39]),
-          "+r"(c[40]), "+r"(c[41]), "+r"(c[42]), "+r"(c[43]), "+r"(c[44]),
-          "+r"(c[45]), "+r"(c[46]), "+r"(c[47]), "+r"(c[48]), "+r"(c[49]),
-          "+r"(c[50]), "+r"(c[51]), "+r"(c[52]), "+r"(c[53]), "+r"(c[54]),
-          "+r"(c[55]), "+r"(c[56]), "+r"(c[57]), "+r"(c[58]), "+r"(c[59]),
-          "+r"(c[60]), "+r"(c[61]), "+r"(c[62]), "+r"(c[63])
-        : "l"(desc_a), "l"(desc_b), "r"(int32_t(scale_out)),
-          "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)),
-          "n"(int32_t(tnspB)));
+  TL_DEVICE static void exec(uint64_t desc_a, uint64_t desc_b, uint32_t *c_raw,
+                             bool scale_out) {
+    auto scale = scale_out ? cute::SM90::GMMA::ScaleOut::One
+                           : cute::SM90::GMMA::ScaleOut::Zero;
+    auto c = reinterpret_cast<CReg *>(c_raw);
+    Run(desc_a, desc_b, c, scale, std::make_index_sequence<kCRegs>{});
   }
 };
 
-// ================================= F16 x F16 -> F32
-// =================================
-
-// M64N8K16 F16->F32
-template <bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl<DataType::kFloat16, DataType::kFloat16, DataType::kFloat32,
-                   64, 8, 16, tnspA, tnspB, scaleA, scaleB> {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    asm volatile("{\n"
-                 ".reg .pred p;\n"
-                 "setp.ne.b32 p, %6, 0;\n"
-                 "wgmma.mma_async.sync.aligned.m64n8k16.f32.f16.f16 "
-                 "{%0, %1, %2, %3}, %4, %5, p, %7, %8, %9, %10;\n"
-                 "}\n"
-                 : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3])
-                 : "l"(desc_a), "l"(desc_b), "r"(int32_t(scale_out)),
-                   "n"(int32_t(scaleA)), "n"(int32_t(scaleB)),
-                   "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+template <class Impl> struct CallWgmmaRS {
+  using AReg = std::remove_extent_t<typename Impl::ARegisters>;
+  using CReg = std::remove_extent_t<typename Impl::CRegisters>;
+  static constexpr int kARegs = std::extent_v<typename Impl::ARegisters>;
+  static constexpr int kCRegs = std::extent_v<typename Impl::CRegisters>;
+  static_assert(sizeof(AReg) == sizeof(uint32_t),
+                "tl::wgmma_rs expects 32-bit register operands for A.");
+  static_assert(sizeof(CReg) == sizeof(uint32_t) ||
+                    sizeof(CReg) == sizeof(float),
+                "tl::wgmma_rs expects 32-bit accumulator registers.");
+
+  template <size_t... AIdx, size_t... CIdx>
+  TL_DEVICE static void
+  Run(const AReg *a, uint64_t desc_b, CReg *c, cute::SM90::GMMA::ScaleOut scale,
+      std::index_sequence<AIdx...>, std::index_sequence<CIdx...>) {
+    Impl::fma(a[AIdx]..., desc_b, c[CIdx]..., scale);
   }
-};
 
-// M64N16K16 F16->F32
-template <bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl<DataType::kFloat16, DataType::kFloat16, DataType::kFloat32,
-                   64, 16, 16, tnspA, tnspB, scaleA, scaleB> {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    asm volatile(
-        "{\n"
-        ".reg .pred p;\n"
-        "setp.ne.b32 p, %10, 0;\n"
-        "wgmma.mma_async.sync.aligned.m64n16k16.f32.f16.f16 "
-        "{%0, %1, %2, %3, %4, %5, %6, %7}, %8, %9, p, %11, %12, %13, %14;\n"
-        "}\n"
-        : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(c[4]),
-          "+r"(c[5]), "+r"(c[6]), "+r"(c[7])
-        : "l"(desc_a), "l"(desc_b), "r"(int32_t(scale_out)),
-          "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)),
-          "n"(int32_t(tnspB)));
+  TL_DEVICE static void exec(const uint32_t *a_raw, uint64_t desc_b,
+                             uint32_t *c_raw, bool scale_out) {
+    auto scale = scale_out ? cute::SM90::GMMA::ScaleOut::One
+                           : cute::SM90::GMMA::ScaleOut::Zero;
+    auto a = reinterpret_cast<const AReg *>(a_raw);
+    auto c = reinterpret_cast<CReg *>(c_raw);
+    Run(a, desc_b, c, scale, std::make_index_sequence<kARegs>{},
+        std::make_index_sequence<kCRegs>{});
   }
 };
 
-// M64N32K16 F16->F32
-template <bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl<DataType::kFloat16, DataType::kFloat16, DataType::kFloat32,
-                   64, 32, 16, tnspA, tnspB, scaleA, scaleB> {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    asm volatile("{\n"
-                 ".reg .pred p;\n"
-                 "setp.ne.b32 p, %18, 0;\n"
-                 "wgmma.mma_async.sync.aligned.m64n32k16.f32.f16.f16 "
-                 "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-                 "%8,  %9, %10, %11, %12, %13, %14, %15}, "
-                 "%16, %17, p, %19, %20, %21, %22;\n"
-                 "}\n"
-                 : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(c[4]),
-                   "+r"(c[5]), "+r"(c[6]), "+r"(c[7]), "+r"(c[8]), "+r"(c[9]),
-                   "+r"(c[10]), "+r"(c[11]), "+r"(c[12]), "+r"(c[13]),
-                   "+r"(c[14]), "+r"(c[15])
-                 : "l"(desc_a), "l"(desc_b), "r"(int32_t(scale_out)),
-                   "n"(int32_t(scaleA)), "n"(int32_t(scaleB)),
-                   "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-  }
-};
+} // namespace detail
 
-// M64N64K16 F16->F32
-template <bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl<DataType::kFloat16, DataType::kFloat16, DataType::kFloat32,
-                   64, 64, 16, tnspA, tnspB, scaleA, scaleB> {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    asm volatile("{\n"
-                 ".reg .pred p;\n"
-                 "setp.ne.b32 p, %34, 0;\n"
-                 "wgmma.mma_async.sync.aligned.m64n64k16.f32.f16.f16 "
-                 "{%0,  %1,  %2,  %3,  %4,  %5,  %6,  %7,  "
-                 "%8,  %9, %10, %11, %12, %13, %14, %15, "
-                 "%16, %17, %18, %19, %20, %21, %22, %23, "
-                 "%24, %25, %26, %27, %28, %29, %30, %31}, "
-                 "%32, %33, p, %35, %36, %37, %38;\n"
-                 "}\n"
-                 : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(c[4]),
-                   "+r"(c[5]), "+r"(c[6]), "+r"(c[7]), "+r"(c[8]), "+r"(c[9]),
-                   "+r"(c[10]), "+r"(c[11]), "+r"(c[12]), "+r"(c[13]),
-                   "+r"(c[14]), "+r"(c[15]), "+r"(c[16]), "+r"(c[17]),
-                   "+r"(c[18]), "+r"(c[19]), "+r"(c[20]), "+r"(c[21]),
-                   "+r"(c[22]), "+r"(c[23]), "+r"(c[24]), "+r"(c[25]),
-                   "+r"(c[26]), "+r"(c[27]), "+r"(c[28]), "+r"(c[29]),
-                   "+r"(c[30]), "+r"(c[31])
-                 : "l"(desc_a), "l"(desc_b), "r"(int32_t(scale_out)),
-                   "n"(int32_t(scaleA)), "n"(int32_t(scaleB)),
-                   "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+template <DataType A_type, DataType B_type, DataType C_type, int M, int N,
+          int K, bool tnspA, bool tnspB, int scaleA, int scaleB>
+struct WgmmaSSImpl {
+  static_assert(detail::IsValidScale<scaleA>, "tl::wgmma_ss: invalid scaleA");
+  static_assert(detail::IsValidScale<scaleB>, "tl::wgmma_ss: invalid scaleB");
+  TL_DEVICE static void execute(uint64_t, uint64_t, uint32_t *, bool) {
+    static_assert(always_false_v<std::integral_constant<int, M>>,
+                  "tl::wgmma_ss: unsupported configuration");
   }
 };
 
-// ================================= BF16 x BF16 -> F32
-// =================================
-
-// M64N8K16 BF16->F32
-template <bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl<DataType::kBFloat16, DataType::kBFloat16, DataType::kFloat32,
-                   64, 8, 16, tnspA, tnspB, scaleA, scaleB> {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    asm volatile("{\n"
-                 ".reg .pred p;\n"
-                 "setp.ne.b32 p, %6, 0;\n"
-                 "wgmma.mma_async.sync.aligned.m64n8k16.f32.bf16.bf16 "
-                 "{%0, %1, %2, %3}, %4, %5, p, %7, %8, %9, %10;\n"
-                 "}\n"
-                 : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3])
-                 : "l"(desc_a), "l"(desc_b), "r"(int32_t(scale_out)),
-                   "n"(int32_t(scaleA)), "n"(int32_t(scaleB)),
-                   "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
+template <DataType A_type, DataType B_type, DataType C_type, int M, int N,
+          int K, bool tnspA, bool tnspB, int scaleA, int scaleB>
+struct WgmmaRSImpl {
+  static_assert(detail::IsValidScale<scaleA>, "tl::wgmma_rs: invalid scaleA");
+  static_assert(detail::IsValidScale<scaleB>, "tl::wgmma_rs: invalid scaleB");
+  TL_DEVICE static void execute(const uint32_t *, uint64_t, uint32_t *, bool) {
+    static_assert(always_false_v<std::integral_constant<int, M>>,
+                  "tl::wgmma_rs: unsupported configuration");
   }
 };
 
-// M64N16K16 BF16->F32
-template <bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl<DataType::kBFloat16, DataType::kBFloat16, DataType::kFloat32,
-                   64, 16, 16, tnspA, tnspB, scaleA, scaleB> {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    asm volatile(
-        "{\n"
-        ".reg .pred p;\n"
-        "setp.ne.b32 p, %10, 0;\n"
-        "wgmma.mma_async.sync.aligned.m64n16k16.f32.bf16.bf16 "
-        "{%0, %1, %2, %3, %4, %5, %6, %7}, %8, %9, p, %11, %12, %13, %14;\n"
-        "}\n"
-        : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(c[4]),
-          "+r"(c[5]), "+r"(c[6]), "+r"(c[7])
-        : "l"(desc_a), "l"(desc_b), "r"(int32_t(scale_out)),
-          "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)),
-          "n"(int32_t(tnspB)));
-  }
-};
+#define TL_WGMMA_DEFINE_SS_GENERAL(AType, BType, CType, M, N, K, ImplName)     \
+  template <bool tnspA, bool tnspB, int scaleA, int scaleB>                    \
+  struct WgmmaSSImpl<DataType::AType, DataType::BType, DataType::CType, M, N,  \
+                     K, tnspA, tnspB, scaleA, scaleB> {                        \
+    static_assert(detail::IsValidScale<scaleA>,                                \
+                  "tl::wgmma_ss: invalid scaleA");                             \
+    static_assert(detail::IsValidScale<scaleB>,                                \
+                  "tl::wgmma_ss: invalid scaleB");                             \
+    using Impl =                                                               \
+        cute::SM90::GMMA::ImplName<detail::MajorValue<tnspA>::value,           \
+                                   detail::MajorValue<tnspB>::value,           \
+                                   detail::ScaleInValue<scaleA>::value,        \
+                                   detail::ScaleInValue<scaleB>::value>;       \
+    TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b,            \
+                                  uint32_t *c, bool scale_out) {               \
+      detail::CallWgmmaSS<Impl>::exec(desc_a, desc_b, c, scale_out);           \
+    }                                                                          \
+  };
 
-// ================================= TF32 x TF32 -> F32
-// =================================
-
-// M64N8K8 TF32->F32
-template <bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl<DataType::kTensorFloat32, DataType::kTensorFloat32,
-                   DataType::kFloat32, 64, 8, 8, tnspA, tnspB, scaleA, scaleB> {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    asm volatile("{\n"
-                 ".reg .pred p;\n"
-                 "setp.ne.b32 p, %6, 0;\n"
-                 "wgmma.mma_async.sync.aligned.m64n8k8.f32.tf32.tf32 "
-                 "{%0, %1, %2, %3}, %4, %5, p, %7, %8, %9, %10;\n"
-                 "}\n"
-                 : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3])
-                 : "l"(desc_a), "l"(desc_b), "r"(int32_t(scale_out)),
-                   "n"(int32_t(scaleA)), "n"(int32_t(scaleB)),
-                   "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-  }
-};
+#define TL_WGMMA_DEFINE_SS_TN(AType, BType, CType, M, N, K, ImplName)          \
+  template <int scaleA, int scaleB>                                            \
+  struct WgmmaSSImpl<DataType::AType, DataType::BType, DataType::CType, M, N,  \
+                     K, false, false, scaleA, scaleB> {                        \
+    static_assert(detail::IsValidScale<scaleA>,                                \
+                  "tl::wgmma_ss: invalid scaleA");                             \
+    static_assert(detail::IsValidScale<scaleB>,                                \
+                  "tl::wgmma_ss: invalid scaleB");                             \
+    using Impl =                                                               \
+        cute::SM90::GMMA::ImplName<detail::ScaleInValue<scaleA>::value,        \
+                                   detail::ScaleInValue<scaleB>::value>;       \
+    TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b,            \
+                                  uint32_t *c, bool scale_out) {               \
+      detail::CallWgmmaSS<Impl>::exec(desc_a, desc_b, c, scale_out);           \
+    }                                                                          \
+  };
 
-// M64N16K8 TF32->F32
-template <bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl<DataType::kTensorFloat32, DataType::kTensorFloat32,
-                   DataType::kFloat32, 64, 16, 8, tnspA, tnspB, scaleA,
-                   scaleB> {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    asm volatile(
-        "{\n"
-        ".reg .pred p;\n"
-        "setp.ne.b32 p, %10, 0;\n"
-        "wgmma.mma_async.sync.aligned.m64n16k8.f32.tf32.tf32 "
-        "{%0, %1, %2, %3, %4, %5, %6, %7}, %8, %9, p, %11, %12, %13, %14;\n"
-        "}\n"
-        : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(c[4]),
-          "+r"(c[5]), "+r"(c[6]), "+r"(c[7])
-        : "l"(desc_a), "l"(desc_b), "r"(int32_t(scale_out)),
-          "n"(int32_t(scaleA)), "n"(int32_t(scaleB)), "n"(int32_t(tnspA)),
-          "n"(int32_t(tnspB)));
-  }
-};
+#define TL_WGMMA_DEFINE_SS_TN_FIXED_SCALE(AType, BType, CType, M, N, K,        \
+                                          ImplName)                            \
+  template <int scaleA, int scaleB>                                            \
+  struct WgmmaSSImpl<DataType::AType, DataType::BType, DataType::CType, M, N,  \
+                     K, false, false, scaleA, scaleB> {                        \
+    static_assert(detail::IsValidScale<scaleA>,                                \
+                  "tl::wgmma_ss: invalid scaleA");                             \
+    static_assert(detail::IsValidScale<scaleB>,                                \
+                  "tl::wgmma_ss: invalid scaleB");                             \
+    static_assert(scaleA == 1 && scaleB == 1,                                  \
+                  "tl::wgmma_ss: only +1 scaling supported for this WGMMA");   \
+    using Impl = cute::SM90::GMMA::ImplName;                                   \
+    TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b,            \
+                                  uint32_t *c, bool scale_out) {               \
+      detail::CallWgmmaSS<Impl>::exec(desc_a, desc_b, c, scale_out);           \
+    }                                                                          \
+  };
 
-// ================================= INT8 x INT8 -> INT32
-// =================================
-
-// M64N8K32 S8->S32
-template <bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl<DataType::kInt8, DataType::kInt8, DataType::kInt32, 64, 8,
-                   32, tnspA, tnspB, scaleA, scaleB> {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    asm volatile("{\n"
-                 ".reg .pred p;\n"
-                 "setp.ne.b32 p, %4, 0;\n"
-                 "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8 "
-                 "{%0, %1}, %2, %3, p, %5, %6, %7, %8;\n"
-                 "}\n"
-                 : "+r"(c[0]), "+r"(c[1])
-                 : "l"(desc_a), "l"(desc_b), "r"(int32_t(scale_out)),
-                   "n"(int32_t(scaleA)), "n"(int32_t(scaleB)),
-                   "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-  }
-};
+#define TL_WGMMA_DEFINE_RS_GENERAL(AType, BType, CType, M, N, K, ImplName)     \
+  template <bool tnspA, bool tnspB, int scaleA, int scaleB>                    \
+  struct WgmmaRSImpl<DataType::AType, DataType::BType, DataType::CType, M, N,  \
+                     K, tnspA, tnspB, scaleA, scaleB> {                        \
+    static_assert(!tnspA, "tl::wgmma_rs: operand A must be K-major");          \
+    static_assert(detail::IsValidScale<scaleA>,                                \
+                  "tl::wgmma_rs: invalid scaleA");                             \
+    static_assert(detail::IsValidScale<scaleB>,                                \
+                  "tl::wgmma_rs: invalid scaleB");                             \
+    using Impl =                                                               \
+        cute::SM90::GMMA::ImplName<detail::MajorValue<tnspA>::value,           \
+                                   detail::MajorValue<tnspB>::value,           \
+                                   detail::ScaleInValue<scaleA>::value,        \
+                                   detail::ScaleInValue<scaleB>::value>;       \
+    TL_DEVICE static void execute(const uint32_t *a, uint64_t desc_b,          \
+                                  uint32_t *c, bool scale_out) {               \
+      detail::CallWgmmaRS<Impl>::exec(a, desc_b, c, scale_out);                \
+    }                                                                          \
+  };
 
-// M64N16K32 S8->S32
-template <bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl<DataType::kInt8, DataType::kInt8, DataType::kInt32, 64, 16,
-                   32, tnspA, tnspB, scaleA, scaleB> {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    asm volatile("{\n"
-                 ".reg .pred p;\n"
-                 "setp.ne.b32 p, %6, 0;\n"
-                 "wgmma.mma_async.sync.aligned.m64n16k32.s32.s8.s8 "
-                 "{%0, %1, %2, %3}, %4, %5, p, %7, %8, %9, %10;\n"
-                 "}\n"
-                 : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3])
-                 : "l"(desc_a), "l"(desc_b), "r"(int32_t(scale_out)),
-                   "n"(int32_t(scaleA)), "n"(int32_t(scaleB)),
-                   "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-  }
-};
+#define TL_WGMMA_DEFINE_RS_TN(AType, BType, CType, M, N, K, ImplName)          \
+  template <int scaleA, int scaleB>                                            \
+  struct WgmmaRSImpl<DataType::AType, DataType::BType, DataType::CType, M, N,  \
+                     K, false, false, scaleA, scaleB> {                        \
+    static_assert(detail::IsValidScale<scaleA>,                                \
+                  "tl::wgmma_rs: invalid scaleA");                             \
+    static_assert(detail::IsValidScale<scaleB>,                                \
+                  "tl::wgmma_rs: invalid scaleB");                             \
+    using Impl =                                                               \
+        cute::SM90::GMMA::ImplName<detail::ScaleInValue<scaleA>::value,        \
+                                   detail::ScaleInValue<scaleB>::value>;       \
+    TL_DEVICE static void execute(const uint32_t *a, uint64_t desc_b,          \
+                                  uint32_t *c, bool scale_out) {               \
+      detail::CallWgmmaRS<Impl>::exec(a, desc_b, c, scale_out);                \
+    }                                                                          \
+  };
 
-// ================================= FP8 x FP8 -> F16/F32
-// =================================
-
-// M64N8K32 E4M3->F16
-template <bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl<DataType::kFloat8_e4m3, DataType::kFloat8_e4m3,
-                   DataType::kFloat16, 64, 8, 32, tnspA, tnspB, scaleA,
-                   scaleB> {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    asm volatile("{\n"
-                 ".reg .pred p;\n"
-                 "setp.ne.b32 p, %4, 0;\n"
-                 "wgmma.mma_async.sync.aligned.m64n8k32.f16.e4m3.e4m3 "
-                 "{%0, %1}, %2, %3, p, %5, %6, %7, %8;\n"
-                 "}\n"
-                 : "+r"(c[0]), "+r"(c[1])
-                 : "l"(desc_a), "l"(desc_b), "r"(int32_t(scale_out)),
-                   "n"(int32_t(scaleA)), "n"(int32_t(scaleB)),
-                   "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-  }
-};
+#define TL_WGMMA_DEFINE_RS_TN_FIXED_SCALE(AType, BType, CType, M, N, K,        \
+                                          ImplName)                            \
+  template <int scaleA, int scaleB>                                            \
+  struct WgmmaRSImpl<DataType::AType, DataType::BType, DataType::CType, M, N,  \
+                     K, false, false, scaleA, scaleB> {                        \
+    static_assert(detail::IsValidScale<scaleA>,                                \
+                  "tl::wgmma_rs: invalid scaleA");                             \
+    static_assert(detail::IsValidScale<scaleB>,                                \
+                  "tl::wgmma_rs: invalid scaleB");                             \
+    static_assert(scaleA == 1 && scaleB == 1,                                  \
+                  "tl::wgmma_rs: only +1 scaling supported for this WGMMA");   \
+    using Impl = cute::SM90::GMMA::ImplName;                                   \
+    TL_DEVICE static void execute(const uint32_t *a, uint64_t desc_b,          \
+                                  uint32_t *c, bool scale_out) {               \
+      detail::CallWgmmaRS<Impl>::exec(a, desc_b, c, scale_out);                \
+    }                                                                          \
+  };
 
-// M64N8K32 E4M3->F32
-template <bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl<DataType::kFloat8_e4m3, DataType::kFloat8_e4m3,
-                   DataType::kFloat32, 64, 8, 32, tnspA, tnspB, scaleA,
-                   scaleB> {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    asm volatile("{\n"
-                 ".reg .pred p;\n"
-                 "setp.ne.b32 p, %6, 0;\n"
-                 "wgmma.mma_async.sync.aligned.m64n8k32.f32.e4m3.e4m3 "
-                 "{%0, %1, %2, %3}, %4, %5, p, %7, %8, %9, %10;\n"
-                 "}\n"
-                 : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3])
-                 : "l"(desc_a), "l"(desc_b), "r"(int32_t(scale_out)),
-                   "n"(int32_t(scaleA)), "n"(int32_t(scaleB)),
-                   "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-  }
-};
+#define TL_WGMMA_FOREACH_N_FLOAT_MUL8(OP)                                      \
+  OP(8)                                                                        \
+  OP(16)                                                                       \
+  OP(24)                                                                       \
+  OP(32)                                                                       \
+  OP(40)                                                                       \
+  OP(48)                                                                       \
+  OP(56)                                                                       \
+  OP(64)                                                                       \
+  OP(72)                                                                       \
+  OP(80)                                                                       \
+  OP(88)                                                                       \
+  OP(96)                                                                       \
+  OP(104)                                                                      \
+  OP(112)                                                                      \
+  OP(120)                                                                      \
+  OP(128)                                                                      \
+  OP(136)                                                                      \
+  OP(144)                                                                      \
+  OP(152)                                                                      \
+  OP(160)                                                                      \
+  OP(168)                                                                      \
+  OP(176)                                                                      \
+  OP(184)                                                                      \
+  OP(192)                                                                      \
+  OP(200)                                                                      \
+  OP(208)                                                                      \
+  OP(216)                                                                      \
+  OP(224)                                                                      \
+  OP(232)                                                                      \
+  OP(240)                                                                      \
+  OP(248)                                                                      \
+  OP(256)
+
+#define TL_WGMMA_FOREACH_N_INT32_MUL8(OP)                                      \
+  OP(8)                                                                        \
+  OP(16)                                                                       \
+  OP(24)                                                                       \
+  OP(32)                                                                       \
+  OP(48)                                                                       \
+  OP(64)                                                                       \
+  OP(80)                                                                       \
+  OP(96)                                                                       \
+  OP(112)                                                                      \
+  OP(128)                                                                      \
+  OP(144)                                                                      \
+  OP(160)                                                                      \
+  OP(176)                                                                      \
+  OP(192)                                                                      \
+  OP(208)                                                                      \
+  OP(224)                                                                      \
+  OP(240)                                                                      \
+  OP(256)
+
+#define TL_WGMMA_DEFINE_F16_F16_F16_SS(N)                                      \
+  TL_WGMMA_DEFINE_SS_GENERAL(kFloat16, kFloat16, kFloat16, 64, N, 16,          \
+                             MMA_64x##N##x16_F16F16F16_SS)
+#define TL_WGMMA_DEFINE_F16_F16_F32_SS(N)                                      \
+  TL_WGMMA_DEFINE_SS_GENERAL(kFloat16, kFloat16, kFloat32, 64, N, 16,          \
+                             MMA_64x##N##x16_F32F16F16_SS)
+#define TL_WGMMA_DEFINE_BF16_BF16_F32_SS(N)                                    \
+  TL_WGMMA_DEFINE_SS_GENERAL(kBFloat16, kBFloat16, kFloat32, 64, N, 16,        \
+                             MMA_64x##N##x16_F32BF16BF16_SS)
+
+#define TL_WGMMA_DEFINE_F32_TF32_SS_TN(N)                                      \
+  TL_WGMMA_DEFINE_SS_TN(kTensorFloat32, kTensorFloat32, kFloat32, 64, N, 8,    \
+                        MMA_64x##N##x8_F32TF32TF32_SS_TN)
+
+#define TL_WGMMA_DEFINE_S32_S8S8_SS_TN(N)                                      \
+  TL_WGMMA_DEFINE_SS_TN_FIXED_SCALE(kInt8, kInt8, kInt32, 64, N, 32,           \
+                                    MMA_64x##N##x32_S32S8S8_SS_TN)
+#define TL_WGMMA_DEFINE_S32_S8U8_SS_TN(N)                                      \
+  TL_WGMMA_DEFINE_SS_TN_FIXED_SCALE(kInt8, kUInt8, kInt32, 64, N, 32,          \
+                                    MMA_64x##N##x32_S32S8U8_SS_TN)
+#define TL_WGMMA_DEFINE_S32_U8S8_SS_TN(N)                                      \
+  TL_WGMMA_DEFINE_SS_TN_FIXED_SCALE(kUInt8, kInt8, kInt32, 64, N, 32,          \
+                                    MMA_64x##N##x32_S32U8S8_SS_TN)
+#define TL_WGMMA_DEFINE_S32_U8U8_SS_TN(N)                                      \
+  TL_WGMMA_DEFINE_SS_TN_FIXED_SCALE(kUInt8, kUInt8, kInt32, 64, N, 32,         \
+                                    MMA_64x##N##x32_S32U8U8_SS_TN)
+
+#define TL_WGMMA_DEFINE_F16_E4M3E4M3_SS_TN(N)                                  \
+  TL_WGMMA_DEFINE_SS_TN(kFloat8_e4m3, kFloat8_e4m3, kFloat16, 64, N, 32,       \
+                        MMA_64x##N##x32_F16E4M3E4M3_SS_TN)
+#define TL_WGMMA_DEFINE_F32_E4M3E4M3_SS_TN(N)                                  \
+  TL_WGMMA_DEFINE_SS_TN(kFloat8_e4m3, kFloat8_e4m3, kFloat32, 64, N, 32,       \
+                        MMA_64x##N##x32_F32E4M3E4M3_SS_TN)
+#define TL_WGMMA_DEFINE_F16_E4M3E5M2_SS_TN(N)                                  \
+  TL_WGMMA_DEFINE_SS_TN(kFloat8_e4m3, kFloat8_e5m2, kFloat16, 64, N, 32,       \
+                        MMA_64x##N##x32_F16E4M3E5M2_SS_TN)
+#define TL_WGMMA_DEFINE_F32_E4M3E5M2_SS_TN(N)                                  \
+  TL_WGMMA_DEFINE_SS_TN(kFloat8_e4m3, kFloat8_e5m2, kFloat32, 64, N, 32,       \
+                        MMA_64x##N##x32_F32E4M3E5M2_SS_TN)
+#define TL_WGMMA_DEFINE_F16_E5M2E4M3_SS_TN(N)                                  \
+  TL_WGMMA_DEFINE_SS_TN(kFloat8_e5m2, kFloat8_e4m3, kFloat16, 64, N, 32,       \
+                        MMA_64x##N##x32_F16E5M2E4M3_SS_TN)
+#define TL_WGMMA_DEFINE_F32_E5M2E4M3_SS_TN(N)                                  \
+  TL_WGMMA_DEFINE_SS_TN(kFloat8_e5m2, kFloat8_e4m3, kFloat32, 64, N, 32,       \
+                        MMA_64x##N##x32_F32E5M2E4M3_SS_TN)
+#define TL_WGMMA_DEFINE_F16_E5M2E5M2_SS_TN(N)                                  \
+  TL_WGMMA_DEFINE_SS_TN(kFloat8_e5m2, kFloat8_e5m2, kFloat16, 64, N, 32,       \
+                        MMA_64x##N##x32_F16E5M2E5M2_SS_TN)
+#define TL_WGMMA_DEFINE_F32_E5M2E5M2_SS_TN(N)                                  \
+  TL_WGMMA_DEFINE_SS_TN(kFloat8_e5m2, kFloat8_e5m2, kFloat32, 64, N, 32,       \
+                        MMA_64x##N##x32_F32E5M2E5M2_SS_TN)
+
+TL_WGMMA_FOREACH_N_FLOAT_MUL8(TL_WGMMA_DEFINE_F16_F16_F16_SS);
+TL_WGMMA_FOREACH_N_FLOAT_MUL8(TL_WGMMA_DEFINE_F16_F16_F32_SS);
+TL_WGMMA_FOREACH_N_FLOAT_MUL8(TL_WGMMA_DEFINE_BF16_BF16_F32_SS);
+TL_WGMMA_FOREACH_N_FLOAT_MUL8(TL_WGMMA_DEFINE_F32_TF32_SS_TN);
+
+TL_WGMMA_FOREACH_N_INT32_MUL8(TL_WGMMA_DEFINE_S32_S8S8_SS_TN);
+TL_WGMMA_FOREACH_N_INT32_MUL8(TL_WGMMA_DEFINE_S32_S8U8_SS_TN);
+TL_WGMMA_FOREACH_N_INT32_MUL8(TL_WGMMA_DEFINE_S32_U8S8_SS_TN);
+TL_WGMMA_FOREACH_N_INT32_MUL8(TL_WGMMA_DEFINE_S32_U8U8_SS_TN);
+
+TL_WGMMA_FOREACH_N_FLOAT_MUL8(TL_WGMMA_DEFINE_F16_E4M3E4M3_SS_TN);
+TL_WGMMA_FOREACH_N_FLOAT_MUL8(TL_WGMMA_DEFINE_F32_E4M3E4M3_SS_TN);
+TL_WGMMA_FOREACH_N_FLOAT_MUL8(TL_WGMMA_DEFINE_F16_E4M3E5M2_SS_TN);
+TL_WGMMA_FOREACH_N_FLOAT_MUL8(TL_WGMMA_DEFINE_F32_E4M3E5M2_SS_TN);
+TL_WGMMA_FOREACH_N_FLOAT_MUL8(TL_WGMMA_DEFINE_F16_E5M2E4M3_SS_TN);
+TL_WGMMA_FOREACH_N_FLOAT_MUL8(TL_WGMMA_DEFINE_F32_E5M2E4M3_SS_TN);
+TL_WGMMA_FOREACH_N_FLOAT_MUL8(TL_WGMMA_DEFINE_F16_E5M2E5M2_SS_TN);
+TL_WGMMA_FOREACH_N_FLOAT_MUL8(TL_WGMMA_DEFINE_F32_E5M2E5M2_SS_TN);
+
+#define TL_WGMMA_DEFINE_F16_F16_F16_RS(N)                                      \
+  TL_WGMMA_DEFINE_RS_GENERAL(kFloat16, kFloat16, kFloat16, 64, N, 16,          \
+                             MMA_64x##N##x16_F16F16F16_RS)
+#define TL_WGMMA_DEFINE_F16_F16_F32_RS(N)                                      \
+  TL_WGMMA_DEFINE_RS_GENERAL(kFloat16, kFloat16, kFloat32, 64, N, 16,          \
+                             MMA_64x##N##x16_F32F16F16_RS)
+#define TL_WGMMA_DEFINE_BF16_BF16_F32_RS(N)                                    \
+  TL_WGMMA_DEFINE_RS_GENERAL(kBFloat16, kBFloat16, kFloat32, 64, N, 16,        \
+                             MMA_64x##N##x16_F32BF16BF16_RS)
+
+#define TL_WGMMA_DEFINE_F32_TF32_RS_TN(N)                                      \
+  TL_WGMMA_DEFINE_RS_TN(kTensorFloat32, kTensorFloat32, kFloat32, 64, N, 8,    \
+                        MMA_64x##N##x8_F32TF32TF32_RS_TN)
+
+#define TL_WGMMA_DEFINE_S32_S8S8_RS_TN(N)                                      \
+  TL_WGMMA_DEFINE_RS_TN_FIXED_SCALE(kInt8, kInt8, kInt32, 64, N, 32,           \
+                                    MMA_64x##N##x32_S32S8S8_RS_TN)
+#define TL_WGMMA_DEFINE_S32_S8U8_RS_TN(N)                                      \
+  TL_WGMMA_DEFINE_RS_TN_FIXED_SCALE(kInt8, kUInt8, kInt32, 64, N, 32,          \
+                                    MMA_64x##N##x32_S32S8U8_RS_TN)
+#define TL_WGMMA_DEFINE_S32_U8S8_RS_TN(N)                                      \
+  TL_WGMMA_DEFINE_RS_TN_FIXED_SCALE(kUInt8, kInt8, kInt32, 64, N, 32,          \
+                                    MMA_64x##N##x32_S32U8S8_RS_TN)
+#define TL_WGMMA_DEFINE_S32_U8U8_RS_TN(N)                                      \
+  TL_WGMMA_DEFINE_RS_TN_FIXED_SCALE(kUInt8, kUInt8, kInt32, 64, N, 32,         \
+                                    MMA_64x##N##x32_S32U8U8_RS_TN)
+
+#define TL_WGMMA_DEFINE_F16_E4M3E4M3_RS_TN(N)                                  \
+  TL_WGMMA_DEFINE_RS_TN(kFloat8_e4m3, kFloat8_e4m3, kFloat16, 64, N, 32,       \
+                        MMA_64x##N##x32_F16E4M3E4M3_RS_TN)
+#define TL_WGMMA_DEFINE_F32_E4M3E4M3_RS_TN(N)                                  \
+  TL_WGMMA_DEFINE_RS_TN(kFloat8_e4m3, kFloat8_e4m3, kFloat32, 64, N, 32,       \
+                        MMA_64x##N##x32_F32E4M3E4M3_RS_TN)
+#define TL_WGMMA_DEFINE_F16_E4M3E5M2_RS_TN(N)                                  \
+  TL_WGMMA_DEFINE_RS_TN(kFloat8_e4m3, kFloat8_e5m2, kFloat16, 64, N, 32,       \
+                        MMA_64x##N##x32_F16E4M3E5M2_RS_TN)
+#define TL_WGMMA_DEFINE_F32_E4M3E5M2_RS_TN(N)                                  \
+  TL_WGMMA_DEFINE_RS_TN(kFloat8_e4m3, kFloat8_e5m2, kFloat32, 64, N, 32,       \
+                        MMA_64x##N##x32_F32E4M3E5M2_RS_TN)
+#define TL_WGMMA_DEFINE_F16_E5M2E4M3_RS_TN(N)                                  \
+  TL_WGMMA_DEFINE_RS_TN(kFloat8_e5m2, kFloat8_e4m3, kFloat16, 64, N, 32,       \
+                        MMA_64x##N##x32_F16E5M2E4M3_RS_TN)
+#define TL_WGMMA_DEFINE_F32_E5M2E4M3_RS_TN(N)                                  \
+  TL_WGMMA_DEFINE_RS_TN(kFloat8_e5m2, kFloat8_e4m3, kFloat32, 64, N, 32,       \
+                        MMA_64x##N##x32_F32E5M2E4M3_RS_TN)
+#define TL_WGMMA_DEFINE_F16_E5M2E5M2_RS_TN(N)                                  \
+  TL_WGMMA_DEFINE_RS_TN(kFloat8_e5m2, kFloat8_e5m2, kFloat16, 64, N, 32,       \
+                        MMA_64x##N##x32_F16E5M2E5M2_RS_TN)
+#define TL_WGMMA_DEFINE_F32_E5M2E5M2_RS_TN(N)                                  \
+  TL_WGMMA_DEFINE_RS_TN(kFloat8_e5m2, kFloat8_e5m2, kFloat32, 64, N, 32,       \
+                        MMA_64x##N##x32_F32E5M2E5M2_RS_TN)
+
+TL_WGMMA_FOREACH_N_FLOAT_MUL8(TL_WGMMA_DEFINE_F16_F16_F16_RS);
+TL_WGMMA_FOREACH_N_FLOAT_MUL8(TL_WGMMA_DEFINE_F16_F16_F32_RS);
+TL_WGMMA_FOREACH_N_FLOAT_MUL8(TL_WGMMA_DEFINE_BF16_BF16_F32_RS);
+TL_WGMMA_FOREACH_N_FLOAT_MUL8(TL_WGMMA_DEFINE_F32_TF32_RS_TN);
+
+TL_WGMMA_FOREACH_N_INT32_MUL8(TL_WGMMA_DEFINE_S32_S8S8_RS_TN);
+TL_WGMMA_FOREACH_N_INT32_MUL8(TL_WGMMA_DEFINE_S32_S8U8_RS_TN);
+TL_WGMMA_FOREACH_N_INT32_MUL8(TL_WGMMA_DEFINE_S32_U8S8_RS_TN);
+TL_WGMMA_FOREACH_N_INT32_MUL8(TL_WGMMA_DEFINE_S32_U8U8_RS_TN);
+
+TL_WGMMA_FOREACH_N_FLOAT_MUL8(TL_WGMMA_DEFINE_F16_E4M3E4M3_RS_TN);
+TL_WGMMA_FOREACH_N_FLOAT_MUL8(TL_WGMMA_DEFINE_F32_E4M3E4M3_RS_TN);
+TL_WGMMA_FOREACH_N_FLOAT_MUL8(TL_WGMMA_DEFINE_F16_E4M3E5M2_RS_TN);
+TL_WGMMA_FOREACH_N_FLOAT_MUL8(TL_WGMMA_DEFINE_F32_E4M3E5M2_RS_TN);
+TL_WGMMA_FOREACH_N_FLOAT_MUL8(TL_WGMMA_DEFINE_F16_E5M2E4M3_RS_TN);
+TL_WGMMA_FOREACH_N_FLOAT_MUL8(TL_WGMMA_DEFINE_F32_E5M2E4M3_RS_TN);
+TL_WGMMA_FOREACH_N_FLOAT_MUL8(TL_WGMMA_DEFINE_F16_E5M2E5M2_RS_TN);
+TL_WGMMA_FOREACH_N_FLOAT_MUL8(TL_WGMMA_DEFINE_F32_E5M2E5M2_RS_TN);
+
+#undef TL_WGMMA_DEFINE_F16_F16_F16_SS
+#undef TL_WGMMA_DEFINE_F16_F16_F32_SS
+#undef TL_WGMMA_DEFINE_BF16_BF16_F32_SS
+#undef TL_WGMMA_DEFINE_F32_TF32_SS_TN
+#undef TL_WGMMA_DEFINE_S32_S8S8_SS_TN
+#undef TL_WGMMA_DEFINE_S32_S8U8_SS_TN
+#undef TL_WGMMA_DEFINE_S32_U8S8_SS_TN
+#undef TL_WGMMA_DEFINE_S32_U8U8_SS_TN
+#undef TL_WGMMA_DEFINE_F16_E4M3E4M3_SS_TN
+#undef TL_WGMMA_DEFINE_F32_E4M3E4M3_SS_TN
+#undef TL_WGMMA_DEFINE_F16_E4M3E5M2_SS_TN
+#undef TL_WGMMA_DEFINE_F32_E4M3E5M2_SS_TN
+#undef TL_WGMMA_DEFINE_F16_E5M2E4M3_SS_TN
+#undef TL_WGMMA_DEFINE_F32_E5M2E4M3_SS_TN
+#undef TL_WGMMA_DEFINE_F16_E5M2E5M2_SS_TN
+#undef TL_WGMMA_DEFINE_F32_E5M2E5M2_SS_TN
+#undef TL_WGMMA_DEFINE_F16_F16_F16_RS
+#undef TL_WGMMA_DEFINE_F16_F16_F32_RS
+#undef TL_WGMMA_DEFINE_BF16_BF16_F32_RS
+#undef TL_WGMMA_DEFINE_F32_TF32_RS_TN
+#undef TL_WGMMA_DEFINE_S32_S8S8_RS_TN
+#undef TL_WGMMA_DEFINE_S32_S8U8_RS_TN
+#undef TL_WGMMA_DEFINE_S32_U8S8_RS_TN
+#undef TL_WGMMA_DEFINE_S32_U8U8_RS_TN
+#undef TL_WGMMA_DEFINE_F16_E4M3E4M3_RS_TN
+#undef TL_WGMMA_DEFINE_F32_E4M3E4M3_RS_TN
+#undef TL_WGMMA_DEFINE_F16_E4M3E5M2_RS_TN
+#undef TL_WGMMA_DEFINE_F32_E4M3E5M2_RS_TN
+#undef TL_WGMMA_DEFINE_F16_E5M2E4M3_RS_TN
+#undef TL_WGMMA_DEFINE_F32_E5M2E4M3_RS_TN
+#undef TL_WGMMA_DEFINE_F16_E5M2E5M2_RS_TN
+#undef TL_WGMMA_DEFINE_F32_E5M2E5M2_RS_TN
+#undef TL_WGMMA_FOREACH_N_FLOAT_MUL8
+#undef TL_WGMMA_FOREACH_N_INT32_MUL8
+#undef TL_WGMMA_DEFINE_SS_TN_FIXED_SCALE
+#undef TL_WGMMA_DEFINE_SS_GENERAL
+#undef TL_WGMMA_DEFINE_SS_TN
+#undef TL_WGMMA_DEFINE_RS_TN_FIXED_SCALE
+#undef TL_WGMMA_DEFINE_RS_GENERAL
+#undef TL_WGMMA_DEFINE_RS_TN
 
-// 函数模板委托给类模板
 template <DataType A_type, DataType B_type, DataType C_type, int M, int N,
           int K, bool tnspA, bool tnspB, int scaleA = 1, int scaleB = 1>
 TL_DEVICE void wgmma_ss(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
@@ -519,129 +462,12 @@ TL_DEVICE void wgmma_ss(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
               scaleB>::execute(desc_a, desc_b, c, scale_out);
 }
 
-// ================================= Mixed Precision Support
-// =================================
-
-// Mixed precision: S8 x U8 -> S32
-template <bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl<DataType::kInt8, DataType::kUInt8, DataType::kInt32, 64, 8,
-                   32, tnspA, tnspB, scaleA, scaleB> {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    asm volatile("{\n"
-                 ".reg .pred p;\n"
-                 "setp.ne.b32 p, %4, 0;\n"
-                 "wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.u8 "
-                 "{%0, %1}, %2, %3, p, %5, %6, %7, %8;\n"
-                 "}\n"
-                 : "+r"(c[0]), "+r"(c[1])
-                 : "l"(desc_a), "l"(desc_b), "r"(int32_t(scale_out)),
-                   "n"(int32_t(scaleA)), "n"(int32_t(scaleB)),
-                   "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-  }
-};
-
-// Mixed precision: U8 x S8 -> S32
-template <bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl<DataType::kUInt8, DataType::kInt8, DataType::kInt32, 64, 8,
-                   32, tnspA, tnspB, scaleA, scaleB> {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    asm volatile("{\n"
-                 ".reg .pred p;\n"
-                 "setp.ne.b32 p, %4, 0;\n"
-                 "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.s8 "
-                 "{%0, %1}, %2, %3, p, %5, %6, %7, %8;\n"
-                 "}\n"
-                 : "+r"(c[0]), "+r"(c[1])
-                 : "l"(desc_a), "l"(desc_b), "r"(int32_t(scale_out)),
-                   "n"(int32_t(scaleA)), "n"(int32_t(scaleB)),
-                   "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-  }
-};
-
-// Mixed precision: U8 x U8 -> S32
-template <bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl<DataType::kUInt8, DataType::kUInt8, DataType::kInt32, 64, 8,
-                   32, tnspA, tnspB, scaleA, scaleB> {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    asm volatile("{\n"
-                 ".reg .pred p;\n"
-                 "setp.ne.b32 p, %4, 0;\n"
-                 "wgmma.mma_async.sync.aligned.m64n8k32.s32.u8.u8 "
-                 "{%0, %1}, %2, %3, p, %5, %6, %7, %8;\n"
-                 "}\n"
-                 : "+r"(c[0]), "+r"(c[1])
-                 : "l"(desc_a), "l"(desc_b), "r"(int32_t(scale_out)),
-                   "n"(int32_t(scaleA)), "n"(int32_t(scaleB)),
-                   "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-  }
-};
-
-// Mixed precision FP8: E4M3 x E5M2 -> F16
-template <bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl<DataType::kFloat8_e4m3, DataType::kFloat8_e5m2,
-                   DataType::kFloat16, 64, 8, 32, tnspA, tnspB, scaleA,
-                   scaleB> {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    asm volatile("{\n"
-                 ".reg .pred p;\n"
-                 "setp.ne.b32 p, %4, 0;\n"
-                 "wgmma.mma_async.sync.aligned.m64n8k32.f16.e4m3.e5m2 "
-                 "{%0, %1}, %2, %3, p, %5, %6, %7, %8;\n"
-                 "}\n"
-                 : "+r"(c[0]), "+r"(c[1])
-                 : "l"(desc_a), "l"(desc_b), "r"(int32_t(scale_out)),
-                   "n"(int32_t(scaleA)), "n"(int32_t(scaleB)),
-                   "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-  }
-};
-
-// Mixed precision FP8: E5M2 x E4M3 -> F16
-template <bool tnspA, bool tnspB, int scaleA, int scaleB>
-struct WgmmaSSImpl<DataType::kFloat8_e5m2, DataType::kFloat8_e4m3,
-                   DataType::kFloat16, 64, 8, 32, tnspA, tnspB, scaleA,
-                   scaleB> {
-  TL_DEVICE static void execute(uint64_t desc_a, uint64_t desc_b, uint32_t *c,
-                                bool scale_out) {
-    asm volatile("{\n"
-                 ".reg .pred p;\n"
-                 "setp.ne.b32 p, %4, 0;\n"
-                 "wgmma.mma_async.sync.aligned.m64n8k32.f16.e5m2.e4m3 "
-                 "{%0, %1}, %2, %3, p, %5, %6, %7, %8;\n"
-                 "}\n"
-                 : "+r"(c[0]), "+r"(c[1])
-                 : "l"(desc_a), "l"(desc_b), "r"(int32_t(scale_out)),
-                   "n"(int32_t(scaleA)), "n"(int32_t(scaleB)),
-                   "n"(int32_t(tnspA)), "n"(int32_t(tnspB)));
-  }
-};
-
-// ================================= Convenience Templates
-// =================================
-
-// Type trait to determine the number of output registers needed
-template <DataType C_type, int M, int N> struct WgmmaOutputRegs {
-  static constexpr int value =
-      (M * N * (C_type == DataType::kFloat32 ? 32 : 16)) / (32 * 8);
-};
-
-// Type trait to get element size in bits
-template <DataType dtype> struct ElementBits {
-  static constexpr int value =
-      (dtype == DataType::kFloat32 || dtype == DataType::kTensorFloat32 ||
-       dtype == DataType::kInt32)
-          ? 32
-      : (dtype == DataType::kFloat16 || dtype == DataType::kBFloat16 ||
-         dtype == DataType::kInt16 || dtype == DataType::kUInt16)
-          ? 16
-      : (dtype == DataType::kInt8 || dtype == DataType::kUInt8 ||
-         dtype == DataType::kFloat8_e4m3 || dtype == DataType::kFloat8_e5m2)
-          ? 8
-      : (dtype == DataType::kInt4 || dtype == DataType::kUInt4) ? 4
-                                                                : 8;
-};
+template <DataType A_type, DataType B_type, DataType C_type, int M, int N,
+          int K, bool tnspA, bool tnspB, int scaleA = 1, int scaleB = 1>
+TL_DEVICE void wgmma_rs(const uint32_t *a, uint64_t desc_b, uint32_t *c,
+                        bool scale_out) {
+  WgmmaRSImpl<A_type, B_type, C_type, M, N, K, tnspA, tnspB, scaleA,
+              scaleB>::execute(a, desc_b, c, scale_out);
+}
 
-} // namespace tl
\ No newline at end of file
+} // namespace tl
diff --git a/src/tl_templates/cuda/intrin.h b/src/tl_templates/cuda/intrin.h
index ef1afa7f9..0d5b5639d 100644
--- a/src/tl_templates/cuda/intrin.h
+++ b/src/tl_templates/cuda/intrin.h
@@ -67,6 +67,20 @@ template <int NumMma> TL_DEVICE void warpgroup_wait() {
   cute::warpgroup_wait<NumMma>();
 }
 
+TL_DEVICE void warpgroup_fence_operand(uint32_t *regs, int count) {
+#pragma unroll
+  for (int i = 0; i < count; ++i) {
+    cute::warpgroup_fence_operand(regs[i]);
+  }
+}
+
+TL_DEVICE void warpgroup_fence_operand(float *regs, int count) {
+#pragma unroll
+  for (int i = 0; i < count; ++i) {
+    cute::warpgroup_fence_operand(regs[i]);
+  }
+}
+
 // Template parameter:
 //   thread_extent: the logical size (in number of threads) of each "group"
 //                  within which we want to elect exactly ONE representative
diff --git a/src/tl_templates/cuda/ldsm.h b/src/tl_templates/cuda/ldsm.h
index 4d6af8a09..a20746dff 100644
--- a/src/tl_templates/cuda/ldsm.h
+++ b/src/tl_templates/cuda/ldsm.h
@@ -118,4 +118,4 @@ TL_DEVICE void ptx_stmatrix_x4_trans(void const *const smem_ptr,
                "r"(value0), "r"(value1), "r"(value2), "r"(value3));
 }
 
-} // namespace tl
\ No newline at end of file
+} // namespace tl
diff --git a/src/tl_templates/cuda/ldst.h b/src/tl_templates/cuda/ldst.h
index c875832eb..d9dd0887b 100644
--- a/src/tl_templates/cuda/ldst.h
+++ b/src/tl_templates/cuda/ldst.h
@@ -6,6 +6,8 @@
 enum class Semantic { WEAK, VOLATILE, ACQUIRE, RELEASE, RELAXED };
 enum class Scope { CTA, GPU, SYS };
 
+namespace tl {
+
 #ifndef TL_ALWAYS_FALSE_V_DEFINED
 #define TL_ALWAYS_FALSE_V_DEFINED
 template <class> inline constexpr bool always_false_v = false;
@@ -18,25 +20,27 @@ template <typename T> struct is_bfloat16 : std::false_type {};
 template <> struct is_bfloat16<__nv_bfloat16> : std::true_type {};
 #endif
 
+} // namespace tl
+
 // Detect cutlass bfloat16_t
 namespace cutlass {
 struct bfloat16_t;
 }
-template <> struct is_bfloat16<cutlass::bfloat16_t> : std::true_type {};
+template <> struct tl::is_bfloat16<cutlass::bfloat16_t> : std::true_type {};
 
 template <typename T>
-inline constexpr bool is_bfloat16_v = is_bfloat16<T>::value;
+inline constexpr bool is_bfloat16_v = tl::is_bfloat16<T>::value;
 
 // Fallback template for unsupported configurations
 template <Semantic semantic, Scope scope, bool na> struct StImpl {
   template <typename T> TL_DEVICE static void execute(T *ptr, T value) {
-    static_assert(always_false_v<T>, "tl::st: unsupported configuration. ");
+    static_assert(tl::always_false_v<T>, "tl::st: unsupported configuration. ");
   }
 };
 
 template <Semantic semantic, Scope scope, bool nc, bool na> struct LdImpl {
   template <typename T> TL_DEVICE static void execute(const T *ptr, T &value) {
-    static_assert(always_false_v<T>, "tl::ld: unsupported configuration. ");
+    static_assert(tl::always_false_v<T>, "tl::ld: unsupported configuration. ");
   }
 };
 
diff --git a/src/tl_templates/cuda/nvrtc_std.h b/src/tl_templates/cuda/nvrtc_std.h
index 9930c2200..34cd58bb2 100644
--- a/src/tl_templates/cuda/nvrtc_std.h
+++ b/src/tl_templates/cuda/nvrtc_std.h
@@ -19,6 +19,11 @@
 
 #ifdef __CUDACC_RTC__
 
+// Disable problematic CUDA standard library headers in NVRTC environment
+// Vector types (float4, uchar, etc.) are built-in to NVRTC and don't need these
+// headers
+#define _LIBCUDACXX___TUPLE_VECTOR_TYPES_H // Prevent vector_types.h inclusion
+
 using int8_t = signed char;
 using uint8_t = unsigned char;
 using int16_t = signed short;
@@ -67,6 +72,24 @@ template <class T> struct is_same<T, T> : true_type {};
 template <class T, class U>
 inline constexpr bool is_same_v = is_same<T, U>::value;
 
+template <class T> struct is_void : false_type {};
+
+template <> struct is_void<void> : true_type {};
+template <> struct is_void<const void> : true_type {};
+template <> struct is_void<volatile void> : true_type {};
+template <> struct is_void<const volatile void> : true_type {};
+
+template <class T> inline constexpr bool is_void_v = is_void<T>::value;
+
+template <class T> struct is_pointer : false_type {};
+
+template <class T> struct is_pointer<T *> : true_type {};
+template <class T> struct is_pointer<T *const> : true_type {};
+template <class T> struct is_pointer<T *volatile> : true_type {};
+template <class T> struct is_pointer<T *const volatile> : true_type {};
+
+template <class T> inline constexpr bool is_pointer_v = is_pointer<T>::value;
+
 namespace index_sequence_impl {
 
 // Based on https://stackoverflow.com/a/32223343/11717224
@@ -118,6 +141,36 @@ template <bool B, class T = void> struct enable_if {};
 template <class T> struct enable_if<true, T> {
   using type = T;
 };
+
+template <class T> struct remove_extent {
+  using type = T;
+};
+
+template <class T> struct remove_extent<T[]> {
+  using type = T;
+};
+
+template <class T, size_t N> struct remove_extent<T[N]> {
+  using type = T;
+};
+
+template <class T> using remove_extent_t = typename remove_extent<T>::type;
+
+template <class T, unsigned I = 0>
+struct extent : integral_constant<size_t, 0> {};
+
+template <class T> struct extent<T[], 0> : integral_constant<size_t, 0> {};
+
+template <class T, unsigned I> struct extent<T[], I> : extent<T, I - 1> {};
+
+template <class T, size_t N>
+struct extent<T[N], 0> : integral_constant<size_t, N> {};
+
+template <class T, size_t N, unsigned I>
+struct extent<T[N], I> : extent<T, I - 1> {};
+
+template <class T, unsigned I = 0>
+inline constexpr size_t extent_v = extent<T, I>::value;
 } // namespace std
 
-#endif
\ No newline at end of file
+#endif // __CUDACC_RTC__
diff --git a/src/tl_templates/cuda/reduce.h b/src/tl_templates/cuda/reduce.h
index 5981fa071..458242649 100644
--- a/src/tl_templates/cuda/reduce.h
+++ b/src/tl_templates/cuda/reduce.h
@@ -2,8 +2,25 @@
 
 #include "common.h"
 
+#ifndef __CUDACC_RTC__
+#include <cstdint>
+#include <type_traits>
+#endif
+
 namespace tl {
 
+// Select a wider accumulator type for improved numerical accuracy.
+// Default: accumulate in the same type. Specialize FP16/BF16 to float.
+template <typename T> struct AccType {
+  using type = T;
+};
+template <> struct AccType<half_t> {
+  using type = float;
+};
+template <> struct AccType<bfloat16_t> {
+  using type = float;
+};
+
 struct SumOp {
   template <typename T> TL_DEVICE T operator()(T const &x, T const &y) {
     return x + y;
@@ -40,53 +57,6 @@ struct BitXorOp {
   }
 };
 
-template <class Reducer, int Threads, bool UseAbs, bool NeedAccumulate>
-struct SharedReduceWarp {
-  template <typename T>
-  static TL_DEVICE void run(const T *__restrict__ src, T *__restrict__ dst,
-                            int total_dest, int reduce_extent, int tail,
-                            T init_value) {
-    if (total_dest <= 0 || reduce_extent <= 0)
-      return;
-    constexpr int kWarpSize = 32;
-    static_assert(Threads % kWarpSize == 0,
-                  "SharedReduceWarp expects blockDim.x to be a multiple of "
-                  "warp size on CUDA.");
-    const int tid = threadIdx.x;
-    const int warp_id = tid / kWarpSize;
-    const int lane = tid % kWarpSize;
-    const int num_warps = Threads / kWarpSize;
-    for (int dest_idx = warp_id; dest_idx < total_dest; dest_idx += num_warps) {
-      const int prefix = tail == 1 ? dest_idx : dest_idx / tail;
-      const int suffix = tail == 1 ? 0 : dest_idx % tail;
-      const int src_base = (prefix * reduce_extent) * tail + suffix;
-      const int dst_index = prefix * tail + suffix;
-
-      T partial = init_value;
-      for (int rv = lane; rv < reduce_extent; rv += kWarpSize) {
-        T val = src[src_base + rv * tail];
-        if constexpr (UseAbs) {
-          val = val < T(0) ? -val : val;
-        }
-        partial = Reducer()(partial, val);
-      }
-
-      unsigned mask = __activemask();
-      for (int offset = kWarpSize / 2; offset > 0; offset >>= 1) {
-        T other = __shfl_down_sync(mask, partial, offset);
-        partial = Reducer()(partial, other);
-      }
-
-      if (lane == 0) {
-        if constexpr (NeedAccumulate) {
-          partial = Reducer()(dst[dst_index], partial);
-        }
-        dst[dst_index] = partial;
-      }
-    }
-  }
-};
-
 template <class Reducer, int threads, int scale, int thread_offset = 0,
           int all_threads = threads>
 struct AllReduce {
@@ -102,7 +72,7 @@ struct AllReduce {
       __syncthreads();
       x = Reducer()(x, red_buf[(threadIdx.x - thread_offset) ^ offset]);
     } else {
-      x = Reducer()(x, T(__shfl_xor_sync(uint32_t(-1), x, offset)));
+      x = Reducer()(x, tl::shfl_xor_sync(uint32_t(-1), x, offset));
     }
     if constexpr (offset == scale) {
       return x;
@@ -122,7 +92,7 @@ struct AllReduce {
       asm volatile("bar.sync %0, %1;" : : "r"(2), "r"(all_threads));
       x = Reducer()(x, red_buf[(threadIdx.x - thread_offset) ^ offset]);
     } else {
-      x = Reducer()(x, T(__shfl_xor_sync(uint32_t(-1), x, offset)));
+      x = Reducer()(x, tl::shfl_xor_sync(uint32_t(-1), x, offset));
     }
     if constexpr (offset == scale) {
       return x;
@@ -159,7 +129,7 @@ template <int threads, bool reverse = false> struct CumSum1D {
 
 #pragma unroll
         for (int off = 1; off < SEG; off <<= 1) {
-          T n = (T)__shfl_down_sync(MASK, val, off);
+          T n = (T)tl::shfl_down_sync(MASK, val, off);
           if (lane < SEG - off)
             val += n;
         }
@@ -234,7 +204,7 @@ template <int threads, int Axis = 0, bool reverse = false> struct CumSum2D {
 
 #pragma unroll
           for (int off = 1; off < SEG; off <<= 1) {
-            T n = (T)__shfl_down_sync(MASK, val, off);
+            T n = tl::shfl_down_sync(MASK, val, off);
             if (lane < SEG - off)
               val += n;
           }
@@ -244,10 +214,10 @@ template <int threads, int Axis = 0, bool reverse = false> struct CumSum2D {
           if (real_col < W)
             dst[real_row * W + real_col] = val;
 
-          T segSum = (T)__shfl_sync(MASK, val, (T)0);
+          T segSum = tl::shfl_sync(MASK, val, 0);
           if (lane == 0)
             carry = segSum;
-          carry = (T)__shfl_sync(MASK, carry, (T)0);
+          carry = tl::shfl_sync(MASK, carry, 0);
         }
       } else {
         for (int seg = 0; seg * SEG < W; ++seg) {
@@ -260,7 +230,7 @@ template <int threads, int Axis = 0, bool reverse = false> struct CumSum2D {
 
 #pragma unroll
           for (int off = 1; off < SEG; off <<= 1) {
-            T n = (T)__shfl_up_sync(MASK, val, off);
+            T n = tl::shfl_up_sync(MASK, val, off);
             if (lane >= off)
               val += n;
           }
@@ -270,18 +240,16 @@ template <int threads, int Axis = 0, bool reverse = false> struct CumSum2D {
           if (real_col < W)
             dst[real_row * W + real_col] = val;
 
-          T segSum = (T)__shfl_sync(MASK, val, SEG - 1);
+          T segSum = tl::shfl_sync(MASK, val, SEG - 1);
           if (lane == SEG - 1)
             carry = segSum;
-          carry = (T)__shfl_sync(MASK, carry, SEG - 1);
+          carry = tl::shfl_sync(MASK, carry, SEG - 1);
         }
       }
     }
   }
 };
 
-// TileScale extra
-
 template <typename T, typename ReduceOp>
 TL_DEVICE T warp_reduce(T value, ReduceOp op) {
   constexpr uint32_t mask = 0xffffffff;
diff --git a/src/tl_templates/cuda/sync.h b/src/tl_templates/cuda/sync.h
index cad94ee7e..df3e5e67f 100644
--- a/src/tl_templates/cuda/sync.h
+++ b/src/tl_templates/cuda/sync.h
@@ -184,11 +184,13 @@ TL_DEVICE void barrier_blocks(int offset, int rank, int num_ranks) {
 #undef FINISHED_SUM_TAG
 }
 
-template <typename T> TL_DEVICE void wait_eq(void *ptr, T val) {
+template <typename P, typename T> TL_DEVICE void wait_eq(P ptr, T val) {
+  static_assert(std::is_same_v<P, uint64_t> || std::is_pointer_v<P>,
+                "P must be a pointer or uint64_t");
   T *flag_ptr = reinterpret_cast<T *>(ptr);
 // Spin-loop
 #pragma unroll 1
-  while (ld_acquire(flag_ptr) != val)
+  while (ld_volatile_global(flag_ptr) != val)
     ;
 }
 
diff --git a/src/tl_templates/cuda/tcgen_05.h b/src/tl_templates/cuda/tcgen_05.h
index 1211bc246..e40907e34 100644
--- a/src/tl_templates/cuda/tcgen_05.h
+++ b/src/tl_templates/cuda/tcgen_05.h
@@ -6,6 +6,7 @@
 #endif
 
 #include "common.h"
+#include <cute/arch/cluster_sm90.hpp>
 
 namespace tl {
 
@@ -59,12 +60,15 @@ inline void __device__ amma_fp16bf16_ss(uint64_t const desc_a,
                  "r"(mask[0]), "r"(mask[1]), "r"(mask[2]), "r"(mask[3]));
 }
 
-inline __device__ void amma_commit(uint64_t const *smem_ptr) {
+// Wrapper for CUTLASS umma_arrive: elect one lane, then arrive the mbarrier
+TL_DEVICE void tcgen05_mma_arrive(void const *smem_ptr) {
   uint32_t bar_intptr = smem_ptr_to_uint(smem_ptr);
-  asm volatile("tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::"
-               "cluster.b64 [%0];"
-               :
-               : "r"(bar_intptr));
+  if (cute::elect_one_sync()) {
+    asm volatile("tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::"
+                 "cluster.b64 [%0];"
+                 :
+                 : "r"(bar_intptr));
+  }
 }
 
-} // namespace tl
\ No newline at end of file
+} // namespace tl
diff --git a/src/tl_templates/cuda/tcgen_05_ld.h b/src/tl_templates/cuda/tcgen_05_ld.h
index b2eb2f816..9e5e34206 100644
--- a/src/tl_templates/cuda/tcgen_05_ld.h
+++ b/src/tl_templates/cuda/tcgen_05_ld.h
@@ -10,7 +10,9 @@
 namespace tl {
 
 // 32 data path lanes, 32-bit pattern, repeated N times
-class tmem_ld_32dp32bNx {
+template <bool Pack16> class tmem_ld_32dp32bNx;
+
+template <> class tmem_ld_32dp32bNx<false> {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
@@ -180,9 +182,180 @@ class tmem_ld_32dp32bNx {
     }
   }
 };
+template <> class tmem_ld_32dp32bNx<true> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 128,
+                  "N must be a power of 2 and lies between 1 ~ 128");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.ld.sync.aligned.32x32b.pack::16b.x1.b32"
+                   "{%0},"
+                   "[%1];\n"
+                   : "=r"(dst_ptr[0])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.32x32b.pack::16b.x2.b32"
+                   "{%0, %1},"
+                   "[%2];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1])
+                   : "r"(src_addr));
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.ld.sync.aligned.32x32b.pack::16b.x4.b32"
+                   "{%0, %1, %2, %3},"
+                   "[%4];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3])
+                   : "r"(src_addr));
+    } else if constexpr (N == 8) {
+      asm volatile("tcgen05.ld.sync.aligned.32x32b.pack::16b.x8.b32"
+                   "{%0, %1, %2, %3, %4, %5, %6, %7},"
+                   "[%8];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+                     "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
+                   : "r"(src_addr));
+    } else if constexpr (N == 16) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.32x32b.pack::16b.x16.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15},"
+          "[%16];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15])
+          : "r"(src_addr));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.32x32b.pack::16b.x32.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
+          "%26, %27, %28, %29, %30, %31},"
+          "[%32];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
+          : "r"(src_addr));
+    } else if constexpr (N == 64) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.32x32b.pack::16b.x64.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63},"
+          "[%64];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63])
+          : "r"(src_addr));
+    } else if constexpr (N == 128) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.32x32b.pack::16b.x128.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, "
+          "%70, "
+          "%71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %82, %83, "
+          "%84, "
+          "%85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, "
+          "%98, "
+          "%99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, "
+          "%110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
+          "%121, %122, %123, %124, %125, %126, %127},"
+          "[%128];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63]), "=r"(dst_ptr[64]), "=r"(dst_ptr[65]),
+            "=r"(dst_ptr[66]), "=r"(dst_ptr[67]), "=r"(dst_ptr[68]),
+            "=r"(dst_ptr[69]), "=r"(dst_ptr[70]), "=r"(dst_ptr[71]),
+            "=r"(dst_ptr[72]), "=r"(dst_ptr[73]), "=r"(dst_ptr[74]),
+            "=r"(dst_ptr[75]), "=r"(dst_ptr[76]), "=r"(dst_ptr[77]),
+            "=r"(dst_ptr[78]), "=r"(dst_ptr[79]), "=r"(dst_ptr[80]),
+            "=r"(dst_ptr[81]), "=r"(dst_ptr[82]), "=r"(dst_ptr[83]),
+            "=r"(dst_ptr[84]), "=r"(dst_ptr[85]), "=r"(dst_ptr[86]),
+            "=r"(dst_ptr[87]), "=r"(dst_ptr[88]), "=r"(dst_ptr[89]),
+            "=r"(dst_ptr[90]), "=r"(dst_ptr[91]), "=r"(dst_ptr[92]),
+            "=r"(dst_ptr[93]), "=r"(dst_ptr[94]), "=r"(dst_ptr[95]),
+            "=r"(dst_ptr[96]), "=r"(dst_ptr[97]), "=r"(dst_ptr[98]),
+            "=r"(dst_ptr[99]), "=r"(dst_ptr[100]), "=r"(dst_ptr[101]),
+            "=r"(dst_ptr[102]), "=r"(dst_ptr[103]), "=r"(dst_ptr[104]),
+            "=r"(dst_ptr[105]), "=r"(dst_ptr[106]), "=r"(dst_ptr[107]),
+            "=r"(dst_ptr[108]), "=r"(dst_ptr[109]), "=r"(dst_ptr[110]),
+            "=r"(dst_ptr[111]), "=r"(dst_ptr[112]), "=r"(dst_ptr[113]),
+            "=r"(dst_ptr[114]), "=r"(dst_ptr[115]), "=r"(dst_ptr[116]),
+            "=r"(dst_ptr[117]), "=r"(dst_ptr[118]), "=r"(dst_ptr[119]),
+            "=r"(dst_ptr[120]), "=r"(dst_ptr[121]), "=r"(dst_ptr[122]),
+            "=r"(dst_ptr[123]), "=r"(dst_ptr[124]), "=r"(dst_ptr[125]),
+            "=r"(dst_ptr[126]), "=r"(dst_ptr[127])
+          : "r"(src_addr));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
 
 // 16 data path lanes, 64-bit pattern, repeated N times
-class tmem_ld_16dp64bNx {
+template <bool Pack16> class tmem_ld_16dp64bNx;
+template <> class tmem_ld_16dp64bNx<false> {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
@@ -352,39 +525,43 @@ class tmem_ld_16dp64bNx {
     }
   }
 };
-
-// 16 data path lanes, 128-bit pattern, repeated N times
-class tmem_ld_16dp128bNx {
+template <> class tmem_ld_16dp64bNx<true> {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 64,
-                  "N must be a power of 2 and lies between 1 ~ 64");
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 128,
+                  "N must be a power of 2 and lies between 1 ~ 128");
 
     if constexpr (N == 1) {
-      asm volatile("tcgen05.ld.sync.aligned.16x128b.x1.b32"
+      asm volatile("tcgen05.ld.sync.aligned.16x64b.pack::16b.x1.b32"
+                   "{%0},"
+                   "[%1];\n"
+                   : "=r"(dst_ptr[0])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x64b.pack::16b.x2.b32"
                    "{%0, %1},"
                    "[%2];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1])
                    : "r"(src_addr));
-    } else if constexpr (N == 2) {
-      asm volatile("tcgen05.ld.sync.aligned.16x128b.x2.b32"
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.ld.sync.aligned.16x64b.pack::16b.x4.b32"
                    "{%0, %1, %2, %3},"
                    "[%4];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
                      "=r"(dst_ptr[3])
                    : "r"(src_addr));
-    } else if constexpr (N == 4) {
-      asm volatile("tcgen05.ld.sync.aligned.16x128b.x4.b32"
+    } else if constexpr (N == 8) {
+      asm volatile("tcgen05.ld.sync.aligned.16x64b.pack::16b.x8.b32"
                    "{%0, %1, %2, %3, %4, %5, %6, %7},"
                    "[%8];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
                      "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
                      "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
                    : "r"(src_addr));
-    } else if constexpr (N == 8) {
+    } else if constexpr (N == 16) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.x8.b32"
+          "tcgen05.ld.sync.aligned.16x64b.pack::16b.x16.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
           "%14, %15},"
           "[%16];\n"
@@ -395,9 +572,9 @@ class tmem_ld_16dp128bNx {
             "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
             "=r"(dst_ptr[15])
           : "r"(src_addr));
-    } else if constexpr (N == 16) {
+    } else if constexpr (N == 32) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.x16.b32"
+          "tcgen05.ld.sync.aligned.16x64b.pack::16b.x32.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
           "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
           "%26, %27, %28, %29, %30, %31},"
@@ -414,9 +591,9 @@ class tmem_ld_16dp128bNx {
             "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
             "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
           : "r"(src_addr));
-    } else if constexpr (N == 32) {
+    } else if constexpr (N == 64) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.x32.b32"
+          "tcgen05.ld.sync.aligned.16x64b.pack::16b.x64.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
           "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
           "%28, "
@@ -449,9 +626,9 @@ class tmem_ld_16dp128bNx {
             "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
             "=r"(dst_ptr[63])
           : "r"(src_addr));
-    } else if constexpr (N == 64) {
+    } else if constexpr (N == 128) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x128b.x64.b32"
+          "tcgen05.ld.sync.aligned.16x64b.pack::16b.x128.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
           "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
           "%28, "
@@ -519,32 +696,39 @@ class tmem_ld_16dp128bNx {
   }
 };
 
-// 16 data path lanes, 256-bit pattern, repeated N times
-class tmem_ld_16dp256bNx {
+// 16 data path lanes, 128-bit pattern, repeated N times
+template <bool Pack16> class tmem_ld_16dp128bNx;
+template <> class tmem_ld_16dp128bNx<false> {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 32,
-                  "N must be a power of 2 and lies between 1 ~ 32");
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 64,
+                  "N must be a power of 2 and lies between 1 ~ 64");
 
     if constexpr (N == 1) {
-      asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32"
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.x1.b32"
+                   "{%0, %1},"
+                   "[%2];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.x2.b32"
                    "{%0, %1, %2, %3},"
                    "[%4];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
                      "=r"(dst_ptr[3])
                    : "r"(src_addr));
-    } else if constexpr (N == 2) {
-      asm volatile("tcgen05.ld.sync.aligned.16x256b.x2.b32"
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.x4.b32"
                    "{%0, %1, %2, %3, %4, %5, %6, %7},"
                    "[%8];\n"
                    : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
                      "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
                      "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
                    : "r"(src_addr));
-    } else if constexpr (N == 4) {
+    } else if constexpr (N == 8) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.x4.b32"
+          "tcgen05.ld.sync.aligned.16x128b.x8.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
           "%14, %15},"
           "[%16];\n"
@@ -555,9 +739,9 @@ class tmem_ld_16dp256bNx {
             "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
             "=r"(dst_ptr[15])
           : "r"(src_addr));
-    } else if constexpr (N == 8) {
+    } else if constexpr (N == 16) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.x8.b32"
+          "tcgen05.ld.sync.aligned.16x128b.x16.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
           "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
           "%26, %27, %28, %29, %30, %31},"
@@ -574,9 +758,9 @@ class tmem_ld_16dp256bNx {
             "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
             "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
           : "r"(src_addr));
-    } else if constexpr (N == 16) {
+    } else if constexpr (N == 32) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.x16.b32"
+          "tcgen05.ld.sync.aligned.16x128b.x32.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
           "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
           "%28, "
@@ -609,9 +793,492 @@ class tmem_ld_16dp256bNx {
             "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
             "=r"(dst_ptr[63])
           : "r"(src_addr));
-    } else if constexpr (N == 32) {
+    } else if constexpr (N == 64) {
       asm volatile(
-          "tcgen05.ld.sync.aligned.16x256b.x32.b32"
+          "tcgen05.ld.sync.aligned.16x128b.x64.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, "
+          "%70, "
+          "%71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %82, %83, "
+          "%84, "
+          "%85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, "
+          "%98, "
+          "%99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, "
+          "%110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
+          "%121, %122, %123, %124, %125, %126, %127},"
+          "[%128];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63]), "=r"(dst_ptr[64]), "=r"(dst_ptr[65]),
+            "=r"(dst_ptr[66]), "=r"(dst_ptr[67]), "=r"(dst_ptr[68]),
+            "=r"(dst_ptr[69]), "=r"(dst_ptr[70]), "=r"(dst_ptr[71]),
+            "=r"(dst_ptr[72]), "=r"(dst_ptr[73]), "=r"(dst_ptr[74]),
+            "=r"(dst_ptr[75]), "=r"(dst_ptr[76]), "=r"(dst_ptr[77]),
+            "=r"(dst_ptr[78]), "=r"(dst_ptr[79]), "=r"(dst_ptr[80]),
+            "=r"(dst_ptr[81]), "=r"(dst_ptr[82]), "=r"(dst_ptr[83]),
+            "=r"(dst_ptr[84]), "=r"(dst_ptr[85]), "=r"(dst_ptr[86]),
+            "=r"(dst_ptr[87]), "=r"(dst_ptr[88]), "=r"(dst_ptr[89]),
+            "=r"(dst_ptr[90]), "=r"(dst_ptr[91]), "=r"(dst_ptr[92]),
+            "=r"(dst_ptr[93]), "=r"(dst_ptr[94]), "=r"(dst_ptr[95]),
+            "=r"(dst_ptr[96]), "=r"(dst_ptr[97]), "=r"(dst_ptr[98]),
+            "=r"(dst_ptr[99]), "=r"(dst_ptr[100]), "=r"(dst_ptr[101]),
+            "=r"(dst_ptr[102]), "=r"(dst_ptr[103]), "=r"(dst_ptr[104]),
+            "=r"(dst_ptr[105]), "=r"(dst_ptr[106]), "=r"(dst_ptr[107]),
+            "=r"(dst_ptr[108]), "=r"(dst_ptr[109]), "=r"(dst_ptr[110]),
+            "=r"(dst_ptr[111]), "=r"(dst_ptr[112]), "=r"(dst_ptr[113]),
+            "=r"(dst_ptr[114]), "=r"(dst_ptr[115]), "=r"(dst_ptr[116]),
+            "=r"(dst_ptr[117]), "=r"(dst_ptr[118]), "=r"(dst_ptr[119]),
+            "=r"(dst_ptr[120]), "=r"(dst_ptr[121]), "=r"(dst_ptr[122]),
+            "=r"(dst_ptr[123]), "=r"(dst_ptr[124]), "=r"(dst_ptr[125]),
+            "=r"(dst_ptr[126]), "=r"(dst_ptr[127])
+          : "r"(src_addr));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
+template <> class tmem_ld_16dp128bNx<true> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 64,
+                  "N must be a power of 2 and lies between 1 ~ 64");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.pack::16b.x1.b32"
+                   "{%0, %1},"
+                   "[%2];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.pack::16b.x2.b32"
+                   "{%0, %1, %2, %3},"
+                   "[%4];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3])
+                   : "r"(src_addr));
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.ld.sync.aligned.16x128b.pack::16b.x4.b32"
+                   "{%0, %1, %2, %3, %4, %5, %6, %7},"
+                   "[%8];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+                     "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
+                   : "r"(src_addr));
+    } else if constexpr (N == 8) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x128b.pack::16b.x8.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15},"
+          "[%16];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15])
+          : "r"(src_addr));
+    } else if constexpr (N == 16) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x128b.pack::16b.x16.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
+          "%26, %27, %28, %29, %30, %31},"
+          "[%32];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
+          : "r"(src_addr));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x128b.pack::16b.x32.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63},"
+          "[%64];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63])
+          : "r"(src_addr));
+    } else if constexpr (N == 64) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x128b.pack::16b.x64.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, "
+          "%70, "
+          "%71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %82, %83, "
+          "%84, "
+          "%85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, "
+          "%98, "
+          "%99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, "
+          "%110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
+          "%121, %122, %123, %124, %125, %126, %127},"
+          "[%128];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63]), "=r"(dst_ptr[64]), "=r"(dst_ptr[65]),
+            "=r"(dst_ptr[66]), "=r"(dst_ptr[67]), "=r"(dst_ptr[68]),
+            "=r"(dst_ptr[69]), "=r"(dst_ptr[70]), "=r"(dst_ptr[71]),
+            "=r"(dst_ptr[72]), "=r"(dst_ptr[73]), "=r"(dst_ptr[74]),
+            "=r"(dst_ptr[75]), "=r"(dst_ptr[76]), "=r"(dst_ptr[77]),
+            "=r"(dst_ptr[78]), "=r"(dst_ptr[79]), "=r"(dst_ptr[80]),
+            "=r"(dst_ptr[81]), "=r"(dst_ptr[82]), "=r"(dst_ptr[83]),
+            "=r"(dst_ptr[84]), "=r"(dst_ptr[85]), "=r"(dst_ptr[86]),
+            "=r"(dst_ptr[87]), "=r"(dst_ptr[88]), "=r"(dst_ptr[89]),
+            "=r"(dst_ptr[90]), "=r"(dst_ptr[91]), "=r"(dst_ptr[92]),
+            "=r"(dst_ptr[93]), "=r"(dst_ptr[94]), "=r"(dst_ptr[95]),
+            "=r"(dst_ptr[96]), "=r"(dst_ptr[97]), "=r"(dst_ptr[98]),
+            "=r"(dst_ptr[99]), "=r"(dst_ptr[100]), "=r"(dst_ptr[101]),
+            "=r"(dst_ptr[102]), "=r"(dst_ptr[103]), "=r"(dst_ptr[104]),
+            "=r"(dst_ptr[105]), "=r"(dst_ptr[106]), "=r"(dst_ptr[107]),
+            "=r"(dst_ptr[108]), "=r"(dst_ptr[109]), "=r"(dst_ptr[110]),
+            "=r"(dst_ptr[111]), "=r"(dst_ptr[112]), "=r"(dst_ptr[113]),
+            "=r"(dst_ptr[114]), "=r"(dst_ptr[115]), "=r"(dst_ptr[116]),
+            "=r"(dst_ptr[117]), "=r"(dst_ptr[118]), "=r"(dst_ptr[119]),
+            "=r"(dst_ptr[120]), "=r"(dst_ptr[121]), "=r"(dst_ptr[122]),
+            "=r"(dst_ptr[123]), "=r"(dst_ptr[124]), "=r"(dst_ptr[125]),
+            "=r"(dst_ptr[126]), "=r"(dst_ptr[127])
+          : "r"(src_addr));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
+
+// 16 data path lanes, 256-bit pattern, repeated N times
+template <bool Pack16> class tmem_ld_16dp256bNx;
+template <> class tmem_ld_16dp256bNx<false> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 32,
+                  "N must be a power of 2 and lies between 1 ~ 32");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32"
+                   "{%0, %1, %2, %3},"
+                   "[%4];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x256b.x2.b32"
+                   "{%0, %1, %2, %3, %4, %5, %6, %7},"
+                   "[%8];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+                     "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
+                   : "r"(src_addr));
+    } else if constexpr (N == 4) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.x4.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15},"
+          "[%16];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15])
+          : "r"(src_addr));
+    } else if constexpr (N == 8) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.x8.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
+          "%26, %27, %28, %29, %30, %31},"
+          "[%32];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
+          : "r"(src_addr));
+    } else if constexpr (N == 16) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.x16.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63},"
+          "[%64];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63])
+          : "r"(src_addr));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.x32.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, "
+          "%70, "
+          "%71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %82, %83, "
+          "%84, "
+          "%85, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, "
+          "%98, "
+          "%99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, "
+          "%110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, "
+          "%121, %122, %123, %124, %125, %126, %127},"
+          "[%128];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63]), "=r"(dst_ptr[64]), "=r"(dst_ptr[65]),
+            "=r"(dst_ptr[66]), "=r"(dst_ptr[67]), "=r"(dst_ptr[68]),
+            "=r"(dst_ptr[69]), "=r"(dst_ptr[70]), "=r"(dst_ptr[71]),
+            "=r"(dst_ptr[72]), "=r"(dst_ptr[73]), "=r"(dst_ptr[74]),
+            "=r"(dst_ptr[75]), "=r"(dst_ptr[76]), "=r"(dst_ptr[77]),
+            "=r"(dst_ptr[78]), "=r"(dst_ptr[79]), "=r"(dst_ptr[80]),
+            "=r"(dst_ptr[81]), "=r"(dst_ptr[82]), "=r"(dst_ptr[83]),
+            "=r"(dst_ptr[84]), "=r"(dst_ptr[85]), "=r"(dst_ptr[86]),
+            "=r"(dst_ptr[87]), "=r"(dst_ptr[88]), "=r"(dst_ptr[89]),
+            "=r"(dst_ptr[90]), "=r"(dst_ptr[91]), "=r"(dst_ptr[92]),
+            "=r"(dst_ptr[93]), "=r"(dst_ptr[94]), "=r"(dst_ptr[95]),
+            "=r"(dst_ptr[96]), "=r"(dst_ptr[97]), "=r"(dst_ptr[98]),
+            "=r"(dst_ptr[99]), "=r"(dst_ptr[100]), "=r"(dst_ptr[101]),
+            "=r"(dst_ptr[102]), "=r"(dst_ptr[103]), "=r"(dst_ptr[104]),
+            "=r"(dst_ptr[105]), "=r"(dst_ptr[106]), "=r"(dst_ptr[107]),
+            "=r"(dst_ptr[108]), "=r"(dst_ptr[109]), "=r"(dst_ptr[110]),
+            "=r"(dst_ptr[111]), "=r"(dst_ptr[112]), "=r"(dst_ptr[113]),
+            "=r"(dst_ptr[114]), "=r"(dst_ptr[115]), "=r"(dst_ptr[116]),
+            "=r"(dst_ptr[117]), "=r"(dst_ptr[118]), "=r"(dst_ptr[119]),
+            "=r"(dst_ptr[120]), "=r"(dst_ptr[121]), "=r"(dst_ptr[122]),
+            "=r"(dst_ptr[123]), "=r"(dst_ptr[124]), "=r"(dst_ptr[125]),
+            "=r"(dst_ptr[126]), "=r"(dst_ptr[127])
+          : "r"(src_addr));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
+template <> class tmem_ld_16dp256bNx<true> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 32,
+                  "N must be a power of 2 and lies between 1 ~ 32");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.ld.sync.aligned.16x256b.pack::16b.x1.b32"
+                   "{%0, %1, %2, %3},"
+                   "[%4];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3])
+                   : "r"(src_addr));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.ld.sync.aligned.16x256b.pack::16b.x2.b32"
+                   "{%0, %1, %2, %3, %4, %5, %6, %7},"
+                   "[%8];\n"
+                   : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+                     "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+                     "=r"(dst_ptr[6]), "=r"(dst_ptr[7])
+                   : "r"(src_addr));
+    } else if constexpr (N == 4) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.pack::16b.x4.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15},"
+          "[%16];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15])
+          : "r"(src_addr));
+    } else if constexpr (N == 8) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.pack::16b.x8.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+          "%14, %15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, "
+          "%26, %27, %28, %29, %30, %31},"
+          "[%32];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31])
+          : "r"(src_addr));
+    } else if constexpr (N == 16) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.pack::16b.x16.b32"
+          "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
+          "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
+          "%28, "
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41, "
+          "%42, "
+          "%43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, "
+          "%56, "
+          "%57, %58, %59, %60, %61, %62, %63},"
+          "[%64];\n"
+          : "=r"(dst_ptr[0]), "=r"(dst_ptr[1]), "=r"(dst_ptr[2]),
+            "=r"(dst_ptr[3]), "=r"(dst_ptr[4]), "=r"(dst_ptr[5]),
+            "=r"(dst_ptr[6]), "=r"(dst_ptr[7]), "=r"(dst_ptr[8]),
+            "=r"(dst_ptr[9]), "=r"(dst_ptr[10]), "=r"(dst_ptr[11]),
+            "=r"(dst_ptr[12]), "=r"(dst_ptr[13]), "=r"(dst_ptr[14]),
+            "=r"(dst_ptr[15]), "=r"(dst_ptr[16]), "=r"(dst_ptr[17]),
+            "=r"(dst_ptr[18]), "=r"(dst_ptr[19]), "=r"(dst_ptr[20]),
+            "=r"(dst_ptr[21]), "=r"(dst_ptr[22]), "=r"(dst_ptr[23]),
+            "=r"(dst_ptr[24]), "=r"(dst_ptr[25]), "=r"(dst_ptr[26]),
+            "=r"(dst_ptr[27]), "=r"(dst_ptr[28]), "=r"(dst_ptr[29]),
+            "=r"(dst_ptr[30]), "=r"(dst_ptr[31]), "=r"(dst_ptr[32]),
+            "=r"(dst_ptr[33]), "=r"(dst_ptr[34]), "=r"(dst_ptr[35]),
+            "=r"(dst_ptr[36]), "=r"(dst_ptr[37]), "=r"(dst_ptr[38]),
+            "=r"(dst_ptr[39]), "=r"(dst_ptr[40]), "=r"(dst_ptr[41]),
+            "=r"(dst_ptr[42]), "=r"(dst_ptr[43]), "=r"(dst_ptr[44]),
+            "=r"(dst_ptr[45]), "=r"(dst_ptr[46]), "=r"(dst_ptr[47]),
+            "=r"(dst_ptr[48]), "=r"(dst_ptr[49]), "=r"(dst_ptr[50]),
+            "=r"(dst_ptr[51]), "=r"(dst_ptr[52]), "=r"(dst_ptr[53]),
+            "=r"(dst_ptr[54]), "=r"(dst_ptr[55]), "=r"(dst_ptr[56]),
+            "=r"(dst_ptr[57]), "=r"(dst_ptr[58]), "=r"(dst_ptr[59]),
+            "=r"(dst_ptr[60]), "=r"(dst_ptr[61]), "=r"(dst_ptr[62]),
+            "=r"(dst_ptr[63])
+          : "r"(src_addr));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.ld.sync.aligned.16x256b.pack::16b.x32.b32"
           "{%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, "
           "%15, %16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, "
           "%28, "
@@ -681,32 +1348,32 @@ class tmem_ld_16dp256bNx {
 
 // 32 data path lanes, 64-bit pattern, repeated N times
 // (conducted with 2x16dp64bNx)
-class tmem_ld_32dp64bNx {
+template <bool Pack16 = false> class tmem_ld_32dp64bNx {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    tmem_ld_16dp64bNx::copy<N>(src_addr, dst_ptr);
-    tmem_ld_16dp64bNx::copy<N>(src_addr + (16 << 16), dst_ptr + N);
+    tmem_ld_16dp64bNx<Pack16>::copy<N>(src_addr, dst_ptr);
+    tmem_ld_16dp64bNx<Pack16>::copy<N>(src_addr + (16 << 16), dst_ptr + N);
   }
 };
 
 // 32 data path lanes, 128-bit pattern, repeated N times
-class tmem_ld_32dp128bNx {
+template <bool Pack16 = false> class tmem_ld_32dp128bNx {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    tmem_ld_16dp128bNx::copy<N>(src_addr, dst_ptr);
-    tmem_ld_16dp128bNx::copy<N>(src_addr + (16 << 16), dst_ptr + N * 2);
+    tmem_ld_16dp128bNx<Pack16>::copy<N>(src_addr, dst_ptr);
+    tmem_ld_16dp128bNx<Pack16>::copy<N>(src_addr + (16 << 16), dst_ptr + N * 2);
   }
 };
 
 // 32 data path lanes, 256-bit pattern, repeated N times
-class tmem_ld_32dp256bNx {
+template <bool Pack16 = false> class tmem_ld_32dp256bNx {
 public:
   template <int N>
   static TL_DEVICE void copy(uint32_t const &src_addr, uint32_t *dst_ptr) {
-    tmem_ld_16dp256bNx::copy<N>(src_addr, dst_ptr);
-    tmem_ld_16dp256bNx::copy<N>(src_addr + (16 << 16), dst_ptr + N * 4);
+    tmem_ld_16dp256bNx<Pack16>::copy<N>(src_addr, dst_ptr);
+    tmem_ld_16dp256bNx<Pack16>::copy<N>(src_addr + (16 << 16), dst_ptr + N * 4);
   }
 };
 
diff --git a/src/tl_templates/cuda/threadblock_swizzle.h b/src/tl_templates/cuda/threadblock_swizzle.h
index 00a230c1a..60fa0ad1f 100644
--- a/src/tl_templates/cuda/threadblock_swizzle.h
+++ b/src/tl_templates/cuda/threadblock_swizzle.h
@@ -4,7 +4,7 @@
 
 namespace tl {
 
-template <int panel_width, int offset = 0> TL_DEVICE dim3 rasterization2DRow() {
+template <int panel_width> TL_DEVICE dim3 rasterization2DRow() {
   const unsigned int block_idx = blockIdx.x + blockIdx.y * gridDim.x;
   const unsigned int grid_size = gridDim.x * gridDim.y;
   const unsigned int panel_size = panel_width * gridDim.x;
@@ -18,13 +18,11 @@ template <int panel_width, int offset = 0> TL_DEVICE dim3 rasterization2DRow() {
   const unsigned int col_idx = (panel_idx & 1)
                                    ? gridDim.x - 1 - panel_offset / stride
                                    : panel_offset / stride;
-  const unsigned int row_idx =
-      (panel_offset % stride + panel_idx * panel_width + offset) % gridDim.y;
+  const unsigned int row_idx = panel_offset % stride + panel_idx * panel_width;
   return {col_idx, row_idx, blockIdx.z};
 }
 
-template <int panel_width, int offset = 0>
-TL_DEVICE dim3 rasterization2DColumn() {
+template <int panel_width> TL_DEVICE dim3 rasterization2DColumn() {
   const unsigned int block_idx = blockIdx.x + blockIdx.y * gridDim.x;
   const unsigned int grid_size = gridDim.x * gridDim.y;
   const unsigned int panel_size = panel_width * gridDim.y;
@@ -38,8 +36,7 @@ TL_DEVICE dim3 rasterization2DColumn() {
   const unsigned int row_idx = (panel_idx & 1)
                                    ? gridDim.y - 1 - panel_offset / stride
                                    : panel_offset / stride;
-  const unsigned int col_idx =
-      (panel_offset % stride + panel_idx * panel_width + offset) % gridDim.x;
+  const unsigned int col_idx = panel_offset % stride + panel_idx * panel_width;
   return {col_idx, row_idx, blockIdx.z};
 }
 
diff --git a/src/tl_templates/hip/common.h b/src/tl_templates/hip/common.h
index b00944a18..8be247e77 100644
--- a/src/tl_templates/hip/common.h
+++ b/src/tl_templates/hip/common.h
@@ -116,6 +116,7 @@ TL_DEVICE void AtomicAdd(T1 address, T2 val) {
   atomicAdd(reinterpret_cast<T1 *>(&address), static_cast<T1>(val));
 }
 
-template <typename T1, typename T2> TL_DEVICE T1 AtomicAddRet(T1 &ref, T2 val) {
-  return atomicAdd(&ref, static_cast<T1>(val));
+template <typename T1, typename T2>
+TL_DEVICE T1 AtomicAddRet(T1 *address, T2 val) {
+  return atomicAdd(reinterpret_cast<T1 *>(address), static_cast<T1>(val));
 }
diff --git a/src/tl_templates/hip/copy.h b/src/tl_templates/hip/copy.h
index 3ba334da8..3f122d801 100644
--- a/src/tl_templates/hip/copy.h
+++ b/src/tl_templates/hip/copy.h
@@ -73,33 +73,35 @@ CK_TILE_DEVICE void async_buffer_load_dword_v(void *smem, int32x4_t rsrc,
 }
 
 template <int N>
-TL_DEVICE void cp_async_gs(void *lds_base_ptr, void *global_base_ptr) {
+TL_DEVICE void cp_async_gs(void *lds_base_ptr, void const *global_base_ptr) {
   if constexpr (N == 16) {
-    *(uint4 *)lds_base_ptr = *(uint4 *)global_base_ptr;
+    *(uint4 *)lds_base_ptr = *(const uint4 *)global_base_ptr;
   } else if constexpr (N == 8) {
-    *(uint2 *)lds_base_ptr = *(uint2 *)global_base_ptr;
+    *(uint2 *)lds_base_ptr = *(const uint2 *)global_base_ptr;
   } else if constexpr (N == 4) {
     async_buffer_load_dword_v(
         lds_base_ptr,
-        make_wave_buffer_resource(((int32_t *)global_base_ptr) - threadIdx.x),
+        make_wave_buffer_resource(((const int32_t *)global_base_ptr) -
+                                  threadIdx.x),
         threadIdx.x * N /*assume 4 bytes*/);
   }
 }
 
 template <int N>
 TL_DEVICE void cp_async_gs_conditional(void *lds_base_ptr,
-                                       void *global_base_ptr, bool cond) {
+                                       void const *global_base_ptr, bool cond) {
   if constexpr (N == 16) {
     *(uint4 *)lds_base_ptr =
-        cond ? *(uint4 *)global_base_ptr : make_uint4(0, 0, 0, 0);
+        cond ? *(const uint4 *)global_base_ptr : make_uint4(0, 0, 0, 0);
   } else if constexpr (N == 8) {
     *(uint2 *)lds_base_ptr =
-        cond ? *(uint2 *)global_base_ptr : make_uint2(0, 0);
+        cond ? *(const uint2 *)global_base_ptr : make_uint2(0, 0);
   } else {
     if (cond) {
       async_buffer_load_dword_v(
           lds_base_ptr,
-          make_wave_buffer_resource(((int32_t *)global_base_ptr) - threadIdx.x),
+          make_wave_buffer_resource(((const int32_t *)global_base_ptr) -
+                                    threadIdx.x),
           threadIdx.x * N /*assume 4 bytes*/);
     } else {
       *(uint4 *)lds_base_ptr = make_uint4(0, 0, 0, 0);
diff --git a/src/tl_templates/hip/hip_fp8.h b/src/tl_templates/hip/hip_fp8.h
index 0000745b5..82fb53031 100644
--- a/src/tl_templates/hip/hip_fp8.h
+++ b/src/tl_templates/hip/hip_fp8.h
@@ -127,3 +127,41 @@ __device__ fp8_e4_8_t make_fp8_e4_8_t(fp8_e4_t x, fp8_e4_t y, fp8_e4_t z,
   res.y = *reinterpret_cast<fp8_e4_4_t *>(&b);
   return res;
 }
+
+__device__ fp8_e4_16_t make_fp8_e4_16_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2,
+                                        fp8_e4_t x3, fp8_e4_t x4, fp8_e4_t x5,
+                                        fp8_e4_t x6, fp8_e4_t x7, fp8_e4_t y0,
+                                        fp8_e4_t y1, fp8_e4_t y2, fp8_e4_t y3,
+                                        fp8_e4_t y4, fp8_e4_t y5, fp8_e4_t y6,
+                                        fp8_e4_t y7) {
+  signed char x0_char = *reinterpret_cast<signed char *>(&x0);
+  signed char x1_char = *reinterpret_cast<signed char *>(&x1);
+  signed char x2_char = *reinterpret_cast<signed char *>(&x2);
+  signed char x3_char = *reinterpret_cast<signed char *>(&x3);
+  signed char x4_char = *reinterpret_cast<signed char *>(&x4);
+  signed char x5_char = *reinterpret_cast<signed char *>(&x5);
+  signed char x6_char = *reinterpret_cast<signed char *>(&x6);
+  signed char x7_char = *reinterpret_cast<signed char *>(&x7);
+  signed char y0_char = *reinterpret_cast<signed char *>(&y0);
+  signed char y1_char = *reinterpret_cast<signed char *>(&y1);
+  signed char y2_char = *reinterpret_cast<signed char *>(&y2);
+  signed char y3_char = *reinterpret_cast<signed char *>(&y3);
+  signed char y4_char = *reinterpret_cast<signed char *>(&y4);
+  signed char y5_char = *reinterpret_cast<signed char *>(&y5);
+  signed char y6_char = *reinterpret_cast<signed char *>(&y6);
+  signed char y7_char = *reinterpret_cast<signed char *>(&y7);
+  int a = (x3_char << 24) | (x2_char << 16) | (x1_char << 8) | x0_char;
+  int b = (x7_char << 24) | (x6_char << 16) | (x5_char << 8) | x4_char;
+  int c = (y3_char << 24) | (y2_char << 16) | (y1_char << 8) | y0_char;
+  int d = (y7_char << 24) | (y6_char << 16) | (y5_char << 8) | y4_char;
+  fp8_e4_8_t res_x;
+  res_x.x = *reinterpret_cast<fp8_e4_4_t *>(&a);
+  res_x.y = *reinterpret_cast<fp8_e4_4_t *>(&b);
+  fp8_e4_8_t res_y;
+  res_y.x = *reinterpret_cast<fp8_e4_4_t *>(&c);
+  res_y.y = *reinterpret_cast<fp8_e4_4_t *>(&d);
+  fp8_e4_16_t res;
+  res.x = res_x;
+  res.y = res_y;
+  return res;
+}
diff --git a/src/tl_templates/hip/ldsm.h b/src/tl_templates/hip/ldsm.h
index 68c1455f7..286b77324 100644
--- a/src/tl_templates/hip/ldsm.h
+++ b/src/tl_templates/hip/ldsm.h
@@ -1,3 +1,3 @@
 #pragma once
 
-#include "common.h"
\ No newline at end of file
+#include "common.h"
diff --git a/src/transform/align_dynamic_shared_memory_allocations.cc b/src/transform/align_dynamic_shared_memory_allocations.cc
index 27890c445..1c2519df9 100644
--- a/src/transform/align_dynamic_shared_memory_allocations.cc
+++ b/src/transform/align_dynamic_shared_memory_allocations.cc
@@ -47,7 +47,7 @@ class TileLangAlignDynamicSharedMemoryAllocations : public StmtExprMutator {
   }
 
   Stmt VisitStmt_(const BlockNode *op) final {
-    Block block = GetRef<Block>(op);
+    Block block = tvm::ffi::GetRef<Block>(op);
     Array<Buffer> alloc_buffers = op->alloc_buffers;
     alloc_buffers.MutateByApply([this](Buffer buf) {
       auto storage_scope =
@@ -58,7 +58,7 @@ class TileLangAlignDynamicSharedMemoryAllocations : public StmtExprMutator {
                                                  buf->dtype.bytes());
         if (!new_shape.same_as(buf->shape)) {
           ObjectPtr<BufferNode> new_buffer =
-              make_object<BufferNode>(*(buf.get()));
+              tvm::ffi::make_object<BufferNode>(*(buf.get()));
           new_buffer->shape = std::move(new_shape);
           buffer_remap_.Set(buf, Buffer(new_buffer));
           return Buffer(new_buffer);
@@ -73,7 +73,7 @@ class TileLangAlignDynamicSharedMemoryAllocations : public StmtExprMutator {
   }
 
   Stmt VisitStmt_(const BufferStoreNode *op) final {
-    auto store_node = GetRef<BufferStore>(op);
+    auto store_node = tvm::ffi::GetRef<BufferStore>(op);
     Buffer buf = op->buffer;
     if (buffer_remap_.count(buf)) {
       buf = buffer_remap_[buf];
@@ -83,7 +83,7 @@ class TileLangAlignDynamicSharedMemoryAllocations : public StmtExprMutator {
   }
 
   PrimExpr VisitExpr_(const BufferLoadNode *op) final {
-    auto load_node = GetRef<BufferLoad>(op);
+    auto load_node = tvm::ffi::GetRef<BufferLoad>(op);
     Buffer buf = op->buffer;
     if (buffer_remap_.count(buf)) {
       buf = buffer_remap_[buf];
@@ -149,11 +149,11 @@ tvm::transform::Pass AlignDynamicSharedMemoryAllocations(int align_bytes) {
                             "tl.AlignDynamicSharedMemoryAllocations", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.AlignDynamicSharedMemoryAllocations",
                         AlignDynamicSharedMemoryAllocations);
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/annotate_device_regions.cc b/src/transform/annotate_device_regions.cc
index ed57f3729..ecc0cba9d 100644
--- a/src/transform/annotate_device_regions.cc
+++ b/src/transform/annotate_device_regions.cc
@@ -46,13 +46,13 @@ class DeviceRegionAnnotater : public StmtMutator {
   Stmt VisitStmt_(const AttrStmtNode *op) final {
     if (op->attr_key == tvm::attr::kTarget) {
       // If a target attribute already exists, use it as-is.
-      return GetRef<Stmt>(op);
+      return tvm::ffi::GetRef<Stmt>(op);
     } else if (op->attr_key == tir::attr::thread_extent ||
                op->attr_key == tir::attr::pipeline_exec_scope ||
                op->attr_key == tir::attr::device_scope) {
       // These attributes are only allowed in device-side code, so
       // they should be annotated with the function's default target.
-      Stmt body = GetRef<Stmt>(op);
+      Stmt body = tvm::ffi::GetRef<Stmt>(op);
       return AttrStmt(device_target_, tvm::attr::kTarget, 0, body);
     } else {
       // All other annotations are ignored
@@ -90,11 +90,11 @@ tvm::transform::Pass AnnotateDeviceRegions() {
   return CreatePrimFuncPass(pass_func, 0, "tl.AnnotateDeviceRegions", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.AnnotateDeviceRegions",
                         AnnotateDeviceRegions);
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/annotate_read_only_params.cc b/src/transform/annotate_read_only_params.cc
new file mode 100644
index 000000000..e9eef683b
--- /dev/null
+++ b/src/transform/annotate_read_only_params.cc
@@ -0,0 +1,191 @@
+/*!
+ * \file annotate_read_only_params.cc
+ * \brief Annotate PrimFunc parameters that are read-only (never written).
+ */
+
+#include <string>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/ir/transform.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+#include <unordered_set>
+
+namespace tvm {
+namespace tl {
+using namespace tir;
+using namespace ffi;
+
+/*!
+ * \brief A simple visitor that marks handle parameters as written when they
+ *        appear on the LHS of a BufferStore or in a tvm_access_ptr with write
+ * flag.
+ */
+class ReadWriteMarker : public StmtExprVisitor {
+public:
+  explicit ReadWriteMarker(
+      const std::unordered_set<const VarNode *> &param_or_data_vars)
+      : param_or_data_vars_(param_or_data_vars) {}
+
+  const std::unordered_set<const VarNode *> &written() const {
+    return written_;
+  }
+
+  // Try to resolve the underlying buffer data Var from a pointer-like
+  // argument. Supports:
+  //  - address_of(BufferLoad(...)) -> returns buffer->data
+  //  - BufferLoad(...)             -> returns buffer->data
+  // Otherwise returns nullptr.
+  const VarNode *ResolveDataVarFromPtrArg(const PrimExpr &arg) const {
+    if (const auto *call = arg.as<CallNode>()) {
+      if (call->op.same_as(builtin::address_of())) {
+        if (call->args.size() == 1U) {
+          if (const auto *load = call->args[0].as<BufferLoadNode>()) {
+            return load->buffer->data.get();
+          }
+        }
+      }
+    } else if (const auto *load = arg.as<BufferLoadNode>()) {
+      return load->buffer->data.get();
+    }
+    return nullptr;
+  }
+
+  void VisitStmt_(const BufferStoreNode *op) final {
+    const VarNode *data = op->buffer->data.get();
+    if (param_or_data_vars_.count(data)) {
+      written_.insert(data);
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitExpr_(const CallNode *op) final {
+    // Detect tvm_access_ptr writes. Be conservative if rw_mask is non-constant.
+    if (op->op.same_as(builtin::tvm_access_ptr())) {
+      if (op->args.size() == 5U) {
+        if (const VarNode *buf = op->args[1].as<VarNode>()) {
+          const IntImmNode *flag = op->args[4].as<IntImmNode>();
+          bool maybe_write = true; // default conservative
+          if (flag) {
+            maybe_write = (flag->value & 2) != 0; // write bit set
+          }
+          if (maybe_write && param_or_data_vars_.count(buf)) {
+            written_.insert(buf);
+          }
+        }
+      }
+    } else {
+      // Generic fallback: mark buffers that appear as
+      // address_of(BufferLoad(...)) in call arguments as written. This matches
+      // patterns like
+      //   tl.tma_store(address_of(smem[..]), address_of(gmem[..]), ...)
+      //   call_extern("AtomicAdd*", address_of(gmem[..]), ...)
+      // and avoids over-marking plain BufferLoad used for reads.
+      for (const PrimExpr &a : op->args) {
+        if (const auto *c = a.as<CallNode>()) {
+          if (c->op.same_as(builtin::address_of()) && c->args.size() == 1U) {
+            if (const auto *bl = c->args[0].as<BufferLoadNode>()) {
+              const VarNode *data = bl->buffer->data.get();
+              if (param_or_data_vars_.count(data)) {
+                written_.insert(data);
+              }
+            }
+          }
+        }
+      }
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+private:
+  std::unordered_set<const VarNode *> param_or_data_vars_;
+  std::unordered_set<const VarNode *> written_;
+};
+
+/*!
+ * \brief Annotate PrimFunc with indices of read-only handle parameters.
+ *
+ * Adds an Array<Integer> attribute "tl.readonly_param_indices" that lists
+ * parameter indices which correspond to handle parameters that are never
+ * written inside the function body. This can be used by codegen to emit
+ * `const` qualifiers to enable read-only caching (e.g., __ldg on CUDA).
+ */
+static tir::PrimFunc MarkReadOnlyParams(tir::PrimFunc f) {
+  // Gather handle params and their corresponding buffer data vars (aliases).
+  std::unordered_set<const VarNode *> param_or_data_vars;
+  // Map back from data var to parameter index for result attribution.
+  std::unordered_map<const VarNode *, size_t> data_var_to_param_idx;
+
+  for (size_t i = 0; i < f->params.size(); ++i) {
+    const Var &p = f->params[i];
+    if (!p->dtype.is_handle())
+      continue;
+    param_or_data_vars.insert(p.get());
+    // If there is a buffer_map entry for this param, include its data var too.
+    if (auto opt = f->buffer_map.Get(p)) {
+      const VarNode *data = opt.value()->data.get();
+      param_or_data_vars.insert(data);
+      data_var_to_param_idx[data] = i;
+    }
+  }
+  if (param_or_data_vars.empty())
+    return f;
+
+  ReadWriteMarker marker(param_or_data_vars);
+  marker(f->body);
+
+  // Determine read-only parameter indices among all params (handle only)
+  Array<Integer> readonly_indices;
+  for (size_t i = 0; i < f->params.size(); ++i) {
+    const Var &v = f->params[i];
+    if (!v->dtype.is_handle())
+      continue;
+
+    bool is_written = false;
+    // Direct param var written?
+    if (marker.written().count(v.get())) {
+      is_written = true;
+    } else {
+      // Or any aliased data var written?
+      if (auto opt = f->buffer_map.Get(v)) {
+        if (marker.written().count(opt.value()->data.get())) {
+          is_written = true;
+        }
+      }
+    }
+
+    if (!is_written) {
+      readonly_indices.push_back(Integer(static_cast<int>(i)));
+    }
+  }
+
+  if (!readonly_indices.empty()) {
+    Map<String, Any> attrs;
+    attrs.Set(String("tl.readonly_param_indices"), readonly_indices);
+    f = WithAttrs(std::move(f), attrs);
+  }
+  return f;
+}
+
+namespace transform {
+using namespace tir::transform;
+
+Pass AnnotateReadOnlyParams() {
+  auto pass_func = [](PrimFunc f, const IRModule &m,
+                      const tvm::transform::PassContext &ctx) {
+    return MarkReadOnlyParams(std::move(f));
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tl.AnnotateReadOnlyParams", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.AnnotateReadOnlyParams",
+                        AnnotateReadOnlyParams);
+}
+
+} // namespace transform
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/annotate_warp_group_reg_alloc.cc b/src/transform/annotate_warp_group_reg_alloc.cc
index 6949c64e8..08be53f20 100644
--- a/src/transform/annotate_warp_group_reg_alloc.cc
+++ b/src/transform/annotate_warp_group_reg_alloc.cc
@@ -124,7 +124,9 @@ class SetMaxNRegInjector : public StmtExprMutator {
       }
       auto producer_body = if_then_else->then_case;
       Optional<Stmt> consumer_body = if_then_else->else_case;
-      ICHECK(consumer_body.defined()) << "Consumer body is undefined";
+      // In some degenerate warp-specialized patterns (e.g., producer-only),
+      // the consumer body may be absent. Handle gracefully by only annotating
+      // the producer side when consumer is missing.
 
       auto dec_reg = nreg_[0].as<IntImmNode>()->value;
       auto inc_reg = nreg_[1].as<IntImmNode>()->value;
@@ -150,15 +152,20 @@ class SetMaxNRegInjector : public StmtExprMutator {
       producer_stmts.push_back(producer_body);
       auto new_producer_body = SeqStmt(producer_stmts);
 
-      Array<Stmt> consumer_stmts;
-      consumer_stmts.push_back(inc_reg_stmt);
-      consumer_stmts.push_back(consumer_body.value());
-      auto new_consumer_body = SeqStmt(consumer_stmts);
+      Stmt new_if_stmt;
+      if (consumer_body.defined()) {
+        Array<Stmt> consumer_stmts;
+        consumer_stmts.push_back(inc_reg_stmt);
+        consumer_stmts.push_back(consumer_body.value());
+        auto new_consumer_body = SeqStmt(consumer_stmts);
+        new_if_stmt = IfThenElse(if_then_else->condition, new_producer_body,
+                                 new_consumer_body);
+      } else {
+        // No consumer branch; keep the if-then form.
+        new_if_stmt = IfThenElse(if_then_else->condition, new_producer_body);
+      }
 
-      auto new_if_stmt = IfThenElse(if_then_else->condition, new_producer_body,
-                                    new_consumer_body);
       auto new_attr = AttrStmt(op->node, op->attr_key, op->value, new_if_stmt);
-
       return new_attr;
     } else {
       return StmtExprMutator::VisitStmt_(op);
@@ -181,11 +188,11 @@ tvm::transform::Pass AnnotateWarpGroupRegAlloc() {
   return CreatePrimFuncPass(pass_func, 0, "tl.AnnotateWarpGroupRegAlloc", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.AnnotateWarpGroupRegAlloc",
                         AnnotateWarpGroupRegAlloc);
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/arg_binder.cc b/src/transform/arg_binder.cc
new file mode 100644
index 000000000..73500e176
--- /dev/null
+++ b/src/transform/arg_binder.cc
@@ -0,0 +1,958 @@
+/*!
+ * \file arg_binder.cc
+ * \brief Helper utility to match and bind arguments.
+ */
+#include "arg_binder.h"
+
+#include <tvm/runtime/device_api.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/op.h>
+
+#include <sstream>
+#include <unordered_set>
+
+#include "../runtime/error_helpers.h"
+#include "tir/transforms/ir_utils.h"
+#include "tvm/arith/int_solver.h"
+#include "tvm/ffi/cast.h"
+#include "tvm/ffi/container/array.h"
+#include "tvm/tir/stmt.h"
+#include "tvm/tir/stmt_functor.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+void BinderAddAssert(arith::Analyzer *ana, PrimExpr cond,
+                     const std::string &arg_name, std::vector<Stmt> *asserts,
+                     PrimExpr nullable_guard = PrimExpr()) {
+  PrimExpr scond = ana->Simplify(cond);
+  if (is_zero(scond)) {
+    LOG(FATAL) << "Bind have an unmet assertion: " << cond << ", "
+               << " on argument " << arg_name;
+  }
+
+  if (!is_one(scond)) {
+    // Extract kernel/buffer/field from arg_name (e.g., "main.A.shape[0]")
+    std::string kernel = arg_name;
+    std::string buf_and_field = arg_name;
+    size_t dot_pos = arg_name.find('.');
+    if (dot_pos != std::string::npos) {
+      kernel = arg_name.substr(0, dot_pos);
+      buf_and_field = arg_name.substr(dot_pos + 1);
+    }
+    std::string buffer = buf_and_field;
+    std::string field;
+    size_t dot2 = buf_and_field.find('.');
+    if (dot2 != std::string::npos) {
+      buffer = buf_and_field.substr(0, dot2);
+      field = buf_and_field.substr(dot2 + 1);
+    }
+
+    // If cond is an equality, prefer structured packed error with expect/got
+    if (const auto *eq = scond.as<tvm::tir::EQNode>()) {
+      PrimExpr lhs = eq->a;
+      PrimExpr rhs = eq->b;
+      // Choose rhs as expected and lhs as got for better semantics in most
+      // binding cases
+      ffi::Array<PrimExpr> pargs;
+      pargs.push_back(StringImm(tvm_error_expect_eq));
+      pargs.push_back(StringImm(kernel));
+      pargs.push_back(StringImm(buffer));
+      pargs.push_back(StringImm(field.empty() ? std::string("value") : field));
+      pargs.push_back(cast(DataType::Int(64), rhs)); // expected
+      pargs.push_back(cast(DataType::Int(64), lhs)); // got
+
+      Stmt call_err =
+          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs));
+      // Only emit at runtime when the equality fails
+      Stmt inner = IfThenElse(Not(scond), call_err);
+      if (nullable_guard.defined()) {
+        inner = IfThenElse(Not(nullable_guard), inner);
+      }
+      asserts->emplace_back(SeqStmt({inner, Evaluate(0)}));
+    } else {
+      // Fallback: packed generic constraint violation without dumping cond
+      ffi::Array<PrimExpr> pargs;
+      pargs.push_back(StringImm(tvm_error_constraint_violation));
+      pargs.push_back(StringImm(kernel));
+      pargs.push_back(StringImm(buffer));
+      pargs.push_back(StringImm(field.empty() ? std::string("value") : field));
+      Stmt call_err =
+          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs));
+      Stmt inner = IfThenElse(Not(scond), call_err);
+      if (nullable_guard.defined()) {
+        inner = IfThenElse(Not(nullable_guard), inner);
+      }
+      asserts->emplace_back(SeqStmt({inner, Evaluate(0)}));
+    }
+  }
+}
+
+std::vector<Var> ArgBinder::getUndefVars(const std::vector<PrimExpr> &args) {
+  std::unordered_set<const VarNode *> visit;
+  std::vector<Var> res;
+  for (const auto &arg : args) {
+    PostOrderVisit(arg, [&](ObjectRef r) {
+      if (auto var = r.as<VarNode>()) {
+        if (!visit.count(var)) {
+          visit.insert(var);
+        }
+        auto it = def_map_->find(var);
+        if (it == def_map_->end()) {
+          // res.push_back(var);
+          res.push_back(ffi::GetRef<Var>(var));
+        }
+      }
+    });
+  }
+  return res;
+}
+
+bool ArgBinder::BindNullable(const PrimExpr &arg, const PrimExpr &value,
+                             const std::string &arg_name, bool with_lets,
+                             const PrimExpr &nullable_guard) {
+  // Currently only used in BindDLTensor, nullable_guard is already a defined
+  // bool, so use it directly.
+  auto MakeGuarded = [&](PrimExpr basic) -> PrimExpr {
+    // is_null || basic
+    return Or(nullable_guard, basic);
+  };
+  ICHECK_EQ(arg.dtype(), value.dtype()) << "arg " << arg << " value " << value;
+  auto BindVar = [&](const VarNode *v, PrimExpr value) {
+    auto v_arg = ffi::GetRef<Var>(v);
+    defs_.emplace_back(v_arg);
+    if (with_lets) {
+      (*def_map_)[v] = value;
+      init_nest_.emplace_back(LetStmt(v_arg, value, Evaluate(0)));
+    } else {
+      (*def_map_)[v] = value;
+    }
+  };
+  // 1. simple binding var = value
+  if (const VarNode *v = arg.as<VarNode>()) {
+    auto it = def_map_->find(v);
+    if (it == def_map_->end()) {
+      BindVar(v, value);
+      // First time binding: identical behavior as Bind_
+      return true;
+    } else {
+      // Second or later binding: add is_null short-circuit
+      PrimExpr cond = value == it->second;
+      BinderAddAssert(&analyzer_, cond, arg_name, &asserts_, nullable_guard);
+    }
+  } else {
+    // 2. complex binding expr = value
+    //  get undefined variables
+    auto undefs = ffi::Array<Var>(getUndefVars({arg}));
+    if (!undefs.empty()) {
+      // if value is not integer, such as float, we are unable to solve it
+      if (!value.dtype().is_int() && !value.dtype().is_uint()) {
+        LOG(FATAL) << "Unable to solve non-integer variables " << undefs
+                   << " from equation `" << value << "`";
+      }
+      arith::IntConstraints constraints(undefs, {}, {arg == value});
+      auto sol = arith::SolveLinearEquations(constraints);
+      if (!sol->dst->variables.empty()) {
+        LOG(FATAL) << "TVM is unable to solve variables " << undefs
+                   << " from equation " << constraints;
+      }
+      for (const auto &v : undefs) {
+        auto value_opt = sol->src_to_dst.Get(v);
+        ICHECK(value_opt->defined())
+            << "Unable to solve variable `" << v << "` from expression `"
+            << (value == arg) << "`";
+        auto value = ffi::GetRef<PrimExpr>(sol->src_to_dst.Get(v)->get());
+        BindVar(v.as<VarNode>(), value);
+      }
+    }
+    // we must add the assert again
+    //    because the solved expression may contain floordiv (e.g. 3 * m == n
+    //    ==>   m = n // 3) we re-compute the constraint to verify the solution
+    //    is correct
+    PrimExpr cond = value == arg;
+    BinderAddAssert(&analyzer_, cond, arg_name, &asserts_, nullable_guard);
+  }
+  // ICHECK(false);
+  return false;
+}
+
+bool ArgBinder::Bind_(const PrimExpr &arg, const PrimExpr &value,
+                      const std::string &arg_name, bool with_lets) {
+  ICHECK_EQ(arg.dtype(), value.dtype()) << "arg " << arg << " value " << value;
+  if (const VarNode *v = arg.as<VarNode>()) {
+    auto it = def_map_->find(v);
+    if (it == def_map_->end()) {
+      Var v_arg = Downcast<Var>(arg);
+      defs_.emplace_back(v_arg);
+      if (with_lets) {
+        (*def_map_)[v] = arg;
+        init_nest_.emplace_back(LetStmt(v_arg, value, Evaluate(0)));
+      } else {
+        (*def_map_)[v] = value;
+      }
+      return true;
+    } else {
+      BinderAddAssert(&analyzer_, value == it->second, arg_name, &asserts_);
+    }
+  } else {
+    BinderAddAssert(&analyzer_, value == arg, arg_name, &asserts_);
+  }
+  return false;
+}
+
+void ArgBinder::Bind(const PrimExpr &arg, const PrimExpr &value,
+                     const std::string &arg_name, bool with_let) {
+  Bind_(arg, value, arg_name, with_let);
+}
+
+void ArgBinder::BindArray(const ffi::Array<PrimExpr> &arg,
+                          const ffi::Array<PrimExpr> &value,
+                          const std::string &arg_name) {
+  ICHECK_EQ(arg.size(), value.size())
+      << "Argument " << arg_name << " array size mismatch";
+  for (size_t i = 0; i < arg.size(); ++i) {
+    std::ostringstream os;
+    os << arg_name << "[" << i << "]";
+    this->Bind(arg[i], value[i], os.str());
+  }
+}
+
+void ArgBinder::BindBuffer(const Buffer &arg, const Buffer &value,
+                           const std::string &arg_name, bool fuzzy_match) {
+  ICHECK_EQ(arg.scope(), value.scope())
+      << "Argument " << arg_name << " Buffer bind scope mismatch";
+  // Relax dtype check to allow FP8 E4M3 variants to bind together.
+  auto dtype_compatible = [](DataType expected, DataType provided) -> bool {
+    if (expected == provided)
+      return true;
+    // If expected is float8_e4m3, allow float8_e4m3fn/float8_e4m3fnuz as well.
+    if (expected.is_float8_e4m3()) {
+      return provided.is_float8_e4m3() || provided.is_float8_e4m3fn() ||
+             provided.is_float8_e4m3fnuz();
+    }
+    // If expected is float8_e5m2, allow float8_e5m2fnuz as well.
+    if (expected.is_float8_e5m2()) {
+      return provided.is_float8_e5m2() || provided.is_float8_e5m2fnuz();
+    }
+    // If expected is bool, allow binding from int8/uint8 with same lanes.
+    if (expected.is_bool()) {
+      bool is_i8 = provided.is_int() && provided.bits() == 8;
+      bool is_u8 = provided.is_uint() && provided.bits() == 8;
+      return (is_i8 || is_u8) && expected.lanes() == provided.lanes();
+    }
+    return false;
+  };
+  ICHECK(dtype_compatible(arg->dtype, value->dtype))
+      << "Argument " << arg_name << " Buffer bind data type mismatch: expected "
+      << arg->dtype << ", got " << value->dtype;
+  if (value->data_alignment % arg->data_alignment != 0) {
+    LOG(WARNING) << "Trying to bind buffer to another one with lower alignment "
+                    "requirement "
+                 << " required_alignment=" << arg->data_alignment
+                 << ", provided_alignment=" << value->data_alignment;
+  }
+
+  if (value->elem_offset.defined()) {
+    // bind pointer and offset.
+    if (is_zero(arg->elem_offset)) {
+      ICHECK(is_zero(value->elem_offset))
+          << "Trying to bind a Buffer with offset into one without offset "
+          << " required elem_offset=" << arg->elem_offset
+          << ", provided elem_offset=" << value->elem_offset;
+    }
+
+    this->Bind(arg->data, value->data, arg_name + ".data");
+    if (Bind_(arg->elem_offset, value->elem_offset, arg_name + ".elem_offset",
+              false)) {
+      if (arg->offset_factor > 1) {
+        PrimExpr offset = value->elem_offset;
+        PrimExpr factor = make_const(offset.dtype(), arg->offset_factor);
+        PrimExpr zero = make_zero(offset.dtype());
+        BinderAddAssert(&analyzer_, zero == truncmod(offset, factor),
+                        arg_name + ".elem_offset", &asserts_);
+      }
+    }
+  }
+
+  if (arg->shape.size() < value->shape.size()) {
+    ICHECK(fuzzy_match) << "Argument " << arg_name << " size mismatch";
+    size_t diff = value->shape.size() - arg->shape.size();
+    for (size_t i = 0; i < diff; ++i) {
+      ICHECK(is_one(analyzer_.Simplify(value->shape[i])))
+          << "Argument " << arg_name << " shape mismatch" << arg->shape
+          << " vs " << value->shape;
+    }
+    for (size_t i = 0; i < arg->shape.size(); ++i) {
+      std::ostringstream os;
+      os << arg_name << ".shape[" << i << "]";
+      this->Bind(arg->shape[i], value->shape[i + diff], os.str());
+    }
+    if (!value->strides.empty()) {
+      ICHECK_EQ(arg->strides.size(), arg->shape.size());
+      ICHECK_EQ(value->strides.size(), value->shape.size());
+      for (size_t i = 0; i < arg->strides.size(); ++i) {
+        std::ostringstream os;
+        os << arg_name << ".strides[" << i << "]";
+        this->Bind(arg->strides[i], value->strides[i + diff], os.str());
+      }
+    }
+  } else {
+    this->BindArray(arg->shape, value->shape, arg_name + ".shape");
+    this->BindArray(arg->strides, value->strides, arg_name + ".strides");
+  }
+}
+
+inline PrimExpr TVMArrayGet(DataType t, Var arr,
+                            builtin::TVMStructFieldKind kind) {
+  return TVMStructGet(t, arr, 0, kind);
+}
+
+void ArgBinder::BindDLTensors(
+    const std::vector<std::pair<Var, Buffer>> &buffer_def,
+    const PrimExpr &device_type, const PrimExpr &device_id,
+    const std::string &func_name,
+    const std::unordered_set<const VarNode *> &used_param_buffers) {
+  ffi::Array<Buffer> buffers;
+  ffi::Array<Var> handles;
+
+  // First pass: collect shape var -> list of (buffer_name, dim_idx, handle_ptr)
+  struct ShapeVarSource {
+    std::string buf_name;
+    size_t dim_idx;
+    const VarNode *handle_ptr; // Raw pointer to check used_param_buffers
+  };
+  std::unordered_map<const VarNode *, std::vector<ShapeVarSource>>
+      shape_var_sources;
+
+  for (const auto &[handle, buffer] : buffer_def) {
+    std::string arg_name = func_name + "." + buffer->data->name_hint;
+
+    // Scan buffer shape for symbolic variables
+    for (size_t k = 0; k < buffer->shape.size(); ++k) {
+      if (buffer->dtype.bits() < 8) {
+        break;
+      }
+
+      if (const VarNode *v = buffer->shape[k].as<VarNode>()) {
+        // This dimension is a symbolic variable
+        shape_var_sources[v].push_back({arg_name, k, handle.get()});
+      }
+    }
+  }
+
+  // Second pass: Create is_null vars and shape buffers for all buffers first
+  std::unordered_map<std::string, Var> is_null_map;
+  std::unordered_map<std::string, Buffer> shape_buffer_map;
+  std::unordered_map<std::string, PrimExpr>
+      is_null_expr_map; // arg_name -> is_null expression (const_false for used
+                        // buffers)
+
+  const DataType tvm_shape_type = DataType::ShapeIndex();
+  const DataType tvm_ndim_type = DataType::Int(32);
+  const Stmt nop = Evaluate(0);
+
+  // Create all is_null vars and shape buffers first
+  for (const auto &[handle, buffer] : buffer_def) {
+    bool is_used = used_param_buffers.count(handle.get());
+    std::string arg_name = func_name + "." + buffer->data->name_hint;
+
+    Var is_null_var(arg_name + "_is_null", DataType::Bool());
+    init_nest_.emplace_back(
+        LetStmt(is_null_var,
+                Call(DataType::Bool(), builtin::isnullptr(), {handle}), nop));
+    const PrimExpr &is_null = is_used ? const_false() : is_null_var;
+
+    is_null_map[arg_name] = is_null_var;
+    is_null_expr_map[arg_name] = is_null;
+
+    if (is_used) {
+      init_nest_.emplace_back(
+          AssertStmt(!is_null_var,
+                     tvm::tir::StringImm(
+                         arg_name + " is expected to have non-NULL pointer"),
+                     nop));
+    }
+  }
+
+  // Create all shape buffers before binding any shapes
+  for (const auto &[handle, buffer] : buffer_def) {
+    std::string arg_name = func_name + "." + buffer->data->name_hint;
+    const PrimExpr &is_null = is_null_expr_map[arg_name];
+
+    // Helper functions for shape/stride name formatting
+    auto shape_handle_name = [&]() { return arg_name + ".shape"; };
+
+    // shape field
+    Buffer buf_shape =
+        decl_buffer({IntImm(DataType::Int(32), buffer->shape.size())},
+                    tvm_shape_type, shape_handle_name());
+    def_handle_dtype_.Set(buf_shape->data, make_const(tvm_shape_type, 0));
+    // Use if_then_else for NULL guard on the shape pointer itself, avoiding
+    // dereferencing TVMStructGet(handle, kArrShape) when handle is NULL.
+    init_nest_.emplace_back(
+        LetStmt(buf_shape->data,
+                tvm::if_then_else(
+                    Not(is_null),
+                    TVMArrayGet(DataType::Handle(), handle, builtin::kArrShape),
+                    make_zero(DataType::Handle())),
+                nop));
+    init_nest_.emplace_back(DeclBuffer(buf_shape, nop));
+
+    // Save for later use in shape binding
+    shape_buffer_map[arg_name] = buf_shape;
+  }
+
+  // Now process each buffer fully
+  for (const auto &[handle, buffer] : buffer_def) {
+    bool is_used = used_param_buffers.count(handle.get());
+    std::string arg_name = func_name + "." + buffer->data->name_hint;
+    const PrimExpr &is_null = is_null_expr_map[arg_name];
+
+    // dimension checks
+    PrimExpr v_ndim = TVMArrayGet(tvm_ndim_type, handle, builtin::kArrNDim);
+
+    // Helper functions for shape/stride name formatting
+    auto shape_handle_name = [&]() { return arg_name + ".shape"; };
+    auto stride_handle_name = [&]() { return arg_name + ".strides"; };
+    auto array_element_name = [&](const std::string &arr_name, size_t k) {
+      std::stringstream ss;
+      ss << arr_name << '[' << k << ']';
+      return ss.str();
+    };
+    auto shape_element_name = [&](size_t k) {
+      return array_element_name(shape_handle_name(), k);
+    };
+    auto stride_element_name = [&](size_t k) {
+      return array_element_name(stride_handle_name(), k);
+    };
+
+    PrimExpr a_ndim =
+        make_const(tvm_ndim_type, static_cast<int64_t>(buffer->shape.size()));
+    // Build clearer ndim message with kernel/buffer names
+    std::string kernel_nm = arg_name;
+    std::string buf_nm = arg_name;
+    size_t dot_pos = arg_name.find('.');
+    if (dot_pos != std::string::npos) {
+      kernel_nm = arg_name.substr(0, dot_pos);
+      buf_nm = arg_name.substr(dot_pos + 1);
+    }
+    // Only check ndim when handle is non-NULL: use packed error helper
+    PrimExpr ndim_ok = (a_ndim == v_ndim);
+    ffi::Array<PrimExpr> ndim_args;
+    ndim_args.push_back(StringImm(tvm_error_ndim_mismatch));
+    ndim_args.push_back(StringImm(kernel_nm));
+    ndim_args.push_back(StringImm(buf_nm));
+    ndim_args.push_back(cast(DataType::Int(64), a_ndim));
+    ndim_args.push_back(cast(DataType::Int(64), v_ndim));
+    Stmt ndim_call = Evaluate(
+        Call(DataType::Int(32), builtin::tvm_call_packed(), ndim_args));
+    init_nest_.emplace_back(
+        SeqStmt({IfThenElse(Not(is_null), IfThenElse(Not(ndim_ok), ndim_call),
+                            Evaluate(0)),
+                 nop}));
+    // type checks
+    // Guard all dtype field loads by `is_null` using if_then_else
+    PrimExpr v_type_code = tvm::if_then_else(
+        Not(is_null),
+        TVMArrayGet(DataType::UInt(8), handle, builtin::kArrTypeCode),
+        IntImm(DataType::UInt(8), buffer->dtype.code()));
+    PrimExpr v_type_bits = tvm::if_then_else(
+        Not(is_null),
+        TVMArrayGet(DataType::UInt(8), handle, builtin::kArrTypeBits),
+        IntImm(DataType::UInt(8), buffer->dtype.bits()));
+    PrimExpr v_type_lanes = tvm::if_then_else(
+        Not(is_null),
+        TVMArrayGet(DataType::UInt(16), handle, builtin::kArrTypeLanes),
+        IntImm(DataType::UInt(16), buffer->dtype.lanes()));
+    PrimExpr expect_code = IntImm(DataType::UInt(8), buffer->dtype.code());
+    PrimExpr expect_bits = IntImm(DataType::UInt(8), buffer->dtype.bits());
+    PrimExpr expect_lanes = IntImm(DataType::UInt(16), buffer->dtype.lanes());
+
+    PrimExpr cond = (v_type_code == expect_code && v_type_bits == expect_bits &&
+                     v_type_lanes == expect_lanes);
+
+    // Allow float8_e4m3 to match float8_e4m3fn/float8_e4m3fnuz at runtime.
+    if (buffer->dtype.is_float8_e4m3()) {
+      PrimExpr code_e4m3 = IntImm(DataType::UInt(8), DataType::kFloat8_e4m3);
+      PrimExpr code_e4m3fn =
+          IntImm(DataType::UInt(8), DataType::kFloat8_e4m3fn);
+      PrimExpr code_e4m3fnuz =
+          IntImm(DataType::UInt(8), DataType::kFloat8_e4m3fnuz);
+      PrimExpr code_match =
+          (v_type_code == code_e4m3 || v_type_code == code_e4m3fn ||
+           v_type_code == code_e4m3fnuz);
+      cond = cond || (code_match && v_type_bits == expect_bits &&
+                      v_type_lanes == expect_lanes);
+    }
+    // Allow float8_e5m2 to match float8_e5m2fnuz at runtime.
+    if (buffer->dtype.is_float8_e5m2()) {
+      PrimExpr code_e5m2 = IntImm(DataType::UInt(8), DataType::kFloat8_e5m2);
+      PrimExpr code_e5m2fnuz =
+          IntImm(DataType::UInt(8), DataType::kFloat8_e5m2fnuz);
+      PrimExpr code_match =
+          (v_type_code == code_e5m2 || v_type_code == code_e5m2fnuz);
+      cond = cond || (code_match && v_type_bits == expect_bits &&
+                      v_type_lanes == expect_lanes);
+    }
+    // Allow bool to match int8/uint8 at runtime, and also kDLBool(code=6).
+    if (buffer->dtype.is_bool()) {
+      PrimExpr code_int = IntImm(DataType::UInt(8), DataType::kInt);
+      PrimExpr code_uint = IntImm(DataType::UInt(8), DataType::kUInt);
+      PrimExpr code_kdlbool = IntImm(DataType::UInt(8), 6);
+      PrimExpr bits8 = IntImm(DataType::UInt(8), 8);
+      PrimExpr bits1 = IntImm(DataType::UInt(8), 1);
+      PrimExpr lanes_ok = (v_type_lanes == expect_lanes);
+      PrimExpr int8_ok =
+          (v_type_code == code_int && v_type_bits == bits8 && lanes_ok);
+      PrimExpr uint8_ok =
+          (v_type_code == code_uint && v_type_bits == bits8 && lanes_ok);
+      // Some frontends may tag bool tensors as kDLBool(code=6), commonly with
+      // bits=8 or bits=1.
+      PrimExpr kdlbool8_ok =
+          (v_type_code == code_kdlbool && v_type_bits == bits8 && lanes_ok);
+      PrimExpr kdlbool1_ok =
+          (v_type_code == code_kdlbool && v_type_bits == bits1 && lanes_ok);
+      // Also accept any dtype whose bitwidth=1, regardless of code, to be
+      // defensive.
+      PrimExpr bit1_ok = (v_type_bits == bits1 && lanes_ok);
+      cond =
+          cond || int8_ok || uint8_ok || kdlbool8_ok || kdlbool1_ok || bit1_ok;
+    }
+    // Allow with bits < 8 to match any type with the same total bit count at
+    // runtime (PyTorch uses int8 as storage for FP4).
+    bool data_is_subtype = buffer->dtype.bits() < 8;
+    if (data_is_subtype) {
+      // Get the pre-created shape buffer for reading runtime shape
+      Buffer buf_shape = shape_buffer_map[arg_name];
+
+      // Calculate expected total bits using compile-time buffer->shape
+      PrimExpr expect_total_bits =
+          cast(DataType::UInt(64), expect_bits) *
+          cast(DataType::UInt(64), expect_lanes) *
+          cast(DataType::UInt(64),
+               buffer->shape.empty()
+                   ? make_const(DataType::UInt(64), 1)
+                   : foldl([](PrimExpr a, PrimExpr b, Span) { return a * b; },
+                           make_const(DataType::UInt(64), 1), buffer->shape));
+
+      // Calculate actual total bits using runtime shape from DLTensor
+      PrimExpr actual_total_bits = cast(DataType::UInt(64), v_type_bits) *
+                                   cast(DataType::UInt(64), v_type_lanes);
+      for (size_t k = 0; k < buffer->shape.size(); ++k) {
+        PrimExpr dim_val =
+            cast(DataType::UInt(64),
+                 BufferLoad(buf_shape,
+                            {IntImm(DataType::Int(32), static_cast<int>(k))}));
+        actual_total_bits = actual_total_bits * dim_val;
+      }
+
+      PrimExpr bits_match = (actual_total_bits == expect_total_bits);
+      BinderAddAssert(&analyzer_, bits_match,
+                      arg_name + " is a subtype, but total bits mismatch",
+                      &asserts_, is_null);
+    }
+    if (!data_is_subtype) {
+      // Build FFI packed call to __tvm_error_dtype_mismatch when mismatch
+      // occurs. Only issue the call when handle is non-NULL and cond is false.
+      ffi::Array<PrimExpr> packed_args;
+      packed_args.push_back(StringImm(tvm_error_dtype_mismatch));
+      // Split arg_name of the form "<kernel>.<buffer>" into parts for clearer
+      // diagnostics
+      std::string kernel_name = arg_name;
+      std::string buffer_name = arg_name;
+      size_t dot_pos = arg_name.find('.');
+      if (dot_pos != std::string::npos) {
+        kernel_name = arg_name.substr(0, dot_pos);
+        buffer_name = arg_name.substr(dot_pos + 1);
+      }
+      packed_args.push_back(StringImm(kernel_name));
+      packed_args.push_back(StringImm(buffer_name));
+
+      auto i64 = DataType::Int(64);
+      // Cast to int64 for FFI function signature
+      packed_args.push_back(cast(i64, v_type_code));  // actual_code
+      packed_args.push_back(cast(i64, v_type_bits));  // actual_bits
+      packed_args.push_back(cast(i64, v_type_lanes)); // actual_lanes
+      packed_args.push_back(cast(i64, expect_code));  // expect_code
+      packed_args.push_back(cast(i64, expect_bits));  // expect_bits
+      packed_args.push_back(cast(i64, expect_lanes)); // expect_lanes
+
+      Stmt call_err = Evaluate(
+          Call(DataType::Int(32), builtin::tvm_call_packed(), packed_args));
+      // Guard the call: only when handle is not null and cond fails
+      Stmt guarded = IfThenElse(Not(is_null) && Not(cond), call_err);
+      asserts_.emplace_back(SeqStmt({guarded, nop}));
+    }
+
+    // Get the pre-created shape buffer
+    Buffer buf_shape = shape_buffer_map[arg_name];
+
+    // Bind symbolic variables from buffer shape
+    for (size_t k = 0; k < buffer->shape.size(); ++k) {
+      // These packed-bit dtype shapes were not bound in the original
+      // implementation, so we just use them as is.
+      if (data_is_subtype) {
+        break;
+      }
+
+      // The "real" runtime shape value read from DLTensor.
+      // Guard the load with `is_null` to avoid dereferencing NULL handles.
+      PrimExpr raw_shape_val =
+          cast(buffer->shape[k].dtype(),
+               BufferLoad(buf_shape,
+                          {IntImm(DataType::Int(32), static_cast<int>(k))}));
+      PrimExpr shape_val = tvm::if_then_else(
+          Not(is_null), raw_shape_val, make_const(raw_shape_val.dtype(), 0));
+
+      // Check if this dimension is a symbolic variable
+      if (const VarNode *v = buffer->shape[k].as<VarNode>()) {
+        auto it = def_map_->find(v);
+        if (it == def_map_->end()) {
+          // First time binding this symbolic variable
+          auto sources_it = shape_var_sources.find(v);
+          if (sources_it != shape_var_sources.end() &&
+              sources_it->second.size() > 1) {
+            // This variable appears in multiple buffers
+            // Assert that at least one buffer is non-null
+            PrimExpr any_nonnull = const_false();
+            for (const auto &src : sources_it->second) {
+              bool buf_is_used = used_param_buffers.count(src.handle_ptr);
+              if (buf_is_used) {
+                any_nonnull = const_true();
+                break;
+              }
+              Var src_is_null = is_null_map[src.buf_name];
+              any_nonnull = Or(any_nonnull, Not(src_is_null));
+            }
+
+            std::ostringstream err_msg;
+            err_msg << "Symbolic shape variable "
+                    << ffi::GetRef<Var>(v)->name_hint
+                    << " requires at least one non-null buffer among: ";
+            bool first = true;
+            for (const auto &src : sources_it->second) {
+              if (!first)
+                err_msg << ", ";
+              err_msg << src.buf_name;
+              first = false;
+            }
+
+            init_nest_.emplace_back(AssertStmt(
+                any_nonnull, tvm::tir::StringImm(err_msg.str()), nop));
+
+            // Build cascaded if_then_else: if !is_null_a then a.shape[k] else
+            // if !is_null_b then b.shape[k] ... We need to construct this in
+            // reverse order
+            PrimExpr cascaded_value;
+            bool is_first_source = true;
+
+            for (auto rit = sources_it->second.rbegin();
+                 rit != sources_it->second.rend(); ++rit) {
+              const auto &src = *rit;
+
+              // Get the shape buffer for this source
+              auto it_buf = shape_buffer_map.find(src.buf_name);
+              if (it_buf == shape_buffer_map.end()) {
+                LOG(FATAL) << "Shape buffer not found for " << src.buf_name;
+              }
+              Buffer src_shape_buf = it_buf->second;
+
+              // Construct the shape load and guard it if the source may be NULL
+              PrimExpr src_raw_shape_val =
+                  cast(buffer->shape[k].dtype(),
+                       BufferLoad(src_shape_buf,
+                                  {IntImm(DataType::Int(32),
+                                          static_cast<int>(src.dim_idx))}));
+
+              // Check if this buffer is used (non-nullable)
+              bool src_is_used = used_param_buffers.count(src.handle_ptr);
+
+              if (is_first_source) {
+                // Base case: use this shape value directly (we know at least
+                // one is non-null from assert)
+                if (src_is_used) {
+                  cascaded_value = src_raw_shape_val;
+                } else {
+                  Var src_is_null = is_null_map[src.buf_name];
+                  cascaded_value = tvm::if_then_else(
+                      Not(src_is_null), src_raw_shape_val,
+                      make_const(src_raw_shape_val.dtype(), 0));
+                }
+                is_first_source = false;
+              } else {
+                // if !is_null then use this shape, else use previous cascaded
+                // value But if buffer is used (non-nullable), always use its
+                // shape
+                if (src_is_used) {
+                  cascaded_value = src_raw_shape_val;
+                } else {
+                  Var src_is_null = is_null_map[src.buf_name];
+                  cascaded_value = tvm::if_then_else(
+                      Not(src_is_null), src_raw_shape_val, cascaded_value);
+                }
+              }
+            }
+
+            // Bind the variable to the cascaded expression
+            Var v_arg = ffi::GetRef<Var>(v);
+            defs_.emplace_back(v_arg);
+            (*def_map_)[v] = cascaded_value;
+            init_nest_.emplace_back(
+                LetStmt(v_arg, cascaded_value, Evaluate(0)));
+          } else {
+            // Single source or no special handling needed, use nullable
+            // binding. When the only source is NULL, bind m to 0 safely.
+            BindNullable(buffer->shape[k], shape_val, shape_element_name(k),
+                         true, is_null);
+          }
+        } else {
+          // Variable already bound, add assertion with nullable guard
+          PrimExpr cond = (it->second == shape_val);
+          BinderAddAssert(&analyzer_, cond, shape_element_name(k), &asserts_,
+                          is_null);
+        }
+      } else {
+        // Constant dimension, just add assertion
+        BindNullable(buffer->shape[k], shape_val, shape_element_name(k), true,
+                     is_null);
+      }
+    }
+
+    // strides field
+    // Skip stride checks for subbyte types (bits < 8), as they use packed
+    // storage and stride semantics don't apply directly.
+    if (!data_is_subtype) {
+      Buffer buf_strides =
+          decl_buffer({IntImm(DataType::Int(32), buffer->strides.size())},
+                      tvm_shape_type, arg_name + ".strides");
+      def_handle_dtype_.Set(buf_strides->data,
+                            tir::TypeAnnotation(tvm_shape_type));
+      init_nest_.emplace_back(
+          LetStmt(buf_strides->data,
+                  tvm::if_then_else(Not(is_null),
+                                    TVMArrayGet(DataType::Handle(), handle,
+                                                builtin::kArrStrides),
+                                    make_zero(DataType::Handle())),
+                  nop));
+      init_nest_.emplace_back(DeclBuffer(buf_strides, nop));
+      PrimExpr v_strides_is_null =
+          Call(DataType::Bool(1), builtin::isnullptr(), {buf_strides->data});
+
+      if (buffer->strides.empty()) {
+        // Assert the buffer is compact
+        DataType stype = buffer->DefaultIndexType();
+        PrimExpr expect_stride = make_const(stype, 1);
+        ffi::Array<PrimExpr> conds;
+        for (size_t i = buffer->shape.size(); i != 0; --i) {
+          size_t k = i - 1;
+          PrimExpr svalue = cast(
+              stype, BufferLoad(buf_strides, {IntImm(DataType::Int(32),
+                                                     static_cast<int>(k))}));
+          conds.push_back(buffer->shape[k] == 1 || expect_stride == svalue);
+          expect_stride = expect_stride * buffer->shape[k];
+        }
+        std::ostringstream stride_err_msg;
+        stride_err_msg
+            << stride_handle_name()
+            << ": expected to be compact array, but got non-compact strides";
+        if (!conds.empty()) {
+          PrimExpr all_ok =
+              foldl([](PrimExpr a, PrimExpr b,
+                       Span span) { return logical_and(a, b, span); },
+                    const_true(1), conds);
+          // Packed generic violation for non-compact strides
+          std::string kernel_nm3 = arg_name;
+          std::string buf_nm3 = arg_name;
+          size_t dot_pos3 = arg_name.find('.');
+          if (dot_pos3 != std::string::npos) {
+            kernel_nm3 = arg_name.substr(0, dot_pos3);
+            buf_nm3 = arg_name.substr(dot_pos3 + 1);
+          }
+          ffi::Array<PrimExpr> pargs4;
+          pargs4.push_back(StringImm(tvm_error_constraint_violation));
+          pargs4.push_back(StringImm(kernel_nm3));
+          pargs4.push_back(StringImm(buf_nm3));
+          pargs4.push_back(StringImm("strides"));
+          Stmt call_err4 = Evaluate(
+              Call(DataType::Int(32), builtin::tvm_call_packed(), pargs4));
+          // Only check when strides array is present and condition fails
+          Stmt check =
+              IfThenElse(Not(v_strides_is_null),
+                         IfThenElse(Not(all_ok), call_err4), Evaluate(0));
+          asserts_.emplace_back(SeqStmt({check, Evaluate(0)}));
+        }
+      } else if (buffer->buffer_type == kAutoBroadcast) {
+        PrimExpr stride_from_shape = 1;
+        for (size_t i = buffer->shape.size(); i != 0; --i) {
+          size_t k = i - 1;
+          DataType stride_dtype = buffer->strides[k].dtype();
+          PrimExpr explicit_stride = cast(
+              stride_dtype,
+              BufferLoad(buf_strides,
+                         {IntImm(DataType::Int(32), static_cast<int>(k))}));
+
+          PrimExpr stride_val = tvm::if_then_else(
+              v_strides_is_null, stride_from_shape, explicit_stride);
+
+          BindNullable(buffer->strides[k], stride_val, stride_element_name(k),
+                       true, is_null);
+        }
+      } else {
+        PrimExpr stride_from_shape = 1;
+
+        for (int k = static_cast<int>(buffer->strides.size()) - 1; k >= 0;
+             --k) {
+          DataType stride_dtype = buffer->strides[k].dtype();
+          PrimExpr explicit_stride =
+              cast(stride_dtype,
+                   BufferLoad(buf_strides, {IntImm(DataType::Int(32), k)}));
+          PrimExpr shape_stride =
+              cast(stride_dtype,
+                   BufferLoad(buf_shape, {IntImm(DataType::Int(32), k)}));
+
+          PrimExpr stride_val = tvm::if_then_else(
+              v_strides_is_null, stride_from_shape, explicit_stride);
+
+          BindNullable(buffer->strides[k], stride_val, stride_element_name(k),
+                       true, is_null);
+        }
+      }
+    } // !data_is_subtype
+
+    // Byte_offset field.
+    int data_bytes = GetVectorBytes(buffer->dtype);
+
+    if (const auto *const_offset = buffer->elem_offset.as<IntImmNode>()) {
+      // Constant elem_offset: only need consistency check, no need for
+      // additional Var binding.
+      PrimExpr actual_byte_offset = tvm::if_then_else(
+          Not(is_null),
+          TVMArrayGet(DataType::UInt(64), handle, builtin::kArrByteOffset),
+          make_const(DataType::UInt(64), 0));
+      PrimExpr expect_byte_offset =
+          make_const(DataType::UInt(64), const_offset->value * data_bytes);
+      PrimExpr ok = (expect_byte_offset == actual_byte_offset);
+      ffi::Array<PrimExpr> pargs;
+      pargs.push_back(StringImm(tvm_error_byte_offset_mismatch));
+      pargs.push_back(StringImm(kernel_nm));
+      pargs.push_back(StringImm(buf_nm));
+      pargs.push_back(cast(DataType::Int(64), expect_byte_offset));
+      pargs.push_back(cast(DataType::Int(64), actual_byte_offset));
+      Stmt call_err =
+          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs));
+      asserts_.emplace_back(SeqStmt(
+          {IfThenElse(Not(is_null), IfThenElse(Not(ok), call_err), Evaluate(0)),
+           nop}));
+    } else {
+      PrimExpr actual_byte_offset = tvm::if_then_else(
+          Not(is_null),
+          TVMArrayGet(DataType::UInt(64), handle, builtin::kArrByteOffset),
+          make_const(DataType::UInt(64), 0));
+      PrimExpr expect_elem_off = cast(
+          buffer->elem_offset.dtype(),
+          (actual_byte_offset / make_const(DataType::UInt(64), data_bytes)));
+
+      BindNullable(buffer->elem_offset, expect_elem_off,
+                   arg_name + ".elem_offset", true, is_null);
+
+      if (buffer->offset_factor > 1) {
+        PrimExpr offset = buffer->elem_offset;
+        PrimExpr factor = make_const(offset.dtype(), buffer->offset_factor);
+        PrimExpr zero = make_zero(offset.dtype());
+        BindNullable(offset, truncmod(offset, factor),
+                     arg_name + ".elem_offset", true, is_null);
+      }
+    }
+
+    // device info.
+    // Define device_id from handle when available (so later passes can use it)
+    PrimExpr actual_dev_type = tvm::if_then_else(
+        Not(is_null),
+        TVMArrayGet(DataType::Int(32), handle, builtin::kArrDeviceType),
+        make_zero(DataType::Int(32)));
+    PrimExpr actual_dev_id = tvm::if_then_else(
+        Not(is_null),
+        TVMArrayGet(DataType::Int(32), handle, builtin::kArrDeviceId),
+        make_zero(DataType::Int(32)));
+
+    // Bind device_id to a safe expression (0 when NULL handle)
+    BindNullable(device_id, actual_dev_id, arg_name + ".device_id", true,
+                 is_null);
+    // Check device_type consistency (device_id equality is implicitly ensured
+    // by binding above)
+    {
+      PrimExpr ok = (device_type == actual_dev_type);
+      ffi::Array<PrimExpr> pargs2;
+      pargs2.push_back(StringImm(tvm_error_device_type_mismatch));
+      pargs2.push_back(StringImm(kernel_nm));
+      pargs2.push_back(StringImm(buf_nm));
+      pargs2.push_back(cast(DataType::Int(64), device_type));
+      pargs2.push_back(cast(DataType::Int(64), actual_dev_type));
+      Stmt call_err2 =
+          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs2));
+      asserts_.emplace_back(
+          SeqStmt({IfThenElse(Not(is_null), IfThenElse(Not(ok), call_err2),
+                              Evaluate(0)),
+                   Evaluate(0)}));
+    }
+
+    // Data field.  Because the validation of the data field may depend
+    // on a dynamic size defined by the other DLTensor* parameters, this
+    // field must be generated last.
+    // Bind data pointer using expression-level guard to avoid deref on NULL.
+    {
+      Var vptr(buffer->data);
+      PrimExpr data_ptr = tvm::if_then_else(
+          Not(is_null),
+          TVMArrayGet(DataType::Handle(), handle, builtin::kArrData),
+          make_zero(DataType::Handle()));
+      BindNullable(buffer->data, data_ptr, arg_name + ".data", true, is_null);
+
+      // Check if the data pointer is NULL.  This check is skipped for
+      // size-0 arrays and also skipped when handle itself is NULL.
+      PrimExpr alloc_size = IntImm(buffer->DefaultIndexType(), 1);
+      for (const auto &dim : buffer->shape) {
+        alloc_size = alloc_size * dim;
+      }
+      // Improve message: kernel/buffer naming for data pointer null check
+      std::string kernel_nm2 = arg_name;
+      std::string buf_nm2 = arg_name;
+      size_t dot_pos2 = arg_name.find('.');
+      if (dot_pos2 != std::string::npos) {
+        kernel_nm2 = arg_name.substr(0, dot_pos2);
+        buf_nm2 = arg_name.substr(dot_pos2 + 1);
+      }
+      // expand combined condition via nested IfThenElse for portability
+      ffi::Array<PrimExpr> pargs3;
+      pargs3.push_back(StringImm(tvm_error_null_ptr));
+      pargs3.push_back(StringImm(kernel_nm2));
+      pargs3.push_back(StringImm(buf_nm2));
+      pargs3.push_back(StringImm("data pointer"));
+      Stmt call_err3 =
+          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(), pargs3));
+      asserts_.emplace_back(SeqStmt(
+          {IfThenElse(Not(is_null),
+                      IfThenElse(Not(alloc_size == 0),
+                                 IfThenElse(Call(DataType::Bool(),
+                                                 builtin::isnullptr(), {vptr}),
+                                            call_err3),
+                                 Evaluate(0)),
+                      Evaluate(0)),
+           nop}));
+
+      // mark alignment of external bufs
+      init_nest_.emplace_back(
+          AttrStmt(vptr, tir::attr::storage_alignment,
+                   IntImm(DataType::Int(32), buffer->data_alignment), nop));
+
+      def_handle_dtype_.Set(vptr, tir::TypeAnnotation(buffer->dtype));
+    }
+  }
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/arg_binder.h b/src/transform/arg_binder.h
new file mode 100644
index 000000000..0d7c3ee62
--- /dev/null
+++ b/src/transform/arg_binder.h
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file arg_binder.h
+ * \brief Helper utility to match and bind arguments.
+ */
+#ifndef TVM_TL_TRANSFORM_ARG_BINDER_H_
+#define TVM_TL_TRANSFORM_ARG_BINDER_H_
+
+#include <tvm/arith/analyzer.h>
+#include <tvm/tir/buffer.h>
+#include <tvm/tir/expr.h>
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+/*!
+ * \brief Helper utility to generate match and bind of arguments.
+ *
+ * \note There is many places in TVM IR where we need argument bindings.
+ *
+ *  Consider a function f(tA(shape=var(n)), tB(shape=3), tC(shape=(n+2)).
+ *  Here n is a undefined variable that is decided by the outside, tB imposes
+ *  a constraint such that it can only take tensor with shape 3, tC imposes
+ *  another constraint that it's shape must equals n + 2.
+ *  So if we call it with f(bufferA, bufferB, bufferC), we need to generate
+ *  the following binding sequence:
+ *  - define n = bufferA.shape[0]
+ *  - assert bufferB.shape[0] == 3
+ *  - assert bufferB.shape[1] == n + 3
+ *
+ *  In general, this is a constraint solving problem. We have simplified
+ * assumption over the binding declaration, such that we require the variable
+ * occurred in constraint must be declared in argument list. So it is illegal to
+ * have signature f(tA(shape=(n+3))) without any argument variable corresponds
+ * to n, even though it is already enough to derive n from the input argument.
+ */
+class ArgBinder {
+public:
+  /*!
+   * \brief Constructor
+   * \param def_map A definition map that contains definition of known
+   * variables. ArgBinder will update this def_map when adding new definitions.
+   */
+  explicit ArgBinder(std::unordered_map<const VarNode *, PrimExpr> *def_map)
+      : def_map_(def_map) {}
+  /*!
+   * \brief Try to bind arg to value, generate constraint if necessary.
+   * \param arg The argument to be binded.
+   * \param value The target expression value
+   * \param arg_name argument name.
+   * \param with_let Whether add lets during bind
+   */
+  void Bind(const PrimExpr &arg, const PrimExpr &value,
+            const std::string &arg_name, bool with_let = false);
+  /*!
+   * \brief Bind array to array
+   * \param arg The argument to be binded.
+   * \param value The target expression value
+   * \param arg_name argument name.
+   */
+  void BindArray(const ffi::Array<PrimExpr> &arg,
+                 const ffi::Array<PrimExpr> &value,
+                 const std::string &arg_name);
+  /*!
+   * \brief Bind symbolic buffer to another symbolic buffer
+   * \param arg The argument to be binded.
+   * \param value The target expression value
+   * \param arg_name argument name.
+   * \param fuzzy_match If enabled, we allow value's dimension to be smaller
+   * than arg, as long as arg's higher dimensions are of 1.
+   */
+  void BindBuffer(const Buffer &arg, const Buffer &value,
+                  const std::string &arg_name, bool fuzzy_match);
+
+  /*!
+   * \brief Bind symbolic buffer to a DLTensor handle.
+   * \param buffer The argument buffer to be binded.
+   * \param device_type The device type to be binded.
+   * \param device_id The device id to be binded.
+   * \param buffer_def The buffer definition.
+   * \param func_name The function name.
+   * \param used_param_buffers The used param buffers.
+   */
+  void
+  BindDLTensors(const std::vector<std::pair<Var, Buffer>> &buffer_def,
+                const PrimExpr &device_type, const PrimExpr &device_id,
+                const std::string &func_name,
+                const std::unordered_set<const VarNode *> &used_param_buffers);
+
+  /*! \return The defs generated in binding. */
+  const std::vector<Var> &defs() const { return defs_; }
+
+  /*! \return The asserts generated in binding
+   *
+   * This contains statements that assert the correct value has been
+   * bound.  For example, `binder.Bind(var, expr_1)` will produce an
+   * entry mapping `var` to `expr_1` in the `binder.defs()`.  If
+   * `binder.Bind(var, expr_2)` is called later, then this will
+   * produce an assert statemtn that `expr_1 == expr_2`.
+   *
+   * Note: Some assert statements produced by BindDLTensor are located
+   * in `binder.init_nest()`, not within `binder.asserts()`.  This is
+   * deliberate, as some values may require checks prior to
+   * initialization.  (e.g. Intializing `m = dl_tensor->shape[3]`
+   * requires first asserting that `3 < dl_tensor->ndim`.)
+   */
+  const std::vector<Stmt> &asserts() const { return asserts_; }
+
+  /*!
+   * \brief Initialization nest generated
+   *
+   * This contains both variable bindings and any assert statements
+   * that are required in order to safely produce those variable
+   * bindings.
+   *
+   * \note Variable bindings may be implemented either as a `LetStmt`
+   *     that defines the variable, or as a variable replacement.  Any
+   *     bindings implemented as a `LetStmt` will be in the
+   *     initialization list.  Any bindings implemented as a variable
+   *     replacement will be stored in the `var_def` map.
+   *
+   *     A `tir::LetStmt` is usually generated when binding to a
+   *     `DLTensor`.  This requires loading values from memory, which
+   *     should only be performed once.  If the binding to a
+   *     `DLTensor` were implemented as a variable replacement, it
+   *     would load values from memory once for each usage of the
+   *     variable.
+   *
+   * \return The initialization nest generated during binding.
+   */
+  const std::vector<Stmt> &init_nest() const { return init_nest_; }
+  /*! \return Handle data type of the data */
+  const ffi::Map<Var, PrimExpr> &def_handle_dtype() const {
+    return def_handle_dtype_;
+  }
+
+  bool BindNullable(const PrimExpr &arg, const PrimExpr &value,
+                    const std::string &arg_name, bool with_lets,
+                    const PrimExpr &nullable_guard);
+
+private:
+  std::vector<Var> getUndefVars(const std::vector<PrimExpr> &arg);
+  // Internal bind function
+  bool Bind_(const PrimExpr &arg, const PrimExpr &value,
+             const std::string &arg_name, bool with_lets);
+  /*! \brief The definition map, can be uses to substitute */
+  std::unordered_map<const VarNode *, PrimExpr> *def_map_;
+  /*! \brief defs generated in the current binder */
+  std::vector<Var> defs_;
+  /*! \brief Initialize nest */
+  std::vector<Stmt> init_nest_;
+  /*! \brief handle data type in the defintiions */
+  ffi::Map<Var, PrimExpr> def_handle_dtype_;
+  /*! \brief asserts generated */
+  std::vector<Stmt> asserts_;
+  /*! \brief internal analyzer. */
+  arith::Analyzer analyzer_;
+};
+} // namespace tl
+} // namespace tvm
+#endif // TVM_TL_TRANSFORM_ARG_BINDER_H_
diff --git a/src/transform/atomicadd_vectorize.cc b/src/transform/atomicadd_vectorize.cc
index a6b12f7e9..d66a538db 100644
--- a/src/transform/atomicadd_vectorize.cc
+++ b/src/transform/atomicadd_vectorize.cc
@@ -203,7 +203,8 @@ class AtomicAddVectorizeRewriter : public StmtExprMutator {
         vmap.Set(old_var, new_var * vector_size_);
         Stmt body = Substitute(fnode->body, vmap);
         return For(new_var, 0, extent / vector_size_, fnode->kind, body,
-                   fnode->thread_binding, fnode->annotations, fnode->span);
+                   fnode->thread_binding, fnode->annotations, fnode->step,
+                   fnode->span);
       }
     }
     return ret;
@@ -231,21 +232,25 @@ class AtomicAddVectorizeRewriter : public StmtExprMutator {
       // Ref: src/tl_templates/cuda/atomic.h::AtomicAdd
       const IntImm memory_order =
           node->args.size() >= 3 ? Downcast<IntImm>(node->args[2]) : IntImm(0);
-
+      Array<PrimExpr> new_args;
       Call address_of_dst =
           Call(DataType::Handle(), builtin::address_of(), {dst_node});
       Call address_of_value =
           Call(DataType::Handle(), builtin::address_of(), {value_node});
-      Array<PrimExpr> new_args;
       if (vector_size_ == 4) {
         new_args.push_back(StringImm("AtomicAddx4"));
+        new_args.push_back(address_of_dst);
+        new_args.push_back(address_of_value);
       } else if (vector_size_ == 2) {
         new_args.push_back(StringImm("AtomicAddx2"));
+        new_args.push_back(address_of_dst);
+        new_args.push_back(address_of_value);
       } else {
+        // Scalar case: AtomicAdd now expects a pointer to destination.
         new_args.push_back(StringImm("AtomicAdd"));
+        new_args.push_back(address_of_dst);
+        new_args.push_back(value_node);
       }
-      new_args.push_back(address_of_dst);
-      new_args.push_back(address_of_value);
       new_args.push_back(memory_order);
 
       Call new_call =
@@ -255,8 +260,28 @@ class AtomicAddVectorizeRewriter : public StmtExprMutator {
     } else {
       Array<PrimExpr> new_args;
       new_args.push_back(StringImm("AtomicAdd"));
-      for (auto x : node->args)
-        new_args.push_back(x);
+      // Ensure first argument is an address; keep value as-is.
+      if (!node->args.empty()) {
+        if (const auto *bl = node->args[0].as<BufferLoadNode>()) {
+          Call address_of_dst = Call(DataType::Handle(), builtin::address_of(),
+                                     {Downcast<BufferLoad>(node->args[0])});
+          new_args.push_back(address_of_dst);
+        } else if (const auto *call = node->args[0].as<CallNode>()) {
+          // If it's already an address_of, forward it; otherwise, keep
+          // original.
+          if (call->op.same_as(builtin::address_of())) {
+            new_args.push_back(node->args[0]);
+          } else {
+            new_args.push_back(node->args[0]);
+          }
+        } else {
+          new_args.push_back(node->args[0]);
+        }
+        // Push remaining args unchanged (value, optional memory_order, ...)
+        for (size_t i = 1; i < node->args.size(); ++i) {
+          new_args.push_back(node->args[i]);
+        }
+      }
 
       Call new_call =
           tvm::tir::Call(node->dtype, builtin::call_extern(), new_args);
@@ -280,4 +305,4 @@ For VectorizeAtomicAdd(const For &for_node, int compute_capability) {
 }
 
 } // namespace tl
-} // namespace tvm
\ No newline at end of file
+} // namespace tvm
diff --git a/src/transform/atomicadd_vectorize.h b/src/transform/atomicadd_vectorize.h
index a55bc0f4a..6bd3309ae 100644
--- a/src/transform/atomicadd_vectorize.h
+++ b/src/transform/atomicadd_vectorize.h
@@ -11,7 +11,6 @@
 #include "../op/builtin.h"
 #include "arith/int_operator.h"
 #include "arith/ir_visitor_with_analyzer.h"
-#include "atomicadd_vectorize.h"
 #include "common/loop_vectorization_utils.h"
 #include <numeric>
 #include <tvm/arith/analyzer.h>
@@ -58,4 +57,4 @@ class AtomicAddVectorizePlanner : public arith::IRVisitorWithAnalyzer {
 } // namespace tl
 } // namespace tvm
 
-#endif // TVM_TL_ATOMICADD_VECTORIZE_H_
\ No newline at end of file
+#endif // TVM_TL_ATOMICADD_VECTORIZE_H_
diff --git a/src/transform/cluster_planning.cc b/src/transform/cluster_planning.cc
index e847bb2b6..7fcdc1691 100644
--- a/src/transform/cluster_planning.cc
+++ b/src/transform/cluster_planning.cc
@@ -10,6 +10,8 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
+#include "../support/ffi_aliases.h"
+
 namespace tvm {
 namespace tir {
 
@@ -66,7 +68,8 @@ class ClusterPlanner {
     }
 
     if (mem_reuse_max > 0) {
-      std::string tag_str = cluster_tag; // Convert to std::string
+      std::string tag_str =
+          static_cast<std::string>(cluster_tag); // Convert to std::string
       if (tag_str.rfind("blockIdx", 0) == 0) {
         // starts with "blockIdx"
         tag_str = "clusterIdx" + tag_str.substr(strlen("blockIdx"));
@@ -74,7 +77,7 @@ class ClusterPlanner {
         // Unexpected format — maybe just prefix
         tag_str = "clusterIdx" + tag_str;
       }
-      cluster_tag = tvm::ffi::String(tag_str); // Convert back
+      cluster_tag = String(tag_str); // Convert back
       return WithAttr(f, cluster_tag, Integer(cluster_size_));
     } else {
       return f;
@@ -122,10 +125,10 @@ tvm::transform::Pass ClusterPlanning() {
   return CreatePrimFuncPass(pass_func, 0, "tl.ClusterPlanning", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.ClusterPlanning", ClusterPlanning);
-});
+}
 } // namespace transform
 
 } // namespace tir
diff --git a/src/transform/common/assume.cc b/src/transform/common/assume.cc
new file mode 100644
index 000000000..cb51d0f8a
--- /dev/null
+++ b/src/transform/common/assume.cc
@@ -0,0 +1,33 @@
+
+/*!
+ * \file assume.cc
+ * \brief Utils on assume statements
+ */
+
+#include "assume.h"
+#include "tvm/tir/builtin.h"
+#include "tvm/tir/expr.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+std::optional<PrimExpr> GetAssumeExprInEvaluateForm(Stmt stmt) {
+  auto eval = stmt.as<EvaluateNode>();
+  if (!eval)
+    return std::nullopt;
+  auto call = eval->value.as<CallNode>();
+  if (!call)
+    return std::nullopt;
+  if (!call->op.same_as(builtin::assume()))
+    return std::nullopt;
+  return call->args[0];
+}
+
+bool IsAssumeInEvaluateForm(const Stmt &stmt) {
+  return GetAssumeExprInEvaluateForm(stmt).has_value();
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/common/assume.h b/src/transform/common/assume.h
new file mode 100644
index 000000000..db830818e
--- /dev/null
+++ b/src/transform/common/assume.h
@@ -0,0 +1,28 @@
+
+/*!
+ * \file assume.h
+ * \brief Utils on assume statements
+ */
+
+#ifndef TVM_TL_TRANSFORM_COMMON_ASSUME_H_
+#define TVM_TL_TRANSFORM_COMMON_ASSUME_H_
+
+#include "tvm/tir/stmt.h"
+#include <optional>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+// Get the expression inside an assume statement, if any. Returns nullopt if
+// the statement is not an assume statement.
+std::optional<PrimExpr> GetAssumeExprInEvaluateForm(Stmt stmt);
+
+// Check if a statement is an assume statement.
+bool IsAssumeInEvaluateForm(const Stmt &stmt);
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_TRANSFORM_COMMON_ASSUME_H_
diff --git a/src/transform/common/loop_parallel_transform_utils.h b/src/transform/common/loop_parallel_transform_utils.h
index b5a1ccddc..52a5a9b97 100644
--- a/src/transform/common/loop_parallel_transform_utils.h
+++ b/src/transform/common/loop_parallel_transform_utils.h
@@ -17,6 +17,8 @@
 #include "arith/ir_visitor_with_analyzer.h"
 #include <queue>
 
+#include "../../op/utils.h"
+
 namespace tvm {
 namespace tl {
 
@@ -41,7 +43,7 @@ class ParallelLoopTransformer : public IRMutatorWithAnalyzer {
       return StmtMutator::VisitStmt_(op);
 
     // Collect loop variables and ranges
-    auto for_node = GetRef<For>(op);
+    auto for_node = tvm::ffi::GetRef<For>(op);
     Array<Var> loop_vars;
     Array<PrimExpr> loop_extents;
     Stmt body = op->body;
@@ -81,7 +83,7 @@ class ParallelLoopTransformer : public IRMutatorWithAnalyzer {
         // post order visit the index
         PostOrderVisit(index, [&](const ObjectRef &obj) {
           if (const VarNode *v = obj.as<VarNode>()) {
-            used_vars.insert(GetRef<Var>(v));
+            used_vars.insert(tvm::ffi::GetRef<Var>(v));
           }
         });
         if (used_vars.empty()) {
@@ -134,7 +136,7 @@ class ParallelLoopTransformer : public IRMutatorWithAnalyzer {
   class BufferAccessCollector : public StmtExprVisitor {
   public:
     void VisitExpr_(const BufferLoadNode *op) final {
-      if (op->buffer.scope() == "local.fragment") {
+      if (IsFragmentBuffer(op->buffer)) {
         if (buffer_indices.find(op->buffer) == buffer_indices.end()) {
           buffer_indices[op->buffer] = op->indices;
         } else {
@@ -147,7 +149,7 @@ class ParallelLoopTransformer : public IRMutatorWithAnalyzer {
     }
 
     void VisitStmt_(const BufferStoreNode *op) final {
-      if (op->buffer.scope() == "local.fragment") {
+      if (IsFragmentBuffer(op->buffer)) {
         if (buffer_indices.find(op->buffer) == buffer_indices.end()) {
           buffer_indices[op->buffer] = op->indices;
         } else {
diff --git a/src/transform/common/loop_vectorization_utils.h b/src/transform/common/loop_vectorization_utils.h
index 890597464..c6cedbcb2 100644
--- a/src/transform/common/loop_vectorization_utils.h
+++ b/src/transform/common/loop_vectorization_utils.h
@@ -212,7 +212,7 @@ class Vectorizer : public StmtMutator,
     PrimExpr a = this->VisitExpr(op->a);
     PrimExpr b = this->VisitExpr(op->b);
     if (a.same_as(op->a) && b.same_as(op->b)) {
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     } else {
       bool is_vec_a = a.dtype().is_scalable_or_fixed_length_vector();
       bool is_vec_b = b.dtype().is_scalable_or_fixed_length_vector();
@@ -266,7 +266,7 @@ class Vectorizer : public StmtMutator,
   PrimExpr VisitExpr_(const NotNode *op) final {
     PrimExpr a = this->VisitExpr(op->a);
     if (a.same_as(op->a)) {
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     } else {
       return !(a);
     }
@@ -307,10 +307,10 @@ class Vectorizer : public StmtMutator,
     PrimExpr value = this->VisitExpr(op->value);
     if (value.dtype().is_scalable_or_fixed_length_vector()) {
       need_scalarize_ = true;
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     }
     if (value.same_as(op->value)) {
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     } else {
       return Broadcast(op->value, op->lanes);
     }
@@ -322,7 +322,7 @@ class Vectorizer : public StmtMutator,
     PrimExpr f = this->VisitExpr(op->false_value);
     if (cond.same_as(op->condition) && t.same_as(op->true_value) &&
         f.same_as(op->false_value)) {
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     } else {
       int cond_lanes = cond.dtype().get_lanes_or_vscale_factor();
       int t_lanes = t.dtype().get_lanes_or_vscale_factor();
@@ -340,7 +340,7 @@ class Vectorizer : public StmtMutator,
   PrimExpr VisitExpr_(const CastNode *op) final {
     PrimExpr value = this->VisitExpr(op->value);
     if (value.same_as(op->value)) {
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     } else {
       if (value.dtype().is_scalable_vector()) {
         return Cast(op->dtype.with_scalable_vscale_factor(
@@ -353,20 +353,20 @@ class Vectorizer : public StmtMutator,
   }
 
   PrimExpr VisitExpr_(const FloatImmNode *op) final {
-    return GetRef<PrimExpr>(op);
+    return tvm::ffi::GetRef<PrimExpr>(op);
   }
 
   PrimExpr VisitExpr_(const IntImmNode *op) final {
-    return GetRef<PrimExpr>(op);
+    return tvm::ffi::GetRef<PrimExpr>(op);
   }
 
   PrimExpr VisitExpr_(const StringImmNode *op) final {
-    return GetRef<PrimExpr>(op);
+    return tvm::ffi::GetRef<PrimExpr>(op);
   }
 
   // Variable
   PrimExpr VisitExpr_(const VarNode *op) final {
-    Var var = GetRef<Var>(op);
+    Var var = tvm::ffi::GetRef<Var>(op);
 
     if (var.same_as(var_)) {
       return ramp_;
@@ -378,6 +378,50 @@ class Vectorizer : public StmtMutator,
       return std::move(var);
     }
   }
+  // IfThenElse expr
+  PrimExpr MutateIfThenElseExpr_(const CallNode *op) {
+    PrimExpr cond = this->VisitExpr(op->args[0]);
+    if (cond.dtype().is_scalable_or_fixed_length_vector()) {
+      need_scalarize_ = true;
+      return tvm::ffi::GetRef<PrimExpr>(op);
+    }
+    PrimExpr t = this->VisitExpr(op->args[1]);
+    PrimExpr f = this->VisitExpr(op->args[2]);
+    if (cond.same_as(op->args[0]) && t.same_as(op->args[1]) &&
+        f.same_as(op->args[2])) {
+      return tvm::ffi::GetRef<PrimExpr>(op);
+    } else {
+      int t_lanes = t.dtype().get_lanes_or_vscale_factor();
+      int f_lanes = f.dtype().get_lanes_or_vscale_factor();
+      int lanes = std::max(t_lanes, f_lanes);
+      bool is_scalable =
+          t.dtype().is_scalable_vector() || f.dtype().is_scalable_vector();
+      t = BroadcastTo(t, lanes, is_scalable);
+      f = BroadcastTo(f, lanes, is_scalable);
+      if (is_scalable) {
+        return Call(op->dtype.with_scalable_vscale_factor(lanes), op->op,
+                    {cond, t, f});
+      } else {
+        return Call(op->dtype.with_lanes(lanes), op->op, {cond, t, f});
+      }
+    }
+  }
+  // Reinterpret expr
+  PrimExpr MutateReinterpretExpr_(const CallNode *op) {
+    ICHECK(op->op.same_as(builtin::reinterpret()));
+    PrimExpr value = this->VisitExpr(op->args[0]);
+    if (value.same_as(op->args[0])) {
+      return tvm::ffi::GetRef<PrimExpr>(op);
+    } else {
+      int lanes = value.dtype().get_lanes_or_vscale_factor();
+      if (value.dtype().is_scalable_vector()) {
+        return Call(op->dtype.with_scalable_vscale_factor(lanes), op->op,
+                    {value});
+      } else {
+        return Call(op->dtype.with_lanes(lanes), op->op, {value});
+      }
+    }
+  }
   // tl::ld or tl::st expr vectorization
   // Transform: for k in vectorized(N): tl::ld(&buf[base+k], val[k])
   // Into: tl::ld(&buf[base], reinterpret<int4>(val[base])) with vectorized load
@@ -477,7 +521,7 @@ class Vectorizer : public StmtMutator,
       } else {
         // Can't vectorize to a standard type, fall back to scalarize
         need_scalarize_ = true;
-        return GetRef<PrimExpr>(op);
+        return tvm::ffi::GetRef<PrimExpr>(op);
       }
 
       // Reinterpret the value to vector type (e.g., int4 for 8xbf16)
@@ -502,12 +546,12 @@ class Vectorizer : public StmtMutator,
     if (new_addr.dtype().is_scalable_or_fixed_length_vector() ||
         new_value.dtype().is_scalable_or_fixed_length_vector()) {
       need_scalarize_ = true;
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     }
 
     // No vectorization needed, return with updated args if changed
     if (new_addr.same_as(addr_arg) && new_value.same_as(value_arg)) {
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     }
 
     Array<PrimExpr> new_args;
@@ -520,50 +564,6 @@ class Vectorizer : public StmtMutator,
     return Call(op->dtype, op->op, new_args);
   }
 
-  // IfThenElse expr
-  PrimExpr MutateIfThenElseExpr_(const CallNode *op) {
-    PrimExpr cond = this->VisitExpr(op->args[0]);
-    if (cond.dtype().is_scalable_or_fixed_length_vector()) {
-      need_scalarize_ = true;
-      return GetRef<PrimExpr>(op);
-    }
-    PrimExpr t = this->VisitExpr(op->args[1]);
-    PrimExpr f = this->VisitExpr(op->args[2]);
-    if (cond.same_as(op->args[0]) && t.same_as(op->args[1]) &&
-        f.same_as(op->args[2])) {
-      return GetRef<PrimExpr>(op);
-    } else {
-      int t_lanes = t.dtype().get_lanes_or_vscale_factor();
-      int f_lanes = f.dtype().get_lanes_or_vscale_factor();
-      int lanes = std::max(t_lanes, f_lanes);
-      bool is_scalable =
-          t.dtype().is_scalable_vector() || f.dtype().is_scalable_vector();
-      t = BroadcastTo(t, lanes, is_scalable);
-      f = BroadcastTo(f, lanes, is_scalable);
-      if (is_scalable) {
-        return Call(op->dtype.with_scalable_vscale_factor(lanes), op->op,
-                    {cond, t, f});
-      } else {
-        return Call(op->dtype.with_lanes(lanes), op->op, {cond, t, f});
-      }
-    }
-  }
-  // Reinterpret expr
-  PrimExpr MutateReinterpretExpr_(const CallNode *op) {
-    ICHECK(op->op.same_as(builtin::reinterpret()));
-    PrimExpr value = this->VisitExpr(op->args[0]);
-    if (value.same_as(op->args[0])) {
-      return GetRef<PrimExpr>(op);
-    } else {
-      int lanes = value.dtype().get_lanes_or_vscale_factor();
-      if (value.dtype().is_scalable_vector()) {
-        return Call(op->dtype.with_scalable_vscale_factor(lanes), op->op,
-                    {value});
-      } else {
-        return Call(op->dtype.with_lanes(lanes), op->op, {value});
-      }
-    }
-  }
   // Call
   PrimExpr VisitExpr_(const CallNode *op) final {
     if (op->op.same_as(builtin::if_then_else())) {
@@ -611,12 +611,12 @@ class Vectorizer : public StmtMutator,
         auto new_arg = this->VisitExpr(arg);
         if (new_arg.dtype().is_scalable_or_fixed_length_vector()) {
           need_scalarize_ = true;
-          return GetRef<PrimExpr>(op);
+          return tvm::ffi::GetRef<PrimExpr>(op);
         }
         new_args.push_back(new_arg);
       }
       if (op->args.same_as(new_args)) {
-        return GetRef<PrimExpr>(op);
+        return tvm::ffi::GetRef<PrimExpr>(op);
       } else {
         return Call(op->dtype, op->op, new_args);
       }
@@ -625,7 +625,7 @@ class Vectorizer : public StmtMutator,
       Array<PrimExpr> new_args = MutateArray(op->args, &lane);
       // normal code path.
       if (op->args.same_as(new_args)) {
-        return GetRef<PrimExpr>(op);
+        return tvm::ffi::GetRef<PrimExpr>(op);
       } else {
         return Call(op->dtype.with_lanes(lane), op->op, new_args);
       }
@@ -633,7 +633,7 @@ class Vectorizer : public StmtMutator,
   }
   // BufferLoad
   PrimExpr VisitExpr_(const BufferLoadNode *op) final {
-    auto load = GetRef<BufferLoad>(op);
+    auto load = tvm::ffi::GetRef<BufferLoad>(op);
 
     auto fmutate = [this](const PrimExpr &index) {
       return this->VisitExpr(index);
@@ -670,7 +670,7 @@ class Vectorizer : public StmtMutator,
       let_binding_[op->var] = op->var;
       PrimExpr body = this->VisitExpr(op->body);
       if (value.same_as(op->value) && body.same_as(op->body)) {
-        return GetRef<PrimExpr>(op);
+        return tvm::ffi::GetRef<PrimExpr>(op);
       } else {
         return Let(op->var, value, body);
       }
@@ -678,7 +678,7 @@ class Vectorizer : public StmtMutator,
   }
   // BufferStore
   Stmt VisitStmt_(const BufferStoreNode *op) final {
-    auto store = GetRef<BufferStore>(op);
+    auto store = tvm::ffi::GetRef<BufferStore>(op);
 
     auto fmutate = [this](const PrimExpr &index) {
       return this->VisitExpr(index);
@@ -741,11 +741,11 @@ class Vectorizer : public StmtMutator,
     ICHECK(!op->extent.dtype().is_scalable_or_fixed_length_vector());
     PrimExpr extent = this->VisitExpr(op->extent);
     if (extent.dtype().is_scalable_or_fixed_length_vector()) {
-      return Scalarize(GetRef<Stmt>(op));
+      return Scalarize(tvm::ffi::GetRef<Stmt>(op));
     }
     Stmt body = this->VisitStmt(op->body);
     if (extent.same_as(op->extent) && body.same_as(op->body)) {
-      return GetRef<Stmt>(op);
+      return tvm::ffi::GetRef<Stmt>(op);
     } else {
       return For(op->loop_var, op->min, extent, op->kind, body,
                  op->thread_binding, op->annotations);
@@ -756,7 +756,7 @@ class Vectorizer : public StmtMutator,
     ICHECK(!op->condition.dtype().is_scalable_or_fixed_length_vector());
     PrimExpr condition = this->VisitExpr(op->condition);
     if (condition.dtype().is_scalable_or_fixed_length_vector()) {
-      return Scalarize(GetRef<Stmt>(op));
+      return Scalarize(tvm::ffi::GetRef<Stmt>(op));
     }
     Stmt then_case = this->VisitStmt(op->then_case);
     Optional<Stmt> else_case = std::nullopt;
@@ -765,7 +765,7 @@ class Vectorizer : public StmtMutator,
     }
     if (condition.same_as(op->condition) && then_case.same_as(op->then_case) &&
         else_case.same_as(op->else_case)) {
-      return GetRef<Stmt>(op);
+      return tvm::ffi::GetRef<Stmt>(op);
     } else {
       return IfThenElse(condition, then_case, else_case);
     }
@@ -790,7 +790,7 @@ class Vectorizer : public StmtMutator,
       let_binding_[op->var] = op->var;
       Stmt body = this->VisitStmt(op->body);
       if (value.same_as(op->value) && body.same_as(op->body)) {
-        return GetRef<Stmt>(op);
+        return tvm::ffi::GetRef<Stmt>(op);
       } else {
         return LetStmt(op->var, value, body);
       }
@@ -803,7 +803,7 @@ class Vectorizer : public StmtMutator,
     if (condition.dtype().is_scalable_or_fixed_length_vector()) {
       LOG(WARNING) << "Cannot handle vector extent in alloc of "
                    << op->buffer_var->name_hint;
-      return Scalarize(GetRef<Stmt>(op));
+      return Scalarize(tvm::ffi::GetRef<Stmt>(op));
     }
 
     // Mutate the extents
@@ -813,7 +813,7 @@ class Vectorizer : public StmtMutator,
       if (new_ext.dtype().is_scalable_or_fixed_length_vector()) {
         LOG(WARNING) << "Cannot handle vector extent in alloc of "
                      << op->buffer_var->name_hint;
-        return Scalarize(GetRef<Stmt>(op));
+        return Scalarize(tvm::ffi::GetRef<Stmt>(op));
       }
       extents.push_back(new_ext);
     }
@@ -894,7 +894,7 @@ class Vectorizer : public StmtMutator,
     PrimExpr a = this->VisitExpr(op->a);
     PrimExpr b = this->VisitExpr(op->b);
     if (a.same_as(op->a) && b.same_as(op->b)) {
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     } else {
       int a_lanes = a.dtype().get_lanes_or_vscale_factor();
       int b_lanes = b.dtype().get_lanes_or_vscale_factor();
@@ -910,7 +910,7 @@ class Vectorizer : public StmtMutator,
     PrimExpr a = this->VisitExpr(op->a);
     PrimExpr b = this->VisitExpr(op->b);
     if (a.same_as(op->a) && b.same_as(op->b)) {
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     } else {
       int a_lanes = a.dtype().get_lanes_or_vscale_factor();
       int b_lanes = b.dtype().get_lanes_or_vscale_factor();
@@ -937,4 +937,4 @@ class Vectorizer : public StmtMutator,
 };
 
 } // namespace tl
-} // namespace tvm
\ No newline at end of file
+} // namespace tvm
diff --git a/src/transform/config_index_bitwidth.cc b/src/transform/config_index_bitwidth.cc
index 58ca0da7f..b0a577555 100644
--- a/src/transform/config_index_bitwidth.cc
+++ b/src/transform/config_index_bitwidth.cc
@@ -38,7 +38,7 @@ class ConfigIndexBitwidthRewriter : public IndexDataTypeRewriter {
     if (is_enabled_ && op->dtype.is_int() && op->dtype.bits() < 64) {
       return IntImm(DataType::Int(_index_bitwidth_), op->value);
     }
-    return GetRef<PrimExpr>(op);
+    return tvm::ffi::GetRef<PrimExpr>(op);
   }
 
   PrimExpr VisitExpr_(const CastNode *op) final {
@@ -88,23 +88,23 @@ class IndexLegalizer : public IRMutatorWithAnalyzer {
 
     PrimExpr VisitExpr_(const VarNode *op) final {
       if (op->dtype.is_int() && op->dtype.bits() < 64) {
-        return cast(DataType::Int(64), GetRef<Var>(op));
+        return cast(DataType::Int(64), tvm::ffi::GetRef<Var>(op));
       }
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     }
 
     PrimExpr VisitExpr_(const IntImmNode *op) final {
       if (op->dtype.is_int() && op->dtype.bits() < 64) {
         return IntImm(DataType::Int(64), op->value);
       }
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     }
 
     PrimExpr VisitExpr_(const CastNode *op) final {
       if (op->dtype.is_int() && op->dtype.bits() < 64) {
         return cast(DataType::Int(64), op->value);
       }
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     }
 
     Stmt VisitStmt_(const BufferStoreNode *op) final {
@@ -183,11 +183,11 @@ tvm::transform::Pass ConfigIndexBitwidth() {
   return CreatePrimFuncPass(pass_func, 0, "tl.ConfigIndexBitwidth", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.ConfigIndexBitwidth",
                         ConfigIndexBitwidth);
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/eliminate_storage_sync_for_mbarrier.cc b/src/transform/eliminate_storage_sync_for_mbarrier.cc
index cc187e8e2..504de732c 100644
--- a/src/transform/eliminate_storage_sync_for_mbarrier.cc
+++ b/src/transform/eliminate_storage_sync_for_mbarrier.cc
@@ -35,9 +35,7 @@ class Eliminator : public IRMutatorWithAnalyzer {
 
   Stmt VisitStmt_(const AttrStmtNode *op) final {
     if (op->attr_key == "thread_extent") {
-      const VarNode *var = nullptr;
-      if (op->node->IsInstance<VarNode>()) {
-        var = op->node.as<VarNode>();
+      if (const auto *var = op->node.as<VarNode>()) {
         if (var->name_hint == "threadIdx.x") {
           thread_extent_ = op;
         }
@@ -82,7 +80,7 @@ class Eliminator : public IRMutatorWithAnalyzer {
   }
 
   Stmt VisitStmt_(const ForNode *op) final {
-    PostOrderVisit(GetRef<For>(op), [&](const ObjectRef &node) {
+    PostOrderVisit(tvm::ffi::GetRef<For>(op), [&](const ObjectRef &node) {
       if (const auto *call = node.as<CallNode>()) {
         if (call->op.same_as(create_list_of_mbarrier()) ||
             call->op.same_as(mbarrier_wait_parity()) ||
@@ -116,11 +114,11 @@ tvm::transform::Pass EliminateStorageSyncForMBarrier() {
                             {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.EliminateStorageSyncForMBarrier",
                         EliminateStorageSyncForMBarrier);
-});
+}
 
 } // namespace transform
 } // namespace tl
diff --git a/src/transform/flatten_buffer.cc b/src/transform/flatten_buffer.cc
index 6a45bcbf4..3b68d3373 100644
--- a/src/transform/flatten_buffer.cc
+++ b/src/transform/flatten_buffer.cc
@@ -69,6 +69,43 @@ class BufferFlattener : public arith::IRMutatorWithAnalyzer {
   using IRMutatorWithAnalyzer::VisitStmt;
   using IRMutatorWithAnalyzer::VisitStmt_;
 
+  class Int64Promoter : public tir::IndexDataTypeRewriter {
+  public:
+    using Parent = IndexDataTypeRewriter;
+
+    PrimExpr VisitExpr_(const VarNode *op) final {
+      if (op->dtype.is_int() && op->dtype.bits() < 64) {
+        return cast(DataType::Int(64), tvm::ffi::GetRef<Var>(op));
+      }
+      return tvm::ffi::GetRef<PrimExpr>(op);
+    }
+
+    PrimExpr VisitExpr_(const IntImmNode *op) final {
+      if (op->dtype.is_int() && op->dtype.bits() < 64) {
+        return IntImm(DataType::Int(64), op->value);
+      }
+      return tvm::ffi::GetRef<PrimExpr>(op);
+    }
+
+    PrimExpr VisitExpr_(const CastNode *op) final {
+      if (op->dtype.is_int() && op->dtype.bits() < 64) {
+        return cast(DataType::Int(64), op->value);
+      }
+      return tvm::ffi::GetRef<PrimExpr>(op);
+    }
+
+    Stmt VisitStmt_(const BufferStoreNode *op) final {
+      // Force indices to be int64
+      auto node = Downcast<BufferStore>(Parent::VisitStmt_(op));
+      return std::move(node);
+    }
+
+    PrimExpr VisitExpr_(const BufferLoadNode *op) final {
+      auto node = Downcast<BufferLoad>(Parent::VisitExpr_(op));
+      return std::move(node);
+    }
+  };
+
   explicit BufferFlattener(arith::Analyzer *ana) : IRMutatorWithAnalyzer(ana) {}
 
   Stmt VisitStmt_(const BlockNode *op) final {
@@ -78,7 +115,7 @@ class BufferFlattener : public arith::IRMutatorWithAnalyzer {
         << "All MatchBufferRegion should be removed in "
            "tir.transform.LowerMatchBuffer.";
 
-    Block block = GetRef<Block>(op);
+    Block block = tvm::ffi::GetRef<Block>(op);
 
     Array<Buffer> alloc_buffers = op->alloc_buffers;
     alloc_buffers.MutateByApply(
@@ -258,7 +295,29 @@ class BufferFlattener : public arith::IRMutatorWithAnalyzer {
   Array<PrimExpr> GetSimplifiedElemOffset(const Buffer &buffer,
                                           const Array<PrimExpr> &indices) {
     auto flattened_indices = buffer->ElemOffset(indices);
-    return this->IterMapSimplifyWithContext(flattened_indices, false);
+    Array<PrimExpr> safe_indices;
+    for (auto index : flattened_indices) {
+      auto int_bound = analyzer_->const_int_bound(index);
+      DataType dtype = index->dtype;
+      if (dtype.is_int() && dtype.bits() < 64) {
+        int64_t max_value = int_bound->max_value;
+        int64_t min_value = int_bound->min_value;
+        const int64_t type_max = (1LL << (dtype.bits() - 1));
+        const int64_t type_min = -(1LL << (dtype.bits() - 1));
+
+        if (max_value >= (type_max - 1) || min_value < type_min) {
+          Int64Promoter promoter;
+          for (auto &index : flattened_indices) {
+            safe_indices.push_back(promoter(index));
+          }
+        } else {
+          safe_indices.push_back(index);
+        }
+      } else {
+        safe_indices.push_back(index);
+      }
+    }
+    return this->IterMapSimplifyWithContext(safe_indices, false);
   }
 
   template <typename Node> Node VisitBufferAccess(Node node) {
@@ -326,10 +385,10 @@ tvm::transform::Pass FlattenBuffer() {
   return CreatePrimFuncPass(pass_func, 0, "tl.FlattenBuffer", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.FlattenBuffer", FlattenBuffer);
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/frontend_legalize.cc b/src/transform/frontend_legalize.cc
index b366d02d1..ffb4b1a53 100644
--- a/src/transform/frontend_legalize.cc
+++ b/src/transform/frontend_legalize.cc
@@ -89,10 +89,10 @@ Pass LetInline() {
   return CreatePrimFuncPass(pass_func, 0, "tl.LetInline", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.LetInline", LetInline);
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/hoist_nonrestrict_params.cc b/src/transform/hoist_nonrestrict_params.cc
new file mode 100644
index 000000000..90db747e8
--- /dev/null
+++ b/src/transform/hoist_nonrestrict_params.cc
@@ -0,0 +1,133 @@
+/*
+ * Hoist tl.non_restrict_params block annotation(s) to PrimFunc attribute.
+ *
+ * Previously, we only looked at the root block. This version recursively
+ * scans all blocks, unions any tl.non_restrict_params entries it finds,
+ * merges with any existing PrimFunc-level attribute, then writes the
+ * deduplicated result back to the PrimFunc attrs. This makes annotation
+ * placement within the function body flexible for frontends.
+ */
+#include <tvm/ffi/container/array.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/ir/transform.h>
+#include <tvm/tir/function.h>
+#include <tvm/tir/stmt.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "../op/builtin.h"
+
+namespace tvm {
+namespace tl {
+using namespace tvm::tir;
+
+class NonRestrictCollector : public StmtVisitor {
+public:
+  void Collect(const Stmt &stmt) { VisitStmt(stmt); }
+
+  Array<Var> Result() const {
+    Array<Var> out;
+    out.reserve(collected_.size());
+    for (const Var &v : collected_)
+      out.push_back(v);
+    return out;
+  }
+
+private:
+  static std::string NormalizeName(const std::string &s) {
+    if (s.size() >= 8 && s.rfind("_handle") == s.size() - 7) {
+      return s.substr(0, s.size() - 7);
+    }
+    return s;
+  }
+
+  void MaybeInsert(const Var &v) {
+    if (!v.defined())
+      return;
+    const VarNode *p = v.get();
+    if (seen_ptr_.count(p))
+      return;
+    // Also dedup by normalized name to be robust w.r.t recreated Vars
+    std::string norm = NormalizeName(v->name_hint);
+    if (seen_name_.count(norm))
+      return;
+    seen_ptr_.insert(p);
+    seen_name_.insert(std::move(norm));
+    collected_.push_back(v);
+  }
+
+  void VisitStmt_(const BlockNode *op) final {
+    auto it = op->annotations.find(attr::kNonRestrictParams);
+    if (it != op->annotations.end()) {
+      if (const auto *arr = (*it).second.as<ffi::ArrayObj>()) {
+        // Downcast directly to Array<Var> for convenience
+        Array<Var> vars = tvm::Downcast<Array<Var>>((*it).second);
+        for (const Var &v : vars) {
+          MaybeInsert(v);
+        }
+      }
+    }
+    // Recurse into child statements
+    StmtVisitor::VisitStmt_(op);
+  }
+
+  std::vector<Var> collected_;
+  std::unordered_set<const VarNode *> seen_ptr_;
+  std::unordered_set<std::string> seen_name_;
+};
+
+static PrimFunc HoistNonRestrictParams(PrimFunc f) {
+  if (!f.defined())
+    return f;
+
+  NonRestrictCollector collector;
+  collector.Collect(f->body);
+  Array<Var> from_blocks = collector.Result();
+
+  // Merge with any existing PrimFunc-level attribute if present
+  if (auto opt_existing = f->GetAttr<Array<Var>>(attr::kNonRestrictParams)) {
+    for (const Var &v : opt_existing.value()) {
+      // Reuse the collector's dedup logic by temporarily constructing a new
+      // collector Alternatively, do a small inline dedup mirroring MaybeInsert
+      // Here we inline a simplified pointer-based dedup plus name-based
+      // fallback
+      bool exists = false;
+      for (const Var &cur : from_blocks) {
+        if (cur.get() == v.get() || cur->name_hint == v->name_hint) {
+          exists = true;
+          break;
+        }
+      }
+      if (!exists)
+        from_blocks.push_back(v);
+    }
+  }
+
+  if (from_blocks.empty())
+    return f;
+
+  return WithAttr(std::move(f), attr::kNonRestrictParams,
+                  std::move(from_blocks));
+}
+
+namespace transform {
+
+tvm::transform::Pass HoistNonRestrictParams() {
+  auto pass_func = [](PrimFunc f, const IRModule &,
+                      const tvm::transform::PassContext &) {
+    return tvm::tl::HoistNonRestrictParams(std::move(f));
+  };
+  return tvm::tir::transform::CreatePrimFuncPass(
+      pass_func, 0, "tl.HoistNonRestrictParams", {});
+}
+
+} // namespace transform
+
+} // namespace tl
+} // namespace tvm
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.HoistNonRestrictParams",
+                        tvm::tl::transform::HoistNonRestrictParams);
+}
diff --git a/src/transform/if_stmt_binding.cc b/src/transform/if_stmt_binding.cc
index 5eb8c1181..5da796c9d 100644
--- a/src/transform/if_stmt_binding.cc
+++ b/src/transform/if_stmt_binding.cc
@@ -33,7 +33,7 @@ class IfStmtBindingRewriter : public StmtExprMutator {
     auto then_case = VisitStmt(op->then_case);
     Optional<Stmt> else_case = op->else_case;
     if (else_case.defined()) {
-      return GetRef<Stmt>(op);
+      return tvm::ffi::GetRef<Stmt>(op);
     }
     ICHECK(then_case.defined()) << "then_case must be defined";
     ICHECK(!else_case.defined()) << "else_case must be undefined";
@@ -81,10 +81,10 @@ tvm::transform::Pass IfStmtBinding() {
   return CreatePrimFuncPass(pass_func, 0, "tl.IfStmtBinding", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.IfStmtBinding", IfStmtBinding);
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/inject_assumes.cc b/src/transform/inject_assumes.cc
index d4c8a53c8..2a5fc62ca 100644
--- a/src/transform/inject_assumes.cc
+++ b/src/transform/inject_assumes.cc
@@ -1,4 +1,10 @@
+/*!
+ * \file inject_assumes.cc
+ * \brief Inject assumes on buffer's shape boundary check. Also convert
+ * existing assumes to AttrNodes.
+ */
 
+#include "common/assume.h"
 #include "tvm/arith/analyzer.h"
 #include "tvm/ffi/optional.h"
 #include "tvm/ir/expr.h"
@@ -6,9 +12,11 @@
 #include "tvm/node/structural_hash.h"
 #include "tvm/tir/builtin.h"
 #include "tvm/tir/expr.h"
+#include "tvm/tir/op.h"
 #include "tvm/tir/stmt.h"
 #include "tvm/tir/stmt_functor.h"
 #include "tvm/tir/transform.h"
+
 #include <sstream>
 
 namespace tvm::tl {
@@ -26,11 +34,12 @@ class AssumeInjector : public tvm::tir::StmtExprMutator {
   }
 
 private:
-  struct AssertCreator {
+  struct AssumeCreator {
     struct Item {
       PrimExpr expr;
       std::vector<Buffer> buffers;
     };
+
     tvm::StructuralHash sh;
     tvm::StructuralEqual se;
     // grouped by expr, since the amount of variadic shape symbols is usually
@@ -52,6 +61,7 @@ class AssumeInjector : public tvm::tir::StmtExprMutator {
         items[*it].buffers.push_back(buffer);
       }
     }
+
     void addBuffer(Buffer buf) {
       for (auto shape : buf->shape) {
         if (shape->IsInstance<IntImmNode>())
@@ -59,10 +69,12 @@ class AssumeInjector : public tvm::tir::StmtExprMutator {
         addExpr(shape, buf);
       }
     }
+
     Stmt build(Stmt body) {
       auto analyzer = arith::Analyzer{};
       for (const auto &e : items) {
-        auto simplified = analyzer.Simplify(GT(e.expr, 0));
+        auto simplified =
+            analyzer.Simplify(GT(e.expr, make_zero(e.expr->dtype)));
         std::stringstream ss;
         ss << "Buffer shape should be greater than 0: shape `" << e.expr
            << "` from buffer ";
@@ -77,32 +89,37 @@ class AssumeInjector : public tvm::tir::StmtExprMutator {
       return body;
     }
   };
+
   Stmt VisitStmt_(const DeclBufferNode *op) final {
     auto body = VisitStmt(op->body);
-    AssertCreator c;
+    AssumeCreator c;
     c.addBuffer(op->buffer);
     return DeclBuffer(op->buffer, c.build(body), op->span);
   }
-  std::optional<PrimExpr> getAssumeExpr(Stmt stmt) {
-    auto eval = stmt.as<EvaluateNode>();
-    if (!eval)
-      return std::nullopt;
-    auto call = eval->value.as<CallNode>();
-    if (!call)
-      return std::nullopt;
-    if (!call->op.same_as(builtin::assume()))
-      return std::nullopt;
-    return call->args[0];
-  }
+
   Stmt VisitStmt_(const SeqStmtNode *op) final {
     struct AssumeGroup {
       std::optional<PrimExpr> e;
       std::vector<Stmt> stmts;
     };
     std::vector<AssumeGroup> groups = {AssumeGroup{std::nullopt, {}}};
-    for (auto i = 0; i < op->seq.size(); i++) {
+    for (size_t i = 0; i < op->seq.size(); i++) {
       auto stmt = VisitStmt(op->seq[i]);
-      if (auto e = getAssumeExpr(stmt)) {
+      // Convert assume in evaluate form to assume attribute.
+      // By default, we have the following IR:
+      //    T.assume(cond1)
+      //    Stmt1
+      //    Stmt2
+      //    T.assume(cond2)
+      // This SeqStmt will be converted to:
+      //    With(attr::tilelang_assume, cond1) {
+      //      Stmt1
+      //      Stmt2
+      //    }
+      //    With(attr::tilelang_assume, cond2) {
+      //      ...
+      //    }
+      if (auto e = GetAssumeExprInEvaluateForm(stmt)) {
         groups.push_back(AssumeGroup{*e, {}});
       } else {
         groups.back().stmts.push_back(stmt);
@@ -125,10 +142,14 @@ class AssumeInjector : public tvm::tir::StmtExprMutator {
                                        : SeqStmt(groups[0].stmts);
     // return SeqStmt(groups[0].stmts);
   }
+
   Stmt VisitStmt_(const BlockNode *op) final {
     auto body = VisitStmt(op->body);
-    AssertCreator c;
-    if (root_node) {
+    AssumeCreator c;
+
+    // NOTE(chaofan): We only inject assumes from function arguments in the
+    // root block.
+    if (op->name_hint == "root") {
       for (auto item : f->buffer_map) {
         c.addBuffer(item.second);
       }
@@ -139,12 +160,13 @@ class AssumeInjector : public tvm::tir::StmtExprMutator {
     for (auto item : op->match_buffers) {
       c.addBuffer(item->buffer);
     }
+
     return Block(op->iter_vars, op->reads, op->writes, op->name_hint,
                  c.build(body), op->init, op->alloc_buffers, op->match_buffers,
                  op->annotations, op->span);
   }
+
   PrimFunc f;
-  bool root_node{true};
 };
 
 using namespace tir::transform;
@@ -156,9 +178,9 @@ tvm::transform::Pass InjectAssumes() {
   return CreatePrimFuncPass(pass_func, 0, "tl.InjectAssumes", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.InjectAssumes", InjectAssumes);
-});
+}
 
 } // namespace tvm::tl
diff --git a/src/transform/inject_fence_proxy.cc b/src/transform/inject_fence_proxy.cc
index ee76dfac1..6152789a2 100644
--- a/src/transform/inject_fence_proxy.cc
+++ b/src/transform/inject_fence_proxy.cc
@@ -108,7 +108,8 @@ bool IsKnownGeneric(const CallNode *call) {
     return false;
   }
   return call->op.same_as(ptx_ldmatrix()) || call->op.same_as(ptx_stmatrix()) ||
-         call->op.same_as(initialize_descriptor());
+         call->op.same_as(initialize_wgmma_descriptor()) ||
+         call->op.same_as(initialize_tcgen05_descriptor());
 }
 
 ProxyKind ProxyFromAttrValue(const ObjectRef &value) {
@@ -319,10 +320,10 @@ tvm::transform::Pass InjectFenceProxy() {
                                             {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.InjectFenceProxy", InjectFenceProxy);
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/inject_pipeline.cc b/src/transform/inject_pipeline.cc
index 20f0861e2..e106dec61 100644
--- a/src/transform/inject_pipeline.cc
+++ b/src/transform/inject_pipeline.cc
@@ -1,22 +1,3 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
 /*!
  * \file inject_software_pipeline.cc
  * \brief Transform annotated loops into pipelined one that parallelize
@@ -37,9 +18,85 @@
 namespace tvm {
 namespace tl {
 using namespace tir;
-
+using namespace ffi;
 namespace software_pipeline {
 
+struct LetWrapper {
+  Var var;
+  PrimExpr value;
+};
+
+/*!
+ * \brief Collector to find all buffers used in a statement.
+ *
+ * This is used to collect buffers that are actually used in the pipeline loop
+ * body, so that we can properly multi-version them for software pipelining.
+ */
+class BufferUsageCollector : public StmtExprVisitor {
+public:
+  BufferUsageCollector(
+      const Map<Var, Buffer> &buffer_data_to_buffer,
+      const std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual>
+          &allocated_buffers)
+      : buffer_data_to_buffer_(buffer_data_to_buffer),
+        allocated_buffers_(allocated_buffers) {}
+
+  Array<Buffer> Collect(const Stmt &stmt) {
+    this->VisitStmt(stmt);
+    Array<Buffer> result;
+    for (const auto &buffer : used_buffers_) {
+      result.push_back(buffer);
+    }
+    return result;
+  }
+
+private:
+  void VisitStmt_(const BufferStoreNode *op) final {
+    AddBuffer(op->buffer);
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitExpr_(const BufferLoadNode *op) final {
+    AddBuffer(op->buffer);
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitExpr_(const CallNode *op) final {
+    // Handle tvm_access_ptr which also accesses buffers
+    if (op->op.same_as(builtin::tvm_access_ptr())) {
+      if (op->args.size() > 1) {
+        if (const auto *var = op->args[1].as<VarNode>()) {
+          auto it = buffer_data_to_buffer_.find(GetRef<Var>(var));
+          if (it != buffer_data_to_buffer_.end()) {
+            AddBuffer((*it).second);
+          }
+        }
+      }
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitStmt_(const BlockNode *op) final {
+    // Also collect buffers allocated in nested blocks within the pipeline body
+    for (const auto &buffer : op->alloc_buffers) {
+      used_buffers_.insert(buffer);
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void AddBuffer(const Buffer &buffer) {
+    // Only add buffers that are allocated (not function input/output buffers)
+    if (allocated_buffers_.count(buffer)) {
+      used_buffers_.insert(buffer);
+    }
+  }
+
+  const Map<Var, Buffer> &buffer_data_to_buffer_;
+  const std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual>
+      &allocated_buffers_;
+  std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> used_buffers_;
+};
+
 /*!
  * \brief Create a block and infer the access region with the given body.
  *
@@ -74,6 +131,8 @@ struct PipelineAnnotation {
   int stage;
   int order;
   bool async;
+  // Index of the statement in the original loop body order (SeqStmt order)
+  int original_idx = -1;
 };
 
 using PipelineInfo = std::unordered_map<Block, PipelineAnnotation,
@@ -160,7 +219,7 @@ class PipelineBodyRewriter : public StmtExprMutator {
         new_args.Set(i + 1, new_index);
       }
     }
-    return Call(call->dtype, call->op, new_args, call->span);
+    return Call(call->dtype, call->op, new_args, call->annotations, call->span);
   }
 
   Stmt VisitStmt_(const BlockNode *op) final {
@@ -231,12 +290,29 @@ class PipelineBodyRewriter : public StmtExprMutator {
  */
 class PipelineRewriter : public StmtExprMutator {
 public:
+  /*!
+   * \brief Constructor of PipelineRewriter.
+   * \param buffer_data_to_buffer The map from buffer data to buffer.
+   * \param pipeline_allocs All buffers that need multi-versioning in the
+   * pipeline. This includes buffers allocated in the pipeline block and
+   * buffers allocated in outer blocks that are used in the pipeline.
+   * \param local_allocs Buffers that are allocated in the pipeline block
+   * itself. These buffers will be re-allocated in the rewritten block.
+   * Buffers in pipeline_allocs but not in local_allocs are allocated in outer
+   * blocks and should not be re-allocated.
+   * \param pipeline_loop The original loop to be software pipelined.
+   * \param pipeline_info The pipeline annotation information.
+   * \param loop_var_let_wrappers Let wrappers that depend on the loop var.
+   */
   PipelineRewriter(Map<Var, Buffer> buffer_data_to_buffer,
                    const Array<Buffer> &pipeline_allocs,
-                   const For &pipeline_loop, const PipelineInfo &pipeline_info)
+                   const Array<Buffer> &local_allocs, const For &pipeline_loop,
+                   const PipelineInfo &pipeline_info,
+                   const std::vector<LetWrapper> &loop_var_let_wrappers)
       : buffer_data_to_buffer_(std::move(buffer_data_to_buffer)),
-        pipeline_allocs_(pipeline_allocs), pipeline_loop_(pipeline_loop),
-        pipeline_info_(pipeline_info) {}
+        pipeline_allocs_(pipeline_allocs), local_allocs_(local_allocs),
+        pipeline_loop_(pipeline_loop), pipeline_info_(pipeline_info),
+        loop_var_let_wrappers_(loop_var_let_wrappers) {}
 
   Stmt BuildPipeline() {
     // Step 1: Analyze accesses to the buffers in the pipeline and compute the
@@ -244,7 +320,12 @@ class PipelineRewriter : public StmtExprMutator {
     std::unordered_map<Buffer, BufferAccessInfo, ObjectPtrHash, ObjectPtrEqual>
         infos = GetBufferAccessInfo();
     for (const Buffer &buffer : pipeline_allocs_) {
-      int num_versions = ComputeBufferVersions(buffer, infos.at(buffer));
+      auto it = infos.find(buffer);
+      if (it == infos.end()) {
+        // Buffer is not accessed in the pipeline blocks, skip it
+        continue;
+      }
+      int num_versions = ComputeBufferVersions(buffer, it->second);
       if (num_versions > 1) {
         buffer_remap_.Set(buffer, RewriteAllocBuffer(buffer, num_versions));
       }
@@ -297,21 +378,27 @@ class PipelineRewriter : public StmtExprMutator {
     }
 
     // Step 2: Emit the pipeline prologue, body and epilogue.
-    Stmt prologue = EmitImpl(pipeline_loop_->min,
-                             pipeline_loop_->min + max_stage_, true, true);
-    Stmt body =
-        EmitImpl(pipeline_loop_->min + max_stage_,
-                 pipeline_loop_->min + pipeline_loop_->extent, false, false);
-    Stmt epilogue = EmitImpl(
-        pipeline_loop_->min + pipeline_loop_->extent,
-        pipeline_loop_->min + pipeline_loop_->extent + max_stage_, true, true);
-
+    Stmt prologue =
+        EmitImpl(pipeline_loop_->min, pipeline_loop_->min + max_stage_, true,
+                 true, false);
+    Stmt body = EmitImpl(pipeline_loop_->min + max_stage_,
+                         pipeline_loop_->min + pipeline_loop_->extent, false,
+                         false, false);
+
+    Stmt epilogue =
+        EmitImpl(pipeline_loop_->min + pipeline_loop_->extent,
+                 pipeline_loop_->min + pipeline_loop_->extent + max_stage_,
+                 true, true, true);
     SeqStmt stmt = SeqStmt({prologue, body, epilogue});
 
     // Step 3: Make a new block that contains new buffer allocations after
     // pipeline rewriting.
+    // Only include buffers that are locally allocated in the pipeline block.
+    // Buffers from outer blocks will be handled separately.
     Array<Buffer> alloc_buffers;
-    for (const auto &alloc : pipeline_allocs_) {
+    std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> local_allocs_set(
+        local_allocs_.begin(), local_allocs_.end());
+    for (const auto &alloc : local_allocs_) {
       alloc_buffers.push_back(buffer_remap_.Get(alloc).value_or(alloc));
       buffer_data_to_buffer_.erase(alloc->data);
     }
@@ -320,6 +407,12 @@ class PipelineRewriter : public StmtExprMutator {
     return BlockRealize({}, Bool(true), block);
   }
 
+  /*!
+   * \brief Get the buffer remapping created during pipeline rewriting.
+   * This is used to update alloc_buffers in outer blocks.
+   */
+  const Map<Buffer, Buffer> &GetBufferRemap() const { return buffer_remap_; }
+
 private:
   /*!
    * \brief Analyze accesses to the buffers in the software pipeline.
@@ -459,7 +552,8 @@ class PipelineRewriter : public StmtExprMutator {
    * \return The resized buffer.
    */
   Buffer RewriteAllocBuffer(const Buffer &buffer, int num_versions) {
-    ObjectPtr<BufferNode> new_buffer = make_object<BufferNode>(*(buffer.get()));
+    ObjectPtr<BufferNode> new_buffer =
+        tvm::ffi::make_object<BufferNode>(*(buffer.get()));
     new_buffer->shape.insert(new_buffer->shape.begin(), PrimExpr(num_versions));
     if (!new_buffer->strides.empty()) {
       ICHECK(new_buffer->strides.size() + 1 == new_buffer->shape.size());
@@ -507,12 +601,16 @@ class PipelineRewriter : public StmtExprMutator {
     // A symbolic expression representing the index the latest async operation
     // associated with this stage has written into, at the "current" iteration.
     Optional<PrimExpr> producer_head;
+    // the commit block's predicate
+    PrimExpr commit_predicate{nullptr};
   };
 
   /*! Structure holding intermediate information for pipeline loop rewriting. */
   struct RewrittenBlockInfo {
     int stage;
     int order;
+    PrimExpr start;
+    PrimExpr end;
     PrimExpr predicate;
     Block block;
     PrimExpr access_index;
@@ -520,56 +618,103 @@ class PipelineRewriter : public StmtExprMutator {
   };
 
   void PopulateWaitCounts(const std::vector<RewrittenBlockInfo> &new_blocks,
-                          std::map<int, AsyncStateLocal> *async_states_local) {
+                          std::map<int, AsyncStateLocal> *async_states_local,
+                          bool is_epilogue = false) {
+    // Precompute which orders are present in this emit, and their access_index
+    std::unordered_map<int, PrimExpr> order_to_access_index;
+    std::unordered_set<int> present_orders;
+    for (const auto &nb : new_blocks) {
+      order_to_access_index[nb.order] = nb.access_index;
+      present_orders.insert(nb.order);
+    }
     for (size_t i = 0; i < new_blocks.size(); ++i) {
+      // 1. Find the unique async producer stage
       int producer_stage_idx = -1;
-      for (auto read_region : new_blocks[i].block->reads) {
+      for (const auto &read_region : new_blocks[i].block->reads) {
         for (const auto &[stage, state] : async_states) {
           if (stage <= new_blocks[i].stage &&
               state.writes(read_region->buffer)) {
-            // Found an earlier stage where read_region->buffer was
-            // asynchronously written
+            // Currently only a single async stage dependency is supported
             ICHECK(producer_stage_idx == -1 || producer_stage_idx == stage)
                 << "A dependency on multiple async stages is not supported";
             producer_stage_idx = stage;
           }
         }
       }
-      if (producer_stage_idx == -1)
+      if (producer_stage_idx == -1) {
+        // This block does not depend on any async producer
         continue;
+      }
       const auto &state = async_states[producer_stage_idx];
+
       auto &dep_local_state = (*async_states_local)[producer_stage_idx];
-      PrimExpr in_flight_cnt = 0;
-      for (const auto &group : state.commit_groups) {
-        PrimExpr consumer_head = new_blocks[i].access_index;
-        PrimExpr producer_head;
-        if (dep_local_state.producer_head.defined()) {
-          producer_head = dep_local_state.producer_head.value();
-          // if the group is after the wait point, minus by 1
-          if (group.front() > new_blocks[i].order)
-            producer_head -= 1;
-        } else {
-          producer_head = state.producer_head;
-        }
-        in_flight_cnt += producer_head - consumer_head;
-      }
 
-      // We can relax the in-flight-count by the number of independent commit.
+      // 2. Use buffer_to_commit_group_ to find all actually dependent commit
+      // groups
       std::unordered_set<int> dependent_groups;
       for (const auto &read_region : new_blocks[i].block->reads) {
-        if (state.buffer_to_commit_group_.count(read_region->buffer.get()))
-          dependent_groups.insert(
-              state.buffer_to_commit_group_.at(read_region->buffer.get()));
+        auto it = state.buffer_to_commit_group_.find(read_region->buffer.get());
+        if (it != state.buffer_to_commit_group_.end()) {
+          dependent_groups.insert(it->second);
+        }
+      }
+
+      // If there is no dependent commit group, no wait needs to be inserted
+      if (dependent_groups.empty()) {
+        continue;
       }
-      for (int i = int(state.commit_groups.size()) - 1; i >= 0; i--) {
-        if (dependent_groups.count(i) == 0)
-          in_flight_cnt += 1;
-        else
-          break; // stop relaxing
+
+      // 3. Compute wait = max_g max(0, t_consumer - committed_before[g])
+      PrimExpr t_consumer = new_blocks[i].access_index;
+      PrimExpr wait_expr = make_zero(t_consumer.dtype());
+
+      PrimExpr current_head = dep_local_state.producer_head.defined()
+                                  ? dep_local_state.producer_head.value()
+                                  : state.producer_head;
+      int consumer_order = new_blocks[i].order;
+
+      for (int g : dependent_groups) {
+        const auto &group = state.commit_groups[g];
+        if (group.empty())
+          continue;
+        int commit_order = group.back();
+        bool commit_present = present_orders.count(commit_order) > 0;
+
+        PrimExpr committed_before;
+        if (commit_present && commit_order <= consumer_order) {
+          // Commit point is in this iteration and earlier than the current
+          // consumer; this iteration's head is visible
+          auto commit_predicate = dep_local_state.commit_predicate;
+          if (analyzer_.CanProve(!commit_predicate,
+                                 arith::ProofStrength::kSymbolicBound)) {
+            // it means the commit block is not executed in this iteration
+            committed_before = new_blocks[i].start - 1;
+          } else if (is_epilogue) {
+            committed_before = new_blocks[i].start - 1;
+          } else {
+            committed_before = order_to_access_index.at(commit_order);
+          }
+        } else {
+          // Commit point is later than the current consumer or not in this
+          // iteration; only the previous iteration's head is visible
+          if (dep_local_state.producer_head.defined()) {
+            auto commit_predicate = dep_local_state.commit_predicate;
+            if (analyzer_.CanProve(!commit_predicate,
+                                   arith::ProofStrength::kSymbolicBound)) {
+              committed_before = new_blocks[i].start - 1;
+            } else if (is_epilogue) {
+              committed_before = new_blocks[i].start - 1;
+            } else {
+              committed_before = current_head - 1;
+            }
+          }
+        }
+
+        wait_expr = analyzer_.Simplify(committed_before - t_consumer);
       }
-      in_flight_cnt = analyzer_.Simplify(in_flight_cnt);
-      dep_local_state.pending_waits.push_back(
-          {static_cast<int>(i), in_flight_cnt});
+
+      wait_expr = analyzer_.Simplify(wait_expr);
+      dep_local_state.pending_waits.push_back({static_cast<int>(i), wait_expr});
     }
   }
 
@@ -622,7 +767,7 @@ class PipelineRewriter : public StmtExprMutator {
    * \return The result loop.
    */
   Stmt EmitImpl(const PrimExpr &start, const PrimExpr &end, bool unroll_loop,
-                bool need_bound_check) {
+                bool need_bound_check, bool is_epilogue = false) {
     PrimExpr new_loop_var;
     PrimExpr extent = end - start;
     auto make_nop = []() {
@@ -634,7 +779,20 @@ class PipelineRewriter : public StmtExprMutator {
       new_loop_var = start; // use constants as the loop var for unit loops
     } else {
       new_loop_var = pipeline_loop_->loop_var.copy_with_suffix("");
-      analyzer_.Bind(Downcast<Var>(new_loop_var), Range(start, end));
+      // Bind the iteration domain [start, end) to strengthen analyzer facts.
+      analyzer_.Bind(Downcast<Var>(new_loop_var),
+                     Range::FromMinExtent(start, end - start));
+    }
+    // Keep the bound constraints active for all analysis below.
+    // Only meaningful when the loop var is symbolic (non-unit loop).
+    std::unique_ptr<With<arith::ConstraintContext>> ctx_lb_guard;
+    std::unique_ptr<With<arith::ConstraintContext>> ctx_ub_guard;
+    if (!is_unit_loop) {
+      Var loop_iter = Downcast<Var>(new_loop_var);
+      ctx_lb_guard.reset(
+          new With<arith::ConstraintContext>(&analyzer_, loop_iter >= start));
+      ctx_ub_guard.reset(
+          new With<arith::ConstraintContext>(&analyzer_, loop_iter < end));
     }
 
     std::vector<RewrittenBlockInfo> new_blocks;
@@ -645,15 +803,14 @@ class PipelineRewriter : public StmtExprMutator {
     for (const Block &block : ordered_stmts_) {
       int stage = pipeline_info_.at(block).stage;
       int order = pipeline_info_.at(block).order;
+
       PrimExpr inbound = Bool(true);
       PrimExpr skewed_loop_var = new_loop_var - stage;
       if (need_bound_check)
-        inbound =
-            analyzer_.Simplify(pipeline_loop_->min <= skewed_loop_var) &&
-            (skewed_loop_var < pipeline_loop_->min + pipeline_loop_->extent);
-      if (analyzer_.CanProve(!inbound)) {
-        continue;
-      }
+        inbound = And(
+            pipeline_loop_->min <= skewed_loop_var,
+            (skewed_loop_var < pipeline_loop_->min + pipeline_loop_->extent));
+
       Block new_block = Downcast<Block>(
           PipelineBodyRewriter(buffer_data_to_buffer_, buffer_remap_,
                                pipeline_loop_, max_stage_ != 1)(block));
@@ -666,6 +823,8 @@ class PipelineRewriter : public StmtExprMutator {
       PrimExpr normalized_access_index =
           is_unit_loop ? skewed_loop_var : skewed_loop_var + delta;
 
+      normalized_access_index = analyzer_.Simplify(normalized_access_index);
+
       // Adjust the block predicate and the body according to the final loop
       // bound
       //  [pipeline_loop_->min, extent).
@@ -676,20 +835,35 @@ class PipelineRewriter : public StmtExprMutator {
       new_block = Downcast<Block>(Substitute(
           new_block, {{pipeline_loop_->loop_var, normalized_access_index}}));
 
+      // If there were Let-wrappers outside the original pipeline body that
+      // depended on the pipeline loop var, push them into each rewritten
+      // block with the correct per-block substitution.
+      if (!loop_var_let_wrappers_.empty()) {
+        BlockNode *n = new_block.CopyOnWrite();
+        Stmt inner = n->body;
+        for (const auto &lw : loop_var_let_wrappers_) {
+          PrimExpr substituted = Substitute(
+              lw.value, {{pipeline_loop_->loop_var, normalized_access_index}});
+          inner = LetStmt(lw.var, substituted, inner);
+        }
+        n->body = inner;
+      }
+
       if (pipeline_info_[block].async) {
         auto &local_state = async_states_local[stage];
         local_state.producer_head = normalized_access_index;
+        local_state.commit_predicate = inbound;
         BlockNode *n = new_block.CopyOnWrite();
         n->body = AttrStmt(make_zero(DataType::Int(32)), tir::attr::async_scope,
                            1, n->body);
       }
 
-      new_blocks.push_back({stage, order, inbound, new_block,
+      new_blocks.push_back({stage, order, start, end, inbound, new_block,
                             normalized_access_index,
                             pipeline_info_[block].async});
     }
 
-    PopulateWaitCounts(new_blocks, &async_states_local);
+    PopulateWaitCounts(new_blocks, &async_states_local, is_epilogue);
 
     auto stmts = CompletePipelineLoopStatements(new_blocks, async_states_local);
 
@@ -731,12 +905,14 @@ class PipelineRewriter : public StmtExprMutator {
   arith::Analyzer analyzer_;
   Map<Var, Buffer> buffer_data_to_buffer_;
   Array<Buffer> pipeline_allocs_;
+  Array<Buffer> local_allocs_;
   For pipeline_loop_;
   PipelineInfo pipeline_info_;
   int max_stage_ = -1;
   Map<Buffer, Buffer> buffer_remap_;
   Array<Block> ordered_stmts_;
   std::map<int, AsyncStateGlobal> async_states;
+  std::vector<LetWrapper> loop_var_let_wrappers_;
 };
 
 /*!
@@ -849,14 +1025,17 @@ class PipelineInjector : private StmtExprMutator {
     Stmt pipeline_body_root{nullptr};
     bool pipeline_body_from_block = false;
     Array<Buffer> pipeline_allocs;
+    Array<Buffer>
+        block_local_allocs; // buffers allocated in the pipeline block itself
     if (const auto *realize = for_node->body.as<BlockRealizeNode>()) {
       const auto &block = realize->block;
       for (const auto &buffer : block->alloc_buffers) {
         ICHECK(buffer->IsInstance<BufferNode>());
         buffer_data_to_buffer_.Set(buffer->data, buffer);
+        allocated_buffers_.insert(buffer);
       }
       pipeline_body_root = block->body;
-      pipeline_allocs = block->alloc_buffers;
+      block_local_allocs = block->alloc_buffers;
       pipeline_body_from_block = true;
     } else {
       pipeline_body_root = for_node->body;
@@ -864,8 +1043,9 @@ class PipelineInjector : private StmtExprMutator {
 
     const SeqStmtNode *pipeline_body_seq = nullptr;
     std::vector<std::function<Stmt(Stmt)>> rewrap_fns;
+    std::vector<LetWrapper> loop_var_let_wrappers;
     auto append_attr_wrapper = [&rewrap_fns](const AttrStmtNode *attr) {
-      ObjectRef node = attr->node;
+      Any node = attr->node;
       String attr_key = attr->attr_key;
       PrimExpr value = attr->value;
       Span span = attr->span;
@@ -896,14 +1076,25 @@ class PipelineInjector : private StmtExprMutator {
           continue;
         }
         if (const auto *let_stmt = current.as<LetStmtNode>()) {
-          Var var = let_stmt->var;
-          PrimExpr value = let_stmt->value;
-          Span span = let_stmt->span;
-          rewrap_fns.emplace_back([var = std::move(var),
-                                   value = std::move(value),
-                                   span](Stmt body) -> Stmt {
-            return LetStmt(var, value, body, span);
-          });
+          // If this Let value uses the pipeline loop var, record it and push
+          // inside each rewritten block later so the loop var can be
+          // substituted with the correct per-iteration index. Otherwise, keep
+          // it as a normal wrapper.
+          bool uses_loop_var = UsesVar(
+              let_stmt->value,
+              [v = op->loop_var.get()](const VarNode *vn) { return vn == v; });
+          if (uses_loop_var) {
+            loop_var_let_wrappers.push_back({let_stmt->var, let_stmt->value});
+          } else {
+            Var var = let_stmt->var;
+            PrimExpr value = let_stmt->value;
+            Span span = let_stmt->span;
+            rewrap_fns.emplace_back([var = std::move(var),
+                                     value = std::move(value),
+                                     span](Stmt body) -> Stmt {
+              return LetStmt(var, value, body, span);
+            });
+          }
           current = let_stmt->body;
           continue;
         }
@@ -935,13 +1126,49 @@ class PipelineInjector : private StmtExprMutator {
         ICHECK(nested_pipeline_block->match_buffers
                    .empty()); // match_buffer should have been lowered
         for (const auto &buffer : nested_pipeline_block->alloc_buffers) {
-          pipeline_allocs.push_back(buffer);
           buffer_data_to_buffer_.Set(buffer->data, buffer);
+          allocated_buffers_.insert(buffer);
         }
       }
       f_add_child(child);
     }
 
+    // Collect all buffers that are actually used in the pipeline loop body.
+    // This includes buffers allocated in outer blocks (like logits_smem) that
+    // are used inside the pipeline loop.
+    BufferUsageCollector collector(buffer_data_to_buffer_, allocated_buffers_);
+    pipeline_allocs = collector.Collect(SeqStmt(pipeline_body_seq->seq));
+
+    // Build a set of local allocs (buffers allocated in the pipeline block
+    // itself) for efficient lookup
+    std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> local_allocs_set;
+    for (const auto &buffer : block_local_allocs) {
+      local_allocs_set.insert(buffer);
+    }
+    for (size_t i = 0; i < pipeline_body_seq->seq.size(); i++) {
+      const Stmt &child = pipeline_body_seq->seq[i];
+      const auto *nested_block_realize = child.as<BlockRealizeNode>();
+      if (nested_block_realize && is_one(nested_block_realize->predicate) &&
+          nested_block_realize->block->body->IsInstance<SeqStmtNode>()) {
+        for (const auto &buffer : nested_block_realize->block->alloc_buffers) {
+          local_allocs_set.insert(buffer);
+        }
+      }
+    }
+
+    // Check if any external buffer (from outer blocks) is already used in
+    // another pipeline. This would cause conflicts in multi-versioning.
+    for (const auto &buffer : pipeline_allocs) {
+      // Only check external buffers (not locally allocated in this pipeline)
+      if (local_allocs_set.count(buffer) == 0) {
+        CHECK(buffers_used_in_pipeline_.count(buffer) == 0)
+            << "Buffer '" << buffer->name
+            << "' is used in multiple software pipeline loops. "
+            << "This is not supported because multi-versioning would conflict.";
+        buffers_used_in_pipeline_.insert(buffer);
+      }
+    }
+
     auto pipeline_stages = Downcast<Array<Integer>>(
         op->annotations.at(tir::attr::software_pipeline_stage));
     auto pipeline_orders = Downcast<Array<Integer>>(
@@ -973,16 +1200,40 @@ class PipelineInjector : private StmtExprMutator {
           pipeline_async_stages.find(stage) != pipeline_async_stages.end();
       PipelineAnnotation stage_order{
           stage,
-          /*order=*/static_cast<int>(pipeline_orders[i]->value), is_async};
+          /*order=*/static_cast<int>(pipeline_orders[i]->value), is_async,
+          /*original_idx=*/static_cast<int>(i)};
       pipeline_info.emplace(original_order[i], stage_order);
     }
 
     ValidatePipelineBody(pipeline_info, original_order);
 
     // Step 4: Rewrite the pipeline body.
-    Stmt pipeline = PipelineRewriter(buffer_data_to_buffer_, pipeline_allocs,
-                                     GetRef<For>(op), pipeline_info)
-                        .BuildPipeline();
+    // local_allocs contains buffers allocated in the pipeline block itself.
+    // pipeline_allocs contains all buffers that need multi-versioning,
+    // including buffers from outer blocks.
+    Array<Buffer> local_allocs = block_local_allocs;
+    // Add nested block allocs to local_allocs
+    for (size_t i = 0; i < pipeline_body_seq->seq.size(); i++) {
+      const Stmt &child = pipeline_body_seq->seq[i];
+      const auto *nested_block_realize = child.as<BlockRealizeNode>();
+      if (nested_block_realize && is_one(nested_block_realize->predicate) &&
+          nested_block_realize->block->body->IsInstance<SeqStmtNode>()) {
+        const Block &nested_pipeline_block = nested_block_realize->block;
+        for (const auto &buffer : nested_pipeline_block->alloc_buffers) {
+          local_allocs.push_back(buffer);
+        }
+      }
+    }
+
+    PipelineRewriter rewriter(buffer_data_to_buffer_, pipeline_allocs,
+                              local_allocs, tvm::ffi::GetRef<For>(op),
+                              pipeline_info, loop_var_let_wrappers);
+    Stmt pipeline = rewriter.BuildPipeline();
+
+    // Store the buffer remapping for updating outer block alloc_buffers
+    for (const auto &kv : rewriter.GetBufferRemap()) {
+      pending_buffer_remap_.Set(kv.first, kv.second);
+    }
     auto apply_wrappers = [&](Stmt stmt) {
       for (auto it = rewrap_fns.rbegin(); it != rewrap_fns.rend(); ++it) {
         stmt = (*it)(stmt);
@@ -1009,6 +1260,7 @@ class PipelineInjector : private StmtExprMutator {
       const auto &block = realize->block;
       for (const auto &buffer : block->alloc_buffers) {
         buffer_data_to_buffer_.erase(buffer->data);
+        allocated_buffers_.erase(buffer);
       }
     }
     return pipeline;
@@ -1017,18 +1269,35 @@ class PipelineInjector : private StmtExprMutator {
   Stmt VisitStmt_(const BlockNode *op) final {
     for (const auto &buffer : op->alloc_buffers) {
       buffer_data_to_buffer_.Set(buffer->data, buffer);
+      allocated_buffers_.insert(buffer);
     }
 
     Block block = Downcast<Block>(StmtExprMutator::VisitStmt_(op));
 
+    // Update alloc_buffers with any pending buffer remaps from pipeline
+    // rewriting. This handles buffers allocated in this block but
+    // multi-versioned during pipeline rewriting of inner loops.
+    Array<Buffer> new_alloc_buffers;
+    for (const auto &buffer : block->alloc_buffers) {
+      if (auto remapped = pending_buffer_remap_.Get(buffer)) {
+        new_alloc_buffers.push_back(remapped.value());
+        // Remove from pending after applying
+        pending_buffer_remap_.erase(buffer);
+      } else {
+        new_alloc_buffers.push_back(buffer);
+      }
+    }
+
     Array<Array<BufferRegion>> access =
         GetBlockReadWriteRegion(block, buffer_data_to_buffer_);
     BlockNode *n = block.CopyOnWrite();
     n->reads = access[0];
     n->writes = access[1];
+    n->alloc_buffers = std::move(new_alloc_buffers);
 
     for (const auto &buffer : op->alloc_buffers) {
       buffer_data_to_buffer_.erase(buffer->data);
+      allocated_buffers_.erase(buffer);
     }
     return block;
   }
@@ -1053,6 +1322,12 @@ class PipelineInjector : private StmtExprMutator {
   }
 
   Map<Var, Buffer> buffer_data_to_buffer_;
+  std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> allocated_buffers_;
+  Map<Buffer, Buffer> pending_buffer_remap_;
+  // Buffers from outer blocks that have been used in a pipeline loop.
+  // Used to detect if the same buffer is used in multiple pipeline loops.
+  std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual>
+      buffers_used_in_pipeline_;
   Optional<String> global_symbol_;
 };
 } // namespace software_pipeline
@@ -1072,11 +1347,11 @@ tir::transform::Pass InjectSoftwarePipeline() {
   return CreatePrimFuncPass(pass_func, 0, "tl.InjectSoftwarePipeline", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.InjectSoftwarePipeline",
                         InjectSoftwarePipeline);
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/inject_ptx_async_copy.cc b/src/transform/inject_ptx_async_copy.cc
index 5b3ad4226..1fadefbf4 100644
--- a/src/transform/inject_ptx_async_copy.cc
+++ b/src/transform/inject_ptx_async_copy.cc
@@ -232,10 +232,10 @@ tvm::transform::Pass InjectPTXAsyncCopy() {
   return CreatePrimFuncPass(pass_func, 0, "tl.InjectPTXAsyncCopy", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.InjectPTXAsyncCopy", InjectPTXAsyncCopy);
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/inject_tma_barrier.cc b/src/transform/inject_tma_barrier.cc
index 39c6debda..93beb15d4 100644
--- a/src/transform/inject_tma_barrier.cc
+++ b/src/transform/inject_tma_barrier.cc
@@ -204,9 +204,9 @@ class TmaBarrierCollector : public IRVisitorWithAnalyzer {
   void VisitStmt_(const EvaluateNode *op) final {
     if (const auto *call = op->value.as<CallNode>()) {
       if (call->op.same_as(tma_load()) || call->op.same_as(tma_load_im2col())) {
-        pending_tma_ops_.push_back(GetRef<Call>(call));
+        pending_tma_ops_.push_back(tvm::ffi::GetRef<Call>(call));
       } else if (call->op.same_as(mbarrier_expect_tx())) {
-        pending_tma_ops_.push_back(GetRef<Call>(call));
+        pending_tma_ops_.push_back(tvm::ffi::GetRef<Call>(call));
       } else if (call->op.same_as(builtin::ptx_arrive_barrier())) {
         PrimExpr barrier_id = call->args[0];
         for (const auto &tma_call : pending_tma_ops_) {
@@ -295,13 +295,15 @@ class TmaSequenceCollector : public IRVisitorWithAnalyzer {
 
   void VisitExpr_(const CallNode *op) final {
     if (op->op.same_as(mbarrier_expect_tx())) {
-      PrimExpr e =
-          tma_op_to_barrier_id_[GetRef<Call>(op)].as<CallNode>()->args[0];
-      auto int_set = arith::EvalSet(e, var_int_set_);
-      expect_.push_back(if_depth_ == 1);
-      sequence.push_back(0);
-      int_sets_.push_back(int_set);
-      expect_tx_count_ += 1;
+      auto call_ref = tvm::ffi::GetRef<Call>(op);
+      if (tma_op_to_barrier_id_.count(call_ref)) {
+        PrimExpr e = tma_op_to_barrier_id_[call_ref].as<CallNode>()->args[0];
+        auto int_set = arith::EvalSet(e, var_int_set_);
+        expect_.push_back(if_depth_ == 1);
+        sequence.push_back(0);
+        int_sets_.push_back(int_set);
+        expect_tx_count_ += 1;
+      }
     } else if (op->op.same_as(builtin::ptx_arrive_barrier())) {
       sequence.push_back(1);
     } else if (op->op.same_as(builtin::ptx_cp_async_barrier())) {
@@ -336,32 +338,61 @@ class TmaSequenceCollector : public IRVisitorWithAnalyzer {
 class BarrierCreationRewriter : public StmtExprMutator {
 public:
   BarrierCreationRewriter(std::vector<int> restore_barrier_ids,
-                          PrimExpr producer_thread_extent)
+                          PrimExpr producer_thread_extent,
+                          int ensure_min_count = 0,
+                          PrimExpr default_barrier_thread_count = 1)
       : restore_barrier_ids_(std::move(restore_barrier_ids)),
-        producer_thread_extent_(std::move(producer_thread_extent)) {}
+        producer_thread_extent_(std::move(producer_thread_extent)),
+        ensure_min_count_(ensure_min_count),
+        default_barrier_thread_count_(std::move(default_barrier_thread_count)) {
+  }
 
   PrimExpr VisitExpr_(const CallNode *op) {
     if (op->op.same_as(create_list_of_mbarrier())) {
-      std::vector<bool> tmp_(op->args.size(), false);
-      Array<PrimExpr> new_args;
+      size_t cur_n = op->args.size();
+      size_t need_n =
+          std::max<size_t>(cur_n, static_cast<size_t>(ensure_min_count_));
+
+      // Mark barriers to restore across the full needed length, not just the
+      // original length, so newly appended entries can be restored as well.
+      std::vector<bool> replace(need_n, false);
       for (auto &id : restore_barrier_ids_) {
-        tmp_[id] = true;
+        if (id >= 0 && static_cast<size_t>(id) < replace.size()) {
+          replace[id] = true;
+        }
       }
 
-      for (size_t i{0}; i < op->args.size(); ++i) {
-        if (tmp_[i]) {
+      Array<PrimExpr> new_args;
+      new_args.reserve(need_n);
+
+      // Preserve/override existing entries
+      for (size_t i{0}; i < cur_n; ++i) {
+        if (replace[i]) {
           new_args.push_back(producer_thread_extent_);
         } else {
           new_args.push_back(op->args[i]);
         }
       }
+      // Append additional barriers if required
+      for (size_t i = cur_n; i < need_n; ++i) {
+        if (replace[i]) {
+          new_args.push_back(producer_thread_extent_);
+        } else {
+          new_args.push_back(default_barrier_thread_count_);
+        }
+      }
+
       return Call(op->dtype, op->op, new_args);
     } else {
       return StmtExprMutator::VisitExpr_(op);
     }
   }
+
+private:
   std::vector<int> restore_barrier_ids_;
   PrimExpr producer_thread_extent_;
+  int ensure_min_count_{0};
+  PrimExpr default_barrier_thread_count_{1};
 };
 
 // we trust mbarrier_wait_parity to be correct
@@ -398,15 +429,38 @@ class TmaBarrierRewriter : public IRMutatorWithAnalyzer {
                                 collector.barrier_id_to_range(),
                                 has_create_list_of_mbarrier);
     f.CopyOnWrite()->body = rewriter(f->body);
+    // Compute the minimum number of barriers actually referenced in the body
+    // after TMA barrier rewrites (e.g., get_mbarrier(0) inserted for TMA).
+    struct GetMbarrierMaxIdxCollector : public StmtExprVisitor {
+      int max_idx{-1};
+      void VisitExpr_(const CallNode *op) final {
+        if (op->op.same_as(get_mbarrier())) {
+          if (op->args.size() == 1) {
+            if (const auto *imm = op->args[0].as<IntImmNode>()) {
+              max_idx = std::max(max_idx, static_cast<int>(imm->value));
+            }
+          }
+        }
+        StmtExprVisitor::VisitExpr_(op);
+      }
+    };
+
+    GetMbarrierMaxIdxCollector max_idx_collector;
+    max_idx_collector(f->body);
+    int ensure_min_count = max_idx_collector.max_idx + 1; // 0-based -> count
+
+    // For simple TMA-only producers, default barrier arrive count should be 1
+    // (only the elected leader performs the TMA arrive/expect).
     auto barrier_creation_rewriter = BarrierCreationRewriter(
-        rewriter.restore_barrier_ids_, rewriter.producer_thread_extent_);
+        rewriter.restore_barrier_ids_, rewriter.producer_thread_extent_,
+        ensure_min_count, Integer(1));
     f.CopyOnWrite()->body = barrier_creation_rewriter(f->body);
     return f;
   }
 
 private:
   Stmt VisitStmt_(const BlockNode *op) {
-    auto block = GetRef<Block>(op);
+    auto block = tvm::ffi::GetRef<Block>(op);
     if (!has_create_list_of_mbarrier_ && !barrier_id_to_range_.empty() &&
         op->name_hint == MainBlockName) {
       ICHECK(false) << "Please declare create_list_of_mbarrier.";
@@ -452,10 +506,27 @@ class TmaBarrierRewriter : public IRMutatorWithAnalyzer {
 
   PrimExpr VisitExpr_(const CallNode *op) {
     if (op->op.same_as(tma_load()) || op->op.same_as(tma_load_im2col())) {
-      // check this must be in the tma_op_to_barrier_id_
-      ICHECK(tma_op_to_barrier_id_.count(GetRef<Call>(op)))
-          << "tma_load must be in the tma_op_to_barrier_id_";
-      auto barrier_id = tma_op_to_barrier_id_[GetRef<Call>(op)];
+      auto call_ref = tvm::ffi::GetRef<Call>(op);
+      if (!tma_op_to_barrier_id_.count(call_ref)) {
+        // For 1D TMA loads, promote raw integer barrier id to get_mbarrier(id)
+        // so codegen can emit mbarrier[index]. This handles degenerate
+        // producer-only kernels where no arrive() is seen and mapping is empty.
+        auto arg0 = op->args[0].as<Call>();
+        bool is_1d_tma_load =
+            arg0 && !arg0.value()->op.same_as(create_tma_descriptor()) &&
+            !arg0.value()->op.same_as(create_tma_im2col_descriptor());
+        if (is_1d_tma_load && op->args.size() >= 3) {
+          if (const auto *imm = op->args[2].as<IntImmNode>()) {
+            Array<PrimExpr> new_args = op->args;
+            new_args.Set(2, Call(DataType::Handle(), get_mbarrier(),
+                                 {IntImm(DataType::Int(32),
+                                         static_cast<int>(imm->value))}));
+            return Call(op->dtype, op->op, new_args);
+          }
+        }
+        return IRMutatorWithAnalyzer::VisitExpr_(op);
+      }
+      auto barrier_id = tma_op_to_barrier_id_[call_ref];
       auto new_args = op->args;
       auto arg0 = op->args[0].as<Call>();
       auto is_1d_tma_load =
@@ -468,9 +539,11 @@ class TmaBarrierRewriter : public IRMutatorWithAnalyzer {
       }
       return Call(op->dtype, op->op, new_args);
     } else if (op->op.same_as(mbarrier_expect_tx())) {
-      ICHECK(tma_op_to_barrier_id_.count(GetRef<Call>(op)))
-          << "mbarrier_expect_tx must be in the tma_op_to_barrier_id_";
-      auto barrier_id = tma_op_to_barrier_id_[GetRef<Call>(op)];
+      auto call_ref = tvm::ffi::GetRef<Call>(op);
+      if (!tma_op_to_barrier_id_.count(call_ref)) {
+        return IRMutatorWithAnalyzer::VisitExpr_(op);
+      }
+      auto barrier_id = tma_op_to_barrier_id_[call_ref];
       auto new_args = op->args;
       new_args.Set(0, barrier_id);
       if (!has_warp_specialization_)
@@ -522,10 +595,10 @@ tvm::transform::Pass InjectTmaBarrier() {
   return CreatePrimFuncPass(pass_func, 0, "tl.InjectTmaBarrier", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.InjectTmaBarrier", InjectTmaBarrier);
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/layout_inference.cc b/src/transform/layout_inference.cc
index c3e552538..daaa7b4cc 100644
--- a/src/transform/layout_inference.cc
+++ b/src/transform/layout_inference.cc
@@ -11,12 +11,17 @@
 #include <tvm/tir/transform.h>
 #include <tvm/tir/utils.h>
 
+#include <algorithm>
+#include <deque>
+#include <memory>
 #include <queue>
 
 #include "../layout/utils.h"
 #include "../op/copy.h"
 #include "../op/parallel.h"
 #include "../op/region.h"
+#include "../op/utils.h"
+#include "../target/utils.h"
 
 #include "arith/ir_mutator_with_analyzer.h"
 #include "arith/ir_visitor_with_analyzer.h"
@@ -24,9 +29,7 @@
 #include "common/loop_parallel_transform_utils.h"
 #include "common/union_find.h"
 #include "layout_reducer.h"
-#include "loop_partition.h"
-#include "loop_vectorize.h"
-#include "runtime/thread_storage_scope.h"
+#include "parallel_loop_layout_validator.h"
 #include "tir/transforms/ir_utils.h"
 
 namespace tvm {
@@ -70,7 +73,7 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
 
   void RunInferStep(int cur_infer_id, InferLevel level, bool update_queue,
                     LayoutMap &layout_map, const LayoutMap &strict_layout_map,
-                    std::queue<int> &q, std::vector<bool> &in_queue) {
+                    std::deque<int> &q, std::vector<bool> &in_queue) {
     auto num_infer = infer_list_.size();
 
     // Range check for cur_infer_id
@@ -84,6 +87,7 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
     auto &next = infer_list_[cur_infer_id];
     auto iter_var = thread_var_vec_[cur_infer_id];
     auto thread_bounds = thread_bounds_vec_[cur_infer_id];
+    arith::Analyzer *cur_analyzer = analyzer_vec_[cur_infer_id].get();
     auto buffer_oob = buffer_oob_vec_[cur_infer_id];
     // Double-check that 'next' is valid
     ICHECK(next.defined()) << "infer_list_[" << cur_infer_id
@@ -105,24 +109,68 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
            "required for layout inference.";
 
     // Run InferLayout
-    DLOG(INFO) << "[RunInferStep] working on " << cur_infer_id << '\n';
-    auto updates =
-        next->InferLayout(LayoutInferArgs{target_, thread_bounds, layout_map,
-                                          &analyzer_, buffer_oob},
-                          level);
+    auto updates = next->InferLayout(LayoutInferArgs{target_,
+                                                     thread_bounds,
+                                                     layout_map,
+                                                     cur_analyzer,
+                                                     buffer_oob,
+                                                     {},
+                                                     let_var_to_expr_},
+                                     level);
+
     // Process the returned updates
     for (const auto &[buffer, layout] : updates) {
-      DLOG(INFO) << "    consider update " << buffer << " as "
-                 << layout->DebugOutput() << '\n';
-
       // Basic validity checks
       ICHECK(buffer.defined()) << "InferLayout returned an undefined buffer.";
       ICHECK(layout.defined()) << "InferLayout returned an undefined layout.";
 
+      // Helper: propagate inferred layout to alias buffers (same data Var)
+      auto propagate_alias = [&](const Buffer &src_buffer,
+                                 const Layout &src_layout) {
+        if (!buffer_data_to_buffers_.count(src_buffer->data))
+          return;
+        const auto &siblings = buffer_data_to_buffers_[src_buffer->data];
+        for (const auto &sib : siblings) {
+          if (sib.same_as(src_buffer))
+            continue;
+          bool shapes_equal =
+              src_layout->InputShape().size() == sib->shape.size();
+          if (shapes_equal) {
+            for (size_t i = 0; i < src_layout->InputShape().size(); ++i) {
+              if (!analyzer_.CanProveEqual(src_layout->InputShape()[i],
+                                           sib->shape[i])) {
+                shapes_equal = false;
+                break;
+              }
+            }
+          }
+          Layout target_layout =
+              shapes_equal
+                  ? src_layout
+                  : src_layout->Reshape(sib->shape, &analyzer_,
+                                        Integer(src_buffer->dtype.bytes()),
+                                        Integer(sib->dtype.bytes()));
+          if (layout_map.count(sib)) {
+            ICHECK(target_layout->IsEqual(layout_map[sib].get()))
+                << "Get different layout for alias buffer " << sib
+                << " (data-shared with " << src_buffer
+                << ")\n current: " << target_layout->DebugOutput()
+                << "\n previous: " << layout_map[sib]->DebugOutput();
+          } else {
+            layout_map.Set(sib, target_layout);
+            if (update_queue && use_list_.count(sib)) {
+              for (int idx : use_list_[sib]) {
+                EnqueueWithPriority(idx, q, in_queue, cur_infer_id, layout_map);
+              }
+            }
+          }
+        }
+      };
+
       if (layout_map.count(buffer)) {
         // If new layout contains the old one, update map
-        if (buffer.scope() == "local.fragment" &&
-            level != InferLevel::kStrict && !strict_layout_map.count(buffer)) {
+        if (IsFragmentBuffer(buffer) && level != InferLevel::kStrict &&
+            !strict_layout_map.count(buffer)) {
           // Actually this test has been done in ParallelOp::InferLayout
           // already. Just do it again to avoid missing implementations in other
           // `TileOperator`s.
@@ -153,8 +201,8 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
           if (ProveFragmentContains(src_layout, dst_layout, indices, indices,
                                     inner_analyzer)) {
             layout_map.Set(buffer, layout);
-            DLOG(INFO) << "    layout broadcast from "
-                       << src_layout->DebugOutput() << ", accepted" << '\n';
+            // Propagate to alias buffers as well
+            propagate_alias(buffer, layout);
             continue;
           }
         }
@@ -163,10 +211,13 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
             << "Get different layout for " << buffer
             << "\n current layout: " << layout->DebugOutput()
             << "\n previous layout: " << layout_map[buffer]->DebugOutput();
+        // Ensure aliases are consistent too
+        propagate_alias(buffer, layout);
       } else {
         // Otherwise, update map
         layout_map.Set(buffer, layout);
-        DLOG(INFO) << "    new layout accepted" << '\n';
+        // Propagate to alias buffers (may enqueue their users)
+        propagate_alias(buffer, layout);
         if (!update_queue)
           continue;
 
@@ -187,22 +238,20 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
               << "Index in use_list_ for buffer " << buffer
               << " out of range: " << idx << " >= " << num_infer << ".";
 
-          if (!in_queue[idx] && idx != cur_infer_id) {
-            in_queue[idx] = true;
-            q.push(idx);
-          }
+          EnqueueWithPriority(idx, q, in_queue, cur_infer_id, layout_map);
         }
       }
     }
   };
 
   void FinishInferQueue(InferLevel level, LayoutMap &layout_map,
-                        const LayoutMap &strict_layout_map, std::queue<int> &q,
+                        const LayoutMap &strict_layout_map, std::deque<int> &q,
                         std::vector<bool> &in_queue) {
     auto num_infer = infer_list_.size();
+
     while (!q.empty()) {
       int cur_infer_id = q.front();
-      q.pop();
+      q.pop_front();
       // Range check again, just to be safe
       ICHECK_GE(cur_infer_id, 0);
       ICHECK_LT(cur_infer_id, num_infer);
@@ -222,6 +271,9 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
     ICHECK_EQ(thread_bounds_vec_.size(), infer_list_.size())
         << "Size mismatch: thread_bounds_vec_ and infer_list_ must match in "
            "length.";
+    ICHECK_EQ(analyzer_vec_.size(), infer_list_.size())
+        << "Size mismatch: analyzer_vec_ and infer_list_ must match in "
+           "length.";
     ICHECK_EQ(buffer_oob_vec_.size(), infer_list_.size())
         << "Size mismatch: buffer_oob_vec_ and infer_list_ must match in "
            "length.";
@@ -240,7 +292,7 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
     int num_infer = infer_list_.size();
 
     // Prepare BFS queue for iterative inference
-    std::queue<int> q;
+    std::deque<int> q;
     std::vector<bool> in_queue(num_infer, true);
     for (int i = 0; i < num_infer; i++) {
       // Check that each infer_list_ entry is valid
@@ -252,7 +304,18 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
       if (!thread_var_vec_[i].defined() && skip_thread_partition_) {
         thread_var_vec_[i] = thread_var_;
       }
-      q.push(i);
+      q.push_back(i);
+    }
+
+    // step 0: set fully replicated layout for floating fragment buffers
+    // Floating buffers are accessed outside TileOps (e.g., in if conditions),
+    // so they must be replicated across all threads.
+    for (const auto &[buffer, thread_bounds] : floating_fragment_buffers_) {
+      if (layout_map.count(buffer))
+        continue;
+      auto frag =
+          Fragment::FullyReplicated(buffer->shape, thread_bounds->extent);
+      layout_map.Set(buffer, frag);
     }
 
     // step 1: infer strict layout
@@ -268,13 +331,53 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
     // step 2: infer common layout with BFS
     FinishInferQueue(InferLevel::kCommon, layout_map, strict_layout_map, q,
                      in_queue);
-
     // step 3: relax constraints to free and re-run
     InferInFreeMode(layout_map, strict_layout_map);
+    // step 4: finalize alias layouts by Var
+    // For each storage var, if any buffer in the group has a layout,
+    // propagate (reshape if needed) to the rest to ensure completeness.
+    for (const auto &[var, buffers] : buffer_data_to_buffers_) {
+      // Find a representative with existing layout
+      Optional<Buffer> rep;
+      Optional<Layout> rep_layout;
+      for (const auto &buf : buffers) {
+        if (layout_map.count(buf)) {
+          rep = buf;
+          rep_layout = layout_map[buf];
+          break;
+        }
+      }
+      if (!rep_layout.defined())
+        continue;
+      for (const auto &buf : buffers) {
+        if (!layout_map.count(buf)) {
+          bool shapes_equal =
+              rep_layout.value()->InputShape().size() == buf->shape.size();
+          if (shapes_equal) {
+            for (size_t i = 0; i < rep_layout.value()->InputShape().size();
+                 ++i) {
+              if (!analyzer_.CanProveEqual(rep_layout.value()->InputShape()[i],
+                                           buf->shape[i])) {
+                shapes_equal = false;
+                break;
+              }
+            }
+          }
+
+          Layout reshaped = shapes_equal
+                                ? rep_layout.value()
+                                : rep_layout.value()->Reshape(
+                                      buf->shape, &analyzer_,
+                                      Integer(rep.value()->dtype.bytes()),
+                                      Integer(buf->dtype.bytes()));
+          layout_map.Set(buf, reshaped);
+        }
+      }
+    }
 
     // Check that all local.fragment buffers have inferred layouts
     for (const auto &[buffer, _] : use_list_) {
-      if (buffer.scope() == "local.fragment") {
+      if (IsFragmentBuffer(buffer)) {
         ICHECK_NE(layout_map.count(buffer), 0)
             << "The layout for fragment " << buffer
             << " can not be inferred correctly.";
@@ -314,28 +417,86 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
 
   void Collect(const PrimFunc &f) {
     for (const auto &[_, buffer] : f->buffer_map) {
-      buffer_data_to_buffer_.Set(buffer->data, buffer);
+      if (buffer_data_to_buffers_.count(buffer->data)) {
+        auto buffers = buffer_data_to_buffers_[buffer->data];
+        buffers.push_back(buffer);
+        buffer_data_to_buffers_.Set(buffer->data, buffers);
+      } else {
+        buffer_data_to_buffers_.Set(buffer->data, {buffer});
+      }
     }
     auto target = f->GetAttr<Target>(tvm::attr::kTarget);
     ICHECK(target.defined())
         << "Layout_Inference: Require the target attribute";
     target_ = target.value();
     this->operator()(f->body);
+    // Compute floating fragment buffers after collection
+    ComputeFloatingFragmentBuffers(f->body);
   }
 
 private:
+  Map<Var, Buffer> GetBufferMap() const {
+    Map<Var, Buffer> buffer_map;
+    for (const auto &[var, buffers] : buffer_data_to_buffers_) {
+      // Use the first buffer for each var
+      // TODO(lei): phaseout buffer_map in future.
+      if (!buffers.empty()) {
+        buffer_map.Set(var, buffers[0]);
+      }
+    }
+    return buffer_map;
+  }
+
+  // Return true if any buffer that this op (idx) touches already has
+  // an inferred layout in layout_map. Used to prioritize enqueue order.
+  bool HasKnownLayoutAnchor(int idx, const LayoutMap &layout_map) const {
+    auto it = op_touched_buffers_.find(idx);
+    if (it == op_touched_buffers_.end() || it->second.empty())
+      return false;
+    for (const auto &buf : it->second) {
+      if (layout_map.count(buf))
+        return true;
+    }
+    return false;
+  }
+
+  // Enqueue idx to q with priority if all its buffers already
+  // have layouts. Also guards against duplicates and self-enqueue.
+  void EnqueueWithPriority(int idx, std::deque<int> &q,
+                           std::vector<bool> &in_queue, int cur_infer_id,
+                           const LayoutMap &layout_map) const {
+    if (idx == cur_infer_id)
+      return;
+    if (idx < 0 || idx >= static_cast<int>(in_queue.size()))
+      return;
+    if (in_queue[idx])
+      return;
+    in_queue[idx] = true;
+    if (HasKnownLayoutAnchor(idx, layout_map)) {
+      q.push_front(idx);
+    } else {
+      q.push_back(idx);
+    }
+  }
+
   void VisitExpr_(const CallNode *op) final {
     IRVisitorWithAnalyzer::VisitExpr_(op);
     // Do not analysis the call node to the global function.
     if (op->op.as<GlobalVarNode>())
       return;
 
-    auto p = ParseOperator(GetRef<Call>(op), buffer_data_to_buffer_);
+    auto p = ParseOperator(tvm::ffi::GetRef<Call>(op));
     if (p.defined()) {
       for (const auto &arg : op->args) {
         if (auto buffer = getBufferFromAccessPtr(arg)) {
           addToUseList(buffer.value());
+        } else if (auto buffer = getBufferFromRegion(arg)) {
+          addToUseList(buffer.value());
         }
+        // Check if the argument uses any LetStmt variables that reference
+        // fragment buffers. If so, add those buffers to the use list.
+        // This handles cases like: a = block_mask_f[i]; T.copy(A[a, 0], ...)
+        CollectFragmentBuffersFromExpr(arg);
       }
       // Compute thread_var_ and thread_bounds_
       thread_var_vec_.push_back(thread_var_);
@@ -350,6 +511,7 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
       } else {
         thread_bounds_vec_.push_back(Range::FromMinExtent(0, 1));
       }
+      analyzer_vec_.push_back(analyzer_.Clone());
 
       // Compute buffer oob for each buffer in the op
       if (const auto *copy = p.as<CopyNode>()) {
@@ -381,12 +543,15 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
       }
 
       // Add the tile operator to infer_list_
-      infer_list_stmt_.push_back(GetRef<ObjectRef>(op));
+      infer_list_stmt_.push_back(tvm::ffi::GetRef<ObjectRef>(op));
       infer_list_.push_back(std::move(p));
     }
   }
 
   Optional<Buffer> getBufferFromAccessPtr(const PrimExpr &expr) {
+    if (auto bl = expr.as<BufferLoadNode>()) {
+      return bl->buffer;
+    }
     auto call = expr.as<CallNode>();
     if (!call) {
       return std::nullopt;
@@ -394,33 +559,126 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
     if (call->op.same_as(builtin::tvm_access_ptr())) {
       auto var_opt = call->args[1].as<Var>();
       if (!var_opt.has_value()) {
-        DLOG(WARNING) << "[getBufferFromAccessPtr] args[1] is not a Var, type: "
-                      << call->args[1]->GetTypeKey();
+        LOG(WARNING) << "[getBufferFromAccessPtr] args[1] is not a Var, type: "
+                     << call->args[1]->GetTypeKey();
         return std::nullopt;
       }
       const auto &var = var_opt.value();
-      return buffer_data_to_buffer_[var];
-    } else if (call->op.same_as(RegionOp::Get())) {
-      return call->args[0].as<BufferLoadNode>()->buffer;
+      if (buffer_data_to_buffers_.count(var)) {
+        const auto &buffers = buffer_data_to_buffers_[var];
+        if (!buffers.empty()) {
+          return buffers[0]; // Return the first buffer
+        }
+      }
+      return std::nullopt;
+    }
+    return std::nullopt;
+  }
+
+  Optional<Buffer> getBufferFromRegion(const PrimExpr &expr) {
+    if (auto call = expr.as<CallNode>()) {
+      if (call->op.same_as(RegionOp::Get())) {
+        if (auto bl = call->args[0].as<BufferLoadNode>()) {
+          return bl->buffer;
+        }
+        return std::nullopt;
+      }
     }
     return std::nullopt;
   }
 
   void addToUseList(const Buffer &buffer) {
+    // buffer scope must be local.fragment
+    if (!IsFragmentBuffer(buffer)) {
+      return;
+    }
     int infer_idx = infer_list_.size();
     if (use_list_.find(buffer) == use_list_.end()) {
       use_list_[buffer] = {};
     }
     use_list_[buffer].push_back(infer_idx);
+
+    // Track which buffers this op (infer_idx) touches for prioritization.
+    // Avoid duplicates.
+    auto &vec = op_touched_buffers_[infer_idx];
+    if (std::none_of(vec.begin(), vec.end(),
+                     [&](const Buffer &b) { return b.same_as(buffer); })) {
+      vec.push_back(buffer);
+    }
   }
 
   void VisitStmt_(const ForNode *op) final {
     if (op->kind == ForKind::kParallel) {
-      auto infer = ParallelOp(GetRef<For>(op));
+      auto infer = ParallelOp(tvm::ffi::GetRef<For>(op));
       for (const auto &[buffer, _] : infer->GetIndiceMap()) {
         addToUseList(buffer);
       }
-      infer_list_stmt_.push_back(GetRef<ObjectRef>(op));
+
+      PostOrderVisit(op->body, [this](const ObjectRef &node) {
+        if (auto *buffer_load = node.as<BufferLoadNode>()) {
+          if (buffer_load->buffer.defined() &&
+              buffer_load->buffer->data.defined()) {
+            if (buffer_data_to_buffers_.count(buffer_load->buffer->data)) {
+              // Check if this buffer is already in the list
+              auto buffers = buffer_data_to_buffers_[buffer_load->buffer->data];
+              bool found = false;
+              for (const auto &buf : buffers) {
+                if (buf.same_as(buffer_load->buffer)) {
+                  found = true;
+                  break;
+                }
+              }
+              if (!found) {
+                buffers.push_back(buffer_load->buffer);
+                buffer_data_to_buffers_.Set(buffer_load->buffer->data, buffers);
+                DLOG(INFO) << "[LayoutInference] BufferStore: added buffer "
+                           << buffer_load->buffer
+                           << " buffer.get() = " << buffer_load->buffer.get()
+                           << " data = " << buffer_load->buffer->data.get();
+              }
+            } else {
+              buffer_data_to_buffers_.Set(buffer_load->buffer->data,
+                                          {buffer_load->buffer});
+              DLOG(INFO) << "[LayoutInference] BufferStore: new buffer "
+                         << buffer_load->buffer
+                         << " buffer.get() = " << buffer_load->buffer.get()
+                         << " data = " << buffer_load->buffer->data.get();
+            }
+          }
+        } else if (auto *buffer_store = node.as<BufferStoreNode>()) {
+          if (buffer_store->buffer.defined() &&
+              buffer_store->buffer->data.defined()) {
+            if (buffer_data_to_buffers_.count(buffer_store->buffer->data)) {
+              auto buffers =
+                  buffer_data_to_buffers_[buffer_store->buffer->data];
+              bool found = false;
+              for (const auto &buf : buffers) {
+                if (buf.same_as(buffer_store->buffer)) {
+                  found = true;
+                  break;
+                }
+              }
+              if (!found) {
+                buffers.push_back(buffer_store->buffer);
+                buffer_data_to_buffers_.Set(buffer_store->buffer->data,
+                                            buffers);
+                DLOG(INFO) << "[LayoutInference] BufferStore: added buffer "
+                           << buffer_store->buffer
+                           << " buffer.get() = " << buffer_store->buffer.get()
+                           << " data = " << buffer_store->buffer->data.get();
+              }
+            } else {
+              buffer_data_to_buffers_.Set(buffer_store->buffer->data,
+                                          {buffer_store->buffer});
+              DLOG(INFO) << "[LayoutInference] BufferStore: new buffer "
+                         << buffer_store->buffer
+                         << " buffer.get() = " << buffer_store->buffer.get()
+                         << " data = " << buffer_store->buffer->data.get();
+            }
+          }
+        }
+      });
+      infer_list_stmt_.push_back(tvm::ffi::GetRef<ObjectRef>(op));
       infer_list_.push_back(std::move(infer));
       thread_var_vec_.push_back(thread_var_);
       if (thread_var_.defined() &&
@@ -434,6 +692,7 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
       } else {
         thread_bounds_vec_.push_back(Range::FromMinExtent(0, 1));
       }
+      analyzer_vec_.push_back(analyzer_.Clone());
       buffer_oob_vec_.push_back(false);
     } else {
       IRVisitorWithAnalyzer::VisitStmt(op->body);
@@ -442,21 +701,59 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
 
   void VisitStmt_(const BlockNode *op) final {
     for (auto buffer : op->alloc_buffers) {
-      buffer_data_to_buffer_.Set(buffer->data, buffer);
+      if (buffer_data_to_buffers_.count(buffer->data)) {
+        auto buffers = buffer_data_to_buffers_[buffer->data];
+        buffers.push_back(buffer);
+        buffer_data_to_buffers_.Set(buffer->data, buffers);
+      } else {
+        buffer_data_to_buffers_.Set(buffer->data, {buffer});
+      }
     }
+
+    // First, visit the block body to collect all buffers from
+    // BufferLoad/BufferStore
+    IRVisitorWithAnalyzer::VisitStmt_(op);
+
+    // After visiting, apply layouts to all collected buffers
     if (op->annotations.count(attr::kLayoutMap)) {
       // Check if the layout map is Map<Var, Layout>
       auto map =
           op->annotations.Get(attr::kLayoutMap)->as<Map<Var, Layout>>().value();
       for (const auto &[var, layout] : map) {
-        ICHECK(buffer_data_to_buffer_.count(var))
+        ICHECK(buffer_data_to_buffers_.count(var))
             << "buffer " << var << " is not found in the block";
-        auto buffer = buffer_data_to_buffer_[var];
-        ICHECK(StructuralEqual()(layout->InputShape(), buffer->shape));
-        annotated_layout_map_.Set(buffer, layout);
+        const auto &buffers = buffer_data_to_buffers_[var];
+        ICHECK(!buffers.empty()) << "buffer list for " << var << " is empty";
+        // Apply layout to all buffers associated with this var
+        for (const auto &buffer : buffers) {
+
+          // Reshape the layout to match the buffer's shape
+          // Check if shapes are structurally equal
+          bool shapes_equal =
+              layout->InputShape().size() == buffer->shape.size();
+          if (shapes_equal) {
+            for (size_t i = 0; i < layout->InputShape().size(); ++i) {
+              if (!analyzer_.CanProveEqual(layout->InputShape()[i],
+                                           buffer->shape[i])) {
+                shapes_equal = false;
+                break;
+              }
+            }
+          }
+
+          if (shapes_equal) {
+            annotated_layout_map_.Set(buffer, layout);
+          } else {
+            // Use the first buffer sharing this var as the base for dtype ratio
+            int base_bytes = buffers[0]->dtype.bytes();
+            auto reshaped_layout =
+                layout->Reshape(buffer->shape, &analyzer_, Integer(base_bytes),
+                                Integer(buffer->dtype.bytes()));
+            annotated_layout_map_.Set(buffer, reshaped_layout);
+          }
+        }
       }
     }
-    IRVisitorWithAnalyzer::VisitStmt_(op);
   }
 
   void VisitStmt_(const AttrStmtNode *op) final {
@@ -470,17 +767,217 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
     IRVisitorWithAnalyzer::VisitStmt_(op);
   }
 
-  Map<Var, Buffer> buffer_data_to_buffer_;
+  void VisitStmt_(const LetStmtNode *op) final {
+    // Record Let variable to its bound expression.
+    // This enables tracking fragment buffer accesses through let bindings.
+    let_var_to_expr_.Set(op->var, op->value);
+    IRVisitorWithAnalyzer::VisitStmt_(op);
+  }
+
+  // Helper: recursively collect fragment buffers from an expression,
+  // following let bindings chain.
+  void CollectFragmentBuffersFromExpr(const PrimExpr &expr) {
+    PostOrderVisit(expr, [this](const ObjectRef &node) {
+      if (auto bl = node.as<BufferLoadNode>()) {
+        if (IsFragmentBuffer(bl->buffer)) {
+          addToUseList(bl->buffer);
+        }
+      } else if (auto var_node = node.as<VarNode>()) {
+        auto var = tvm::ffi::GetRef<Var>(var_node);
+        if (let_var_to_expr_.count(var)) {
+          CollectFragmentBuffersFromExpr(let_var_to_expr_[var]);
+        }
+      }
+    });
+  }
+
+  void VisitExpr_(const BufferLoadNode *op) final {
+    // Collect buffer from BufferLoad
+    if (op->buffer.defined() && op->buffer->data.defined()) {
+      if (buffer_data_to_buffers_.count(op->buffer->data)) {
+        // Check if this buffer is already in the list
+        auto buffers = buffer_data_to_buffers_[op->buffer->data];
+        bool found = false;
+        for (const auto &buf : buffers) {
+          if (buf.same_as(op->buffer)) {
+            found = true;
+            break;
+          }
+        }
+        if (!found) {
+          buffers.push_back(op->buffer);
+          buffer_data_to_buffers_.Set(op->buffer->data, buffers);
+          DLOG(INFO) << "[LayoutInference] BufferLoad: added buffer "
+                     << op->buffer << " buffer.get() = " << op->buffer.get()
+                     << " data = " << op->buffer->data.get();
+        }
+      } else {
+        buffer_data_to_buffers_.Set(op->buffer->data, {op->buffer});
+        DLOG(INFO) << "[LayoutInference] BufferLoad: new buffer " << op->buffer
+                   << " buffer.get() = " << op->buffer.get()
+                   << " data = " << op->buffer->data.get();
+      }
+    }
+    IRVisitorWithAnalyzer::VisitExpr_(op);
+  }
+
+  void VisitStmt_(const BufferStoreNode *op) final {
+    // Collect buffer from BufferStore
+    if (op->buffer.defined() && op->buffer->data.defined()) {
+      if (buffer_data_to_buffers_.count(op->buffer->data)) {
+        // Check if this buffer is already in the list
+        auto buffers = buffer_data_to_buffers_[op->buffer->data];
+        bool found = false;
+        for (const auto &buf : buffers) {
+          if (buf.same_as(op->buffer)) {
+            found = true;
+            break;
+          }
+        }
+        if (!found) {
+          buffers.push_back(op->buffer);
+          buffer_data_to_buffers_.Set(op->buffer->data, buffers);
+          DLOG(INFO) << "[LayoutInference] BufferStore: added buffer "
+                     << op->buffer << " buffer.get() = " << op->buffer.get()
+                     << " data = " << op->buffer->data.get();
+        }
+      } else {
+        buffer_data_to_buffers_.Set(op->buffer->data, {op->buffer});
+        DLOG(INFO) << "[LayoutInference] BufferStore: new buffer " << op->buffer
+                   << " buffer.get() = " << op->buffer.get()
+                   << " data = " << op->buffer->data.get();
+      }
+    }
+    IRVisitorWithAnalyzer::VisitStmt_(op);
+  }
+
+  // Compute floating fragment buffers after collection is done.
+  //
+  // A "floating" fragment buffer is one that has accesses outside of any
+  // TileOp (Copy, Gemm, Reduce, Parallel, etc.). For example:
+  //
+  //   T.copy(BlockMask[by, :], block_mask_f)  // block_mask_f accessed IN
+  //   TileOp for i in T.Pipelined(N_S):
+  //       if block_mask_f[i] >= 0:           // block_mask_f accessed OUTSIDE
+  //       TileOp (floating!)
+  //           T.copy(A[...], A_shared)
+  //
+  // In this example, `block_mask_f[i]` in the if-condition is a "floating"
+  // access because it's not inside any TileOp. Such buffers need special
+  // handling: they must be fully replicated across all threads since the
+  // access pattern cannot be inferred from TileOp semantics.
+  //
+  // This function identifies these buffers by:
+  // 1. Collecting all IR nodes that are inside TileOps (from infer_list_stmt_)
+  // 2. Scanning the entire function body for fragment buffer accesses
+  // 3. Any access not inside a TileOp means the buffer is "floating"
+  // 4. Recording the thread_bounds at the point of each floating access
+  void ComputeFloatingFragmentBuffers(const Stmt &func_body) {
+    // Step 1: Collect all nodes that are inside TileOps
+    std::unordered_set<const Object *> nodes_in_tileops;
+    for (const auto &stmt : infer_list_stmt_) {
+      PostOrderVisit(stmt, [&](const ObjectRef &node) {
+        nodes_in_tileops.insert(node.get());
+      });
+    }
+
+    // Step 2: Use a visitor to scan for floating accesses while tracking thread
+    // context
+    class FloatingBufferCollector : public IRVisitorWithAnalyzer {
+    public:
+      FloatingBufferCollector(
+          const std::unordered_set<const Object *> &nodes_in_tileops,
+          std::unordered_map<Buffer, Range, ObjectPtrHash, ObjectPtrEqual>
+              &floating_buffers)
+          : nodes_in_tileops_(nodes_in_tileops),
+            floating_buffers_(floating_buffers) {}
+
+      void VisitStmt_(const AttrStmtNode *op) final {
+        if (op->attr_key == tir::attr::thread_extent) {
+          IterVar iv = Downcast<IterVar>(op->node);
+          if (iv->thread_tag == "threadIdx.x") {
+            thread_var_ = iv;
+          }
+        }
+        IRVisitorWithAnalyzer::VisitStmt_(op);
+      }
+
+      void VisitExpr_(const BufferLoadNode *op) final {
+        CheckFloatingAccess(op->buffer, op);
+        IRVisitorWithAnalyzer::VisitExpr_(op);
+      }
+
+      void VisitStmt_(const BufferStoreNode *op) final {
+        CheckFloatingAccess(op->buffer, op);
+        IRVisitorWithAnalyzer::VisitStmt_(op);
+      }
+
+    private:
+      void CheckFloatingAccess(const Buffer &buffer, const Object *node) {
+        if (!IsFragmentBuffer(buffer))
+          return;
+        if (nodes_in_tileops_.find(node) != nodes_in_tileops_.end())
+          return;
+        // This is a floating access - record buffer with current thread_bounds
+        if (floating_buffers_.find(buffer) != floating_buffers_.end())
+          return; // Already recorded
+        Range thread_bounds = Range::FromMinExtent(0, 1);
+        if (thread_var_.defined() &&
+            analyzer_.const_int_bound.IsBound(thread_var_->var)) {
+          auto const_int_bound = analyzer_.const_int_bound(thread_var_);
+          auto dtype = thread_var_->var.dtype();
+          auto extent =
+              const_int_bound->max_value - const_int_bound->min_value + 1;
+          thread_bounds = Range::FromMinExtent(
+              IntImm(dtype, const_int_bound->min_value), IntImm(dtype, extent));
+        }
+        floating_buffers_[buffer] = thread_bounds;
+      }
+
+      const std::unordered_set<const Object *> &nodes_in_tileops_;
+      std::unordered_map<Buffer, Range, ObjectPtrHash, ObjectPtrEqual>
+          &floating_buffers_;
+      IterVar thread_var_;
+    };
+
+    FloatingBufferCollector collector(nodes_in_tileops,
+                                      floating_fragment_buffers_);
+    collector(func_body);
+
+    // Debug log floating fragment buffers
+    if (!floating_fragment_buffers_.empty()) {
+      DLOG(INFO)
+          << "Floating fragment buffers (have accesses outside TileOps):";
+      for (const auto &[buffer, thread_bounds] : floating_fragment_buffers_) {
+        DLOG(INFO) << "    " << buffer
+                   << " with thread_bounds: " << thread_bounds;
+      }
+    }
+  }
+
+  Map<Var, Array<Buffer>> buffer_data_to_buffers_;
+  // Map from LetStmt variable to its bound expression
+  Map<Var, PrimExpr> let_var_to_expr_;
   std::vector<ObjectRef> infer_list_stmt_;
   std::vector<TileOperator> infer_list_;
+  // Fragment buffers that have accesses outside of TileOps.
+  // These "floating" buffers need fully replicated layouts since their
+  // access patterns cannot be inferred from TileOp semantics.
+  // Maps buffer -> thread_bounds at the point of floating access.
+  // See ComputeFloatingFragmentBuffers() for detailed explanation.
+  std::unordered_map<Buffer, Range, ObjectPtrHash, ObjectPtrEqual>
+      floating_fragment_buffers_;
   std::unordered_map<Buffer, std::vector<int>, ObjectPtrHash, ObjectPtrEqual>
       use_list_;
+  // Per-op list of buffers it touches (fragment scope), used for prioritization
+  std::unordered_map<int, std::vector<Buffer>> op_touched_buffers_;
   // This is a workaround for cpu backend,
   // we need to define a thread_var for the serial loop.
   IterVar thread_var_ = IterVar(Range::FromMinExtent(0, 1), Var("v_thread"),
                                 IterVarType::kDataPar);
   std::vector<IterVar> thread_var_vec_;
   std::vector<Range> thread_bounds_vec_;
+  std::vector<std::unique_ptr<arith::Analyzer>> analyzer_vec_;
   std::vector<bool> buffer_oob_vec_;
   Target target_;
   LayoutMap annotated_layout_map_;
@@ -513,12 +1010,34 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
       if (infer_indices.empty())
         continue;
 
-      // Union all infer_list_ indices that share the same buffer
+      // Union all infer_list_ indices that share the same Buffer object
       int first_idx = infer_indices[0];
       for (size_t i = 1; i < infer_indices.size(); i++) {
         uf.Union(first_idx, infer_indices[i]);
       }
     }
+    // Additionally, union across buffers that share the same underlying
+    // buffer->data (Var). This handles cases like reshape where multiple
+    // Buffer objects alias the same storage.
+    for (const auto &[var, buffers] : buffer_data_to_buffers_) {
+      std::vector<int> merged;
+      for (const auto &buf : buffers) {
+        auto it = use_list_.find(buf);
+        if (it != use_list_.end()) {
+          const auto &vec = it->second;
+          merged.insert(merged.end(), vec.begin(), vec.end());
+        }
+      }
+      if (merged.size() > 1) {
+        std::sort(merged.begin(), merged.end());
+        merged.erase(std::unique(merged.begin(), merged.end()), merged.end());
+        int first = merged[0];
+        for (size_t i = 1; i < merged.size(); ++i) {
+          uf.Union(first, merged[i]);
+        }
+      }
+    }
+
     std::unordered_map<int, std::vector<int>> components;
     for (int i = 0; i < infer_list_.size(); i++) {
       int root = uf.Find(i);
@@ -535,7 +1054,7 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
 
     // For each component, try each op as root, and determine the least
     // replicated one
-    std::queue<int> q;
+    std::deque<int> q;
     std::vector<bool> in_queue(infer_list_.size(), false);
 
     for (auto &&[root, members] : components) {
@@ -549,7 +1068,7 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
       // Try each member as the root of inference for this component
       for (int attempt_infer_root : members) {
         DLOG(INFO) << "----------------------- try root " << attempt_infer_root
-                   << '\n';
+                   << " members " << members.size() << '\n';
         // Backup the current infer_list_ state
         auto back_infer_list = BackupInferList();
         // Copy the current layout_map for temporary use
@@ -580,6 +1099,10 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
           do_update = false;
           DLOG(INFO) << "attempt failed due to NormalizeIterException "
                      << e.what() << '\n';
+        } catch (const LoopLayoutInjectiveException &e) {
+          do_update = false;
+          DLOG(INFO) << "attempt failed due to LoopLayoutInjectiveException "
+                     << e.what() << '\n';
         }
 
         if (do_update) {
@@ -590,14 +1113,22 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
               int64_t frag_reg_num = 1;
               for (auto i : frag.value()->OutputShape()) {
                 auto pci = as_const_int(i);
-                ICHECK(pci != nullptr);
+                ICHECK(pci != nullptr)
+                    << "Can not use non-constant range to "
+                       "iterate over a fragment/local "
+                       "buffer. Non-constant shape expr is: "
+                    << i
+                    << ". This is possibly because you use symbolic shape when "
+                       "accessing a fragment/local buffer.";
                 frag_reg_num *= *pci;
               }
               reg_num += frag_reg_num;
             }
           }
           // Update the best plan if this one uses fewer registers
-          if (reg_num < min_reg_num) {
+          if (reg_num < min_reg_num ||
+              (reg_num == min_reg_num &&
+               attempt_infer_root < min_reg_num_infer_root)) {
             best_infer_list =
                 BackupInferList(); // Use backup to avoid moving out infer_list_
             best_layout_map = tmp_layout_map;
@@ -627,16 +1158,15 @@ class LayoutInferencer : public IRMutatorWithAnalyzer {
     BufferUseDefCollector collector(skip_thread_partition);
     collector.Collect(f);
     auto result = collector.Run();
-    LayoutInferencer substituter(result, skip_thread_partition, &analyzer);
+    LayoutInferencer substituter(result, &analyzer);
     fptr->body = substituter.VisitStmt(f->body);
     return f;
   }
 
 private:
   LayoutInferencer(const LayoutInferenceResult &result,
-                   bool skip_thread_partition, arith::Analyzer *analyzer)
-      : arith::IRMutatorWithAnalyzer(analyzer), result_(result),
-        skip_thread_partition_(skip_thread_partition) {};
+                   arith::Analyzer *analyzer)
+      : arith::IRMutatorWithAnalyzer(analyzer), result_(result) {};
 
   using arith::IRMutatorWithAnalyzer::IRMutatorWithAnalyzer;
 
@@ -669,153 +1199,55 @@ class LayoutInferencer : public IRMutatorWithAnalyzer {
   }
 
   /**
-   * @brief Visit and transform For nodes according to inferred layout
-   * information.
+   * @brief Visit and transform For nodes by storing inferred layout information
+   *        as annotations instead of expanding the loop.
    *
-   * If the For node is present in result_.for_map, this method applies
-   * loop-level layout-driven transformations: it optionally partitions the loop
-   * across the thread index, vectorizes the loop body, and wraps the loop with
-   * a predicate if one was inferred for the loop root.
+   * If the For node is present in result_.for_map, this method stores the
+   * inferred loop layout and predicate as annotations on the For node, rather
+   * than performing loop partition and vectorization.
    *
-   * Detailed behavior:
-   * - Reads reducer information from the For node's attr::kReducerInfo
-   * annotation (if present) to detect reduction targets.
-   * - Detects register-local buffer stores (buffers with scope "local") in the
-   *   original loop body; if only register-local stores are present the loop is
-   *   treated as a register-local scenario and is not partitioned across
-   * threads.
-   * - Obtains the loop layout from result_.for_map[root] and, unless the loop
-   * is register-local or skip_thread_partition_ is set, partitions the loop via
-   *   PartitionLoop using thread_var_ and analyzer_.
-   * - Scans the transformed loop body to determine whether it accesses any
-   *   non-local buffers (scopes other than "local" or "local.fragment").
-   * - Scans the transformed loop body to detect reducers (based on
-   * reducer_info). If a reducer is present the loop is NOT vectorized
-   * (reduction axes are excluded from vectorization as a conservative
-   * workaround).
-   * - If the loop has non-local accesses and no reducer, the loop is vectorized
-   *   via VectorizeLoop.
-   * - If a predicate exists in result_.predicate_map for the loop root and the
-   *   loop was partitioned, the method returns an IfThenElse surrounding the
-   *   (possibly partitioned/vectorized) loop with that predicate; otherwise it
-   *   returns the transformed For.
+   * The stored annotations are:
+   * - attr::kParallelLoopLayout: The Fragment layout for the parallel loop
+   * - attr::kParallelLoopPredicate: The predicate expression (if any)
    *
-   * @return The possibly transformed For statement (or an IfThenElse wrapping
-   * it)
+   * @return The For statement with layout annotations attached
    */
   Stmt VisitStmt_(const ForNode *op) final {
-    Map<Var, ReducerInfo> reducer_info;
-    if (op->annotations.count(attr::kReducerInfo))
-      reducer_info = op->annotations.Get(attr::kReducerInfo)
-                         ->as<Map<Var, ReducerInfo>>()
-                         .value();
+    if (!result_.for_map.count(tvm::ffi::GetRef<For>(op))) {
+      return IRMutatorWithAnalyzer::VisitStmt_(op);
+    }
 
     For for_node = Downcast<For>(IRMutatorWithAnalyzer::VisitStmt_(op));
-    if (result_.for_map.count(GetRef<For>(op))) {
-      auto root = GetRef<For>(op);
-      // This check is a workaround to support T.Parallel for local buffers.
-      // For example:
-      //   for i in T.Parallel(1024):
-      //     A_local[i] = A_global[i]
-      // Here, A_local is a register-local buffer held independently by each
-      // thread, so explicit thread binding is not required.
-      bool store_into_local = false;
-      PostOrderVisit(root, [&](const ObjectRef &obj) {
-        if (const auto *store = obj.as<BufferStoreNode>()) {
-          if (store->buffer.scope() == "local") {
-            store_into_local = true;
-          }
-          // if the case is like:
-          // for i in T.Parallel(1024):
-          //     A_local[i] = B_global[i]
-          //     A_frag[i] = A_global[i]
-          // exception will be raise in Parallel::LayoutInference
-        }
-      });
-      // This check if for the loop that only manuplates "local" buffers,
-      // for i in T.Parallel(1024):
-      //     A_local[i] = B_local[i]
-      // Though this might be illegal
-      // We use PostOrderVisit to detect whether the loop only manuplates
-      // "local" buffers, which indicates register usage and justifies skipping
-      // thread binding.
-      bool local_register_only = true;
-      PostOrderVisit(root, [&](const ObjectRef &obj) {
-        if (const auto *store = obj.as<BufferStoreNode>()) {
-          if (store->buffer.scope() != "local") {
-            local_register_only = false;
-          }
-        } else if (const auto *load = obj.as<BufferLoadNode>()) {
-          if (load->buffer.scope() != "local") {
-            local_register_only = false;
-          }
-        }
-      });
+    auto root = tvm::ffi::GetRef<For>(op);
 
-      auto loop_layout = result_.for_map[root];
-      // FIXME: tell in-Parallel and out-of-Parallel `local`s apart
-      // NOTE(lei): a bit ugly, we should rethink about this part in future.
-      bool parallel_loop =
-          !skip_thread_partition_ && !local_register_only && !store_into_local;
-
-      if (parallel_loop) {
-        for_node =
-            PartitionLoop(for_node, thread_var_->var, analyzer_, loop_layout);
-      }
-      // If none thread bindings are provided, partition the loop
-      bool has_non_local = false;
-      PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
-        if (const auto *load = obj.as<BufferLoadNode>()) {
-          String scope = load->buffer.scope();
-          if (scope != "local" && scope != "local.fragment") {
-            has_non_local = true;
-          }
-        } else if (const auto *store = obj.as<BufferStoreNode>()) {
-          String scope = store->buffer.scope();
-          if (scope != "local" && scope != "local.fragment") {
-            has_non_local = true;
-          }
-        }
-      });
-      // Workaround: if reducer is presented, don't vectorize loop
-      // Best solution should be isolate reduction axis out of vectorization
-      bool has_reducer = false;
-      PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
-        if (!has_reducer)
-          if (const auto *store = obj.as<BufferStoreNode>()) {
-            has_reducer = reducer_info.count(store->buffer->data) != 0;
-          }
-      });
+    auto loop_layout = result_.for_map[root];
 
-      if (has_non_local && !has_reducer) {
-        for_node = VectorizeLoop(for_node);
-      }
+    // Store the loop layout as an annotation on the For node
+    auto for_ptr = for_node.CopyOnWrite();
+    for_ptr->annotations.Set(attr::kParallelLoopLayout, loop_layout);
 
-      if (result_.predicate_map.count(root) && parallel_loop) {
-        return IfThenElse(result_.predicate_map[root], for_node);
-      } else {
-        return for_node;
+    // Store the predicate as an annotation if it exists and is not trivially
+    // true
+    if (result_.predicate_map.count(root)) {
+      PrimExpr predicate = analyzer_->Simplify(result_.predicate_map[root]);
+      // Only store predicate if it's not trivially true
+      if (!is_const_int(predicate, 1)) {
+        for_ptr->annotations.Set(attr::kParallelLoopPredicate, predicate);
       }
     }
+
     return for_node;
   }
 
   Stmt VisitStmt_(const AttrStmtNode *op) final {
     if (op->attr_key == tir::attr::thread_extent) {
       IterVar iv = Downcast<IterVar>(op->node);
-      ICHECK_NE(iv->thread_tag.length(), 0U);
-      if (iv->thread_tag == "threadIdx.x") {
-        thread_var_ = iv;
-      }
     }
     return IRMutatorWithAnalyzer::VisitStmt_(op);
   }
 
 private:
   const LayoutInferenceResult result_;
-  IterVar thread_var_ = IterVar(Range::FromMinExtent(0, 1), Var("v_thread"),
-                                IterVarType::kDataPar);
-  bool skip_thread_partition_{false};
 };
 
 tvm::transform::Pass LayoutInference() {
@@ -826,15 +1258,18 @@ tvm::transform::Pass LayoutInference() {
     collector(f->body);
     bool has_thread_binding = !collector.thread_binding_.empty();
     bool skip_thread_partition = !has_thread_binding;
-    return LayoutInferencer::Substitute(std::move(f), skip_thread_partition);
+    f = LayoutInferencer::Substitute(std::move(f), skip_thread_partition);
+    // Validate parallel loop layout annotations
+    ParallelLoopLayoutValidator::Validate(f->body);
+    return f;
   };
   return CreatePrimFuncPass(pass_func, 0, "tl.LayoutInference", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.LayoutInference", LayoutInference);
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/layout_reducer.cc b/src/transform/layout_reducer.cc
index e875c972c..957918c97 100644
--- a/src/transform/layout_reducer.cc
+++ b/src/transform/layout_reducer.cc
@@ -14,6 +14,7 @@
 #include "../layout/layout.h"
 #include "../op/fill.h"
 #include "../op/finalize_reducer.h"
+#include "../op/region.h"
 #include "arith/ir_mutator_with_analyzer.h"
 #include "layout_reducer.h"
 
@@ -212,8 +213,7 @@ class ReducerLayoutAnnotator : public IRMutatorWithAnalyzer {
         const auto &buffer = opt_buffer.value();
         Fragment f;
         if (info->rep == ReducerRepType::ALL) {
-          f = Fragment(buffer->shape, {}, ReplicationPlaceholder(),
-                       thread_extent, std::nullopt);
+          f = Fragment::FullyReplicated(buffer->shape, thread_extent);
         } else if (info->rep == ReducerRepType::NONE) {
           PrimExpr flatten_idx = InputPlaceholder(0);
           for (int i = 1; i < buffer->shape.size(); ++i)
@@ -275,22 +275,61 @@ class ReducerLayoutAnnotator : public IRMutatorWithAnalyzer {
     auto op = op_ref.CopyOnWrite();
     if (op->op.same_as(Fill::Get())) {
       ICHECK(!op->args.empty());
-      if (auto arg0_call = op->args[0].as<Call>();
-          arg0_call &&
-          arg0_call.value()->op.same_as(builtin::tvm_access_ptr())) {
-        ICHECK(arg0_call.value()->args.size() > 1);
-        if (auto var = arg0_call.value()->args[1].as<Var>();
-            var && reducer_info_map_.count(var.value())) {
-          ICHECK(inside_reducer_range_.count(var.value()) == 0)
+      if (auto arg0_call = op->args[0].as<Call>()) {
+        // tl.region(...) — extract buffer var from its first arg
+        if (arg0_call.value()->op.same_as(RegionOp::Get())) {
+          ICHECK(!arg0_call.value()->args.empty());
+          if (auto bl = arg0_call.value()->args[0].as<BufferLoadNode>()) {
+            Var var = bl->buffer->data;
+            if (reducer_info_map_.count(var)) {
+              ICHECK(inside_reducer_range_.count(var) == 0)
+                  << "T.fill on reducer must be enclosed with a "
+                     "T.finalize_reducer before next.";
+              inside_reducer_range_.Set(var,
+                                        reducer_info_map_.Get(var).value());
+            }
+          }
+        }
+        // builtin.tvm_access_ptr(...) — existing path (legacy)
+        if (arg0_call.value()->op.same_as(builtin::tvm_access_ptr())) {
+          ICHECK(arg0_call.value()->args.size() > 1);
+          if (auto var = arg0_call.value()->args[1].as<Var>();
+              var && reducer_info_map_.count(var.value())) {
+            ICHECK(inside_reducer_range_.count(var.value()) == 0)
+                << "T.fill on reducer must be enclosed with a "
+                   "T.finalize_reducer "
+                   "before next.";
+            inside_reducer_range_.Set(
+                var.value(), reducer_info_map_.Get(var.value()).value());
+          }
+        }
+      } else if (auto bl = op->args[0].as<BufferLoadNode>()) {
+        Var var = bl->buffer->data;
+        if (reducer_info_map_.count(var)) {
+          ICHECK(inside_reducer_range_.count(var) == 0)
               << "T.fill on reducer must be enclosed with a T.finalize_reducer "
                  "before next.";
-          inside_reducer_range_.Set(var.value(),
-                                    reducer_info_map_.Get(var.value()).value());
+          inside_reducer_range_.Set(var, reducer_info_map_.Get(var).value());
         }
       }
     } else if (op->op.same_as(FinalizeReducerOp::Get())) {
       ICHECK(op->args.size() == 1);
-      auto var = GetVarFromAccessPtr(op->args[0]);
+      Var var;
+      if (auto bl = op->args[0].as<BufferLoadNode>()) {
+        var = bl->buffer->data;
+      } else if (auto reg_call = op->args[0].as<Call>()) {
+        if (reg_call.value()->op.same_as(RegionOp::Get())) {
+          if (auto bl2 = reg_call.value()->args[0].as<BufferLoadNode>()) {
+            var = bl2->buffer->data;
+          } else {
+            LOG(FATAL) << "tl.region expects BufferLoad as first arg";
+          }
+        } else {
+          var = GetVarFromAccessPtr(op->args[0]);
+        }
+      } else {
+        var = GetVarFromAccessPtr(op->args[0]);
+      }
       ICHECK(inside_reducer_range_.count(var) == 1)
           << "T.finalize_reducer must have a pairing T.fill ahead of it, "
              "enclosing a reduction range.";
@@ -362,10 +401,10 @@ tvm::transform::Pass LayoutReducer() {
   return CreatePrimFuncPass(pass_func, 0, "tl.LayoutReducer", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.LayoutReducer", LayoutReducer);
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/layout_reducer.h b/src/transform/layout_reducer.h
index 894631cc2..e46ade948 100644
--- a/src/transform/layout_reducer.h
+++ b/src/transform/layout_reducer.h
@@ -66,17 +66,17 @@ struct ReducerInfoNode : Object {
 
   ReducerInfoNode() = default;
   ReducerInfoNode(const String &op_str, const String &rep_str);
-  static constexpr const char *_type_key = "tl.ReducerInfo";
-  TVM_DECLARE_FINAL_OBJECT_INFO(ReducerInfoNode, Object);
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.ReducerInfo", ReducerInfoNode, Object);
 };
 
 struct ReducerInfo : ObjectRef {
 public:
   TVM_DLL ReducerInfo(const String &op_str, const String &rep_str) {
-    data_ = make_object<ReducerInfoNode>(op_str, rep_str);
+    data_ = tvm::ffi::make_object<ReducerInfoNode>(op_str, rep_str);
   }
 
-  TVM_DEFINE_OBJECT_REF_METHODS(ReducerInfo, ObjectRef, ReducerInfoNode);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(ReducerInfo, ObjectRef,
+                                             ReducerInfoNode);
 };
 
 namespace attr {
diff --git a/src/transform/legalize_negative_index.cc b/src/transform/legalize_negative_index.cc
new file mode 100644
index 000000000..f0df555ef
--- /dev/null
+++ b/src/transform/legalize_negative_index.cc
@@ -0,0 +1,239 @@
+/*!
+ * \file legalize_negative_index.cc
+ * \brief Legalize negative indices in buffer load/store expressions.
+ */
+
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/runtime/logging.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include <unordered_map>
+#include <variant>
+#include <vector>
+
+#include "arith/ir_mutator_with_analyzer.h"
+#include "arith/ir_visitor_with_analyzer.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+using arith::IRVisitorWithAnalyzer;
+
+enum class IndexSignState { kNonNegative, kNegative, kUnknown };
+
+using BufferAccessVariant =
+    std::variant<const BufferLoadNode *, const BufferStoreNode *>;
+using LoadStore2StateMap =
+    std::unordered_map<BufferAccessVariant, std::vector<IndexSignState>>;
+
+class NegativeIndexAnalyzer : public IRVisitorWithAnalyzer {
+public:
+  explicit NegativeIndexAnalyzer(LoadStore2StateMap *result)
+      : result_(result) {}
+
+private:
+  std::vector<IndexSignState> ProcessIdx(const ffi::Array<PrimExpr> &indices,
+                                         ffi::String buffer_name) {
+    std::vector<IndexSignState> states;
+    states.reserve(indices.size());
+
+    for (size_t i = 0; i < indices.size(); ++i) {
+      PrimExpr simplified = analyzer_.Simplify(indices[i]);
+      IndexSignState state = IndexSignState::kUnknown;
+
+      // Handle scalar indices with the standard analyzer
+      if (simplified.dtype().lanes() == 1) {
+        if (analyzer_.CanProve(simplified >= 0))
+          state = IndexSignState::kNonNegative;
+        else if (analyzer_.CanProve(simplified < 0))
+          state = IndexSignState::kNegative;
+        else
+          DLOG(WARNING)
+              << "LegalizeNegativeIndex: cannot prove non-negative index "
+              << simplified << " for buffer " << buffer_name << " (axis " << i
+              << ", index " + indices[i]->Script() + ").";
+      }
+      // Vector indices: try to reason about non-negativity/negativity
+      // Common patterns are Ramp(base, stride, lanes) and Broadcast(value,
+      // lanes).
+      else if (const auto *ramp = simplified.as<RampNode>()) {
+        // Compute a safe lower/upper bound for the vector lanes
+        // lower_bound = base_min + min(0, stride_min) * (lanes - 1)
+        // upper_bound = base_max + max(0, stride_max) * (lanes - 1)
+        auto base_bound = analyzer_.const_int_bound(ramp->base);
+        auto stride_bound = analyzer_.const_int_bound(ramp->stride);
+        int lanes = *as_const_int(ramp->lanes);
+
+        int64_t base_min = base_bound->min_value;
+        int64_t base_max = base_bound->max_value;
+        int64_t s_min = stride_bound->min_value;
+        int64_t s_max = stride_bound->max_value;
+
+        // Guard against overflow is not strictly necessary here because
+        // bounds may be +/-inf represented by sentinel values.
+        int64_t lower = base_min;
+        if (s_min < 0)
+          lower += s_min * (lanes - 1);
+        int64_t upper = base_max;
+        if (s_max > 0)
+          upper += s_max * (lanes - 1);
+
+        if (lower >= 0)
+          state = IndexSignState::kNonNegative;
+        else if (upper < 0)
+          state = IndexSignState::kNegative;
+        else
+          DLOG(WARNING)
+              << "LegalizeNegativeIndex: cannot prove non-negative index "
+              << simplified << " for buffer " << buffer_name << " (axis " << i
+              << ", index " + indices[i]->Script() + ").";
+      } else if (const auto *broadcast = simplified.as<BroadcastNode>()) {
+        auto v = analyzer_.Simplify(broadcast->value);
+        if (analyzer_.CanProve(v >= 0))
+          state = IndexSignState::kNonNegative;
+        else if (analyzer_.CanProve(v < 0))
+          state = IndexSignState::kNegative;
+        else {
+          // Try const bound if proof unavailable
+          auto vb = analyzer_.const_int_bound(v);
+          if (vb->min_value >= 0)
+            state = IndexSignState::kNonNegative;
+          else if (vb->max_value < 0)
+            state = IndexSignState::kNegative;
+          else
+            DLOG(WARNING)
+                << "LegalizeNegativeIndex: cannot prove non-negative index "
+                << simplified << " for buffer " << buffer_name << " (axis " << i
+                << ", index " + indices[i]->Script() + ").";
+        }
+      }
+      states.push_back(state);
+    }
+
+    return std::move(states);
+  }
+
+  bool NeedRecord(const std::vector<IndexSignState> &states) {
+    return std::any_of(states.begin(), states.end(),
+                       [](const IndexSignState &state) {
+                         return state == IndexSignState::kUnknown ||
+                                state == IndexSignState::kNegative;
+                       });
+  }
+
+  void VisitExpr_(const BufferLoadNode *op) final {
+    std::vector<IndexSignState> states =
+        ProcessIdx(op->indices, op->buffer->name);
+
+    if (NeedRecord(states))
+      (*result_)[op] = std::move(states);
+
+    IRVisitorWithAnalyzer::VisitExpr_(op);
+  }
+
+  void VisitStmt_(const BufferStoreNode *op) final {
+    std::vector<IndexSignState> states =
+        ProcessIdx(op->indices, op->buffer->name);
+
+    if (NeedRecord(states))
+      (*result_)[op] = std::move(states);
+
+    IRVisitorWithAnalyzer::VisitStmt_(op);
+  }
+
+private:
+  LoadStore2StateMap *result_;
+};
+
+class NegativeIndexRewriter : public arith::IRMutatorWithAnalyzer {
+public:
+  static PrimFunc Apply(PrimFunc func, const LoadStore2StateMap &states) {
+    arith::Analyzer analyzer;
+    NegativeIndexRewriter rewriter(&analyzer, states);
+    PrimFuncNode *func_node = func.CopyOnWrite();
+    func_node->body = rewriter.VisitStmt(func_node->body);
+    return func;
+  }
+
+private:
+  NegativeIndexRewriter(arith::Analyzer *analyzer,
+                        const LoadStore2StateMap &states)
+      : arith::IRMutatorWithAnalyzer(analyzer), states_(states) {}
+
+  ffi::Array<PrimExpr> UpdateIdx(const ffi::Array<PrimExpr> &indices,
+                                 const ffi::Array<PrimExpr> &buffer_shape,
+                                 const std::vector<IndexSignState> &state_vec) {
+    ICHECK_EQ(state_vec.size(), indices.size())
+        << "State vector size mismatch for buffer load/store indices ("
+        << indices << ")";
+    ffi::Array<PrimExpr> new_indices = indices;
+    for (size_t i = 0; i < indices.size(); ++i) {
+      if (state_vec[i] != IndexSignState::kNegative)
+        continue;
+      new_indices.Set(i, analyzer_->Simplify(buffer_shape[i] + indices[i]));
+    }
+    return new_indices;
+  }
+
+  PrimExpr VisitExpr_(const BufferLoadNode *op) final {
+    BufferLoad load =
+        Downcast<BufferLoad>(arith::IRMutatorWithAnalyzer::VisitExpr_(op));
+
+    auto it = states_.find(op);
+    if (it == states_.end())
+      return load;
+
+    auto indices = UpdateIdx(load->indices, load->buffer->shape, it->second);
+    return BufferLoad(load->buffer, indices, load->predicate);
+  }
+
+  Stmt VisitStmt_(const BufferStoreNode *op) final {
+    BufferStore store =
+        Downcast<BufferStore>(arith::IRMutatorWithAnalyzer::VisitStmt_(op));
+
+    auto it = states_.find(op);
+    if (it == states_.end())
+      return store;
+
+    auto indices = UpdateIdx(store->indices, store->buffer->shape, it->second);
+    return BufferStore(store->buffer, store->value, indices, store->predicate);
+  }
+
+private:
+  const LoadStore2StateMap &states_;
+};
+
+PrimFunc LegalizeNegativeIndex(PrimFunc func) {
+  if (!func->body.defined()) {
+    return func;
+  }
+
+  LoadStore2StateMap states;
+  NegativeIndexAnalyzer analyzer(&states);
+  analyzer(func->body);
+  if (states.empty()) {
+    return func;
+  }
+
+  return NegativeIndexRewriter::Apply(std::move(func), states);
+}
+
+tvm::transform::Pass LegalizeNegativeIndexPass() {
+  using namespace tir::transform;
+  auto pass_func = [](PrimFunc f, const IRModule &, PassContext) {
+    return LegalizeNegativeIndex(std::move(f));
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tl.LegalizeNegativeIndex", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.LegalizeNegativeIndex",
+                        LegalizeNegativeIndexPass);
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/legalize_safe_memory_access.cc b/src/transform/legalize_safe_memory_access.cc
index ee408d4a5..a6f31da7d 100644
--- a/src/transform/legalize_safe_memory_access.cc
+++ b/src/transform/legalize_safe_memory_access.cc
@@ -24,32 +24,6 @@ namespace tl {
 using namespace tir;
 using arith::IRMutatorWithAnalyzer;
 
-// Helper class to find leaf For nodes in a given IR
-class LeafForFinder : public StmtVisitor {
-public:
-  std::vector<For> leaf_for_nodes;
-
-private:
-  void VisitStmt_(const ForNode *op) final {
-    has_child_for_ = false;
-    bool parent_has_child_for = parent_has_child_for_;
-    parent_has_child_for_ = false;
-
-    StmtVisitor::VisitStmt(op->body);
-
-    if (!has_child_for_) {
-      leaf_for_nodes.push_back(GetRef<For>(op));
-    }
-
-    parent_has_child_for_ = parent_has_child_for;
-    parent_has_child_for_ = true;
-  }
-
-private:
-  bool has_child_for_ = false;
-  bool parent_has_child_for_ = false;
-};
-
 // GlobalMemChecker for a BufferLoad/BufferStore node:
 // 1. Identify BufferLoad and BufferStore nodes.
 // 2. Check if the buffer is in global scope.
@@ -109,13 +83,16 @@ struct GlobalMemChecker : public StmtExprVisitor {
       PrimExpr index = indices[i];
       PrimExpr shape_dim = buffer->shape[i];
 
-      bool has_variable = false;
+      bool is_index_constant = true;
       PostOrderVisit(index, [&](const ObjectRef &obj) {
         if (const VarNode *v = obj.as<VarNode>()) {
-          has_variable = true;
+          is_index_constant = false;
+        }
+        if (const BufferLoadNode *v = obj.as<BufferLoadNode>()) {
+          is_index_constant = false;
         }
       });
-      if (!has_variable) {
+      if (is_index_constant) {
         // If index is a constant, we can skip the check
         continue;
       }
@@ -145,18 +122,31 @@ struct GlobalMemChecker : public StmtExprVisitor {
   bool recursively_collect_conds_;
 };
 
-class SafeMemorysRewriter : public StmtExprMutator {
-  arith::Analyzer *analyzer_;
-
+class SafeMemorysRewriter : public IRMutatorWithAnalyzer {
 public:
-  explicit SafeMemorysRewriter(Map<Buffer, PrimExpr> annotated_safe_value_map,
-                               arith::Analyzer *analyzer)
-      : annotated_safe_value_map_(std::move(annotated_safe_value_map)),
-        analyzer_(analyzer) {}
+  // Static method to substitute and transform the given PrimFunc
+  static PrimFunc Substitute(PrimFunc f) {
+    arith::Analyzer analyzer;
+    // Create an instance of the legalizer with the analyzer
+    SafeMemorysRewriter substituter(&analyzer);
+    // Get a mutable copy of the function node
+    PrimFuncNode *fptr = f.CopyOnWrite();
+    for (const auto &[_, buffer] : f->buffer_map) {
+      substituter.buffer_data_to_buffer_.Set(buffer->data, buffer);
+    }
+    // Apply the legalizer to the function body
+    fptr->body = substituter.VisitStmt(f->body);
+    return f;
+  }
 
 private:
+  // Constructor initializing the base class with the analyzer
+  SafeMemorysRewriter(arith::Analyzer *analyzer)
+      : arith::IRMutatorWithAnalyzer(analyzer) {}
+  // Constructor initializing the base class with the analyzer
+
   PrimExpr VisitExpr_(const BufferLoadNode *op) final {
-    auto load = Downcast<BufferLoad>(StmtExprMutator::VisitExpr_(op));
+    auto load = Downcast<BufferLoad>(IRMutatorWithAnalyzer::VisitExpr_(op));
 
     // For Load/Store, we only check the current node, not its children.
     // Since rewriter will recursively visit children.
@@ -181,7 +171,7 @@ class SafeMemorysRewriter : public StmtExprMutator {
 
   Stmt VisitStmt_(const BufferStoreNode *op) final {
     // Check if the buffer is in global scope
-    auto store = Downcast<BufferStore>(StmtExprMutator::VisitStmt_(op));
+    auto store = Downcast<BufferStore>(IRMutatorWithAnalyzer::VisitStmt_(op));
 
     GlobalMemChecker checker(analyzer_, /*recursively_collect_conds=*/false);
     checker(store);
@@ -253,85 +243,6 @@ class SafeMemorysRewriter : public StmtExprMutator {
     return evaluate;
   }
 
-  bool IsLocalBuffer(const Buffer &buffer) {
-    String scope = buffer.scope();
-    return scope == "local" || scope == "local.fragment" ||
-           scope == "local.var";
-  }
-
-  bool isSharedBuffer(const Buffer &buffer) {
-    String scope = buffer.scope();
-    return scope == "shared" || scope == "shared.dyn";
-  }
-
-  bool IsGlobalBuffer(const Buffer &buffer) {
-    String scope = buffer.scope();
-    return scope == "global";
-  }
-  // Get the safe value of the buffer
-  PrimExpr GetSafeValue(const Buffer &buffer) {
-    if (annotated_safe_value_map_.count(buffer)) {
-      return annotated_safe_value_map_[buffer];
-    }
-    return make_zero(buffer->dtype);
-  }
-
-  Map<Buffer, PrimExpr> annotated_safe_value_map_;
-};
-
-// Class to legalize safe memory access by transforming them appropriately
-class SafeMemoryLegalizer : IRMutatorWithAnalyzer {
-public:
-  // Static method to substitute and transform the given PrimFunc
-  static PrimFunc Substitute(PrimFunc f) {
-    arith::Analyzer analyzer;
-    // Create an instance of the legalizer with the analyzer
-    SafeMemoryLegalizer substituter(&analyzer);
-    // Get a mutable copy of the function node
-    PrimFuncNode *fptr = f.CopyOnWrite();
-    for (const auto &[_, buffer] : f->buffer_map) {
-      substituter.buffer_data_to_buffer_.Set(buffer->data, buffer);
-    }
-    // Apply the legalizer to the function body
-    fptr->body = substituter.VisitStmt(f->body);
-    return f;
-  }
-
-private:
-  // Constructor initializing the base class with the analyzer
-  SafeMemoryLegalizer(arith::Analyzer *analyzer)
-      : arith::IRMutatorWithAnalyzer(analyzer) {}
-
-  // Override the VisitStmt_ method to handle ForNode (loop statements)
-  Stmt VisitStmt_(const ForNode *op) final {
-    // Visit and potentially modify the loop node
-    For for_node = Downcast<For>(IRMutatorWithAnalyzer::VisitStmt_(op));
-    auto has_inner_loop = HasInnerLoop(for_node->body);
-    if (!has_inner_loop) {
-      SafeMemorysRewriter rewriter(annotated_safe_value_map_, analyzer_);
-      for_node.CopyOnWrite()->body = rewriter(for_node->body);
-      // // Detect Buffer Load Node in the loop body, collect the indices and
-      // buffer size
-
-      // // Run the checker on the loop body
-      // GlobalMemChecker checker(analyzer_);
-      // checker(for_node->body);
-      // Array<PrimExpr> conditions = checker.GetConditions();
-      // auto body = for_node->body;
-      // // Note that we might have duplicate conditions
-      // // Which will be optimized by simplify pass
-      // // Replace the loop body with the new body
-      // for (auto cond : conditions) {
-      //   body = IfThenElse(cond, body);
-      // }
-      // for_node.CopyOnWrite()->body = body;
-      return std::move(for_node);
-    }
-
-    // Visit a For Node
-    return IRMutatorWithAnalyzer::VisitStmt_(op);
-  }
-
   Stmt VisitStmt_(const BlockNode *op) final {
     for (auto buffer : op->alloc_buffers) {
       buffer_data_to_buffer_.Set(buffer->data, buffer);
@@ -351,10 +262,12 @@ class SafeMemoryLegalizer : IRMutatorWithAnalyzer {
     return IRMutatorWithAnalyzer::VisitStmt_(op);
   }
 
-  static bool HasInnerLoop(const Stmt &stmt) {
-    LeafForFinder finder;
-    finder(stmt);
-    return !finder.leaf_for_nodes.empty();
+  // Get the safe value of the buffer
+  PrimExpr GetSafeValue(const Buffer &buffer) {
+    if (annotated_safe_value_map_.count(buffer)) {
+      return annotated_safe_value_map_[buffer];
+    }
+    return make_zero(buffer->dtype);
   }
 
   Map<Var, Buffer> buffer_data_to_buffer_;
@@ -371,18 +284,18 @@ tvm::transform::Pass LegalizeSafeMemoryAccess() {
     if (disable_safe_memory_legalize) {
       return f;
     }
-    return SafeMemoryLegalizer::Substitute(std::move(f));
+    return SafeMemorysRewriter::Substitute(std::move(f));
   };
   // Create and return a PrimFunc pass with the transformation function
   return CreatePrimFuncPass(pass_func, 0, "tl.LegalizeSafeMemoryAccess", {});
 }
 
 // Register the pass globally so it can be used in the compilation pipeline
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.LegalizeSafeMemoryAccess",
                         LegalizeSafeMemoryAccess);
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/legalize_vectorized_loop.cc b/src/transform/legalize_vectorized_loop.cc
index dc2099208..4fd4ab91f 100644
--- a/src/transform/legalize_vectorized_loop.cc
+++ b/src/transform/legalize_vectorized_loop.cc
@@ -73,7 +73,7 @@ class LoopVectorizedLegalizer : IRMutatorWithAnalyzer {
     // Change the loop kind from vectorized to serial
     for_node.CopyOnWrite()->kind = ForKind::kSerial;
     // Apply vectorization transformation to the loop
-    return VectorizeLoop(for_node);
+    return VectorizeLoop(for_node, analyzer_);
   }
 };
 
@@ -89,11 +89,11 @@ tvm::transform::Pass LegalizeVectorizedLoop() {
 }
 
 // Register the pass globally so it can be used in the compilation pipeline
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.LegalizeVectorizedLoop",
                         LegalizeVectorizedLoop);
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/loop_partition.cc b/src/transform/loop_partition.cc
index e9930310a..c1117533a 100644
--- a/src/transform/loop_partition.cc
+++ b/src/transform/loop_partition.cc
@@ -28,6 +28,9 @@
 
 #include <utility>
 
+#include "../op/utils.h"
+#include "loop_vectorize.h"
+
 namespace tvm {
 namespace tl {
 
@@ -93,7 +96,8 @@ For PartitionLoop(For op, Var thread_var, arith::Analyzer *analyzer,
   }
   for (int i = 0; i < old_loop_depth; i++) {
     const ForNode *loop = body.as<ForNode>();
-    ICHECK(loop != nullptr);
+    ICHECK(loop != nullptr)
+        << "No extra statements are allowed between nested parallel loops.";
     vmap.Set(loop->loop_var, indices[i]);
     loop_mins.push_back(loop->min);
     loop_extents.push_back(loop->extent);
@@ -173,7 +177,7 @@ class LoopPramaUnroller : public StmtExprMutator {
       if (as_const_int(analyzer->Simplify(node->extent)) == nullptr) {
         return StmtExprMutator::VisitStmt_(node);
       }
-      For new_for = GetRef<For>(node);
+      For new_for = tvm::ffi::GetRef<For>(node);
       auto for_ptr = new_for.CopyOnWrite();
       for_ptr->annotations.Set(tir::attr::pragma_unroll_explicit, Bool(false));
       for_ptr->kind = ForKind::kUnrolled;
@@ -217,14 +221,14 @@ class LoopPartitioner : public StmtExprVisitor {
 
 private:
   void VisitExpr_(const BufferLoadNode *op) final {
-    if (op->buffer.scope() == "local.fragment") {
+    if (IsFragmentBuffer(op->buffer)) {
       has_fragment_ = true;
     }
     StmtExprVisitor::VisitExpr_(op);
   }
 
   void VisitStmt_(const BufferStoreNode *op) final {
-    if (op->buffer.scope() == "local.fragment") {
+    if (IsFragmentBuffer(op->buffer)) {
       has_fragment_ = true;
     }
     StmtExprVisitor::VisitStmt_(op);
@@ -266,5 +270,31 @@ For LoopPragmaUnroll(For stmt) {
   return unrolled;
 }
 
+Stmt LowerParallelLoop(For loop, const Fragment &loop_layout, Var thread_var,
+                       arith::Analyzer *analyzer, Optional<PrimExpr> predicate,
+                       bool parallel_loop, bool should_vectorize) {
+  // Save analyzer state to prevent conflicted bindings during vectorization
+  auto saved_analyzer = analyzer->Clone();
+
+  For result_loop = loop;
+
+  // Step 1: Partition the loop based on the layout (if this is a parallel loop)
+  if (parallel_loop) {
+    result_loop = PartitionLoop(result_loop, thread_var, analyzer, loop_layout);
+  }
+
+  // Step 2: Vectorize the loop (if requested)
+  if (should_vectorize) {
+    result_loop = VectorizeLoop(result_loop, saved_analyzer.get());
+  }
+
+  // Step 3: Wrap with predicate if provided and this is a parallel loop
+  if (predicate.defined() && parallel_loop) {
+    return IfThenElse(predicate.value(), result_loop);
+  }
+
+  return result_loop;
+}
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/loop_partition.h b/src/transform/loop_partition.h
index 1103e7515..844065ab3 100644
--- a/src/transform/loop_partition.h
+++ b/src/transform/loop_partition.h
@@ -26,6 +26,7 @@
 #define TVM_TL_LOOP_PARTITION_H_
 
 #include <tvm/tir/op.h>
+#include <tvm/tir/stmt.h>
 
 #include "../layout/layout.h"
 
@@ -45,6 +46,31 @@ Fragment PlanLoopPartition(const For &op, int vectorize_size,
 
 For LoopPragmaUnroll(For stmt);
 
+/*!
+ * \brief Lower a parallel loop by partitioning and vectorizing it.
+ *
+ * This function combines PartitionLoop and VectorizeLoop into a single
+ * operation, and optionally wraps the result with an IfThenElse if a
+ * predicate is provided.
+ *
+ * \param loop The parallel For loop to lower.
+ * \param loop_layout The Fragment layout for partitioning.
+ * \param thread_var The thread variable for partitioning.
+ * \param analyzer The arithmetic analyzer.
+ * \param predicate Optional predicate to wrap the loop with IfThenElse.
+ * \param parallel_loop Whether this is a true parallel loop requiring thread
+ *        partitioning. False for loops that only operate on local/register
+ *        buffers. (default true)
+ * \param should_vectorize Whether to vectorize the loop. False when reducers
+ *        are present or when there are no non-local buffer accesses.
+ *        (default true)
+ * \return The lowered statement.
+ */
+Stmt LowerParallelLoop(For loop, const Fragment &loop_layout, Var thread_var,
+                       arith::Analyzer *analyzer,
+                       Optional<PrimExpr> predicate = Optional<PrimExpr>(),
+                       bool parallel_loop = true, bool should_vectorize = true);
+
 } // namespace tl
 } // namespace tvm
 
diff --git a/src/transform/loop_vectorize.cc b/src/transform/loop_vectorize.cc
index 9eddb213f..64b4fa6fc 100644
--- a/src/transform/loop_vectorize.cc
+++ b/src/transform/loop_vectorize.cc
@@ -30,6 +30,7 @@
 #include "common/loop_vectorization_utils.h"
 #include "tvm/tir/analysis.h"
 #include "tvm/tir/var.h"
+#include <string>
 #include <tvm/arith/iter_affine_map.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/stmt_functor.h>
@@ -45,7 +46,7 @@ struct VectorizePlanResult {
   PrimExpr condition;
 };
 
-class VectorizeFindGlobalAccess : public arith::IRVisitorWithAnalyzer {
+class VectorizeFindGlobalAccess : public StmtExprVisitor {
 public:
   VectorizeFindGlobalAccess() = default;
 
@@ -60,19 +61,20 @@ class VectorizeFindGlobalAccess : public arith::IRVisitorWithAnalyzer {
   void VisitStmt_(const BufferStoreNode *node) final {
     if (node->buffer.scope() == "global")
       has_global_access_ = true;
-    return arith::IRVisitorWithAnalyzer::VisitStmt_(node);
+    return StmtExprVisitor::VisitStmt_(node);
   }
 
   void VisitExpr_(const BufferLoadNode *node) final {
     if (node->buffer.scope() == "global")
       has_global_access_ = true;
-    return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
+    return StmtExprVisitor::VisitExpr_(node);
   }
 };
 
-class VectorizePlanner : public arith::IRVisitorWithAnalyzer {
+class VectorizePlanner : public arith::IRMutatorWithAnalyzer {
 public:
-  VectorizePlanner() = default;
+  explicit VectorizePlanner(arith::Analyzer *analyzer)
+      : arith::IRMutatorWithAnalyzer(analyzer) {}
 
   int Plan(const For &node) {
     tvm::transform::PassContext ctxt = tvm::transform::PassContext::Current();
@@ -92,21 +94,31 @@ class VectorizePlanner : public arith::IRVisitorWithAnalyzer {
   }
 
 private:
-  void VisitStmt_(const ForNode *node) final {
+  Stmt VisitStmt_(const ForNode *node) final {
     inner_for_ = node;
-    auto extent_ptr = as_const_int(analyzer_.Simplify(node->extent));
-    // Here I disable dynamic shape completely,
-    //   In order to do it, the Planner should accept an analyzer with
-    //   arithmetic info outside to prove the dividiblity of vector size
-    if (!extent_ptr) {
-      vector_size_ = 1;
-      return;
+    bool contains_nested_for = false;
+    // Must analysis vectorization on the innermost loop
+    PostOrderVisit(Downcast<Stmt>(node->body), [&](const ObjectRef &obj) {
+      if (obj.as<ForNode>()) {
+        contains_nested_for = true;
+      }
+    });
+
+    if (!contains_nested_for) {
+      auto extent_ptr = as_const_int(analyzer_->Simplify(node->extent));
+      // Here I disable dynamic shape completely,
+      //   In order to do it, the Planner should accept an analyzer with
+      //   arithmetic info outside to prove the dividiblity of vector size
+      if (!extent_ptr) {
+        vector_size_ = 1;
+        return ffi::GetRef<Stmt>(node);
+      }
+      vector_size_ = arith::ZeroAwareGCD(vector_size_, *extent_ptr);
     }
-    vector_size_ = arith::ZeroAwareGCD(vector_size_, *extent_ptr);
-    arith::IRVisitorWithAnalyzer::VisitStmt_(node);
+    return arith::IRMutatorWithAnalyzer::VisitStmt_(node);
   }
 
-  void VisitExpr_(const BufferLoadNode *node) final {
+  PrimExpr VisitExpr_(const BufferLoadNode *node) final {
     if (node->buffer.scope() == "shared" || node->buffer.scope() == "global" ||
         node->buffer.scope() == "shared.dyn")
       has_nonlocal_memory_access_ = true;
@@ -115,26 +127,27 @@ class VectorizePlanner : public arith::IRVisitorWithAnalyzer {
       // constant buffer that tl hack to use as local register.
       auto boundary_check = node->buffer->shape[0].as<IntImmNode>();
       if (boundary_check && boundary_check->value == 1) {
-        return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
+        return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
       }
     }
     UpdateVectorSize(node->indices, node->buffer);
+    return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
   }
 
-  void VisitStmt_(const BufferStoreNode *node) final {
+  Stmt VisitStmt_(const BufferStoreNode *node) final {
     if (node->buffer.scope() == "shared" || node->buffer.scope() == "global" ||
         node->buffer.scope() == "shared.dyn")
       has_nonlocal_memory_access_ = true;
     UpdateVectorSize(node->indices, node->buffer);
-    return arith::IRVisitorWithAnalyzer::VisitExpr(node->value);
+    return arith::IRMutatorWithAnalyzer::VisitStmt_(node);
   }
 
-  void VisitStmt_(const IfThenElseNode *node) final {
+  Stmt VisitStmt_(const IfThenElseNode *node) final {
     CheckConditionVectorized(node->condition);
-    return arith::IRVisitorWithAnalyzer::VisitStmt_(node);
+    return arith::IRMutatorWithAnalyzer::VisitStmt_(node);
   }
 
-  void VisitExpr_(const CallNode *node) final {
+  PrimExpr VisitExpr_(const CallNode *node) final {
     if (node->op == builtin::if_then_else()) {
       CheckConditionVectorized(node->args[0]);
     } else if (node->op == builtin::call_extern()) {
@@ -169,25 +182,29 @@ class VectorizePlanner : public arith::IRVisitorWithAnalyzer {
             }
 
             if (can_vectorize) {
-              return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
+              return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
             }
           }
         }
       }
       // do not vectorize other extern calls
       vector_size_ = 1;
+    } else if (node->op.same_as(tl::rng_rand()) ||
+               node->op.same_as(tl::rng_init())) {
+      // do not vectorize random operation
+      vector_size_ = 1;
     }
-    return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
+    return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
   }
 
   void CheckConditionVectorized(const PrimExpr &cond) {
     // TODO: perform some checks here
   }
 
-  void VisitExpr_(const CastNode *node) final {
+  PrimExpr VisitExpr_(const CastNode *node) final {
     vector_size_ = arith::ZeroAwareGCD(
         vector_load_bits_max_ / node->dtype.bits(), vector_size_);
-    return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
+    return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
   }
 
   void UpdateVectorSize(const Array<PrimExpr> indices, const Buffer &buffer) {
@@ -207,19 +224,21 @@ class VectorizePlanner : public arith::IRVisitorWithAnalyzer {
     for (int i = 0; i < indices.size(); ++i) {
       elem_offset += indices[i] * strides[i];
     }
-
     // 2. If element offset is independent with loop_var, ignore it
-    if (CanProveIndependent(elem_offset, inner_for_->loop_var, &analyzer_)) {
+    if (CanProveIndependent(elem_offset, inner_for_->loop_var, analyzer_)) {
       return;
     }
-
-    // 3. Tight vectorize bound
-    vector_size_ = arith::ZeroAwareGCD(vector_size_, vector_load_bits_max_ /
-                                                         buffer->dtype.bits());
-
+    // 3. Check if current vector_size_ works with invariant boundary check
+    if (!IsExprInvariantInVectorBoundary(elem_offset, inner_for_->loop_var,
+                                         vector_size_, analyzer_)) {
+      // If not, tight vectorize bound with buffer dtype constraint
+      vector_size_ = arith::ZeroAwareGCD(
+          vector_size_, vector_load_bits_max_ /
+                            (buffer->dtype.bits() * buffer->dtype.lanes()));
+    }
     // 4. Try to vectorize buffer load
     while (!IndiceCanVectorize(elem_offset, inner_for_->loop_var,
-                               inner_for_->extent, vector_size_, &analyzer_)) {
+                               inner_for_->extent, vector_size_, analyzer_)) {
       vector_size_ /= 2;
     }
   }
@@ -259,7 +278,8 @@ class VectorizeRewriter : public StmtExprMutator {
         Stmt body = Substitute(fnode->body, vmap);
         body = For(inner_var, 0, vector_size_, ForKind::kVectorized, body);
         body = For(outer_var, 0, extent / vector_size_, fnode->kind, body,
-                   fnode->thread_binding, fnode->annotations, fnode->span);
+                   fnode->thread_binding, fnode->annotations, fnode->step,
+                   fnode->span);
         return body;
       }
     } else {
@@ -271,13 +291,21 @@ class VectorizeRewriter : public StmtExprMutator {
   const int vector_size_;
 };
 
-int GetVectorizeSize(const For &loop) { return VectorizePlanner().Plan(loop); }
+int GetVectorizeSize(const For &loop) {
+  arith::Analyzer analyzer;
+  return VectorizePlanner(&analyzer).Plan(loop);
+}
+
+int GetVectorizeSize(const For &loop, arith::Analyzer *analyzer) {
+  return VectorizePlanner(analyzer).Plan(loop);
+}
 
 bool CanProveIndependent(const PrimExpr &expr, Var var,
                          arith::Analyzer *analyzer) {
   // 1. if var doesn't exist, it is independent
-  bool used_var = UsesVar(
-      expr, [&](const VarNode *v) { return GetRef<Var>(v).same_as(var); });
+  bool used_var = UsesVar(expr, [&](const VarNode *v) {
+    return tvm::ffi::GetRef<Var>(v).same_as(var);
+  });
   if (!used_var) {
     return true;
   }
@@ -290,6 +318,28 @@ bool CanProveIndependent(const PrimExpr &expr, Var var,
   return false;
 }
 
+bool IsExprInvariantInVectorBoundary(const PrimExpr &expr, Var var,
+                                     int target_vectorized_size,
+                                     arith::Analyzer *analyzer) {
+  // Check if expr is invariant within vector boundaries
+  // We're trying to prove the access expression A[f(var)] depends only on
+  // floor(var/vecsize), not on var%vecsize
+  // Mathematically:
+  // \forall var, f(floor(var/vecsize)*vecsize + var%vecsize) ==
+  // f(floor(var/vecsize)*vecsize + 0)
+  // Example: for i in T.vectorized(8):
+  //     A[i] = B[i] * C[i//4]
+  // if vecsize=4, f(i)=i//4 depends only on i//4
+  // Therefore A[i] = B[i] * C[i//4] can be vectorized with vecsize=4
+  PrimExpr var_aligned =
+      floordiv(var, target_vectorized_size) * target_vectorized_size;
+  PrimExpr expr_aligned = Substitute(expr, {{var, var_aligned}});
+  if (analyzer->CanProveEqual(expr, expr_aligned)) {
+    return true;
+  }
+  return false;
+}
+
 bool IndiceCanVectorize(const PrimExpr &expr, Var var,
                         const PrimExpr &iter_var_size,
                         int target_vectorized_size, arith::Analyzer *analyzer) {
@@ -298,24 +348,38 @@ bool IndiceCanVectorize(const PrimExpr &expr, Var var,
     return true;
 
   // Extent must be divisible
-  if (!analyzer->CanProveEqual(FloorMod(iter_var_size, target_vectorized_size),
+  PrimExpr target_size_for_iter =
+      make_const(iter_var_size.dtype(), target_vectorized_size);
+  PrimExpr target_size_for_expr =
+      make_const(expr.dtype(), target_vectorized_size);
+  PrimExpr target_size_for_var =
+      make_const(var.dtype(), target_vectorized_size);
+  PrimExpr zero = make_const(var.dtype(), 0);
+
+  if (!analyzer->CanProveEqual(FloorMod(iter_var_size, target_size_for_iter),
                                0))
     return false;
 
+  if (IsExprInvariantInVectorBoundary(expr, var, target_vectorized_size,
+                                      analyzer)) {
+    return true;
+  }
+
+  auto simplified_expr = analyzer->Simplify(Substitute(expr, {{var, zero}}));
   // The base offset must be divisible
-  if (!analyzer->CanProveEqual(
-          FloorMod(Substitute(expr, {{var, 0}}), target_vectorized_size), 0)) {
+  if (!analyzer->CanProveEqual(FloorMod(simplified_expr, target_size_for_expr),
+                               zero)) {
     return false;
   }
 
   // Bind thread range
-  Var v0("v0"), v1("v1");
-  analyzer->Bind(v0, Range(0, target_vectorized_size));
-  analyzer->Bind(v1, Range(0, analyzer->Simplify(FloorDiv(
-                                  iter_var_size, target_vectorized_size))));
+  Var v0("v0", var.dtype()), v1("v1", var.dtype());
+  analyzer->Bind(v0, Range(zero, target_size_for_var));
+  analyzer->Bind(v1, Range(zero, analyzer->Simplify(FloorDiv(
+                                     iter_var_size, target_size_for_iter))));
   PrimExpr expr_transformed = analyzer->Simplify(
-      Substitute(expr, {{var, v0 + v1 * target_vectorized_size}}));
-  Vectorizer vectorizer(v0, IntImm(v0->dtype, target_vectorized_size));
+      Substitute(expr, {{var, v0 + v1 * target_size_for_var}}));
+  Vectorizer vectorizer(v0, target_size_for_var);
   PrimExpr expr_vectorized = vectorizer.VisitExpr(expr_transformed);
 
   // This simplify is necessary for thread region specified
@@ -335,7 +399,20 @@ bool IndiceCanVectorize(const PrimExpr &expr, Var var,
 
 For VectorizeLoop(const For &loop, int vectorize_hint) {
   if (vectorize_hint <= 0) {
-    VectorizePlanner planner;
+    arith::Analyzer analyzer;
+    VectorizePlanner planner(&analyzer);
+    vectorize_hint = planner.Plan(loop);
+  }
+  if (vectorize_hint == 1)
+    return loop;
+  auto rewriter = VectorizeRewriter(vectorize_hint);
+  return Downcast<For>(rewriter(loop));
+}
+
+For VectorizeLoop(const For &loop, arith::Analyzer *analyzer,
+                  int vectorize_hint) {
+  if (vectorize_hint <= 0) {
+    VectorizePlanner planner(analyzer);
     vectorize_hint = planner.Plan(loop);
   }
   if (vectorize_hint == 1)
diff --git a/src/transform/loop_vectorize.h b/src/transform/loop_vectorize.h
index 4ab20c668..92a756228 100644
--- a/src/transform/loop_vectorize.h
+++ b/src/transform/loop_vectorize.h
@@ -35,12 +35,23 @@ using namespace tir;
 
 int GetVectorizeSize(const For &loop);
 
+int GetVectorizeSize(const For &loop, arith::Analyzer *analyzer);
+
 For VectorizeLoop(const For &loop, int vectorize_hint = -1);
 
+For VectorizeLoop(const For &loop, arith::Analyzer *analyzer,
+                  int vectorize_hint = -1);
+
 // Can prove expr is independent with var, i.e. the value of expr doesn't change
 // when var changes
 bool CanProveIndependent(const PrimExpr &expr, Var var,
                          arith::Analyzer *analyzer);
+
+// Check if expr is invariant within vector boundaries
+bool IsExprInvariantInVectorBoundary(const PrimExpr &expr, Var var,
+                                     int target_vectorized_size,
+                                     arith::Analyzer *analyzer);
+
 bool IndiceCanVectorize(const PrimExpr &expr, Var var,
                         const PrimExpr &iter_var_size,
                         int target_vectorized_size, arith::Analyzer *analyzer);
diff --git a/src/transform/loop_vectorize_dynamic.cc b/src/transform/loop_vectorize_dynamic.cc
deleted file mode 100644
index d02582726..000000000
--- a/src/transform/loop_vectorize_dynamic.cc
+++ /dev/null
@@ -1,545 +0,0 @@
-/*!
- * \file loop_vectorize_dynamic.cc
- * \brief A tool to automatically vectorize a for loop with dynamic shape
- * \brief Reference to loop_vectorize.cc and vectorize_loop.cc
- */
-
-#include <cstdint>
-#include <tvm/arith/iter_affine_map.h>
-#include <tvm/ffi/reflection/registry.h>
-#include <tvm/tir/builtin.h>
-#include <tvm/tir/op.h>
-#include <tvm/tir/stmt_functor.h>
-
-#include <numeric>
-#include <utility>
-
-#include "../layout/layout.h"
-#include "../layout/utils.h"
-#include "../op/builtin.h"
-#include "arith/int_operator.h"
-#include "arith/ir_visitor_with_analyzer.h"
-#include "common/loop_vectorization_utils.h"
-
-namespace tvm {
-namespace tl {
-
-using namespace tir;
-using arith::IRMutatorWithAnalyzer;
-
-struct VectorizePlanResult {
-  int vector_size;
-  bool dynamic;
-  PrimExpr condition;
-};
-
-bool IndiceCanVectorizeDynamic(const PrimExpr &expr, Var var,
-                               const PrimExpr &iter_var_size,
-                               int target_vectorized_size,
-                               arith::Analyzer *analyzer) {
-  ICHECK(target_vectorized_size >= 1);
-  if (target_vectorized_size == 1)
-    return true;
-  if (!analyzer->CanProveEqual(FloorMod(iter_var_size, target_vectorized_size),
-                               0))
-    return false;
-  Var v0("v0"), v1("v1");
-  analyzer->Bind(v0, Range(0, target_vectorized_size));
-  analyzer->Bind(v1, Range(0, FloorDiv(iter_var_size, target_vectorized_size)));
-  PrimExpr expr_transformed = analyzer->Simplify(
-      Substitute(expr, {{var, v0 + v1 * target_vectorized_size}}));
-
-  Vectorizer vectorizer(v0, IntImm(v0->dtype, target_vectorized_size));
-  PrimExpr expr_vectorized = vectorizer.VisitExpr(expr_transformed);
-  auto ramp_node = expr_vectorized.as<RampNode>();
-  if (!ramp_node) {
-    // Broadcast value
-    if (expr_vectorized.dtype().lanes() == 1)
-      return true;
-    else
-      return false;
-  } else {
-    return is_one(ramp_node->stride);
-  }
-}
-
-class VectorizePlannerDynamic : public arith::IRVisitorWithAnalyzer {
-public:
-  VectorizePlannerDynamic(int dynamic_alignment,
-                          bool disable_dynamic_tail_split)
-      : dynamic_alignment_(dynamic_alignment),
-        disable_dynamic_tail_split_(disable_dynamic_tail_split),
-        vector_load_bits_max_(128) {
-    if (disable_dynamic_tail_split_) {
-      vector_size_ = dynamic_alignment_;
-    } else {
-      vector_size_ = vector_load_bits_max_;
-    }
-  }
-
-  int Plan(const For &node) {
-    this->operator()(node);
-    // Always Enable vectorization
-    // if (!has_nonlocal_memory_access_) return 1;
-    return vector_size_;
-  }
-
-  bool GetDynamic() { return dynamic_; }
-
-  PrimExpr GetCondition() { return condition_; }
-
-private:
-  void VisitStmt_(const ForNode *node) final {
-    inner_for_ = node;
-    iter_map_.Set(node->loop_var, Range(node->min, node->extent));
-    arith::IRVisitorWithAnalyzer::VisitStmt_(node);
-  }
-
-  void VisitExpr_(const BufferLoadNode *node) final {
-    if (node->buffer.scope() == "shared" || node->buffer.scope() == "global" ||
-        node->buffer.scope() == "shared.dyn")
-      has_nonlocal_memory_access_ = true;
-    if (node->buffer->shape.size() == 1) {
-      // TODO(lei): This should be improved as
-      // constant buffer that tl hack to use as local register.
-      auto boundary_check = node->buffer->shape[0].as<IntImmNode>();
-      if (boundary_check && boundary_check->value == 1) {
-        return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
-      }
-    }
-    UpdateVectorSize(node->indices, node->buffer);
-    return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
-  }
-
-  void VisitStmt_(const BufferStoreNode *node) final {
-    if (node->buffer.scope() == "shared" || node->buffer.scope() == "global" ||
-        node->buffer.scope() == "shared.dyn")
-      has_nonlocal_memory_access_ = true;
-    UpdateVectorSize(node->indices, node->buffer);
-    return arith::IRVisitorWithAnalyzer::VisitStmt_(node);
-  }
-
-  void VisitStmt_(const IfThenElseNode *node) final {
-    CheckConditionVectorized(node->condition);
-    return arith::IRVisitorWithAnalyzer::VisitStmt_(node);
-  }
-
-  void VisitExpr_(const CallNode *node) final {
-    if (node->op == builtin::if_then_else()) {
-      CheckConditionVectorized(node->args[0]);
-    } else if (node->op == builtin::call_extern()) {
-      // do not vectorize extern calls
-      vector_size_ = 1;
-    }
-    return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
-  }
-
-  void CheckConditionVectorized(const PrimExpr &cond) {
-    // TODO: may perform some checks here
-  }
-
-  void UpdateVectorSize(const Array<PrimExpr> &indices, const Buffer &buffer) {
-    if (!inner_for_)
-      return;
-    auto extent_ptr = inner_for_->extent.as<IntImmNode>();
-    if (!extent_ptr)
-      return;
-
-    const DataType &access_type = buffer->dtype;
-    // i // 2, i % 8 can also be vectorized as factor 16
-    int max_vector_size = vector_load_bits_max_ / access_type.bits();
-
-    // so we should disable this GCD optimization
-    max_vector_size = arith::ZeroAwareGCD(max_vector_size, extent_ptr->value);
-
-    auto last_dim = buffer->shape.back();
-    auto mod_set = analyzer_.modular_set(last_dim);
-    // when dynamic shape like [m, k]: coeff=1, base=0, GCD will block
-    // conditionally tail vectorize
-    if (buffer->shape.back().as<IntImmNode>()) {
-      max_vector_size = arith::ZeroAwareGCD(max_vector_size, mod_set->coeff);
-
-      auto gcd_base = arith::ZeroAwareGCD(max_vector_size, mod_set->base);
-      // If gcd_base is equal to the last dimension,
-      // we should analyze the second-to-last dimension
-      // in relation to the last dimension.
-      if (gcd_base < Downcast<IntImm>(last_dim)->value) {
-        max_vector_size = gcd_base;
-      }
-
-      vector_size_ = arith::ZeroAwareGCD(max_vector_size, vector_size_);
-
-      PrimExpr elem_offset = 0;
-      PrimExpr stride = 1;
-      for (int i = indices.size() - 1; i >= 0; --i) {
-        elem_offset = elem_offset + indices[i] * stride;
-        stride = stride * buffer->shape[i];
-      }
-      while (!IndiceCanVectorizeDynamic(elem_offset, inner_for_->loop_var,
-                                        inner_for_->extent, vector_size_,
-                                        &analyzer_)) {
-        vector_size_ /= 2;
-      }
-    } else {
-      // dynamic shape load: get the vectorization condition
-      dynamic_ = true;
-      if (!disable_dynamic_tail_split_ &&
-          vector_size_ >= vector_load_bits_max_ / buffer->dtype.bits()) {
-        vector_size_ = vector_load_bits_max_ / buffer->dtype.bits();
-      }
-      PrimExpr offset = buffer.OffsetOf(indices).back();
-      // condition for alignment, maybe useless
-      condition_ = (FloorMod(offset, vector_size_) == 0);
-    }
-  }
-
-  // Use dynamic alignment from pass config
-  int vector_load_bits_max_;
-  int dynamic_alignment_;
-  bool disable_dynamic_tail_split_;
-
-  int vector_size_;
-
-  const ForNode *inner_for_{};
-  Map<Var, Range> iter_map_;
-  bool has_nonlocal_memory_access_ = false;
-  // conditionally vectorize
-  bool dynamic_ = false;
-  PrimExpr condition_;
-};
-
-class VectorizedBodyMutator : public StmtExprMutator {
-public:
-  VectorizedBodyMutator(Var inner_var, int vector_size,
-                        std::vector<PrimExpr> conditions)
-      : inner_var_(std::move(inner_var)), vector_size_(vector_size),
-        conditions_(std::move(conditions)) {}
-
-private:
-  PrimExpr VisitExpr_(const CallNode *op) final {
-    if (op->op.same_as(builtin::if_then_else())) {
-      // TODO: Currently not ramp, but only reserve the "then" part (because
-      // conditions are move outside this vectorized loop)
-      PrimExpr ifexpr = op->args[0];
-      PrimExpr thenexpr = op->args[1];
-      bool flag = false;
-      for (auto &cond : conditions_) {
-        if (ifexpr.get() == cond.get()) {
-          flag = true;
-        }
-      }
-      if (flag) {
-        return thenexpr;
-      } else {
-        return GetRef<PrimExpr>(op);
-      }
-    } else {
-      return GetRef<PrimExpr>(op);
-    }
-  }
-
-  Var inner_var_;
-  int vector_size_;
-  std::vector<PrimExpr> conditions_;
-};
-
-class VectorizedConditionExtractor : public StmtExprVisitor {
-public:
-  VectorizedConditionExtractor() = default;
-  std::vector<PrimExpr> GetConditions(const Stmt &body) {
-    this->VisitStmt(body);
-    return conditions_;
-  }
-
-private:
-  void VisitExpr_(const CallNode *op) final {
-    if (op->op.same_as(builtin::if_then_else())) {
-      PrimExpr cond = op->args[0];
-      conditions_.emplace_back(cond);
-    }
-    StmtExprVisitor::VisitExpr_(op);
-  }
-
-  void VisitStmt_(const IfThenElseNode *node) final {
-    conditions_.emplace_back(node->condition);
-    StmtExprVisitor::VisitStmt_(node);
-  }
-
-  std::vector<PrimExpr> conditions_;
-};
-
-// backward-compatibility: extracter -> extractor
-using VectorizedConditionExtracter = VectorizedConditionExtractor;
-
-class NestedLoopChecker : public StmtExprVisitor {
-public:
-  NestedLoopChecker() : loop_num_(0) {}
-  int GetNestLoopNum(const Stmt &body) {
-    this->VisitStmt(body);
-    return loop_num_;
-  }
-
-private:
-  void VisitStmt_(const ForNode *node) final {
-    loop_num_++;
-    StmtExprVisitor::VisitStmt_(node);
-  }
-  int loop_num_;
-};
-
-// Modify every subexpression in the condition
-class VectorizedConditionMutator : public StmtExprMutator {
-public:
-  VectorizedConditionMutator(Var inner_var, int extent)
-      : inner_var_(std::move(inner_var)), vector_size_(extent) {}
-
-private:
-  PrimExpr VisitExpr_(const GENode *node) final {
-    PrimExpr lhs = StmtExprMutator::VisitExpr(node->a);
-    PrimExpr rhs = StmtExprMutator::VisitExpr(node->b);
-    auto span = node->span;
-    Map<Var, PrimExpr> vmap_lhs, vmap_rhs;
-    vmap_lhs.Set(inner_var_, 0);
-    PrimExpr lhs_bound = Substitute(lhs, vmap_lhs);
-    vmap_rhs.Set(inner_var_, vector_size_ - 1);
-    PrimExpr rhs_bound = Substitute(rhs, vmap_rhs);
-    return GE(lhs_bound, rhs_bound, span);
-  }
-
-  PrimExpr VisitExpr_(const GTNode *node) final {
-    PrimExpr lhs = StmtExprMutator::VisitExpr(node->a);
-    PrimExpr rhs = StmtExprMutator::VisitExpr(node->b);
-    auto span = node->span;
-    Map<Var, PrimExpr> vmap_lhs, vmap_rhs;
-    vmap_lhs.Set(inner_var_, 0);
-    PrimExpr lhs_bound = Substitute(lhs, vmap_lhs);
-    vmap_rhs.Set(inner_var_, vector_size_ - 1);
-    PrimExpr rhs_bound = Substitute(rhs, vmap_rhs);
-    return GT(lhs_bound, rhs_bound, span);
-  }
-
-  PrimExpr VisitExpr_(const LENode *node) final {
-    PrimExpr lhs = StmtExprMutator::VisitExpr(node->a);
-    PrimExpr rhs = StmtExprMutator::VisitExpr(node->b);
-    auto span = node->span;
-    Map<Var, PrimExpr> vmap_lhs, vmap_rhs;
-    vmap_lhs.Set(inner_var_, vector_size_ - 1);
-    PrimExpr lhs_bound = Substitute(lhs, vmap_lhs);
-    vmap_rhs.Set(inner_var_, 0);
-    PrimExpr rhs_bound = Substitute(rhs, vmap_rhs);
-    return LE(lhs_bound, rhs_bound, span);
-  }
-
-  PrimExpr VisitExpr_(const LTNode *node) final {
-    PrimExpr lhs = StmtExprMutator::VisitExpr(node->a);
-    PrimExpr rhs = StmtExprMutator::VisitExpr(node->b);
-    auto span = node->span;
-    Map<Var, PrimExpr> vmap_lhs, vmap_rhs;
-    vmap_lhs.Set(inner_var_, vector_size_ - 1);
-    PrimExpr lhs_bound = Substitute(lhs, vmap_lhs);
-    vmap_rhs.Set(inner_var_, 0);
-    PrimExpr rhs_bound = Substitute(rhs, vmap_rhs);
-    return LT(lhs_bound, rhs_bound, span);
-  }
-
-  Var inner_var_;
-  int vector_size_;
-};
-
-class VectorizeRewriterDynamic : public StmtExprMutator {
-public:
-  VectorizeRewriterDynamic(const VectorizePlanResult &plan,
-                           bool disable_dynamic_tail_split)
-      : vector_size_(plan.vector_size), condition_(plan.condition),
-        dynamic_(plan.dynamic),
-        disable_dynamic_tail_split_(disable_dynamic_tail_split) {}
-
-private:
-  Stmt VisitStmt_(const ForNode *node) final {
-    // Get pass config `tl.disable_dynamic_tail_split`
-    tvm::transform::PassContext ctxt = tvm::transform::PassContext::Current();
-    Optional<Bool> opt_disable_dynamic_tail_split =
-        ctxt->GetConfig(kDisableDynamicTailSplit, Optional<Bool>());
-    bool disable_dynamic_tail_split =
-        opt_disable_dynamic_tail_split.value_or(Bool(false));
-
-    inner_for_ = node;
-    auto ret = StmtExprMutator::VisitStmt_(node);
-    if (inner_for_ != node) {
-      return ret;
-    }
-    For fnode = ret.as<For>().value();
-    auto old_var = fnode->loop_var;
-    if (!fnode->extent.as<IntImmNode>()) {
-      return ret;
-    }
-    int extent = Downcast<IntImm>(fnode->extent)->value;
-
-    if (!dynamic_) {
-      return fnode;
-    }
-
-    if (!disable_dynamic_tail_split) {
-      // To handle the fact that cp.async only support address aligned with
-      // access size
-      vector_size_ = 1;
-    }
-
-    ICHECK(extent % vector_size_ == 0)
-        << "extent: " << extent << " vector_size_: " << vector_size_;
-    ICHECK(is_zero(fnode->min));
-    Var inner_var = Var("vec");
-    Var outer_var = Var(old_var->name_hint);
-    Map<Var, PrimExpr> vmap;
-    vmap.Set(fnode->loop_var, outer_var * vector_size_ + inner_var);
-    Stmt body = Substitute(fnode->body, vmap);
-
-    VectorizedConditionExtractor extractor;
-    std::vector<PrimExpr> conditions = extractor.GetConditions(body);
-
-    VectorizedConditionMutator condition_mutator(inner_var, vector_size_);
-
-    // Adaptively set vectorized variable to the min/max value of the extent
-    PrimExpr condition_bound;
-    if (!conditions.empty()) {
-      condition_bound = condition_mutator(conditions[0]);
-      for (int i = 1; i < conditions.size(); ++i) {
-        condition_bound = condition_bound && condition_mutator(conditions[i]);
-      }
-    }
-
-    if (!disable_dynamic_tail_split) {
-      // If dynamic_tail_split is true, we will vectorize the loop with
-      // if-then-else conditions modify body in the vectorized loop
-      VectorizedBodyMutator mutator(inner_var, vector_size_, conditions);
-      Stmt vectorize_body = mutator(body);
-
-      // add condition ifthenelse here
-      For vectorize_for =
-          For(inner_var, 0, vector_size_, ForKind::kVectorized, vectorize_body);
-      For serial_for = For(inner_var, 0, vector_size_, ForKind::kSerial, body);
-      if (!conditions.empty()) {
-        body = IfThenElse(condition_bound, vectorize_for, serial_for);
-      } else {
-        body = vectorize_for;
-      }
-      body = For(outer_var, 0, extent / vector_size_, fnode->kind, body,
-                 fnode->thread_binding, fnode->annotations, fnode->span);
-      return body;
-    } else {
-      // If dynamic_tail_split is false, we will directly vectorize the loop
-      // without dynamic tail split and if_then_else, which may lead to error
-      VectorizedBodyMutator mutator(inner_var, vector_size_, conditions);
-      Stmt vectorize_body = mutator(body);
-
-      For vectorize_for =
-          For(inner_var, 0, vector_size_, ForKind::kVectorized, vectorize_body);
-      body =
-          For(outer_var, 0, extent / vector_size_, fnode->kind, vectorize_for,
-              fnode->thread_binding, fnode->annotations, fnode->span);
-      return body;
-    }
-  }
-
-  const ForNode *inner_for_{};
-  int vector_size_;
-  const PrimExpr condition_;
-  const bool dynamic_;
-  const bool disable_dynamic_tail_split_;
-};
-
-VectorizePlanResult
-GetVectorizePlanResultDynamic(const For &loop, int dynamic_alignment,
-                              bool disable_dynamic_tail_split) {
-  VectorizePlannerDynamic planner(dynamic_alignment,
-                                  disable_dynamic_tail_split);
-  int vector_size = planner.Plan(loop);
-  bool dynamic = planner.GetDynamic();
-  PrimExpr condition = planner.GetCondition();
-  return {vector_size, dynamic, condition};
-}
-
-class LoopVectorizerDynamic : public IRMutatorWithAnalyzer {
-public:
-  static Stmt Substitute(Stmt stmt, bool disable_dynamic_tail_split,
-                         int dynamic_alignment) {
-    arith::Analyzer analyzer;
-    LoopVectorizerDynamic substituter(&analyzer, disable_dynamic_tail_split,
-                                      dynamic_alignment);
-    stmt = substituter.VisitStmt(stmt);
-    return stmt;
-  }
-
-private:
-  LoopVectorizerDynamic(arith::Analyzer *analyzer,
-                        bool disable_dynamic_tail_split, int dynamic_alignment)
-      : arith::IRMutatorWithAnalyzer(analyzer),
-        disable_dynamic_tail_split_(disable_dynamic_tail_split),
-        dynamic_alignment_(dynamic_alignment) {}
-
-  Stmt VisitStmt_(const ForNode *op) final {
-    For for_node = Downcast<For>(IRMutatorWithAnalyzer::VisitStmt_(op));
-    VectorizePlanResult res{vector_load_bits_max_, false, 0};
-    res = GetVectorizePlanResultDynamic(for_node, dynamic_alignment_,
-                                        disable_dynamic_tail_split_);
-    NestedLoopChecker checker;
-    int nest_num = checker.GetNestLoopNum(for_node);
-    if (nest_num > 1 ||
-        for_node->kind == ForKind::kVectorized) { // only rewrite the innermost
-                                                  // non-vectorized loop
-      return for_node;
-    }
-    auto rewriter = VectorizeRewriterDynamic(res, disable_dynamic_tail_split_);
-    return Downcast<For>(rewriter(for_node));
-  }
-
-  const int vector_load_bits_max_ = 128;
-  int dynamic_alignment_;
-  bool disable_dynamic_tail_split_;
-};
-
-class VectorizeSkipperDynamic : public StmtMutator {
-public:
-  Stmt VisitStmt_(const ForNode *op) final {
-    Stmt stmt = StmtMutator::VisitStmt_(op);
-    op = stmt.as<ForNode>();
-    if (op->kind == ForKind::kVectorized) {
-      return For(op->loop_var, op->min, op->extent, ForKind::kSerial, op->body);
-    } else {
-      return stmt;
-    }
-  }
-};
-
-tvm::transform::Pass LoopVectorizeDynamic() {
-  using namespace tir::transform;
-  auto pass_func = [=](PrimFunc f, const IRModule &m, PassContext ctx) {
-    bool disable_dynamic_tail_split =
-        ctx->GetConfig<Bool>(kDisableDynamicTailSplit, Bool(true)).value();
-    int dynamic_alignment =
-        (int)(ctx->GetConfig<Integer>(kDynamicAlignment, Integer(8))
-                  .value_or(Integer(8))
-                  ->value);
-    // Ensure tl.dynamic_alignment is a power of 2
-    if (disable_dynamic_tail_split &&
-        ((dynamic_alignment & (dynamic_alignment - 1)) != 0)) {
-      LOG(FATAL) << "tl.dynamic_alignment must be a power of 2, but got "
-                 << dynamic_alignment;
-    }
-    auto *n = f.CopyOnWrite();
-    n->body = LoopVectorizerDynamic::Substitute(
-        std::move(n->body), disable_dynamic_tail_split, dynamic_alignment);
-    return f;
-  };
-  return CreatePrimFuncPass(pass_func, 0, "tl.LoopVectorizeDynamic", {});
-}
-
-// Register the pass globally so it can be used in the compilation pipeline
-TVM_FFI_STATIC_INIT_BLOCK({
-  namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("tl.transform.LoopVectorizeDynamic",
-                        LoopVectorizeDynamic);
-});
-
-} // namespace tl
-} // namespace tvm
diff --git a/src/transform/lower_cpengine_intrin.cc b/src/transform/lower_cpengine_intrin.cc
index c6114789d..4304a6adf 100644
--- a/src/transform/lower_cpengine_intrin.cc
+++ b/src/transform/lower_cpengine_intrin.cc
@@ -76,11 +76,11 @@ tvm::transform::Pass LowerCpengineIntrin() {
   return CreatePrimFuncPass(pass_func, 0, "tl.LowerCpengineIntrin", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.LowerCpengineIntrin",
                         LowerCpengineIntrin);
-});
+}
 #endif // (CUDA_MAJOR_VERSION >= 12)
 
 } // namespace tl
diff --git a/src/transform/lower_device_kernel_launch.cc b/src/transform/lower_device_kernel_launch.cc
index 7ea7f7c62..f2d8ae239 100644
--- a/src/transform/lower_device_kernel_launch.cc
+++ b/src/transform/lower_device_kernel_launch.cc
@@ -36,7 +36,7 @@ namespace tvm {
 namespace tl {
 
 using namespace tir;
-
+using namespace ffi;
 namespace {
 struct KernelInfo {
   // The device on which the PrimFunc runs
@@ -372,8 +372,8 @@ tvm::transform::Pass LowerDeviceKernelLaunch() {
       IRModule updates;
       for (const auto &[gvar, base_func] : mod->functions) {
         if (auto *ptr = base_func.as<PrimFuncNode>()) {
-          auto prim_func =
-              mutator.RewriteKernelLaunchSite(gvar, GetRef<PrimFunc>(ptr));
+          auto prim_func = mutator.RewriteKernelLaunchSite(
+              gvar, tvm::ffi::GetRef<PrimFunc>(ptr));
           if (!prim_func.same_as(base_func)) {
             updates->Add(gvar, prim_func);
           }
@@ -388,8 +388,8 @@ tvm::transform::Pass LowerDeviceKernelLaunch() {
       IRModule updates;
       for (const auto &[gvar, base_func] : mod->functions) {
         if (auto *ptr = base_func.as<PrimFuncNode>()) {
-          auto prim_func =
-              mutator.UpdateKernelAttributes(gvar, GetRef<PrimFunc>(ptr));
+          auto prim_func = mutator.UpdateKernelAttributes(
+              gvar, tvm::ffi::GetRef<PrimFunc>(ptr));
           if (!prim_func.same_as(base_func)) {
             updates->Add(gvar, prim_func);
           }
@@ -407,11 +407,11 @@ tvm::transform::Pass LowerDeviceKernelLaunch() {
                                           "tl.LowerDeviceKernelLaunch", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.LowerDeviceKernelLaunch",
                         LowerDeviceKernelLaunch);
-});
+}
 
 } // namespace transform
 } // namespace tl
diff --git a/src/transform/lower_device_storage_access_info.cc b/src/transform/lower_device_storage_access_info.cc
index 635a3fdb8..6dc46e985 100644
--- a/src/transform/lower_device_storage_access_info.cc
+++ b/src/transform/lower_device_storage_access_info.cc
@@ -45,7 +45,7 @@ class StorageAccessInfoLower : public StmtExprMutator {
   Stmt VisitStmt_(const AllocateNode *op) final {
     auto scope = StorageScope::Create(GetPtrStorageScope(op->buffer_var));
     if (!scope.tag.empty() && scope.tag != ".dyn" && scope.tag != ".var" &&
-        scope.tag != ".barrier" && scope.tag != ".descriptor") {
+        scope.tag != ".barrier" && scope.tag.find(".descriptor") != 0) {
       auto info = GetMemoryInfo(GetPtrStorageScope(op->buffer_var));
       ICHECK(info.defined())
           << "Cannot find memory info of " << scope.to_string();
@@ -143,11 +143,11 @@ Pass LowerDeviceStorageAccessInfo() {
                             {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.LowerDeviceStorageAccessInfo",
                         LowerDeviceStorageAccessInfo);
-});
+}
 
 } // namespace transform
 } // namespace tl
diff --git a/src/transform/lower_hopper_intrin.cc b/src/transform/lower_hopper_intrin.cc
index b514627d7..e9c848ac9 100644
--- a/src/transform/lower_hopper_intrin.cc
+++ b/src/transform/lower_hopper_intrin.cc
@@ -26,10 +26,13 @@ class LowerHopperIntrin : public StmtExprMutator {
     LowerHopperIntrin substituter(disable_shuffle_elect);
     fptr->body = substituter.VisitStmt(f->body);
     Map<Var, Array<PrimExpr>> init_desc_arg_map;
+    // Collect prologue/epilogue statements for host-side setup/teardown
+    Array<Stmt> prologue_stmts;
+    Array<Stmt> epilogue_stmts;
     for (const auto &[call, var] : substituter.desc_map_) {
       // Should allocate 128 bytes for TensorMap on stack
       Call alloc_desc = Call(DataType::Handle(), builtin::tvm_stack_alloca(),
-                             {StringImm("arg_value"), 16});
+                             {StringImm("tvm_ffi_any"), 16});
       Array<PrimExpr> init_desc_args;
       if (call->op.same_as(create_tma_descriptor())) {
         init_desc_args.push_back(StringImm(tvm_tensormap_create_tiled));
@@ -44,11 +47,66 @@ class LowerHopperIntrin : public StmtExprMutator {
       // add to function attribute
       Call init_desc =
           Call(DataType::Handle(), builtin::tvm_call_packed(), init_desc_args);
-      fptr->body =
-          LetStmt(var, alloc_desc, SeqStmt({Evaluate(init_desc), fptr->body}));
+      // Accumulate TMA descriptor init into prologue
+      prologue_stmts.push_back(LetStmt(var, alloc_desc, Evaluate(init_desc)));
       init_desc_arg_map.Set(var, init_desc_args);
     }
     f = WithAttr(std::move(f), "tma_descriptor_args", init_desc_arg_map);
+
+    // Additionally, if L2 persistent cache annotations were lowered earlier,
+    // materialize TVM FFI calls to set the stream access policy window.
+    if (f->attrs.defined() && f->attrs->dict.count("l2_persistent_map")) {
+      auto l2_map =
+          f->GetAttr<Map<String, Array<PrimExpr>>>("l2_persistent_map");
+      if (l2_map.defined()) {
+        // Build a lookup from buffer name to Buffer object
+        std::unordered_map<std::string, Buffer> name2buf;
+        for (const auto &kv : f->buffer_map) {
+          name2buf.emplace(kv.second->name, kv.second);
+        }
+        for (const auto &kv : l2_map.value()) {
+          const std::string buf_name = kv.first;
+          const Array<PrimExpr> &args = kv.second;
+          if (name2buf.count(buf_name) == 0) {
+            continue;
+          }
+          const Buffer &buf = name2buf.at(buf_name);
+          // Build base pointer expression (read access)
+          PrimExpr base_ptr = buf.access_ptr(1);
+          // Args packed: func_name, base_ptr, num_bytes, hit_ratio
+          Array<PrimExpr> packed_args;
+          packed_args.push_back(
+              StringImm(tvm_cuda_stream_set_access_policy_window));
+          packed_args.push_back(base_ptr);
+          // size_in_bytes (args[1]) then hit_ratio (args[0])
+          ICHECK_GE(args.size(), 2);
+          packed_args.push_back(args[1]);
+          packed_args.push_back(args[0]);
+          prologue_stmts.push_back(Evaluate(Call(
+              DataType::Int(32), builtin::tvm_call_packed(), packed_args)));
+        }
+        // Add a single epilogue call to reset the access policy window and
+        // restore L2 limit
+        Array<PrimExpr> reset_args;
+        reset_args.push_back(
+            StringImm(tvm_cuda_stream_reset_access_policy_window));
+        epilogue_stmts.push_back(Evaluate(
+            Call(DataType::Int(32), builtin::tvm_call_packed(), reset_args)));
+      }
+    }
+
+    // Stitch prologue statements before the original body
+    if (!prologue_stmts.empty()) {
+      // Chain the Let/Evaluate statements sequentially
+      Stmt seq = prologue_stmts.size() == 1 ? prologue_stmts[0]
+                                            : SeqStmt(prologue_stmts);
+      fptr->body = SeqStmt({seq, fptr->body});
+    }
+    if (!epilogue_stmts.empty()) {
+      Stmt seq_end = epilogue_stmts.size() == 1 ? epilogue_stmts[0]
+                                                : SeqStmt(epilogue_stmts);
+      fptr->body = SeqStmt({fptr->body, seq_end});
+    }
     return f;
   }
 
@@ -83,6 +141,16 @@ class LowerHopperIntrin : public StmtExprMutator {
                                   stmts.size() > 1 ? SeqStmt(stmts) : stmts[0]);
           stmt_seq.push_back(stmt_);
           if (!init_mbarrier_calls_.empty()) {
+            // Note from FlashAttention:
+            // Helps with visibility of barrier init operations across warps /
+            // cta / cluster Available as a separate function so as to batch
+            // inits across barriers and fence once Note : It must be composed
+            // with an appropriate sync instruction with the right scope to
+            // ensure visibility eg. __syncthreads() or a cluster_arrive() +
+            // cluster_wait()
+            Stmt mem_fence = Evaluate(Call(
+                DataType::Handle(), tvm::tl::ptx_fence_barrier_init(), {}));
+            stmt_seq.push_back(mem_fence);
             Stmt mem_sync =
                 Evaluate(Call(DataType::Handle(), builtin::tvm_storage_sync(),
                               {StringImm("shared")}));
@@ -103,14 +171,14 @@ class LowerHopperIntrin : public StmtExprMutator {
     if (call->op.same_as(create_tma_descriptor()) ||
         call->op.same_as(create_tma_im2col_descriptor())) {
       Var var;
-      auto iter = desc_map_.find(GetRef<Call>(call));
+      auto iter = desc_map_.find(tvm::ffi::GetRef<Call>(call));
       if (iter != desc_map_.end()) {
         var = iter->second;
       } else {
         String name = call->args[2].as<Var>().value()->name_hint;
         var = Var(name + "_desc",
                   PointerType(PrimType(cuTensorMapType()), "grid_constant"));
-        desc_map_[GetRef<Call>(call)] = var;
+        desc_map_[tvm::ffi::GetRef<Call>(call)] = var;
         prefetch_calls_.push_back(
             Evaluate(Call(DataType::Handle(), builtin::call_extern(),
                           {StringImm("tl::prefetch_tma_descriptor"), var})));
@@ -151,10 +219,10 @@ tvm::transform::Pass LowerHopperIntrin() {
   return CreatePrimFuncPass(pass_func, 0, "tl.LowerHopperIntrin", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.LowerHopperIntrin", LowerHopperIntrin);
-});
+}
 #endif // (CUDA_MAJOR_VERSION >= 12)
 
 } // namespace tl
diff --git a/src/transform/lower_intrin.cc b/src/transform/lower_intrin.cc
index 737fc8936..cf312264d 100644
--- a/src/transform/lower_intrin.cc
+++ b/src/transform/lower_intrin.cc
@@ -37,6 +37,7 @@
 namespace tvm {
 namespace tl {
 using namespace tir;
+using namespace ffi;
 
 class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
 public:
@@ -70,9 +71,9 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
   PrimExpr VisitExpr_(const CallNode *op) final {
     if (auto *ptr_op = op->op.as<OpNode>()) {
       for (const auto &f_attr_map : attr_maps_) {
-        FLowerGeneral f = f_attr_map.get(GetRef<Op>(ptr_op), nullptr);
+        FLowerGeneral f = f_attr_map.get(tvm::ffi::GetRef<Op>(ptr_op), nullptr);
         if (f != nullptr) {
-          PrimExpr e = GetRef<PrimExpr>(op);
+          PrimExpr e = tvm::ffi::GetRef<PrimExpr>(op);
           PrimExpr r = f(e);
           ICHECK(r.defined()) << "intrinsic rule must always return valid Expr";
           if (!r.same_as(e)) {
@@ -99,7 +100,7 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
   // We use floordiv for integer analysis,
   // but will need to lower them to native truncdiv instructions
   PrimExpr VisitExpr_(const FloorDivNode *op) final {
-    auto e = GetRef<PrimExpr>(op);
+    auto e = tvm::ffi::GetRef<PrimExpr>(op);
     PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
     op = ret.as<FloorDivNode>();
     if (op == nullptr)
@@ -121,8 +122,14 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
         return truncdiv(op->a, op->b);
       }
 
+      // NOTE: Disabled due to integer overflow risk in `a + b * c`.
+      // The transformation `floordiv(a,b) -> truncdiv(a + b*c, b) - c`
+      // may overflow when `a` is near type limit and `c` is large,
+      // producing incorrect results.
+
       // If the numerator's lower bound is known, express the floordiv
       // in terms of truncdiv using only positive operands.
+      /*
       arith::ConstIntBound const_int_bound = analyzer_->const_int_bound(op->a);
       if (const_int_bound->min_value < 0 &&
           const_int_bound->min_value >
@@ -164,6 +171,7 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
             analyzer_->Simplify(op->a + op->b * ceildiv);
         return truncdiv(offset_numerator, op->b) - ceildiv;
       }
+      */
 
       DLOG(INFO) << "LowerFloorDiv: Cannot decide the sign of divident";
       PrimExpr rdiv = truncdiv(op->a, op->b);
@@ -222,8 +230,14 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
         return truncmod(op->a, op->b);
       }
 
+      // NOTE: Disabled due to integer overflow risk in `a + b * c`.
+      // The transformation `floordiv(a,b) -> truncdiv(a + b*c, b) - c`
+      // may overflow when `a` is near type limit and `c` is large,
+      // producing incorrect results.
+
       // If the numerator's lower bound is known, express the floormod
       // in terms of truncmod using only positive operands.
+      /*
       arith::ConstIntBound const_int_bound = analyzer_->const_int_bound(op->a);
       if (const_int_bound->min_value < 0 &&
           const_int_bound->min_value >
@@ -264,6 +278,7 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
             analyzer_->Simplify(op->a + op->b * ceildiv);
         return truncmod(offset_numerator, op->b);
       }
+      */
 
       DLOG(INFO) << "LowerFloorMod: Cannot decide the sign of divident";
       // NOTE:condition on b >= 0.
@@ -305,7 +320,7 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
     using namespace arith;
     PVar<PrimExpr> x, y;
     PVar<IntImm> c;
-    auto e = GetRef<PrimExpr>(op);
+    auto e = tvm::ffi::GetRef<PrimExpr>(op);
     if (max(floordiv(x, y), c).Match(e) && c.Eval()->value >= 0 &&
         analyzer_->CanProveGreaterEqual(y.Eval(), 0)) {
       return max(VisitExpr(truncdiv(x, y).Eval()), c.Eval());
@@ -316,7 +331,7 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
   PrimExpr VisitExpr_(const EQNode *op) final {
     using namespace arith;
     PVar<PrimExpr> x, y;
-    auto e = GetRef<PrimExpr>(op);
+    auto e = tvm::ffi::GetRef<PrimExpr>(op);
     if ((floormod(x, y) == 0).Match(e)) {
       return VisitExpr((truncmod(x, y) == 0).Eval());
     }
@@ -326,7 +341,7 @@ class IntrinInjecter : public tvm::arith::IRMutatorWithAnalyzer {
   PrimExpr VisitExpr_(const NENode *op) final {
     using namespace arith;
     PVar<PrimExpr> x, y;
-    auto e = GetRef<PrimExpr>(op);
+    auto e = tvm::ffi::GetRef<PrimExpr>(op);
     if ((floormod(x, y) != 0).Match(e)) {
       return VisitExpr((truncmod(x, y) != 0).Eval());
     }
@@ -413,10 +428,10 @@ tir::transform::Pass LowerIntrin() {
   return CreatePrimFuncPass(pass_func, 0, "tl.LowerIntrin", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.LowerIntrin", LowerIntrin);
-});
+}
 
 } // namespace transform
 
diff --git a/src/transform/lower_l2_persistent_annotation.cc b/src/transform/lower_l2_persistent_annotation.cc
index 8a8dee4c0..1f7be710d 100644
--- a/src/transform/lower_l2_persistent_annotation.cc
+++ b/src/transform/lower_l2_persistent_annotation.cc
@@ -98,10 +98,10 @@ tvm::transform::Pass LowerL2Persistent() {
   return CreatePrimFuncPass(pass_func, 0, "tl.LowerL2Persistent", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.LowerL2Persistent", LowerL2Persistent);
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/lower_opaque_block.cc b/src/transform/lower_opaque_block.cc
index b278fbf47..76dc36a6a 100644
--- a/src/transform/lower_opaque_block.cc
+++ b/src/transform/lower_opaque_block.cc
@@ -119,7 +119,7 @@ class OpaqueBlockLower : public StmtExprMutator {
     // Step 1. Update unit loop info.
     PrimExpr min = this->VisitExpr(op->min);
     PrimExpr extent = this->VisitExpr(op->extent);
-    if (is_one(extent) && op->annotations.empty()) {
+    if (is_one(extent) && IsEffectivelyEmptyAnnotation(op->annotations)) {
       // handling unit loop
       unit_loop_vars_[op->loop_var] = min;
     }
@@ -135,7 +135,8 @@ class OpaqueBlockLower : public StmtExprMutator {
       ICHECK(op->thread_binding.defined());
       String thread_tag = op->thread_binding.value()->thread_tag;
       body = MakeLaunchThread(min, extent, op->loop_var, thread_tag, body);
-    } else if (is_one(extent) && op->annotations.empty()) {
+    } else if (is_one(extent) &&
+               IsEffectivelyEmptyAnnotation(op->annotations)) {
       // Case 2. Unit loop
       return body;
     } else {
@@ -150,8 +151,25 @@ class OpaqueBlockLower : public StmtExprMutator {
     return body;
   }
 
+  // Treat annotations as empty if they are truly empty or contain only
+  // the unroll hint `pragma_unroll_explicit`. This allows unit-length
+  // loops produced by unroll pragmas to be simplified away.
+  bool
+  IsEffectivelyEmptyAnnotation(const Map<String, ffi::Any> &annotations) const {
+    if (annotations.empty()) {
+      return true;
+    }
+    if (annotations.size() == 1) {
+      auto it = annotations.find(tir::attr::pragma_unroll_explicit);
+      if (it != annotations.end()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
   PrimExpr VisitExpr_(const VarNode *op) final {
-    Var var = GetRef<Var>(op);
+    Var var = tvm::ffi::GetRef<Var>(op);
     auto it = unit_loop_vars_.find(var);
     if (it == unit_loop_vars_.end()) {
       return var;
@@ -286,10 +304,10 @@ tir::transform::Pass LowerOpaqueBlock() {
   return CreatePrimFuncPass(pass_func, 0, "tl.LowerOpaqueBlock", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.LowerOpaqueBlock", LowerOpaqueBlock);
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/lower_shared_barrier.cc b/src/transform/lower_shared_barrier.cc
index a3208d181..991676cb8 100644
--- a/src/transform/lower_shared_barrier.cc
+++ b/src/transform/lower_shared_barrier.cc
@@ -32,7 +32,7 @@ class SharedBarrierRewriter : public StmtExprMutator {
       : disable_shuffle_elect_(disable_shuffle_elect) {}
 
   Stmt VisitStmt_(const BlockNode *op) final {
-    Block block = GetRef<Block>(op);
+    Block block = tvm::ffi::GetRef<Block>(op);
     Array<Buffer> alloc_buffers = op->alloc_buffers;
 
     // Record the mapping from buffer data var to buffer for later lookup
@@ -204,10 +204,10 @@ tvm::transform::Pass LowerSharedBarrier() {
   return CreatePrimFuncPass(pass_func, 0, "tl.LowerSharedBarrier", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.LowerSharedBarrier", LowerSharedBarrier);
-});
+}
 
 } // namespace transform
 } // namespace tl
diff --git a/src/transform/lower_shared_tmem.cc b/src/transform/lower_shared_tmem.cc
index 661b39949..4a3ad187e 100644
--- a/src/transform/lower_shared_tmem.cc
+++ b/src/transform/lower_shared_tmem.cc
@@ -30,7 +30,7 @@ class SharedTmemRewriter : public StmtExprMutator {
 
 private:
   Stmt VisitStmt_(const BlockNode *op) final {
-    Block block = GetRef<Block>(op);
+    Block block = tvm::ffi::GetRef<Block>(op);
     Array<Buffer> alloc_buffers = op->alloc_buffers;
     if (op->annotations.count(attr::kLayoutMap)) {
       auto layout_map = op->annotations.Get(attr::kLayoutMap);
@@ -88,6 +88,8 @@ class SharedTmemRewriter : public StmtExprMutator {
     Array<Var> new_data_vars;
     for (auto buffer : tmem_buffers) {
       auto data = buffer->data;
+      if (var_remap_.count(data))
+        continue;
       auto new_data =
           Var(data->name_hint, PointerType(PrimType(tmem_dtype_), "shared"));
       var_remap_.Set(data, new_data);
@@ -107,6 +109,7 @@ class SharedTmemRewriter : public StmtExprMutator {
                                buffer->buffer_type);
       new_buffers.push_back(new_buffer);
       buffer_remap_.Set(buffer, new_buffer);
+      buffer_data_to_buffer_.Set(new_data, new_buffer);
     }
 
     // remove the tmem buffers
@@ -255,7 +258,15 @@ class SharedTmemRewriter : public StmtExprMutator {
           op->dtype, op->op,
           {op->args[0], new_data, op->args[2], op->args[3], op->args[4]});
     }
-    return StmtExprMutator::VisitExpr_(op);
+    auto expr = StmtExprMutator::VisitExpr_(op);
+    return expr;
+  }
+  PrimExpr VisitExpr_(const VarNode *op) final {
+    Var var = tvm::ffi::GetRef<Var>(op);
+    if (var_remap_.count(var)) {
+      return var_remap_[var];
+    }
+    return var;
   }
 
   Stmt VisitStmt_(const AttrStmtNode *op) final {
@@ -300,10 +311,10 @@ tvm::transform::Pass LowerSharedTmem() {
   return CreatePrimFuncPass(pass_func, 0, "tl.LowerSharedTmem", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.LowerSharedTmem", LowerSharedTmem);
-});
+}
 
 } // namespace transform
 } // namespace tl
diff --git a/src/transform/lower_thread_allreduce.cc b/src/transform/lower_thread_allreduce.cc
index 71ef8a92c..dc0fbeb85 100644
--- a/src/transform/lower_thread_allreduce.cc
+++ b/src/transform/lower_thread_allreduce.cc
@@ -39,6 +39,7 @@
 namespace tvm {
 namespace tl {
 using namespace tir;
+using namespace ffi;
 
 using runtime::StorageRank;
 using runtime::StorageScope;
@@ -944,11 +945,11 @@ tvm::transform::Pass LowerThreadAllreduce() {
   return CreatePrimFuncPass(pass_func, 0, "tl.LowerThreadAllreduce", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.LowerThreadAllreduce",
                         LowerThreadAllreduce);
-});
+}
 
 } // namespace transform
 } // namespace tl
diff --git a/src/transform/lower_tile_op.cc b/src/transform/lower_tile_op.cc
old mode 100755
new mode 100644
index 09583f2c9..303ac03ee
--- a/src/transform/lower_tile_op.cc
+++ b/src/transform/lower_tile_op.cc
@@ -10,6 +10,7 @@
 #include <tvm/tir/transform.h>
 #include <tvm/tir/utils.h>
 #include <unordered_map>
+#include <vector>
 
 #include "../layout/layout.h"
 #include "../layout/utils.h"
@@ -17,8 +18,11 @@
 #include "../op/gemm.h"
 #include "../op/gemm_sp.h"
 #include "../op/operator.h"
+#include "../op/utils.h"
+#include "../target/utils.h"
 
 #include "arith/ir_mutator_with_analyzer.h"
+#include "layout_reducer.h"
 #include "loop_partition.h"
 
 namespace tvm {
@@ -103,55 +107,6 @@ class LayoutRemapRewriter : public arith::IRMutatorWithAnalyzer {
 
   Map<Buffer, Layout> layout_remap_;
 };
-class BufferGemmCollector : public StmtExprVisitor {
-public:
-  BufferGemmCollector() { Clear(); }
-
-  void Clear() { buffer_var_gemm_.clear(); }
-
-  void Collect(const Stmt &stmt) { VisitStmt(stmt); }
-
-  Array<Var> GetBufferVarGemm() { return buffer_var_gemm_; }
-
-private:
-  void VisitStmt_(const EvaluateNode *op) {
-    const CallNode *call_node = op->value.as<CallNode>();
-    // Value of EvaluateNode may not be a call
-    if (!call_node) {
-      return;
-    }
-    auto call = Downcast<Call>(call_node);
-    if (call->op.same_as(Gemm::Get())) {
-      auto srcA_buffer_access_ptr = Downcast<Call>(call->args[0]);
-      ICHECK(srcA_buffer_access_ptr->op.same_as(builtin::tvm_access_ptr()));
-      auto srcA_buffer_var = Downcast<Var>(srcA_buffer_access_ptr->args[1]);
-      auto srcB_buffer_access_ptr = Downcast<Call>(call->args[1]);
-      ICHECK(srcB_buffer_access_ptr->op.same_as(builtin::tvm_access_ptr()));
-      auto srcB_buffer_var = Downcast<Var>(srcB_buffer_access_ptr->args[1]);
-      auto dst_buffer_access_ptr = Downcast<Call>(call->args[2]);
-      ICHECK(dst_buffer_access_ptr->op.same_as(builtin::tvm_access_ptr()));
-      auto dst_buffer_var = Downcast<Var>(dst_buffer_access_ptr->args[1]);
-      buffer_var_gemm_.push_back(srcA_buffer_var);
-      buffer_var_gemm_.push_back(srcB_buffer_var);
-      buffer_var_gemm_.push_back(dst_buffer_var);
-    } else if (call->op.same_as(GemmSP::Get())) {
-      auto srcA_buffer_access_ptr = Downcast<Call>(call->args[0]);
-      ICHECK(srcA_buffer_access_ptr->op.same_as(builtin::tvm_access_ptr()));
-      auto srcA_buffer_var = Downcast<Var>(srcA_buffer_access_ptr->args[1]);
-      auto srcB_buffer_access_ptr = Downcast<Call>(call->args[1]);
-      ICHECK(srcB_buffer_access_ptr->op.same_as(builtin::tvm_access_ptr()));
-      auto srcB_buffer_var = Downcast<Var>(srcB_buffer_access_ptr->args[1]);
-      auto dst_buffer_access_ptr = Downcast<Call>(call->args[2]);
-      ICHECK(dst_buffer_access_ptr->op.same_as(builtin::tvm_access_ptr()));
-      auto dst_buffer_var = Downcast<Var>(dst_buffer_access_ptr->args[1]);
-      buffer_var_gemm_.push_back(srcA_buffer_var);
-      buffer_var_gemm_.push_back(srcB_buffer_var);
-      buffer_var_gemm_.push_back(dst_buffer_var);
-    }
-  }
-
-  Array<Var> buffer_var_gemm_;
-};
 
 /*!
  * \brief A class that rewrites buffer references in a statement based on a
@@ -253,11 +208,6 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
     auto target = f->GetAttr<Target>(tvm::attr::kTarget);
     ICHECK(target.defined()) << "LowerTileOpPass: Require the target attribute";
     substituter.target_ = target.value();
-    // For TMA 1D, we should collect the buffers which are not used in GEMM and
-    // do not need swizzle
-    BufferGemmCollector collector;
-    collector.Collect(f->body);
-    substituter.buffer_var_gemm_ = collector.GetBufferVarGemm();
     PrimFuncNode *fptr = f.CopyOnWrite();
     fptr->body = substituter.VisitStmt(f->body);
     fptr->body =
@@ -301,6 +251,9 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
         layout_map_.Set(buffer, layout);
       }
     }
+    // Begin a new workspace collection frame for this block scope
+    workspace_stack_.emplace_back();
+
     auto block = Downcast<Block>(arith::IRMutatorWithAnalyzer::VisitStmt_(op));
     auto block_ptr = block.CopyOnWrite();
     for (size_t i = 0; i < block->alloc_buffers.size(); i++) {
@@ -309,9 +262,13 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
         block_ptr->alloc_buffers.Set(i, buffer_remap_[buffer]);
       }
     }
-    for (const auto &buffer : workspaces_)
-      block_ptr->alloc_buffers.push_back(buffer);
-    workspaces_.clear();
+    // Attach any workspaces requested within this block to its alloc_buffers
+    if (!workspace_stack_.empty()) {
+      for (const auto &buffer : workspace_stack_.back()) {
+        block_ptr->alloc_buffers.push_back(buffer);
+      }
+      workspace_stack_.pop_back();
+    }
     return block;
   }
 
@@ -350,8 +307,9 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
       if (!load_expr.same_as(access_ptr_call->args[0])) {
         auto node = access_ptr_call.CopyOnWrite();
         node->args.Set(0, load_expr);
-        access_ptr_call = Call(access_ptr_call->dtype, access_ptr_call->op,
-                               {load_expr}, access_ptr_call->span);
+        access_ptr_call =
+            Call(access_ptr_call->dtype, access_ptr_call->op, {load_expr},
+                 access_ptr_call->annotations, access_ptr_call->span);
       }
       BufferLoad load = Downcast<BufferLoad>(access_ptr_call->args[0]);
       Array<PrimExpr> indices = load->indices;
@@ -421,7 +379,7 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
       }
       result.rewritten = true;
       result.expr = Call(access_ptr_call->dtype, access_ptr_call->op, new_args,
-                         access_ptr_call->span);
+                         access_ptr_call->annotations, access_ptr_call->span);
       return result;
     } else {
       LOG(FATAL) << "Invalid access op for permuted layout: " << access_ptr;
@@ -435,7 +393,7 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
       return expr;
     }
     if (const auto *var_node = expr.as<VarNode>()) {
-      Var var = GetRef<Var>(var_node);
+      Var var = tvm::ffi::GetRef<Var>(var_node);
       auto it = let_bindings_.find(var);
       if (it != let_bindings_.end()) {
         return it->second;
@@ -518,7 +476,8 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
       if (!load_expr.same_as(address_of_call->args[0])) {
         auto call_node = call.CopyOnWrite();
         call_node->args.Set(5, Call(address_of_call->dtype, address_of_call->op,
-                                    {load_expr}, address_of_call->span));
+                                    {load_expr}, address_of_call->annotations,
+                                    address_of_call->span));
         address_of_call = Downcast<Call>(call->args[5]);
         access_ptr = call->args[5];
       }
@@ -611,7 +570,7 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
       let_bindings_.erase(op->var);
     }
     if (value.same_as(op->value) && body.same_as(op->body)) {
-      return GetRef<Stmt>(op);
+      return tvm::ffi::GetRef<Stmt>(op);
     } else {
       auto n = this->CopyOnWrite(op);
       n->value = value;
@@ -652,13 +611,21 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
     if (call && call->op.as<GlobalVarNode>())
       return Downcast<Evaluate>(IRMutatorWithAnalyzer::VisitStmt_(op));
 
-    auto tile_op = ParseOperator(GetRef<Stmt>(op), buffer_data_to_buffer_);
+    auto tile_op = ParseOperator(tvm::ffi::GetRef<Stmt>(op));
     if (!tile_op.defined())
       return IRMutatorWithAnalyzer::VisitStmt_(op);
     AddWorkspaceCallback callback = [this](int num_elem, DataType dtype) {
       auto workspace =
           decl_buffer({PrimExpr(num_elem)}, dtype, "workspace", "shared.dyn");
-      workspaces_.push_back(workspace);
+      // Record workspace under the innermost block scope so its lifetime
+      // covers the statements that requested it and does not sink into
+      // subsequently created inner blocks (e.g., GEMM macro blocks).
+      if (!workspace_stack_.empty()) {
+        workspace_stack_.back().push_back(workspace);
+      } else {
+        // Fallback: create a temporary frame (should be rare)
+        workspace_stack_.emplace_back(Array<Buffer>{workspace});
+      }
       return workspace.access_ptr(2); // write
     };
 
@@ -676,9 +643,15 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
       thread_bounds = Range::FromMinExtent(0, 1);
     }
 
+    // Convert let_bindings_ to Map<Var, PrimExpr> for LowerArgs
+    Map<Var, PrimExpr> let_var_to_expr;
+    for (const auto &[var, expr] : let_bindings_) {
+      let_var_to_expr.Set(var, expr);
+    }
+
     auto lowered = tile_op->Lower(
         LowerArgs{target_, thread_bounds, thread_var_->var, callback,
-                  layout_map_, buffer_remap_, buffer_var_gemm_},
+                  layout_map_, buffer_remap_, let_var_to_expr},
         analyzer_);
     return IRMutatorWithAnalyzer::VisitStmt(lowered);
   }
@@ -696,6 +669,163 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
     return arith::IRMutatorWithAnalyzer::VisitStmt_(op);
   }
 
+  /**
+   * @brief Handle a Parallel For node, lowering it based on the layout
+   * annotation.
+   *
+   * This method checks if the For node has a parallel_loop_layout annotation.
+   * If the For node is a parallel loop (ForKind::kParallel):
+   * - It must have the parallel_loop_layout annotation, otherwise an error is
+   *   raised.
+   * - The loop is partitioned and vectorized based on the annotated layout.
+   * - If a predicate annotation exists, the loop is wrapped with an IfThenElse.
+   *
+   * Special handling for reducers and local buffers:
+   * - If the loop stores into local buffers, thread partitioning is skipped.
+   * - If the loop only manipulates local buffers, thread partitioning is
+   * skipped.
+   * - If reducers are present, vectorization is skipped.
+   * - Vectorization is only applied if non-local buffers or vectorizable casts
+   *   are present.
+   *
+   * @return Stmt The lowered statement.
+   */
+  Stmt VisitStmt_(const ForNode *op) final {
+    // Extract reducer info from annotations
+    Map<Var, ReducerInfo> reducer_info;
+    if (op->annotations.count(attr::kReducerInfo)) {
+      reducer_info = op->annotations.Get(attr::kReducerInfo)
+                         ->as<Map<Var, ReducerInfo>>()
+                         .value();
+    }
+
+    // First visit the body
+    For for_node = Downcast<For>(arith::IRMutatorWithAnalyzer::VisitStmt_(op));
+
+    // Only process parallel loops
+    if (op->kind != ForKind::kParallel) {
+      return for_node;
+    }
+
+    // For nested parallel loops, the annotation is placed on the outermost
+    // loop. Inner parallel loops without annotation should be skipped here -
+    // they will be processed as part of the outer loop's partitioning.
+    if (!op->annotations.count(attr::kParallelLoopLayout)) {
+      return for_node;
+    }
+
+    auto loop_layout = Downcast<Fragment>(
+        op->annotations.Get(attr::kParallelLoopLayout).value());
+
+    // Get predicate if it exists
+    Optional<PrimExpr> predicate;
+    if (op->annotations.count(attr::kParallelLoopPredicate)) {
+      predicate = Downcast<PrimExpr>(
+          op->annotations.Get(attr::kParallelLoopPredicate).value());
+    }
+
+    auto root = tvm::ffi::GetRef<For>(op);
+
+    // Check if the loop stores into local buffers.
+    // For example:
+    //   for i in T.Parallel(1024):
+    //     A_local[i] = A_global[i]
+    // Here, A_local is a register-local buffer held independently by each
+    // thread, so explicit thread binding is not required.
+    bool store_into_local = false;
+    PostOrderVisit(root, [&](const ObjectRef &obj) {
+      if (const auto *store = obj.as<BufferStoreNode>()) {
+        if (IsLocalBuffer(store->buffer)) {
+          store_into_local = true;
+        }
+      }
+    });
+
+    // Check if the loop only manipulates "local" buffers.
+    // for i in T.Parallel(1024):
+    //     A_local[i] = B_local[i]
+    // This indicates register usage and justifies skipping thread binding.
+    bool local_register_only = true;
+    PostOrderVisit(root, [&](const ObjectRef &obj) {
+      if (const auto *store = obj.as<BufferStoreNode>()) {
+        if (!IsLocalBuffer(store->buffer)) {
+          local_register_only = false;
+        }
+      } else if (const auto *load = obj.as<BufferLoadNode>()) {
+        if (!IsLocalBuffer(load->buffer)) {
+          local_register_only = false;
+        }
+      }
+    });
+
+    // Determine if this is a true parallel loop requiring thread partitioning.
+    // Skip partitioning for loops that only operate on local/register buffers.
+    bool parallel_loop = !local_register_only && !store_into_local;
+
+    // Check if there are non-local buffer accesses (for vectorization decision)
+    bool has_non_local = false;
+    PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
+      if (const auto *load = obj.as<BufferLoadNode>()) {
+        if (!IsLocalBuffer(load->buffer) && !IsFragmentBuffer(load->buffer)) {
+          has_non_local = true;
+        }
+      } else if (const auto *store = obj.as<BufferStoreNode>()) {
+        if (!IsLocalBuffer(store->buffer) && !IsFragmentBuffer(store->buffer)) {
+          has_non_local = true;
+        }
+      }
+    });
+
+    // Check if reducers are present in the loop body
+    // Workaround: if reducer is presented, don't vectorize loop
+    // Best solution should be isolate reduction axis out of vectorization
+    //
+    // Note: reducer_info stores original buffer data vars, but after visiting
+    // the body, buffers may have been remapped via var_remap_. We need to find
+    // the original var to check against reducer_info.
+    bool has_reducer = false;
+    PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
+      if (!has_reducer) {
+        if (const auto *store = obj.as<BufferStoreNode>()) {
+          Var data_var = store->buffer->data;
+          // Find the original var if it was remapped
+          // var_remap_ maps old_var -> new_var, so we need reverse lookup
+          Var original_var = data_var;
+          for (const auto &[old_var, new_var] : var_remap_) {
+            if (new_var.same_as(data_var)) {
+              original_var = old_var;
+              break;
+            }
+          }
+          has_reducer = reducer_info.count(original_var) != 0;
+        }
+      }
+    });
+
+    // Check if vectorizable cast operations exist
+    bool has_cast_operations = false;
+    PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
+      if (const auto *cast = obj.as<CastNode>()) {
+        DataType from_ty = cast->value.dtype();
+        DataType target_ty = cast->dtype;
+        if (IsCudaVectorizableCast(from_ty, target_ty) &&
+            TargetIsCuda(Target::Current())) {
+          has_cast_operations = true;
+        }
+      }
+    });
+
+    // Decide whether to vectorize:
+    // - Only if there are non-local buffers or vectorizable casts
+    // - AND no reducers are present
+    bool should_vectorize =
+        (has_non_local || has_cast_operations) && !has_reducer;
+
+    // Lower the parallel loop using the common function
+    return LowerParallelLoop(for_node, loop_layout, thread_var_->var, analyzer_,
+                             predicate, parallel_loop, should_vectorize);
+  }
+
   Target target_;
   Map<Var, Buffer> buffer_data_to_buffer_;
   Map<Buffer, Layout> layout_map_;
@@ -706,7 +836,8 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
   IterVar thread_var_ = IterVar(Range::FromMinExtent(0, 1), Var("v_thread"),
                                 IterVarType::kDataPar);
   size_t thread_block_size_ = 0;
-  Array<Buffer> workspaces_;
+  // Stack of per-Block workspace buffers gathered while visiting children
+  std::vector<Array<Buffer>> workspace_stack_;
   // For ptx Node, we need to remap the buffer and indices
   // By access CallNode instead of BufferLoad Node.
   bool is_ptx_{false};
@@ -716,7 +847,6 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
   std::unordered_map<Var, Buffer, ObjectPtrHash, ObjectPtrEqual> buffer_map_;
   Map<Var, Var> var_remap_;
   bool has_tma_{false};
-  Array<Var> buffer_var_gemm_;
 };
 
 namespace transform {
@@ -730,10 +860,10 @@ tvm::transform::Pass LowerTileOp() {
   return CreatePrimFuncPass(pass_func, 0, "tl.LowerTileOp", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.LowerTileOp", LowerTileOp);
-});
+}
 } // namespace transform
 
 } // namespace tl
diff --git a/src/transform/make_packed_api.cc b/src/transform/make_packed_api.cc
index a124027ce..0fd3a2c16 100644
--- a/src/transform/make_packed_api.cc
+++ b/src/transform/make_packed_api.cc
@@ -20,6 +20,7 @@
 /*!
  * \file make_packed_api.cc Lower PrimFunc to use the packed function API.
  */
+#include <tvm/ffi/extra/module.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/device_api.h>
@@ -32,23 +33,24 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
 #include "../op/builtin.h"
-#include "tir/transforms/arg_binder.h"
+#include "arg_binder.h"
+#include "merge_if_stmt.h"
 #include "tir/transforms/ir_utils.h"
 
 namespace tvm {
 namespace tl {
 using namespace tir;
-static constexpr const char *kDeviceContextVar = "device_api_context";
+using namespace ffi;
 
 namespace {
 class ReturnRewriter : public StmtMutator {
 public:
-  explicit ReturnRewriter(Var ret_var, Var ret_tcode)
-      : ret_var_(std::move(ret_var)), ret_tcode_(std::move(ret_tcode)) {}
+  explicit ReturnRewriter(Var ret_var) : ret_var_(ret_var) {}
 
   Stmt VisitStmt_(const ForNode *node) override {
     if (node->kind == ForKind::kParallel)
@@ -78,8 +80,6 @@ class ReturnRewriter : public StmtMutator {
   struct ConvertedInfo {
     int type_index{-1};
     PrimExpr expr;
-    Buffer dummy_val_buffer;
-    Buffer dummy_tcode_buffer;
   };
 
   ConvertedInfo ConvertForFFI(const PrimExpr &val) {
@@ -87,7 +87,11 @@ class ReturnRewriter : public StmtMutator {
 
     // convert val's data type to FFI data type, return type code
     DataType dtype = val.dtype();
-    if (dtype.is_int() || dtype.is_uint()) {
+    if (dtype.is_bool()) {
+      info.type_index = ffi::TypeIndex::kTVMFFIBool;
+      info.expr = Cast(DataType::Int(64), val);
+
+    } else if (dtype.is_int() || dtype.is_uint()) {
       info.type_index = ffi::TypeIndex::kTVMFFIInt;
       info.expr = Cast(DataType::Int(64), val);
     } else if (dtype.is_float()) {
@@ -100,56 +104,39 @@ class ReturnRewriter : public StmtMutator {
       LOG(FATAL) << "data type " << dtype << " not supported yet";
     }
 
-    // If multiple return locations have the same data type, use the
-    // same dummy buffer declaration.
-    auto it = dummy_val_buffer_map_.find(info.type_index);
-    if (it != dummy_val_buffer_map_.end()) {
-      info.dummy_val_buffer = it->second;
-    } else {
-      info.dummy_val_buffer =
-          Buffer(ret_var_, info.expr.dtype(), {1}, {1}, ConstInt32(0),
-                 ret_var_->name_hint, 0, 0, kDefault);
-      dummy_val_buffer_map_[info.type_index] = info.dummy_val_buffer;
-    }
-
-    // The type_index is always a 32-bit int, so we don't need to have a
-    // separate map.
-    if (!dummy_tcode_buffer_.defined()) {
-      dummy_tcode_buffer_ =
-          Buffer(ret_tcode_, DataType::Int(32), {1}, {1}, ConstInt32(0),
-                 ret_tcode_->name_hint, 0, 0, kDefault);
-    }
-    info.dummy_tcode_buffer = dummy_tcode_buffer_;
-
     return info;
   }
 
-  Stmt WriteToOut(const PrimExpr &val) {
+  Stmt WriteToOut(PrimExpr val) {
     auto info = ConvertForFFI(val);
-    Stmt store_val = BufferStore(info.dummy_val_buffer, info.expr, {0});
-    Stmt store_tcode =
-        BufferStore(info.dummy_tcode_buffer, info.type_index, {0});
+    Stmt store_tindex = tir::Evaluate(
+        tir::Call(DataType::Int(32), tir::builtin::tvm_struct_set(),
+                  {ret_var_, IntImm(DataType::Int(32), 0),
+                   IntImm(DataType::Int(32), tir::builtin::kTVMFFIAnyTypeIndex),
+                   IntImm(DataType::Int(32), info.type_index)}));
+    Stmt store_zero_padding = tir::Evaluate(tir::Call(
+        DataType::Int(32), tir::builtin::tvm_struct_set(),
+        {ret_var_, IntImm(DataType::Int(32), 0),
+         IntImm(DataType::Int(32), tir::builtin::kTVMFFIAnyZeroPadding),
+         IntImm(DataType::Int(32), 0)}));
+    Stmt store_val = tir::Evaluate(tir::Call(
+        DataType::Int(32), tir::builtin::tvm_struct_set(),
+        {ret_var_, IntImm(DataType::Int(32), 0),
+         IntImm(DataType::Int(32), tir::builtin::kTVMFFIAnyUnionValue),
+         info.expr}));
     Stmt ret_zero = Evaluate(tvm::ret(0));
-    return SeqStmt({store_val, store_tcode, ret_zero});
+    return SeqStmt({store_tindex, store_zero_padding, store_val, ret_zero});
   }
 
   Var ret_var_;
-  Var ret_tcode_;
   int in_parallel_{0};
-
-  std::unordered_map<int, Buffer> dummy_val_buffer_map_;
-  Buffer dummy_tcode_buffer_;
 };
 
-Stmt RewriteReturn(Stmt body, Var ret_var, Var ret_tcode) {
-  ReturnRewriter rewriter(std::move(ret_var), std::move(ret_tcode));
-  return rewriter(std::move(body));
-}
-
 class SubroutineCallRewriter : public StmtExprMutator {
 public:
-  static Optional<Stmt> Apply(const Map<GlobalVar, String> &packed_func_methods,
-                              Stmt stmt) {
+  static ffi::Optional<Stmt>
+  Apply(const ffi::Map<GlobalVar, ffi::String> &packed_func_methods,
+        Stmt stmt) {
     SubroutineCallRewriter rewriter(packed_func_methods);
     stmt = rewriter.VisitStmt(stmt);
     if (rewriter.made_change_) {
@@ -161,16 +148,16 @@ class SubroutineCallRewriter : public StmtExprMutator {
 
 private:
   explicit SubroutineCallRewriter(
-      const Map<GlobalVar, String> &packed_func_methods)
+      const ffi::Map<GlobalVar, ffi::String> &packed_func_methods)
       : packed_func_methods(packed_func_methods) {}
 
   PrimExpr VisitExpr_(const CallNode *op) override {
     auto node = Downcast<Call>(StmtExprMutator::VisitExpr_(op));
 
     if (auto *gvar_ptr = node->op.as<GlobalVarNode>()) {
-      auto gvar = GetRef<GlobalVar>(gvar_ptr);
+      auto gvar = ffi::GetRef<GlobalVar>(gvar_ptr);
       if (auto symbol = packed_func_methods.Get(gvar)) {
-        Array<PrimExpr> cpacked_args;
+        ffi::Array<PrimExpr> cpacked_args;
         cpacked_args.push_back(tir::StringImm(symbol.value()));
         for (auto arg : node->args) {
           cpacked_args.push_back(arg);
@@ -186,19 +173,18 @@ class SubroutineCallRewriter : public StmtExprMutator {
 
     return node;
   }
-  const Map<GlobalVar, String> &packed_func_methods;
+  const ffi::Map<GlobalVar, ffi::String> &packed_func_methods;
   bool made_change_{false};
 };
 
 } // namespace
 
-inline Stmt MakeAssertEQ(PrimExpr lhs, PrimExpr rhs, const std::string &msg) {
-  return AssertStmt(std::move(lhs) == std::move(rhs), tvm::tir::StringImm(msg),
-                    Evaluate(0));
+inline Stmt MakeAssertEQ(PrimExpr lhs, PrimExpr rhs, std::string msg) {
+  return AssertStmt(lhs == rhs, tvm::tir::StringImm(msg), Evaluate(0));
 }
 
-inline Stmt MakeAssertNotNull(PrimExpr ptr, const std::string &msg) {
-  Call isnull(DataType::Bool(), builtin::isnullptr(), {std::move(ptr)});
+inline Stmt MakeAssertNotNull(PrimExpr ptr, std::string msg) {
+  Call isnull(DataType::Bool(), builtin::isnullptr(), {ptr});
   return AssertStmt(!isnull, tvm::tir::StringImm(msg), Evaluate(0));
 }
 
@@ -220,7 +206,7 @@ Optional<String> RequiresPackedAPI(const PrimFunc &func) {
 
   // Internal function calls do not need the PackedFunc API
   auto global_symbol = func->GetAttr<String>(tvm::attr::kGlobalSymbol);
-  if (!global_symbol.defined()) {
+  if (!global_symbol) {
     return std::nullopt;
   }
 
@@ -229,7 +215,7 @@ Optional<String> RequiresPackedAPI(const PrimFunc &func) {
 
 PrimFunc MakePackedAPI(PrimFunc func) {
   auto global_symbol = RequiresPackedAPI(func);
-  if (!global_symbol.defined()) {
+  if (!global_symbol) {
     return func;
   }
   std::string name_hint = global_symbol.value();
@@ -253,21 +239,16 @@ PrimFunc MakePackedAPI(PrimFunc func) {
   }
 
   auto *func_ptr = func.CopyOnWrite();
+  // set the global symbol to the packed function name
   const Stmt nop = Evaluate(0);
   int num_args = static_cast<int>(func_ptr->params.size());
 
   // Data field definitions
   // The packed fields
+  Var v_self_handle("self_handle", DataType::Handle());
   Var v_packed_args("args", DataType::Handle());
-  Buffer buf_packed_arg_type_ids =
-      decl_buffer({IntImm(DataType::Int(32), func_ptr->params.size())},
-                  DataType::Int(32), "arg_type_ids");
   Var v_num_packed_args("num_args", DataType::Int(32));
-  Var v_out_ret_value("out_ret_value", PointerType(PrimType(DataType::Void())));
-  Var v_out_ret_tcode("out_ret_tcode",
-                      PointerType(PrimType(DataType::Int(32))));
-  Var v_resource_handle("resource_handle", DataType::Handle());
-  // The arguments of the function.
+  Var v_result("result", PointerType(PrimType(DataType::Void())));
 
   // The device context
   Var device_id("dev_id");
@@ -277,37 +258,24 @@ PrimFunc MakePackedAPI(PrimFunc func) {
   std::vector<Stmt> seq_init, seq_check, arg_buffer_declarations;
   std::unordered_map<const VarNode *, PrimExpr> vmap;
   ArgBinder binder(&vmap);
-  std::vector<Stmt> shape_checks;
-  tvm::transform::PassContext ctxt = tvm::transform::PassContext::Current();
-  bool disable_dynamic_tail_split =
-      ctxt->GetConfig<Bool>(kDisableDynamicTailSplit, Bool(true)).value();
 
   // ---------------------------
   // local function definitions
   // load i-th argument as type t
-  auto f_arg_value = [&](DataType t, int i) {
-    Array<PrimExpr> call_args{
+  auto f_load_arg_value = [&](DataType arg_type, int i) {
+    ffi::Array<PrimExpr> call_args{
         v_packed_args, IntImm(DataType::Int(32), i),
-        IntImm(DataType::Int(32), builtin::kTVMValueContent)};
+        IntImm(DataType::Int(32), builtin::kTVMFFIAnyUnionValue)};
     // load 64 bit version
-    DataType api_type = APIType(t);
+    DataType api_type = APIType(arg_type);
     PrimExpr res = Call(api_type, builtin::tvm_struct_get(), call_args);
     // cast to the target version.
-    if (api_type != t) {
-      res = Cast(t, res);
+    if (api_type != arg_type) {
+      res = Cast(arg_type, res);
     }
     return res;
   };
 
-  // Find the device API context argument based on name
-  for (const auto &param : func_ptr->params) {
-    if (param->name_hint == kDeviceContextVar) {
-      num_args--;
-      v_resource_handle = param;
-      break;
-    }
-  }
-
   // Assert correct type codes for each argument.  This must be done
   // *before* any initialization steps produced by
   // `binder.BindDLTensor()`.  The validity of those initialization
@@ -320,65 +288,219 @@ PrimFunc MakePackedAPI(PrimFunc func) {
         return error_message.str();
       }()));
 
-  seq_init.push_back(MakeAssertNotNull(
-      v_packed_args, name_hint + ": TVMValue* arg pointer was NULL"));
-  seq_init.push_back(MakeAssertNotNull(
-      buf_packed_arg_type_ids->data, name_hint + ": int* type_codes was NULL"));
-
-  seq_init.emplace_back(DeclBuffer(buf_packed_arg_type_ids, nop));
+  if (num_args > 0) {
+    seq_init.push_back(
+        MakeAssertNotNull(v_packed_args, name_hint + ": args pointer is NULL"));
+  }
 
   // Need to delay binding of the buffers, in case some arguments also
   // appear in the buffer.
   std::vector<std::pair<PrimExpr, Var>> var_def;
   std::vector<std::pair<Var, Buffer>> buffer_def;
 
-  for (int i = 0; i < static_cast<int>(func_ptr->params.size()); ++i) {
-    Var param = func_ptr->params[i];
+  // First, collect a reverse map from Buffer->data var to parameter var so we
+  // can detect whether a buffer is actually used by the function body. In
+  // addition, collect variables that appear in the buffer's shape/stride so we
+  // can consider uses of those symbols as a use of the buffer itself.
+  std::unordered_map<const VarNode *, const VarNode *> data_var2param;
+  std::unordered_map<const VarNode *, std::vector<const VarNode *>>
+      shape_var2params;
+  for (const auto &kv : func_ptr->buffer_map) {
+    const Var &param = kv.first;
+    const Buffer &buf = kv.second;
+    data_var2param[buf->data.get()] = param.get();
+    auto record_shape_vars = [&](const PrimExpr &e) {
+      PostOrderVisit(e, [&](const ObjectRef &n) {
+        if (const auto *v = n.as<VarNode>()) {
+          shape_var2params[v].push_back(param.get());
+        }
+      });
+    };
+    for (const PrimExpr &e : buf->shape)
+      record_shape_vars(e);
+    for (const PrimExpr &e : buf->strides)
+      record_shape_vars(e);
+    if (buf->elem_offset.defined())
+      record_shape_vars(buf->elem_offset);
+  }
 
-    // Ignore the device context argument, as it will still be passed
-    // as a native argument.
-    if (param->name_hint == kDeviceContextVar) {
-      continue;
+  // A visitor that records
+  //  - which parameter buffers are used via their data var (load/store/direct),
+  //  - which shape/stride/offset symbols are referenced in the body.
+  // Shape symbols are not immediately attributed to all carrier buffers here;
+  // a minimal carrier set is selected after visiting.
+  struct UsedBufferDetector : public StmtExprVisitor {
+    UsedBufferDetector(
+        const std::unordered_map<const VarNode *, const VarNode *> &data2param,
+        const std::unordered_map<const VarNode *, std::vector<const VarNode *>>
+            &shape2params)
+        : data2param(data2param), shape2params(shape2params) {}
+    void VisitExpr_(const VarNode *op) override {
+      auto it = data2param.find(op);
+      if (it != data2param.end()) {
+        used_params_by_data.insert(it->second);
+      }
+      auto it2 = shape2params.find(op);
+      if (it2 != shape2params.end()) {
+        used_shape_vars.insert(op);
+      }
+      StmtExprVisitor::VisitExpr_(op);
+    }
+    void VisitStmt_(const BufferStoreNode *op) override {
+      auto it = data2param.find(op->buffer->data.get());
+      if (it != data2param.end()) {
+        used_params_by_data.insert(it->second);
+      }
+      StmtExprVisitor::VisitStmt_(op);
+    }
+    void VisitExpr_(const BufferLoadNode *op) override {
+      auto it = data2param.find(op->buffer->data.get());
+      if (it != data2param.end()) {
+        used_params_by_data.insert(it->second);
+      }
+      StmtExprVisitor::VisitExpr_(op);
     }
 
-    var_def.emplace_back(f_arg_value(param.dtype(), i), param);
-    if (func_ptr->buffer_map.count(param)) {
-      buffer_def.emplace_back(param, func_ptr->buffer_map[param]);
+    const std::unordered_map<const VarNode *, const VarNode *> &data2param;
+    const std::unordered_map<const VarNode *, std::vector<const VarNode *>>
+        &shape2params;
+    std::unordered_set<const VarNode *> used_params_by_data;
+    std::unordered_set<const VarNode *> used_shape_vars;
+  };
+
+  UsedBufferDetector detector(data_var2param, shape_var2params);
+  detector(func_ptr->body);
+
+  // Build the packed argument handling. While doing so, keep track of whether
+  // each parameter buffer is actually used. Unused input buffers can be
+  // nullable and do not require DLTensor field dereferences.
+  //
+  // Start from buffers used via data-var (definitely non-NULL), then for each
+  // referenced shape symbol pick a minimal "carrier" buffer that provides the
+  // symbol. Prefer carriers that are already used-by-data; otherwise pick one
+  // arbitrary carrier to ensure the symbol is bound.
+  std::unordered_set<const VarNode *> used_param_buffers =
+      detector.used_params_by_data;
+  for (const VarNode *sym : detector.used_shape_vars) {
+    auto it = shape_var2params.find(sym);
+    if (it == shape_var2params.end())
+      continue;
+    const auto &carriers = it->second;
+    bool has_used_carrier = false;
+    for (const VarNode *p : carriers) {
+      if (used_param_buffers.count(p)) {
+        has_used_carrier = true;
+        break;
+      }
     }
+    // NOTE: With the new nullable shape binding logic in
+    // ArgBinder::BindDLTensors, we no longer need to force one carrier to be
+    // non-NULL. The binder will:
+    // 1. Assert that at least one carrier is non-NULL at runtime
+    // 2. Use cascaded if_then_else to read from the first non-NULL carrier
+    // So we can allow all carriers to be nullable.
+    // if (!has_used_carrier && !carriers.empty()) {
+    //   used_param_buffers.insert(carriers.front());
+    // }
+  }
 
-    // type code checks
-    Var type_index(param->name_hint + ".code", DataType::Int(32));
-    seq_init.emplace_back(LetStmt(
+  for (int i = 0; i < static_cast<int>(func_ptr->params.size()); ++i) {
+    Var param = func_ptr->params[i];
+    PrimExpr arg_value;
+    // type index checks
+    Var type_index(param->name_hint + ".type_index", DataType::Int(32));
+    seq_init.push_back(LetStmt(
         type_index,
-        BufferLoad(buf_packed_arg_type_ids, {IntImm(DataType::Int(32), i)}),
+        tir::Call(DataType::Int(32), builtin::tvm_struct_get(),
+                  {v_packed_args, IntImm(DataType::Int(32), i),
+                   IntImm(DataType::Int(32), builtin::kTVMFFIAnyTypeIndex)}),
         nop));
-    DataType t = param.dtype();
-    if (t.is_handle()) {
+    DataType dtype = param.dtype();
+    if (dtype.is_handle()) {
       std::ostringstream msg;
-      msg << name_hint << ": Expect arg[" << i << "] to be pointer";
+      // Prefer the Buffer name if available; otherwise, fall back to param name
+      // (trim _handle).
+      std::string display_name;
+      auto it_buf = func_ptr->buffer_map.find(param);
+      if (it_buf != func_ptr->buffer_map.end()) {
+        const auto &kv = *it_buf;
+        display_name = kv.second->data->name_hint;
+      } else {
+        display_name = param->name_hint;
+        const char *suffix = "_handle";
+        if (display_name.size() >= 7 &&
+            display_name.compare(display_name.size() - 7, 7, suffix) == 0) {
+          display_name.erase(display_name.size() - 7);
+        }
+      }
+      msg << "kernel " << name_hint << " input " << display_name
+          << " expected pointer or tensor handle";
       seq_init.emplace_back(
           AssertStmt(type_index == ffi::TypeIndex::kTVMFFINone ||
                          type_index == ffi::TypeIndex::kTVMFFIOpaquePtr ||
                          type_index == ffi::TypeIndex::kTVMFFIDLTensorPtr ||
                          type_index >= ffi::TypeIndex::kTVMFFIStaticObjectBegin,
                      tvm::tir::StringImm(msg.str()), nop));
-    } else if (t.is_int() || t.is_uint()) {
+      // if type_index is Tensor, we need to add the offset of the DLTensor
+      // header which always equals 16 bytes, this ensures that T.handle always
+      // shows up as a DLTensor*
+      const int64_t object_cell_offset = sizeof(TVMFFIObject);
+      static_assert(object_cell_offset == 24);
+      arg_value = f_load_arg_value(param.dtype(), i);
+      PrimExpr handle_from_tensor =
+          Call(DataType::Handle(), tir::builtin::handle_add_byte_offset(),
+               {arg_value, IntImm(DataType::Int(32), object_cell_offset)});
+      arg_value = Select(type_index == ffi::TypeIndex::kTVMFFITensor,
+                         handle_from_tensor, arg_value);
+    } else if (dtype.is_bool()) {
       std::ostringstream msg;
-      msg << name_hint << ": Expect arg[" << i << "] to be int";
-      seq_init.emplace_back(AssertStmt(type_index == kDLInt,
-                                       tvm::tir::StringImm(msg.str()), nop));
+      msg << "kernel " << name_hint << " scalar " << param->name_hint
+          << " expected boolean";
+      seq_init.emplace_back(
+          AssertStmt(type_index == ffi::TypeIndex::kTVMFFIBool ||
+                         type_index == ffi::TypeIndex::kTVMFFIInt,
+                     tvm::tir::StringImm(msg.str()), nop));
+      arg_value =
+          Cast(DataType::Bool(), f_load_arg_value(DataType::Int(64), i));
+
+    } else if (dtype.is_int() || dtype.is_uint()) {
+      std::ostringstream msg;
+      msg << "kernel " << name_hint << " scalar " << param->name_hint
+          << " expected integer";
+      seq_init.emplace_back(
+          AssertStmt(type_index == ffi::TypeIndex::kTVMFFIInt ||
+                         type_index == ffi::TypeIndex::kTVMFFIBool,
+                     tvm::tir::StringImm(msg.str()), nop));
+      arg_value = f_load_arg_value(param.dtype(), i);
     } else {
-      ICHECK(t.is_float());
+      ICHECK(dtype.is_float());
       std::ostringstream msg;
-      msg << name_hint << ": Expect arg[" << i << "] to be float";
-      seq_init.emplace_back(AssertStmt(type_index == kDLFloat,
-                                       tvm::tir::StringImm(msg.str()), nop));
+      msg << "kernel " << name_hint << " scalar " << param->name_hint
+          << " expected float";
+      seq_init.emplace_back(
+          AssertStmt(type_index == ffi::TypeIndex::kTVMFFIFloat ||
+                         type_index == ffi::TypeIndex::kTVMFFIInt ||
+                         type_index == ffi::TypeIndex::kTVMFFIBool,
+                     tvm::tir::StringImm(msg.str()), nop));
+      // use select so we can also handle int conversion to bool
+      arg_value = tir::Select(
+          type_index == ffi::TypeIndex::kTVMFFIFloat,
+          /* true_value = */ f_load_arg_value(param.dtype(), i),
+          /* false_value = */
+          Cast(param.dtype(), f_load_arg_value(DataType::Int(64), i)));
+    }
+    var_def.emplace_back(arg_value, param);
+    if (func_ptr->buffer_map.count(param)) {
+      // buffer binding now depends on type index
+      // if the index is Tensor handle, we need to offset to get the DLTensor*
+      buffer_def.emplace_back(param, func_ptr->buffer_map[param]);
     }
   }
 
-  Array<Var> args{v_packed_args,     buf_packed_arg_type_ids->data,
-                  v_num_packed_args, v_out_ret_value,
-                  v_out_ret_tcode,   v_resource_handle};
+  // signature: (void* handle, TVMFFIAny* packed_args, int num_args, TVMFFIAny*
+  // v_result)
+  ffi::Array<Var> args{v_self_handle, v_packed_args, v_num_packed_args,
+                       v_result};
 
   // Arg definitions are defined before buffer binding to avoid the use before
   // def errors.
@@ -391,83 +513,59 @@ PrimFunc MakePackedAPI(PrimFunc func) {
     binder.Bind(param, expr, name_hint + "." + param->name_hint, true);
   }
 
-  for (const auto &kv : buffer_def) {
-    binder.BindDLTensor(kv.second, device_type, device_id, kv.first,
-                        name_hint + "." + kv.first->name_hint);
-    arg_buffer_declarations.push_back(DeclBuffer(kv.second, nop));
+  binder.BindDLTensors(buffer_def, device_type, device_id, name_hint,
+                       used_param_buffers);
+  for (const auto &[var, buffer] : buffer_def) {
+    // Prefer buffer data var name in diagnostics to avoid exposing low-level
+    // handle vars
+    arg_buffer_declarations.push_back(DeclBuffer(buffer, nop));
   }
 
-  func =
-      WithAttrs(std::move(func),
-                {{tvm::attr::kCallingConv, Integer(CallingConv::kCPackedFunc)},
-                 {tvm::attr::kTarget, target_host}});
-  Stmt body = RewriteReturn(func_ptr->body, v_out_ret_value, v_out_ret_tcode);
+  // reset global symbol to attach prefix
+  func = WithAttrs(
+      std::move(func),
+      {{tvm::attr::kCallingConv, static_cast<int>(CallingConv::kCPackedFunc)},
+       {tvm::attr::kTarget, target_host},
+       {tvm::attr::kGlobalSymbol,
+        ffi::symbol::tvm_ffi_symbol_prefix + global_symbol.value()}});
+
+  Stmt body = ReturnRewriter(v_result)(func_ptr->body);
   body = AttrStmt(make_zero(DataType::Int(32)), tir::attr::compute_scope,
                   StringImm(name_hint + "_compute_"), body);
   // Set device context
   if (vmap.count(device_id.get())) {
-    ObjectRef node = String("default");
+    ffi::Any node = ffi::String("default");
     seq_check.push_back(AttrStmt(node, tir::attr::device_id, device_id, nop));
     seq_check.push_back(
         AttrStmt(node, tir::attr::device_type, device_type, nop));
 
     if (runtime::DeviceAPI::NeedSetDevice(target_device_type)) {
       Stmt set_device =
-          Evaluate(Call(DataType::Int(32), builtin::tvm_call_packed(),
+          Evaluate(Call(DataType::Int(32), tir::builtin::tvm_call_packed(),
                         {StringImm(runtime::symbol::tvm_set_device),
                          device_type, device_id}));
       body = SeqStmt({set_device, body});
     }
   }
 
-  // (zhengju) For dynamic constraint, we need to check the buffer shape and
-  // dtype to make sure the buffer can be vectorized.
-  for (const auto &kv : buffer_def) {
-    if (disable_dynamic_tail_split) {
-      Optional<Integer> opt_dynamic_alignment =
-          ctxt->GetConfig(kDynamicAlignment, Optional<Integer>());
-      int dynamic_alignment = opt_dynamic_alignment.value_or(Integer(8))->value;
-      // The vectorize dimension will be the last dimension of the buffer
-      auto vectorize_dim = kv.second->shape[kv.second->shape.size() - 1];
-      auto shape_vectorize_expr = [&]() -> PrimExpr {
-        PrimExpr result = IntImm(kv.second->DefaultIndexType(), 1);
-        result = result * vectorize_dim;
-        result = FloorMod(result, dynamic_alignment);
-        return result;
-      }();
-      shape_checks.emplace_back(AssertStmt(
-          shape_vectorize_expr == 0,
-          tvm::tir::StringImm(
-              kv.second->name +
-              ": Vectorize dimension in buffer must be divisible by " +
-              std::to_string(dynamic_alignment)),
-          nop));
-    }
-  }
-
   // Return error code of zero on success
   body = SeqStmt({body, Evaluate(ret(Integer(0)))});
 
-  if (!disable_dynamic_tail_split) {
-    body = MergeNest({seq_init, binder.init_nest(), seq_check, binder.asserts(),
-                      arg_buffer_declarations},
-                     body);
-  } else {
-    body = MergeNest({seq_init, binder.init_nest(), seq_check, binder.asserts(),
-                      arg_buffer_declarations, shape_checks},
-                     body);
-  }
-
+  body = MergeNest({seq_init, binder.init_nest(), seq_check, binder.asserts(),
+                    arg_buffer_declarations},
+                   body);
   func_ptr->body = body;
   func_ptr->params = args;
 
-  Array<Var> undefined = UndefinedVars(func_ptr->body, func_ptr->params);
+  ffi::Array<Var> undefined = UndefinedVars(body, func_ptr->params);
+
   ICHECK_EQ(undefined.size(), 0)
       << "In PrimFunc " << name_hint << " variables " << undefined
       << " are used, but are not passed in as API arguments";
 
-  func_ptr->buffer_map = Map<Var, Buffer>();
-  func_ptr->ret_type = PrimType(DataType::Int(32)); // return the function.
+  func_ptr->buffer_map = ffi::Map<Var, Buffer>();
+  func_ptr->ret_type = PrimType(DataType::Int(32));
+  // return the function.
   return func;
 }
 
@@ -496,8 +594,8 @@ tvm::transform::Pass MakePackedAPI() {
                                                       func->body)) {
           func.CopyOnWrite()->body = body.value();
         }
-
         func = MakePackedAPI(std::move(func));
+        func = MergeIfStmtSubstitute(func);
 
         if (!func.same_as(orig_func)) {
           updates->Add(gvar, func);
@@ -514,11 +612,11 @@ tvm::transform::Pass MakePackedAPI() {
   return tvm::transform::CreateModulePass(pass_func, 0, "tl.MakePackedAPI", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.MakePackedAPI",
                         []() { return MakePackedAPI(); });
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/merge_if_stmt.cc b/src/transform/merge_if_stmt.cc
index db0206e4c..98d9d3ac2 100644
--- a/src/transform/merge_if_stmt.cc
+++ b/src/transform/merge_if_stmt.cc
@@ -3,6 +3,8 @@
  * \brief Merge the If Stmt in SeqStmt
  */
 
+#include "merge_if_stmt.h"
+
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
@@ -20,23 +22,46 @@ using namespace tir;
 class MergeIfStmtRewriter : public StmtExprMutator {
 public:
   static PrimFunc Substitute(PrimFunc &f) {
-    auto rewriter = MergeIfStmtRewriter();
-    f.CopyOnWrite()->body = rewriter(f->body);
+    f.CopyOnWrite()->body = MergeIfStmtRewriter::Apply(f->body);
     return f;
   }
 
+  static Stmt Apply(Stmt stmt) {
+    auto rewriter = MergeIfStmtRewriter();
+    return rewriter(stmt);
+  }
+
 private:
   MergeIfStmtRewriter() = default;
 
+  void FlattenAppend(const Stmt &s, Array<Stmt> *out) {
+    if (const auto *seq = s.as<SeqStmtNode>()) {
+      for (const Stmt &e : seq->seq) {
+        FlattenAppend(e, out);
+      }
+    } else {
+      out->push_back(s);
+    }
+  }
+
   Stmt VisitStmt_(const SeqStmtNode *op) final {
-    Array<Stmt> new_seq;
+    // First, recursively flatten nested SeqStmt so that
+    //   SeqStmt{ if, SeqStmt{ if, SeqStmt{ if } } }
+    // becomes a single-level sequence of [if, if, if].
+    Array<Stmt> flat_seq;
+    for (const Stmt &stmt : op->seq) {
+      Stmt new_stmt = this->VisitStmt(stmt);
+      FlattenAppend(new_stmt, &flat_seq);
+    }
 
+    // Then, merge consecutive IfThenElse (without else) that share the same
+    // condition.
+    Array<Stmt> new_seq;
     PrimExpr current_condition;
     Array<Stmt> current_if_bodies;
 
-    for (const Stmt &stmt : op->seq) {
-      Stmt new_stmt = this->VisitStmt(stmt);
-      if (const IfThenElseNode *if_node = new_stmt.as<IfThenElseNode>()) {
+    for (const Stmt &stmt : flat_seq) {
+      if (const auto *if_node = stmt.as<IfThenElseNode>()) {
         if (!if_node->else_case.defined()) {
           if (current_condition.defined() &&
               ExprDeepEqual()(current_condition, if_node->condition)) {
@@ -73,7 +98,7 @@ class MergeIfStmtRewriter : public StmtExprMutator {
         current_if_bodies.clear();
       }
 
-      new_seq.push_back(new_stmt);
+      new_seq.push_back(stmt);
     }
 
     if (!current_if_bodies.empty()) {
@@ -90,6 +115,12 @@ class MergeIfStmtRewriter : public StmtExprMutator {
   }
 };
 
+PrimFunc MergeIfStmtSubstitute(PrimFunc &f) {
+  return MergeIfStmtRewriter::Substitute(f);
+}
+
+Stmt ApplyMergeIfStmt(Stmt stmt) { return MergeIfStmtRewriter::Apply(stmt); }
+
 using namespace tir::transform;
 tvm::transform::Pass MergeIfStmt() {
   auto pass_func = [=](PrimFunc f, const IRModule &m, const PassContext &ctx) {
@@ -98,10 +129,10 @@ tvm::transform::Pass MergeIfStmt() {
   return CreatePrimFuncPass(pass_func, 0, "tl.MergeIfStmt", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.MergeIfStmt", MergeIfStmt);
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/merge_if_stmt.h b/src/transform/merge_if_stmt.h
new file mode 100644
index 000000000..5d7a282d1
--- /dev/null
+++ b/src/transform/merge_if_stmt.h
@@ -0,0 +1,52 @@
+/*!
+ * \file merge_if_stmt.h
+ * \brief Merge consecutive If statements with the same condition
+ */
+#ifndef TVM_TL_TRANSFORM_MERGE_IF_STMT_H_
+#define TVM_TL_TRANSFORM_MERGE_IF_STMT_H_
+
+#include <tvm/tir/function.h>
+#include <tvm/tir/stmt.h>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+// Forward declaration
+class MergeIfStmtRewriter;
+
+/*!
+ * \brief Apply MergeIfStmt transformation to a PrimFunc
+ *
+ * This function merges consecutive IfThenElse statements that have the same
+ * condition into a single if statement with a SeqStmt body.
+ *
+ * Example:
+ *   if (cond) { stmt1 }
+ *   if (cond) { stmt2 }
+ *   if (cond) { stmt3 }
+ *
+ * Becomes:
+ *   if (cond) {
+ *     stmt1
+ *     stmt2
+ *     stmt3
+ *   }
+ *
+ * \param f The PrimFunc to transform
+ * \return Transformed PrimFunc with merged if statements
+ */
+PrimFunc MergeIfStmtSubstitute(PrimFunc &f);
+
+/*!
+ * \brief Apply MergeIfStmt transformation to a statement
+ * \param stmt The statement to transform
+ * \return Transformed statement with merged if statements
+ */
+Stmt ApplyMergeIfStmt(Stmt stmt);
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_TRANSFORM_MERGE_IF_STMT_H_
diff --git a/src/transform/merge_shared_memory_allocations.cc b/src/transform/merge_shared_memory_allocations.cc
index 800a135c8..096fbd928 100644
--- a/src/transform/merge_shared_memory_allocations.cc
+++ b/src/transform/merge_shared_memory_allocations.cc
@@ -31,6 +31,12 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
+#include <algorithm>
+#include <functional>
+#include <limits>
+#include <optional>
+#include <queue>
+#include <sstream>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
@@ -38,7 +44,6 @@
 #include "../op/builtin.h"
 #include "../target/utils.h"
 #include "runtime/thread_storage_scope.h"
-#include "support/arena.h"
 #include "tir/transforms/ir_utils.h"
 #include "tvm/tir/function.h"
 
@@ -141,6 +146,8 @@ class SharedMemLinearAccessPatternFinder final : public StmtExprVisitor {
   void VisitStmt_(const AllocateNode *op) final {
     size_t level = scope_.size();
     const VarNode *buf = op->buffer_var.get();
+    // Record the allocation site and depth so liveness can reason about the
+    // original scope.
     alloc_info_[buf].alloc = op;
     alloc_info_[buf].level = level;
     StmtExprVisitor::VisitStmt_(op);
@@ -155,7 +162,7 @@ class SharedMemLinearAccessPatternFinder final : public StmtExprVisitor {
     auto it = alloc_info_.find(buf);
     if (it != alloc_info_.end() && it->second.alloc) {
       ICHECK_LT(it->second.level, scope_.size());
-      if (IsAppropriateSharedMemory(GetRef<Var>(buf))) {
+      if (IsAppropriateSharedMemory(tvm::ffi::GetRef<Var>(buf))) {
         // set into scope_.size() - 1 for aggressive memory reuse
         auto enable_aggressive_merge = enable_aggressive_merge_;
         if (enable_aggressive_merge) {
@@ -194,17 +201,23 @@ class SharedMemLinearAccessPatternFinder final : public StmtExprVisitor {
     const VarNode *buf = op->buffer->data.get();
     auto it = alloc_info_.find(buf);
     if (it != alloc_info_.end() && it->second.alloc) {
-      // Allow buffer access at the same level or deeper scope
-      // Changed from < to <= to handle cases where buffer is accessed
-      // in expressions at the same scope level where it's allocated
+      // Earlier we required `alloc_level < scope_.size()`, assuming every load
+      // would occur strictly inside a nested scope.  In practice the lowering
+      // pipeline may materialise reads in the very same frame that owns the
+      // allocation (e.g. when the buffer value is passed directly to a call),
+      // which used to trigger the CHECK.  Treat same-level accesses as valid so
+      // the merged allocator can reason about their lifetime correctly.
       ICHECK_LE(it->second.level, scope_.size())
           << "Load memory in places other than store.";
-      if (IsAppropriateSharedMemory(GetRef<Var>(buf))) {
+      if (IsAppropriateSharedMemory(tvm::ffi::GetRef<Var>(buf))) {
         auto enable_aggressive_merge = enable_aggressive_merge_;
         if (enable_aggressive_merge) {
           scope_[scope_.size() - 1].touched.push_back(buf);
         } else {
-          // When accessing at the same level, use that level
+          // When the access happens in the same scope frame as the allocation
+          // we attribute it to that frame instead of the outer parent.  This
+          // keeps the liveness window tight while still accounting for nested
+          // scopes that legitimately touch the buffer deeper in the tree.
           size_t access_level = std::min(it->second.level, scope_.size() - 1);
           scope_[access_level].touched.push_back(buf);
         }
@@ -216,14 +229,17 @@ class SharedMemLinearAccessPatternFinder final : public StmtExprVisitor {
     // Directly reference to the variable count as a read.
     auto it = alloc_info_.find(buf);
     if (it != alloc_info_.end() && it->second.alloc) {
-      // Allow buffer access at the same level or deeper scope
+      // Same rationale as the BufferLoad path above: direct references can be
+      // emitted at the allocation level after flattening, so accept them and
+      // record the touch for liveness planning.
       ICHECK_LE(it->second.level, scope_.size());
-      if (IsAppropriateSharedMemory(GetRef<Var>(buf))) {
+      if (IsAppropriateSharedMemory(tvm::ffi::GetRef<Var>(buf))) {
         auto enable_aggressive_merge = enable_aggressive_merge_;
         if (enable_aggressive_merge) {
           scope_[scope_.size() - 1].touched.push_back(buf);
         } else {
-          // When accessing at the same level, use that level
+          // Attribute same-level uses to the allocation frame, mirroring the
+          // BufferLoad handling to keep reuse decisions consistent.
           size_t access_level = std::min(it->second.level, scope_.size() - 1);
           scope_[access_level].touched.push_back(buf);
         }
@@ -245,6 +261,8 @@ class SharedMemLinearAccessPatternFinder final : public StmtExprVisitor {
     scope_.pop_back();
     int64_t end_index = static_cast<int64_t>(linear_seq_.size());
     ICHECK_GT(end_index, begin_index);
+    // The paired entries serve as scope sentinels once we flatten the
+    // control-flow tree.
     e.scope_pair_offset = begin_index - end_index;
     linear_seq_.push_back(e);
     // record the pointer to end index.
@@ -336,9 +354,30 @@ class SharedMemoryAlignmentPlanner : public StmtExprVisitor {
   }
 
 private:
+  // Helper to record alignment for a shared/shared.dyn Var under alignment
+  // scope
+  void MarkSharedVarIfNeeded(const VarNode *op) {
+    if (!op || !under_alignment_scope_)
+      return;
+    auto ptr_type = op->type_annotation.as<PointerTypeNode>();
+    if (!ptr_type)
+      return;
+    auto scope = GetPtrStorageScope(tvm::ffi::GetRef<Var>(op));
+    if (scope == "shared" || scope == "shared.dyn") {
+      auto target = Target::Current();
+      ICHECK(target.defined()) << "Target is not defined";
+      const int alignment = TargetIsHopper(target) ? 1024 : 16;
+      shmem_alignment_map_[op] = alignment;
+    }
+  }
+
   void VisitExpr_(const CallNode *op) {
     if (op->op.same_as(tl::tl_gemm()) || op->op.same_as(tl::tl_gemm_sp()) ||
-        op->op.same_as(tl::tma_load()) || op->op.same_as(tl::tma_store())) {
+        op->op.same_as(tl::tma_load()) || op->op.same_as(tl::tma_store()) ||
+        op->op.same_as(tl::initialize_wgmma_descriptor()) ||
+        op->op.same_as(tl::initialize_tcgen05_descriptor())) {
+      // These intrinsics introduce stricter SMEM alignment requirements; mark
+      // the subtree.
       under_alignment_scope_ = true;
       StmtExprVisitor::VisitExpr_(op);
       under_alignment_scope_ = false;
@@ -348,15 +387,16 @@ class SharedMemoryAlignmentPlanner : public StmtExprVisitor {
   }
 
   void VisitExpr_(const VarNode *op) {
-    auto ptr_type = op->type_annotation.as<PointerTypeNode>();
-    if (ptr_type && under_alignment_scope_) {
-      auto scope = GetPtrStorageScope(GetRef<Var>(op));
-      if (scope == "shared" || scope == "shared.dyn") {
-        auto target = Target::Current();
-        ICHECK(target.defined()) << "Target is not defined";
-        const int alignment = TargetIsHopper(target) ? 1024 : 16;
-        shmem_alignment_map_[op] = alignment;
-      }
+    MarkSharedVarIfNeeded(op);
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitExpr_(const BufferLoadNode *op) {
+    // If we encounter address_of(BufferLoad(...)) or any direct BufferLoad
+    // within an alignment scope, make sure we mark the underlying shared var.
+    if (op && under_alignment_scope_) {
+      const VarNode *data_var = op->buffer->data.get();
+      MarkSharedVarIfNeeded(data_var);
     }
     StmtExprVisitor::VisitExpr_(op);
   }
@@ -394,6 +434,8 @@ class SharedMemoryRewriter : public StmtExprMutator {
                                               enable_aggressive_merge, verbose);
     finder(stmt);
     shmem_alignment_map_ = SharedMemoryAlignmentPlanner::Plan(stmt);
+    // First compute liveness over the flattened schedule, then feed it into the
+    // arena packer.
     this->LivenessAnalysis(finder.linear_seq_, finder.stmt_attrs_);
     this->PlanMemory(finder.linear_seq_, finder.stmt_attrs_);
   }
@@ -403,65 +445,6 @@ class SharedMemoryRewriter : public StmtExprMutator {
     if (op->attr_key == tir::attr::thread_extent && !allocated_) {
       // Allocate one dynamic shared memory allocation at the beginning of
       // thread scope
-      int max_layer_num = 0;
-      std::vector<const StorageEntry *> all_entry;
-      for (const auto &e : const_free_map_) {
-        all_entry.push_back(e.second);
-      }
-      for (const StorageEntry *e : sym_free_list_) {
-        all_entry.push_back(e);
-      }
-      // Sort the storage entries in descending order of their total allocation
-      // size (in bits). This ensures that larger allocations are placed first,
-      // which can help minimize fragmentation and improve memory packing
-      // efficiency when merging shared memory buffers.
-      std::sort(all_entry.begin(), all_entry.end(),
-                [](const StorageEntry *a, const StorageEntry *b) {
-                  return a->const_nbits > b->const_nbits;
-                });
-      for (const StorageEntry *e : all_entry) {
-        max_layer_num =
-            std::max(max_layer_num, static_cast<int>(e->allocs.size()));
-      }
-      // calculate align for each layer of each storage entry.
-      std::vector<int> align(max_layer_num, 0);
-      for (const StorageEntry *e : all_entry) {
-        for (int i = 0; i < static_cast<int>(e->allocs.size()); i++) {
-          for (const VarNode *buffer : e->allocs[i]) {
-            const AllocateNode *alloc = shmem_allocs_[buffer];
-            align[i] =
-                std::max(align[i], alloc->dtype.bytes() * alloc->dtype.lanes());
-            align[i] = std::max(align[i], align_bytes_);
-          }
-        }
-      }
-
-      for (const StorageEntry *e : all_entry) {
-        PrimExpr max_inner_offset = 0;
-        for (int i = 0; i < static_cast<int>(e->allocs.size()); i++) {
-          PrimExpr inner_offset = 0;
-          for (const VarNode *buffer : e->allocs[i]) {
-            const AllocateNode *alloc = shmem_allocs_[buffer];
-            auto alignment = align[i];
-            // Modern nvidia architecture performs hardware swizzling (hopper
-            // wgmma/tma for example) requires dynamic shared memory address to
-            // be aligned to 1024 bytes For other devices, we align to 16 bytes
-            if (shmem_alignment_map_.find(buffer) !=
-                shmem_alignment_map_.end()) {
-              alignment = std::max(align[i], shmem_alignment_map_[buffer]);
-            }
-            PrimExpr start_offset = merged_alloc_size_ + inner_offset;
-            PrimExpr aligned_offset =
-                indexdiv(start_offset + alignment - 1, alignment) * alignment;
-            buffer_byte_offsets_[buffer] = aligned_offset;
-            inner_offset =
-                aligned_offset - merged_alloc_size_ +
-                alloc->extents[0] * alloc->dtype.bytes() * alloc->dtype.lanes();
-          }
-          max_inner_offset = max(max_inner_offset, inner_offset);
-        }
-        merged_alloc_size_ += max_inner_offset;
-      }
 
       if (verbose_) {
 
@@ -626,18 +609,201 @@ class SharedMemoryRewriter : public StmtExprMutator {
 
   using StmtEntry = SharedMemLinearAccessPatternFinder::StmtEntry;
   using StmtAttr = SharedMemLinearAccessPatternFinder::StmtAttr;
-  struct StorageEntry {
-    // The constant size of the buffer in bits, only used if it is constant
-    uint64_t const_nbits{0};
-    // Allocs that shares this entry.
-    // The inner vector means a "layer"
-    // For example, it we need to allocate C in the memory of A and B:
-    // |  A: 4096 bytes |  B: 4096 bytes |
-    // |            C: 8192 bytes        |
-    // Then the allocs = {{A, B}, {C}}
-    std::vector<std::vector<const VarNode *>> allocs;
+
+  // Metadata about a single shared-memory allocation prior to merging.  This
+  // is used to build lifetimes, alignment requirements, and final offsets.
+  struct BufInfo {
+    const VarNode *var{nullptr};
+    std::string name;
+    PrimExpr size_expr;
+    std::optional<int64_t> const_size_bytes; // in bytes if compile-time known.
+    int alignment{0};                        // required byte alignment.
+    int start{0}; // first statement index touching the buf.
+    int end{0};   // one-past-last statement index.
+    DataType size_dtype{DataType::Int(32)};
+  };
+
+  // Interval describing the liveness window of a (constant-sized) allocation.
+  struct Interval {
+    int start{0};
+    int end{0};
+    size_t size_bytes{0};
+    int alignment{0};
+    const VarNode *var{nullptr};
+  };
+
+  // Result of a linear-scan arena packing.  Offsets contain the byte offset for
+  // each constant-sized buffer, arena_size is the total constant footprint.
+  struct ArenaPlan {
+    size_t arena_size{0};
+    std::unordered_map<const VarNode *, size_t> offsets;
+  };
+
+  static size_t AlignUpSize(size_t value, size_t alignment) {
+    if (alignment == 0) {
+      return value;
+    }
+    size_t remainder = value % alignment;
+    if (remainder == 0) {
+      return value;
+    }
+    return value + (alignment - remainder);
+  }
+
+  struct FreeBlock {
+    size_t offset{0};
+    size_t size{0};
+  };
+
+  class FreeList {
+  public:
+    std::optional<size_t> Allocate(size_t need, size_t alignment) {
+      // Best-fit search: pick the slot that wastes the least space after
+      // alignment.
+      int best = -1;
+      size_t best_waste = std::numeric_limits<size_t>::max();
+      for (int i = 0, n = static_cast<int>(blocks_.size()); i < n; ++i) {
+        size_t aligned = AlignUpSize(blocks_[i].offset, alignment);
+        size_t head = aligned - blocks_[i].offset;
+        if (head <= blocks_[i].size && (blocks_[i].size - head) >= need) {
+          size_t waste = blocks_[i].size - head - need;
+          if (waste < best_waste) {
+            best_waste = waste;
+            best = i;
+          }
+        }
+      }
+      if (best < 0) {
+        return std::nullopt;
+      }
+      FreeBlock blk = blocks_[best];
+      size_t aligned = AlignUpSize(blk.offset, alignment);
+      size_t head = aligned - blk.offset;
+      size_t tail = blk.size - head - need;
+      blocks_.erase(blocks_.begin() + best);
+      if (head) {
+        blocks_.push_back({blk.offset, head});
+      }
+      if (tail) {
+        blocks_.push_back({aligned + need, tail});
+      }
+      Normalize();
+      return aligned;
+    }
+
+    void Free(size_t offset, size_t size) {
+      if (size == 0)
+        return;
+      blocks_.push_back({offset, size});
+      Normalize();
+    }
+
+  private:
+    void Normalize() {
+      if (blocks_.empty())
+        return;
+      std::sort(blocks_.begin(), blocks_.end(),
+                [](const FreeBlock &a, const FreeBlock &b) {
+                  return a.offset < b.offset;
+                });
+      std::vector<FreeBlock> merged;
+      merged.reserve(blocks_.size());
+      for (const FreeBlock &blk : blocks_) {
+        if (merged.empty()) {
+          merged.push_back(blk);
+          continue;
+        }
+        FreeBlock &last = merged.back();
+        size_t last_end = last.offset + last.size;
+        if (blk.offset <= last_end) {
+          size_t blk_end = blk.offset + blk.size;
+          if (blk_end > last_end) {
+            last.size = blk_end - last.offset;
+          }
+        } else {
+          merged.push_back(blk);
+        }
+      }
+      blocks_ = std::move(merged);
+    }
+
+    std::vector<FreeBlock> blocks_;
+  };
+
+  struct ActiveInterval {
+    int end{0};
+    size_t offset{0};
+    size_t size{0};
+    const VarNode *var{nullptr};
+    bool operator>(const ActiveInterval &other) const {
+      return end > other.end;
+    }
   };
 
+  static ArenaPlan LinearScanPack(std::vector<Interval> intervals) {
+    // Process intervals in program order so lifetimes correspond to the
+    // linearised CFG.
+    std::sort(intervals.begin(), intervals.end(),
+              [](const Interval &lhs, const Interval &rhs) {
+                if (lhs.start != rhs.start) {
+                  return lhs.start < rhs.start;
+                }
+                if (lhs.size_bytes != rhs.size_bytes) {
+                  return lhs.size_bytes > rhs.size_bytes;
+                }
+                // Use name comparison for deterministic ordering instead of
+                // pointer comparison
+                return lhs.var->name_hint < rhs.var->name_hint;
+              });
+
+    std::priority_queue<ActiveInterval, std::vector<ActiveInterval>,
+                        std::greater<ActiveInterval>>
+        active;
+    FreeList freelist;
+    size_t arena_top = 0;
+    std::unordered_map<const VarNode *, size_t> offsets;
+
+    // Expire intervals that end before or at program counter `pc`.
+    auto retire = [&](int pc) {
+      while (!active.empty() && active.top().end <= pc) {
+        const ActiveInterval top = active.top();
+        active.pop();
+        freelist.Free(top.offset, top.size);
+      }
+    };
+
+    for (const Interval &interval : intervals) {
+      retire(interval.start);
+      size_t offset = 0;
+      // Try to recycle previously freed memory first; fall back to bumping the
+      // arena.
+      if (auto slot =
+              freelist.Allocate(interval.size_bytes, interval.alignment)) {
+        offset = slot.value();
+      } else {
+        offset = AlignUpSize(arena_top, interval.alignment);
+        arena_top = offset + interval.size_bytes;
+      }
+      active.push(ActiveInterval{interval.end, offset, interval.size_bytes,
+                                 interval.var});
+      offsets[interval.var] = offset;
+    }
+
+    return ArenaPlan{arena_top, std::move(offsets)};
+  }
+
+  PrimExpr AlignPrimExpr(const PrimExpr &value, int alignment) const {
+    if (alignment <= 1) {
+      return value;
+    }
+    DataType dtype = value.dtype();
+    ICHECK(dtype.is_int() || dtype.is_uint())
+        << "Expected integer dtype for alignment, but got " << dtype;
+    PrimExpr align_expr = make_const(dtype, alignment);
+    PrimExpr adjust = make_const(dtype, alignment - 1);
+    return indexdiv(value + adjust, align_expr) * align_expr;
+  }
+
   // Event entry in liveness analysis
   struct EventEntry {
     // variables we generate
@@ -905,173 +1071,239 @@ class SharedMemoryRewriter : public StmtExprMutator {
   void
   PlanMemory(const std::vector<StmtEntry> &seq,
              const std::unordered_map<const Object *, StmtAttr> &stmt_attrs) {
-    std::unordered_set<const VarNode *> inplace_flag;
+    buffer_byte_offsets_.clear();
+    (void)stmt_attrs;
+
+    if (shmem_allocs_.empty()) {
+      merged_alloc_size_ = make_const(DataType::Int(64), 0);
+      return;
+    }
+
+    // Discover the first and last touch for every allocation.
+    std::unordered_map<const VarNode *, int> start_index;
+    std::unordered_map<const VarNode *, int> end_index;
 
     for (size_t i = 0; i < seq.size(); ++i) {
       auto it = event_map_.find(seq[i].stmt);
-      // scope_pair_offset <= 0 means it is either
-      // - leaf stmt(offset = 0)
-      // - end of scope(offset < 0)
-      // In both cases, we need to handle the kill event correctly
-      auto is_leaf_alloc = [&](const VarNode *var) {
-        return seq[i].scope_pair_offset == 0 &&
-               std::find(it->second.gen.begin(), it->second.gen.end(), var) !=
-                   it->second.gen.end();
-      };
-      if (it != event_map_.end() && seq[i].scope_pair_offset <= 0) {
-        for (const VarNode *var : it->second.kill) {
-          if (!is_leaf_alloc(var))
-            this->Free(var);
-        }
+      if (it == event_map_.end())
+        continue;
+      for (const VarNode *var : it->second.gen) {
+        start_index.emplace(var, static_cast<int>(i));
       }
-      // scope_pair_offset >= 0 means it is either
-      // - leaf stmt(offset = 0)
-      // - beginning of scope(offset < 0)
-      // In both cases, we need to handle the gen event correctly
-      if (it != event_map_.end() && seq[i].scope_pair_offset >= 0) {
-        for (const VarNode *var : it->second.gen) {
-          ICHECK(shmem_allocs_.count(var));
-          const AllocateNode *alloc = shmem_allocs_[var];
-          StorageEntry *dst_entry = FindAlloc(alloc);
-          alloc_map_[var] = dst_entry;
-        }
+      for (const VarNode *var : it->second.kill) {
+        end_index[var] = std::max(end_index[var], static_cast<int>(i) + 1);
       }
-      if (it != event_map_.end() && seq[i].scope_pair_offset <= 0) {
-        for (const VarNode *var : it->second.kill) {
-          if (is_leaf_alloc(var))
-            this->Free(var);
-        }
+    }
+
+    const int seq_len = static_cast<int>(seq.size());
+    for (const auto &kv : start_index) {
+      if (!end_index.count(kv.first)) {
+        end_index[kv.first] = seq_len;
       }
     }
-  }
-  /*!
-   * \brief Allocate new storage entry.
-   * \param op the allocate node
-   * \param the size of the allocation in bits
-   * \return the new storage entry
-   */
-  StorageEntry *NewAlloc(const AllocateNode *op, size_t const_nbits) {
-    ICHECK(op != nullptr);
-    // Reuse not successful, allocate a new buffer.
-    StorageEntry *entry = arena_.make<StorageEntry>();
-    entry->allocs.push_back({op->buffer_var.get()});
-    entry->const_nbits = const_nbits;
-    return entry;
-  }
-  /*!
-   * @brief Locate or create a storage entry from free lists to satisfy an
-   * AllocateNode.
-   *
-   * Finds a reusable StorageEntry for the given AllocateNode (constant or
-   * symbolic size) using two-tiered strategies:
-   * - For constant-size allocations (>0): prefer a free entry that is >=
-   * required size; if none, coalesce smaller free constant-size entries until
-   * the sum meets the request and return a new StorageEntry representing the
-   * merged space. Very small constant allocations (<= 32 bits) are not reused
-   * and will allocate a fresh entry.
-   * - For symbolic-size (unknown at compile time): pick and remove an arbitrary
-   * entry from the symbolic free list.
-   *
-   * If no suitable free entry is found, a fresh StorageEntry is created via
-   * NewAlloc.
-   *
-   * @param op Pointer to the AllocateNode to satisfy. Must be non-null.
-   * @return StorageEntry* A storage entry that will hold the allocation (may be
-   * newly created).
-   */
-  StorageEntry *FindAlloc(const AllocateNode *op) {
-    ICHECK(op != nullptr);
-    // skip plan for local variable,
-    // compiler can do a better job with register allocation.
-    const uint64_t match_range = 16;
-    uint64_t op_elem_bits = op->dtype.bits() * op->dtype.lanes();
-    uint64_t const_nbits =
-        static_cast<uint64_t>(op->ConstantAllocationSize() * op_elem_bits);
-
-    // disable reuse of small arrays, they will be lowered to registers in LLVM
-    // This rules only apply if we are using non special memory
-    if (const_nbits > 0 && const_nbits <= 32) {
-      return NewAlloc(op, const_nbits);
+
+    // Create a sorted vector of keys from shmem_allocs_ for deterministic
+    // iteration
+    std::vector<const VarNode *> sorted_vars;
+    sorted_vars.reserve(shmem_allocs_.size());
+    for (const auto &kv : shmem_allocs_) {
+      sorted_vars.push_back(kv.first);
     }
+    std::sort(sorted_vars.begin(), sorted_vars.end(),
+              [](const VarNode *a, const VarNode *b) {
+                return a->name_hint < b->name_hint;
+              });
+
+    std::vector<BufInfo> buf_infos;
+    buf_infos.reserve(shmem_allocs_.size());
+    // Build a BufInfo for all allocations that participate in liveness.
+    for (const VarNode *var : sorted_vars) {
+      auto start_it = start_index.find(var);
+      if (start_it == start_index.end()) {
+        continue;
+      }
 
-    if (const_nbits != 0) {
-      // constant allocation.
-      auto begin = const_free_map_.lower_bound(0);
-      auto mid = const_free_map_.lower_bound(const_nbits);
-      auto end = const_free_map_.upper_bound(const_nbits * match_range);
-      // Start looking at the buffer that is bigger than the required size
-      // first. If we find one, directly allocate the buffer in its location and
-      // remove its entry in the free list
-      for (auto it = mid; it != end; ++it) {
-        StorageEntry *e = it->second;
-        e->const_nbits = std::max(const_nbits, e->const_nbits);
-        const_free_map_.erase(it);
-        it->second->allocs.push_back({op->buffer_var.get()});
-        return e;
+      BufInfo info;
+      info.var = var;
+      info.name = var->name_hint;
+      info.start = start_it->second;
+      info.end = std::max(end_index[var], info.start + 1);
+      info.alignment = align_bytes_;
+      auto align_it = shmem_alignment_map_.find(var);
+      if (align_it != shmem_alignment_map_.end()) {
+        info.alignment = std::max(info.alignment, align_it->second);
       }
-      // Then start looking at smaller buffers.
-      // Keep collecting the buffer until the sum of their size exceeds the
-      // buffer to allocate and finally free all these entry in the free list
-      std::vector<std::multimap<uint64_t, StorageEntry *>::iterator> delete_it;
-      // the alloc list for the new entry
-      std::vector<std::vector<const VarNode *>> reuse_allocs;
-      uint64_t mem_ct = 0;
-      for (auto it = mid; it != begin;) {
-        --it;
-        delete_it.push_back(it);
-        mem_ct += it->second->const_nbits;
-        int n = it->second->allocs.size();
-        if (n > static_cast<int>(reuse_allocs.size())) {
-          reuse_allocs.resize(n, {});
-        }
-        for (int i = 0; i < n; i++) {
-          for (const VarNode *alloc : it->second->allocs[i]) {
-            reuse_allocs[i].push_back(alloc);
-          }
-        }
-        if (mem_ct >= const_nbits) {
-          break;
-        }
+
+      const AllocateNode *alloc = shmem_allocs_.at(var);
+      int64_t bytes_per_elem =
+          static_cast<int64_t>(alloc->dtype.bytes() * alloc->dtype.lanes());
+      DataType size_dtype = DataType::Int(32);
+      if (!alloc->extents.empty()) {
+        size_dtype = alloc->extents[0].dtype();
+      }
+      if (!size_dtype.is_int() && !size_dtype.is_uint()) {
+        size_dtype = DataType::Int(32);
       }
-      reuse_allocs.push_back({op->buffer_var.get()});
-      if (mem_ct != 0) {
-        StorageEntry *e = arena_.make<StorageEntry>();
-        e->const_nbits = std::max(const_nbits, mem_ct);
-        e->allocs = reuse_allocs;
-        for (auto it : delete_it) {
-          const_free_map_.erase(it);
+
+      PrimExpr size_expr = make_const(size_dtype, bytes_per_elem);
+      for (const PrimExpr &extent : alloc->extents) {
+        PrimExpr e = extent;
+        if (e.dtype() != size_dtype) {
+          e = cast(size_dtype, e);
         }
-        return e;
+        size_expr = size_expr * e;
       }
-    } else {
-      // if its symbolic allocation, just arbitrarily choose one entry to fit in
-      // because we don't know its actual size
-      for (auto it = sym_free_list_.begin(); it != sym_free_list_.end(); ++it) {
-        StorageEntry *e = *it;
-        sym_free_list_.erase(it);
-        return e;
+      info.size_dtype = size_dtype;
+      info.size_expr = size_expr;
+
+      int64_t const_extent = alloc->ConstantAllocationSize();
+      if (const_extent >= 0) {
+        info.const_size_bytes = const_extent * bytes_per_elem;
       }
+
+      buf_infos.push_back(std::move(info));
     }
-    return NewAlloc(op, const_nbits);
-  }
 
-  /*!
-   * \brief add the storage entry to the buffer var into the free list.
-   * \param var the buffer var
-   */
-  void Free(const VarNode *var) {
-    auto it = alloc_map_.find(var);
-    ICHECK(it != alloc_map_.end());
-    StorageEntry *e = it->second;
-    ICHECK_NE(e->allocs.size(), 0U);
-
-    // normal free.
-    if (e->const_nbits != 0) {
-      const_free_map_.insert({e->const_nbits, e});
-    } else {
-      sym_free_list_.push_back(e);
+    // Stable order so the later passes have deterministic behaviour.
+    std::sort(buf_infos.begin(), buf_infos.end(),
+              [](const BufInfo &a, const BufInfo &b) {
+                if (a.start != b.start)
+                  return a.start < b.start;
+                if (a.end != b.end)
+                  return a.end < b.end;
+                return a.name < b.name;
+              });
+
+    std::vector<Interval> intervals;
+    intervals.reserve(buf_infos.size());
+    for (const BufInfo &info : buf_infos) {
+      if (!info.const_size_bytes.has_value())
+        continue;
+      // Only constant-sized buffers participate in the arena packing because
+      // dynamic sizes must be placed sequentially later.
+      Interval interval;
+      interval.start = info.start;
+      interval.end = info.end;
+      interval.size_bytes = static_cast<size_t>(
+          std::max<int64_t>(0, info.const_size_bytes.value()));
+      interval.alignment = info.alignment;
+      interval.var = info.var;
+      intervals.push_back(interval);
+    }
+
+    ArenaPlan plan = LinearScanPack(std::move(intervals));
+    size_t arena_size_const = plan.arena_size;
+
+    if (verbose_) {
+      LOG(DEBUG) << "ArenaPlan (constant buffers): arena_size="
+                 << arena_size_const;
+      for (const auto &kv : plan.offsets) {
+        const VarNode *var = kv.first;
+        LOG(DEBUG) << "  " << var->name_hint << " -> offset=" << kv.second;
+      }
+    }
+
+    // Cursor tracks the running byte offset within the merged arena.
+    DataType offset_dtype =
+        buf_infos.empty() ? DataType::Int(32) : buf_infos.front().size_dtype;
+    PrimExpr total_size = make_const(offset_dtype, 0);
+    PrimExpr cursor = AlignPrimExpr(
+        make_const(offset_dtype, static_cast<int64_t>(arena_size_const)),
+        align_bytes_);
+
+    auto CastToOffset = [&](PrimExpr expr) -> PrimExpr {
+      if (expr.dtype() == offset_dtype) {
+        return expr;
+      }
+      return cast(offset_dtype, expr);
+    };
+
+    for (const BufInfo &info : buf_infos) {
+      PrimExpr offset_expr;
+      auto it = plan.offsets.find(info.var);
+      if (it != plan.offsets.end()) {
+        offset_expr =
+            make_const(offset_dtype, static_cast<int64_t>(it->second));
+      } else {
+        // Dynamic-sized buffers are appended after the constant arena.
+        cursor = AlignPrimExpr(cursor, info.alignment);
+        PrimExpr size_expr = CastToOffset(info.size_expr);
+        offset_expr = cursor;
+        cursor = offset_expr + size_expr;
+      }
+
+      buffer_byte_offsets_[info.var] = offset_expr;
+      PrimExpr buf_end = offset_expr + CastToOffset(info.size_expr);
+      total_size = max(total_size, buf_end);
+    }
+
+    merged_alloc_size_ = buf_infos.empty()
+                             ? make_const(offset_dtype, 0)
+                             : AlignPrimExpr(total_size, align_bytes_);
+
+    bool overlap_detected = false;
+
+    if (verbose_) {
+      LOG(DEBUG) << "Memory Allocation Plan for "
+                 << (is_dynamic_ ? "Dynamic" : "Static") << " Shared Memory:";
+      LOG(DEBUG) << "  Total Merged Size (aligned): " << merged_alloc_size_;
+      for (const BufInfo &info : buf_infos) {
+        const PrimExpr &offset = buffer_byte_offsets_.at(info.var);
+        LOG(DEBUG) << "    Buffer: " << info.name << " start=" << info.start
+                   << " end=" << info.end << " alignment=" << info.alignment
+                   << " offset=" << offset << " size=" << info.size_expr;
+      }
+      // Sanity check for overlapping constant buffers.
+      for (size_t i = 0; i < buf_infos.size(); ++i) {
+        const BufInfo &a = buf_infos[i];
+        auto a_off_imm = buffer_byte_offsets_.at(a.var).as<IntImmNode>();
+        if (!a.const_size_bytes.has_value() || a_off_imm == nullptr)
+          continue;
+        int64_t a_off = a_off_imm->value;
+        int64_t a_end = a_off + a.const_size_bytes.value();
+        for (size_t j = i + 1; j < buf_infos.size(); ++j) {
+          const BufInfo &b = buf_infos[j];
+          auto b_off_imm = buffer_byte_offsets_.at(b.var).as<IntImmNode>();
+          if (!b.const_size_bytes.has_value() || b_off_imm == nullptr)
+            continue;
+          bool live_overlap = !(a.end <= b.start || b.end <= a.start);
+          if (!live_overlap)
+            continue;
+          int64_t b_off = b_off_imm->value;
+          int64_t b_end = b_off + b.const_size_bytes.value();
+          bool mem_overlap = !(a_end <= b_off || b_end <= a_off);
+          if (mem_overlap) {
+            overlap_detected = true;
+            LOG(WARNING) << "Buffer overlap detected between " << a.name
+                         << " and " << b.name << " (lifetime overlap with "
+                         << "offset ranges [" << a_off << ", " << a_end
+                         << ") and [" << b_off << ", " << b_end << ")).";
+          }
+        }
+      }
+    }
+
+    if (overlap_detected) {
+      LOG(WARNING) << "Detected overlapping constant buffers; falling back to "
+                   << "sequential allocation without reuse.";
+      buffer_byte_offsets_.clear();
+      // In the fallback path we simply lay buffers out sequentially.
+      PrimExpr new_cursor = make_const(offset_dtype, 0);
+      PrimExpr new_total = make_const(offset_dtype, 0);
+      for (const BufInfo &info : buf_infos) {
+        new_cursor = AlignPrimExpr(new_cursor, info.alignment);
+        PrimExpr size_expr = CastToOffset(info.size_expr);
+        buffer_byte_offsets_[info.var] = new_cursor;
+        PrimExpr buf_end = new_cursor + size_expr;
+        new_total = max(new_total, buf_end);
+        new_cursor = buf_end;
+      }
+      merged_alloc_size_ = buf_infos.empty()
+                               ? make_const(offset_dtype, 0)
+                               : AlignPrimExpr(new_total, align_bytes_);
     }
   }
+
   // Whether enable dynamic analysis.
   bool is_dynamic_{true};
 
@@ -1095,14 +1327,6 @@ class SharedMemoryRewriter : public StmtExprMutator {
   bool allocated_{false};
   // Locations of free ops.
   std::unordered_map<const Object *, EventEntry> event_map_;
-  // constant size free map.
-  std::multimap<uint64_t, StorageEntry *> const_free_map_;
-  // symbolic free list, for non constant items.
-  std::list<StorageEntry *> sym_free_list_;
-  // The allocation assign map
-  std::unordered_map<const VarNode *, StorageEntry *> alloc_map_;
-  /*! \brief allocator of all the StorageEntry*/
-  support::Arena arena_;
   // The mapping of buffer bytes alignment
   std::unordered_map<const VarNode *, int> shmem_alignment_map_;
 };
@@ -1150,11 +1374,11 @@ Pass MergeSharedMemoryAllocations(bool enable_aggressive_merge = false,
                             {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.MergeSharedMemoryAllocations",
                         MergeSharedMemoryAllocations);
-});
+}
 
 } // namespace transform
 } // namespace tl
diff --git a/src/transform/multi_version_buffer_rewriter.cc b/src/transform/multi_version_buffer_rewriter.cc
index 38c9108c3..4075673ec 100644
--- a/src/transform/multi_version_buffer_rewriter.cc
+++ b/src/transform/multi_version_buffer_rewriter.cc
@@ -57,7 +57,7 @@ class WarpSpecializedRoleMarker_ : public StmtVisitor {
 
     // Check reads from global
     Block block(/*iter_vars=*/{}, /*reads=*/{}, /*writes=*/{}, /*name_hint=*/"",
-                /*body*/ GetRef<Stmt>(op));
+                /*body*/ tvm::ffi::GetRef<Stmt>(op));
     auto access = GetBlockReadWriteRegion(block, buffer_data_to_buffer_);
     auto reads = access[0];
     Role role = Role::kProducer;
@@ -253,7 +253,8 @@ class MultiVersionBufferRewriter : public StmtExprMutator {
   }
 
   static Buffer RewriteAllocBuffer(const Buffer &buffer, int num_versions) {
-    ObjectPtr<BufferNode> new_buffer = make_object<BufferNode>(*(buffer.get()));
+    ObjectPtr<BufferNode> new_buffer =
+        tvm::ffi::make_object<BufferNode>(*(buffer.get()));
     new_buffer->shape.insert(new_buffer->shape.begin(), PrimExpr(num_versions));
     if (!new_buffer->strides.empty()) {
       ICHECK(new_buffer->strides.size() + 1 == new_buffer->shape.size());
@@ -468,7 +469,7 @@ class MultiVersionBufferRewriter : public StmtExprMutator {
         new_args.Set(i + 1, new_index);
       }
     }
-    return Call(call->dtype, call->op, new_args, call->span);
+    return Call(call->dtype, call->op, new_args, call->annotations, call->span);
   }
 
   PrimExpr version_index_;
@@ -493,10 +494,10 @@ tvm::transform::Pass MultiVersionBuffer() {
   return CreatePrimFuncPass(pass_func, 0, "tl.MultiVersionBuffer", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.MultiVersionBuffer", MultiVersionBuffer);
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/parallel_loop_layout_validator.h b/src/transform/parallel_loop_layout_validator.h
new file mode 100644
index 000000000..c4cc2e1fc
--- /dev/null
+++ b/src/transform/parallel_loop_layout_validator.h
@@ -0,0 +1,140 @@
+/*!
+ * \file parallel_loop_layout_validator.h
+ * \brief Validator for parallel loop layout annotations.
+ */
+
+#ifndef TVM_TL_TRANSFORM_PARALLEL_LOOP_LAYOUT_VALIDATOR_H_
+#define TVM_TL_TRANSFORM_PARALLEL_LOOP_LAYOUT_VALIDATOR_H_
+
+#include <tvm/tir/stmt_functor.h>
+
+#include "../layout/layout.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+/*!
+ * \brief Count the number of consecutive nested parallel loops starting from
+ *        the given For node.
+ * \param op The outermost For node to start counting from.
+ * \return The number of consecutive nested parallel loops.
+ */
+inline int CountNestedParallelLoops(const ForNode *op) {
+  int count = 0;
+  const ForNode *current = op;
+  while (current != nullptr && current->kind == ForKind::kParallel) {
+    count++;
+    current = current->body.as<ForNode>();
+  }
+  return count;
+}
+
+/*!
+ * \brief Validator that checks parallel loop layout annotations.
+ *
+ * This validator checks:
+ * 1. All parallel loops must have layout annotations (either directly or via
+ *    an outer nested parallel loop).
+ * 2. For nested parallel loops, only the outermost parallel loop should have
+ *    the layout annotation.
+ * 3. The layout's InputDim must equal the number of consecutive nested
+ *    parallel loops.
+ */
+class ParallelLoopLayoutValidator : public StmtVisitor {
+public:
+  /*!
+   * \brief Validate parallel loop layout annotations in the given statement.
+   * \param stmt The statement to validate.
+   */
+  static void Validate(const Stmt &stmt) {
+    ParallelLoopLayoutValidator validator;
+    validator.VisitStmt(stmt);
+  }
+
+private:
+  void VisitStmt_(const ForNode *op) final {
+    // Only validate parallel loops
+    if (op->kind != ForKind::kParallel) {
+      StmtVisitor::VisitStmt_(op);
+      return;
+    }
+
+    // Check if this parallel loop has a layout annotation
+    bool has_layout = op->annotations.count(attr::kParallelLoopLayout) > 0;
+
+    // Count the number of consecutive nested parallel loops
+    int nested_count = CountNestedParallelLoops(op);
+
+    if (has_layout) {
+      // This is the outermost parallel loop with layout annotation
+      auto loop_layout = Downcast<Fragment>(
+          op->annotations.Get(attr::kParallelLoopLayout).value());
+
+      // Validate that layout's InputDim matches the number of nested parallel
+      // loops
+      int layout_input_dim = static_cast<int>(loop_layout->InputDim());
+      ICHECK(layout_input_dim == nested_count)
+          << "Layout InputDim mismatch for parallel loop.\n"
+          << "Expected: " << nested_count
+          << " (number of consecutive nested parallel loops)\n"
+          << "Got: " << layout_input_dim << " (layout InputDim)\n"
+          << "Loop: " << tvm::ffi::GetRef<For>(op) << "\n"
+          << "For nested parallel loops, the layout annotation should be on "
+          << "the outermost loop, and its InputDim should equal the total "
+          << "number of nested parallel loops.";
+
+      // Validate that inner parallel loops do NOT have layout annotations
+      ValidateInnerParallelLoopsNoLayout(op->body, nested_count - 1);
+
+      // Skip visiting inner parallel loops as they are part of this nested
+      // structure. Visit the body of the innermost parallel loop instead.
+      const ForNode *innermost = op;
+      for (int i = 1; i < nested_count; i++) {
+        innermost = innermost->body.as<ForNode>();
+      }
+      StmtVisitor::VisitStmt(innermost->body);
+    } else {
+      // This parallel loop doesn't have a layout annotation
+      // This is only valid if it's an inner loop of a nested parallel structure
+      // But since we process from outermost to innermost, if we reach here
+      // without a layout annotation, it's an error.
+      LOG(FATAL)
+          << "Parallel loop missing layout annotation.\n"
+          << "Loop: " << tvm::ffi::GetRef<For>(op) << "\n"
+          << "All parallel loops must have a layout annotation after "
+          << "LayoutInference pass. For nested parallel loops, the annotation "
+          << "should be on the outermost loop.";
+    }
+  }
+
+  /*!
+   * \brief Validate that inner parallel loops do not have layout annotations.
+   * \param body The body to check (should be inner parallel loops).
+   * \param remaining_count Number of remaining inner parallel loops to check.
+   */
+  void ValidateInnerParallelLoopsNoLayout(const Stmt &body,
+                                          int remaining_count) {
+    if (remaining_count <= 0) {
+      return;
+    }
+
+    const ForNode *inner_for = body.as<ForNode>();
+    ICHECK(inner_for != nullptr && inner_for->kind == ForKind::kParallel)
+        << "Expected inner parallel loop but found: " << body;
+
+    ICHECK(!inner_for->annotations.count(attr::kParallelLoopLayout))
+        << "Inner parallel loop should NOT have layout annotation.\n"
+        << "Loop: " << tvm::ffi::GetRef<For>(inner_for) << "\n"
+        << "For nested parallel loops, only the outermost parallel loop "
+        << "should have the layout annotation.";
+
+    ValidateInnerParallelLoopsNoLayout(inner_for->body, remaining_count - 1);
+  }
+};
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_TRANSFORM_PARALLEL_LOOP_LAYOUT_VALIDATOR_H_
diff --git a/src/transform/persist_threadblock.cc b/src/transform/persist_threadblock.cc
index 84d89e34d..b64ffdcce 100644
--- a/src/transform/persist_threadblock.cc
+++ b/src/transform/persist_threadblock.cc
@@ -38,7 +38,7 @@ class PersistThreadblock : public StmtExprMutator {
 
   Stmt VisitStmt_(const EvaluateNode *op) final {
     if (const auto *call = op->value.as<CallNode>()) {
-      if (call->op.same_as(sync_grid_cg())) {
+      if (call->op.same_as(sync_grid())) {
         has_sync_grid_ = true;
       }
     }
@@ -59,10 +59,10 @@ tvm::transform::Pass PersistThreadblock() {
   return CreatePrimFuncPass(pass_func, 0, "tl.PersistThreadblock", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.PersistThreadblock", PersistThreadblock);
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/pipeline_planning.cc b/src/transform/pipeline_planning.cc
index 15d4ff961..717dce27f 100644
--- a/src/transform/pipeline_planning.cc
+++ b/src/transform/pipeline_planning.cc
@@ -103,7 +103,7 @@ class AsyncDependencyChainBuilder : public StmtExprVisitor {
           ICHECK(call->op.same_as(builtin::tvm_access_ptr()));
           auto var = call->args[1].as<VarNode>();
           ICHECK(var);
-          auto it = buffer_data_to_buffer_.find(GetRef<Var>(var));
+          auto it = buffer_data_to_buffer_.find(tvm::ffi::GetRef<Var>(var));
           ICHECK(it != buffer_data_to_buffer_.end());
           return (*it).second;
         };
@@ -210,7 +210,7 @@ class BufferRegionCollector : public StmtExprVisitor {
       if (const auto *load = op->args[0].as<BufferLoadNode>()) {
         buffer_region = BufferRegion::FullRegion(load->buffer);
       } else if (const auto *var_node = op->args[0].as<VarNode>()) {
-        Var data_var = GetRef<Var>(var_node);
+        Var data_var = tvm::ffi::GetRef<Var>(var_node);
         auto it = buffer_data_to_buffer_.find(data_var);
         if (it != buffer_data_to_buffer_.end()) {
           buffer_region = BufferRegion::FullRegion((*it).second);
@@ -223,7 +223,7 @@ class BufferRegionCollector : public StmtExprVisitor {
     } else if (op->op.same_as(builtin::tvm_access_ptr())) {
       const VarNode *buffer_var = op->args[1].as<VarNode>();
       ICHECK(buffer_var);
-      auto it = buffer_data_to_buffer_.find(GetRef<Var>(buffer_var));
+      auto it = buffer_data_to_buffer_.find(tvm::ffi::GetRef<Var>(buffer_var));
       if (it != buffer_data_to_buffer_.end()) {
         const Buffer &buffer = (*it).second;
         const BufferRegion buffer_region = BufferRegion::FullRegion(buffer);
@@ -402,7 +402,7 @@ class PipelinePlanner : public StmtExprMutator {
       if (TargetHasAsyncCopy(target_) && use_async_copy_)
         annotations.Set(tir::attr::software_pipeline_async_stages,
                         Array<Integer>{0});
-      auto for_node = GetRef<For>(loop);
+      auto for_node = tvm::ffi::GetRef<For>(loop);
       for_node.CopyOnWrite()->annotations = annotations;
       return for_node;
     }
@@ -728,10 +728,10 @@ tvm::transform::Pass PipelinePlanning() {
   return CreatePrimFuncPass(pass_func, 0, "tl.PipelinePlanning", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.PipelinePlanning", PipelinePlanning);
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/plan_update_buffer_allocation_location.cc b/src/transform/plan_update_buffer_allocation_location.cc
new file mode 100644
index 000000000..995b21519
--- /dev/null
+++ b/src/transform/plan_update_buffer_allocation_location.cc
@@ -0,0 +1,359 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief Planning where buffers to be allocated and update the AST.
+ * \file plan_update_buffer_allocation_location.cc
+ */
+
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+#include <tvm/tir/var.h>
+
+#include "tir/transforms/ir_utils.h"
+
+// Forward-declare tir's var-level LCA helper which has no public header.
+namespace tvm {
+namespace tir {
+ffi::Map<Var, ffi::Optional<Stmt>>
+DetectBufferVarAccessLCA(const PrimFunc &func);
+}
+} // namespace tvm
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+using namespace tir::transform;
+
+// Use TVM's tir analysis API for LCA detection.
+
+class CollectManagedAllocations : public StmtExprVisitor {
+public:
+  void VisitStmt_(const BlockNode *op) final {
+    for (const auto &buf : op->alloc_buffers) {
+      managed_allocations.insert(buf->data.get());
+    }
+    for (const auto &buf : op->match_buffers) {
+      managed_allocations.insert(buf->buffer->data.get());
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  /*! \brief Buffers that are allocated outside of the BlockNode, and should not
+   * be moved by BufferAllocationLocator. */
+  std::unordered_set<const VarNode *> managed_allocations;
+};
+
+/*! \brief Collect the allocate buffer order. */
+class BufferAllocateOrderCollector : public StmtExprVisitor {
+public:
+  static ffi::Array<Buffer> Collect(const PrimFunc &func) {
+    BufferAllocateOrderCollector collector;
+    for (const auto &kv : func->buffer_map) {
+      collector.buffer_alloc_recorder_.push_back(kv.second);
+    }
+    collector(func->body);
+    return std::move(collector.buffer_alloc_recorder_);
+  }
+
+private:
+  bool find(const Buffer &buf) {
+    return std::find(buffer_alloc_recorder_.begin(),
+                     buffer_alloc_recorder_.end(),
+                     buf) != buffer_alloc_recorder_.end();
+  }
+
+  void VisitStmt_(const BlockNode *op) final {
+    for (const Buffer &buffer : op->alloc_buffers) {
+      buffer_alloc_recorder_.push_back(buffer);
+    }
+    // Also visit match_buffers to collect constant buffers associated with
+    // AllocateConst nodes. These buffers only appear in read and match_buffer
+    // regions.
+    for (const auto &region : op->match_buffers) {
+      if (!find(region->source->buffer)) {
+        buffer_alloc_recorder_.push_back(region->source->buffer);
+      }
+    }
+
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitExpr_(const BufferLoadNode *op) final {
+    if (!find(op->buffer)) {
+      buffer_alloc_recorder_.push_back(op->buffer);
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitStmt_(const BufferStoreNode *op) final {
+    if (!find(op->buffer)) {
+      buffer_alloc_recorder_.push_back(op->buffer);
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  /*! \brief The buffer allocated order recorder. */
+  ffi::Array<Buffer> buffer_alloc_recorder_;
+};
+
+class BufferAllocationLocator : public StmtExprMutator {
+public:
+  explicit BufferAllocationLocator(const PrimFunc &func) {
+    // Use TVM's tir LCA detection implementation
+    ffi::Map<Buffer, ffi::Optional<Stmt>> buffer_lca =
+        tir::DetectBufferAccessLCA(func);
+    ffi::Map<Var, ffi::Optional<Stmt>> var_lca =
+        tir::DetectBufferVarAccessLCA(func);
+
+    // The buffer_alloc_recorder Array is used to keep the buffer allocation
+    // order since the buffer_lca Map is unordered.
+    ffi::Array<Buffer> buffer_alloc_recorder =
+        BufferAllocateOrderCollector::Collect(func);
+    std::unordered_set<const VarNode *> arg_buffer_vars;
+    CollectManagedAllocations collector;
+    collector(func->body);
+    managed_allocations_ = collector.managed_allocations;
+
+    for (const auto &kv : func->buffer_map) {
+      const Buffer &buffer = kv.second;
+      arg_buffer_vars.emplace(buffer->data.get());
+      PushBinding(buffer->data, buffer);
+    }
+    // create buffers to be allocated at each stmts
+    for (const auto &buffer : buffer_alloc_recorder) {
+      // Prefer the LCA derived from the underlying data var. If missing, fall
+      // back to Buffer LCA.
+      const StmtNode *stmt = nullptr;
+      auto vit = var_lca.find(buffer->data);
+      if (vit != var_lca.end()) {
+        stmt = (*vit).second.get();
+      } else {
+        auto bit = buffer_lca.find(buffer);
+        if (bit != buffer_lca.end()) {
+          stmt = (*bit).second.get();
+        }
+      }
+      if (stmt != nullptr || vit != var_lca.end()) {
+        if (arg_buffer_vars.count(buffer->data.get())) {
+          continue;
+        }
+        if (managed_allocations_.count(buffer->data.get())) {
+          alloc_buffers_[stmt].push_back(buffer);
+        }
+        // Do not push binding here. Bindings should reflect scope accurately,
+        // and will be pushed/popped when visiting the owning stmt.
+      }
+    }
+  }
+
+private:
+  // Maintain a stack of Buffers per data var to correctly handle cases
+  // where multiple Buffer objects share the same underlying data Var.
+  void PushBinding(const Var &v, const Buffer &buf) {
+    ffi::Array<Buffer> arr;
+    auto it = buffer_data_to_buffers_.find(v);
+    if (it != buffer_data_to_buffers_.end()) {
+      arr = (*it).second;
+    }
+    arr.push_back(buf);
+    buffer_data_to_buffers_.Set(v, arr);
+  }
+
+  void PopBinding(const Var &v) {
+    auto it = buffer_data_to_buffers_.find(v);
+    if (it == buffer_data_to_buffers_.end())
+      return;
+    ffi::Array<Buffer> arr = (*it).second;
+    if (!arr.empty()) {
+      // erase last element
+      std::vector<Buffer> tmp;
+      tmp.reserve(arr.size() - 1);
+      for (size_t i = 0; i + 1 < arr.size(); ++i)
+        tmp.push_back(arr[i]);
+      arr = ffi::Array<Buffer>(tmp);
+    }
+    if (arr.empty()) {
+      buffer_data_to_buffers_.erase(v);
+    } else {
+      buffer_data_to_buffers_.Set(v, arr);
+    }
+  }
+
+  bool HasBinding(const Var &v) const {
+    auto it = buffer_data_to_buffers_.find(v);
+    return it != buffer_data_to_buffers_.end() && !(*it).second.empty();
+  }
+
+  // Snapshot the current top binding per Var for APIs that require
+  // a single Buffer per data Var (e.g. GetBlockReadWriteRegion).
+  ffi::Map<Var, Buffer> SnapshotVarMap() const {
+    ffi::Map<Var, Buffer> out;
+    for (const auto &kv : buffer_data_to_buffers_) {
+      const Var &v = kv.first;
+      const ffi::Array<Buffer> &arr = kv.second;
+      if (!arr.empty()) {
+        out.Set(v, arr[arr.size() - 1]);
+      }
+    }
+    return out;
+  }
+
+  Stmt VisitStmt_(const ForNode *op) final {
+    auto it = alloc_buffers_.find(op);
+    if (it == alloc_buffers_.end()) {
+      return StmtMutator::VisitStmt_(op);
+    }
+    for (const Buffer &buf : it->second) {
+      PushBinding(buf->data, buf);
+    }
+    auto node = Downcast<For>(StmtMutator::VisitStmt_(op));
+    ffi::Array<Buffer> new_block_alloc_bufs;
+    for (const Buffer &buf : it->second) {
+      if (managed_allocations_.count(buf->data.get())) {
+        PopBinding(buf->data);
+        new_block_alloc_bufs.push_back(buf);
+      }
+    }
+
+    if (!new_block_alloc_bufs.empty()) {
+      node.CopyOnWrite()->body =
+          InjectOpaqueBlock(node->body, new_block_alloc_bufs);
+    }
+
+    return node;
+  }
+
+  Stmt VisitStmt_(const BlockNode *op) final {
+    ICHECK(!op->init.defined());
+    ffi::Array<Buffer> alloc_buffers;
+    auto it = alloc_buffers_.find(op);
+    if (it != alloc_buffers_.end()) {
+      alloc_buffers = it->second;
+      for (const Buffer &buf : it->second) {
+        PushBinding(buf->data, buf);
+      }
+    }
+    for (const MatchBufferRegion match_buffer : op->match_buffers) {
+      const Var &target_var = match_buffer->buffer->data;
+      const Var &source_var = match_buffer->source->buffer->data;
+      ICHECK(HasBinding(source_var));
+      PushBinding(target_var, match_buffer->buffer);
+    }
+    Stmt stmt = StmtMutator::VisitStmt_(op);
+    op = stmt.as<BlockNode>();
+    ICHECK(op != nullptr);
+
+    // No longer consider buffers created by match_buffer inside the block when
+    // updating access region.
+    for (const MatchBufferRegion match_buffer : op->match_buffers) {
+      const Var &target_var = match_buffer->buffer->data;
+      PopBinding(target_var);
+    }
+    // No longer consider buffers allocated inside the block when updating
+    // access region.
+    if (it != alloc_buffers_.end()) {
+      for (const Buffer &buf : it->second) {
+        PopBinding(buf->data);
+      }
+    }
+
+    ObjectPtr<BlockNode> n = CopyOnWrite(op);
+    n->alloc_buffers = std::move(alloc_buffers);
+    // Erase buffer allocated inside the block from access region.
+    n->reads = RemoveRedundantBufferRegion(n->reads);
+    n->writes = RemoveRedundantBufferRegion(n->writes);
+    return Stmt(n);
+  }
+
+  Stmt VisitStmt_(const BufferRealizeNode *op) final {
+    ICHECK(false)
+        << "Internal Error: BufferRealizeNode is not allowed in TensorIR.";
+    throw;
+  }
+
+  Stmt InjectOpaqueBlock(Stmt body, const ffi::Array<Buffer> &alloc_buffers) {
+    ICHECK(!alloc_buffers.empty());
+    Block opaque_block(/*iter_vars=*/{},
+                       /*reads=*/{},
+                       /*writes=*/{},
+                       /*name_hint=*/"",
+                       /*body=*/std::move(body),
+                       /*init=*/std::nullopt,
+                       /*alloc_buffers=*/alloc_buffers);
+    ObjectPtr<BlockNode> n = CopyOnWrite(opaque_block.get());
+    // Snapshot to a Var->Buffer map using the innermost binding for each Var.
+    ffi::Map<Var, Buffer> var_map = SnapshotVarMap();
+    ffi::Array<ffi::Array<BufferRegion>> access =
+        GetBlockReadWriteRegion(opaque_block, var_map);
+    n->reads = access[0];
+    n->writes = access[1];
+    BlockRealize realize({}, Bool(true), Block(n));
+    return realize;
+  }
+
+  ffi::Array<BufferRegion>
+  RemoveRedundantBufferRegion(const ffi::Array<BufferRegion> &region) const {
+    ffi::Array<BufferRegion> result;
+    for (const BufferRegion &buffer_region : region) {
+      if (HasBinding(buffer_region->buffer->data)) {
+        result.push_back(buffer_region);
+      }
+    }
+    return result;
+  }
+
+  /*! \brief The map from stmt to the buffers to be allocated under it. */
+  std::unordered_map<const StmtNode *, ffi::Array<Buffer>> alloc_buffers_;
+  /*! \brief Stack of buffers per data var for scoping correctness. */
+  ffi::Map<Var, ffi::Array<Buffer>> buffer_data_to_buffers_;
+  /*! \brief Buffers that are allocated within a BlockNode, and may be moved. */
+  std::unordered_set<const VarNode *> managed_allocations_;
+};
+
+PrimFunc PlanAndUpdateBufferAllocationLocation(PrimFunc func) {
+  auto fptr = func.CopyOnWrite();
+  BufferAllocationLocator locator(func);
+  fptr->body = locator(fptr->body);
+  return func;
+}
+
+namespace transform {
+
+Pass PlanAndUpdateBufferAllocationLocation() {
+  auto pass_func = [=](PrimFunc f, const IRModule &m, const PassContext &ctx) {
+    return ::tvm::tl::PlanAndUpdateBufferAllocationLocation(std::move(f));
+  };
+  return CreatePrimFuncPass(pass_func, 0,
+                            "tl.PlanAndUpdateBufferAllocationLocation", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.PlanAndUpdateBufferAllocationLocation",
+                        PlanAndUpdateBufferAllocationLocation);
+}
+
+} // namespace transform
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/simplify.cc b/src/transform/simplify.cc
index f1a64c306..c10d5687a 100644
--- a/src/transform/simplify.cc
+++ b/src/transform/simplify.cc
@@ -23,6 +23,7 @@ namespace tvm {
 namespace tl {
 
 using namespace tir;
+using namespace ffi;
 using namespace arith;
 
 struct SimplifyConfigNode : public AttrsNodeReflAdapter<SimplifyConfigNode> {
@@ -62,8 +63,8 @@ struct SimplifyConfigNode : public AttrsNodeReflAdapter<SimplifyConfigNode> {
                 "branch",
                 refl::DefaultValue(false));
   }
-  static constexpr const char *_type_key = "tl.transform.SimplifyConfig";
-  TVM_FFI_DECLARE_FINAL_OBJECT_INFO(SimplifyConfigNode, BaseAttrsNode);
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.transform.SimplifyConfig",
+                                    SimplifyConfigNode, BaseAttrsNode);
 
   RewriteSimplifier::Extension GetEnabledExtensions() const {
     RewriteSimplifier::Extension flags = RewriteSimplifier::kNone;
@@ -209,12 +210,11 @@ CollectVarsUsedInBufferDefinition(const Stmt &stmt) {
 
 class SimplifyConfig : public Attrs {
 public:
-  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(SimplifyConfig, Attrs,
-                                            SimplifyConfigNode);
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NOTNULLABLE(SimplifyConfig, Attrs,
+                                                SimplifyConfigNode);
 };
-TVM_FFI_STATIC_INIT_BLOCK({ SimplifyConfigNode::RegisterReflection(); });
+TVM_FFI_STATIC_INIT_BLOCK() { SimplifyConfigNode::RegisterReflection(); }
 
-TVM_REGISTER_NODE_TYPE(SimplifyConfigNode);
 TVM_REGISTER_PASS_CONFIG_OPTION("tl.Simplify", SimplifyConfig);
 
 class StmtSimplifier : public IRMutatorWithAnalyzer {
@@ -240,37 +240,42 @@ class StmtSimplifier : public IRMutatorWithAnalyzer {
     simplifier.MarkBufferMapShapes(func);
     func.CopyOnWrite()->body = simplifier(func->body);
 
-    // Begin to remove useless var and buffer
-    // First get used buffers
-    simplifier.used_buffers_ = CollectUsedBuffers(func);
-
-    bool param_updated = false;
-    Array<Var> new_params;
-    Map<Var, Buffer> new_buffer_map;
-    // Check whether each buffer is used
-    for (const auto &var : func->params) {
-      if (func->buffer_map.find(var) != func->buffer_map.end()) {
-        if (simplifier.used_buffers_.find(func->buffer_map[var].get()) !=
-            simplifier.used_buffers_.end()) {
-          new_params.push_back(var);
-          new_buffer_map.Set(var, func->buffer_map[var]);
-        } else if (simplifier.used_in_buffer_def_.find(
-                       func->buffer_map[var]->data.get()) !=
-                   simplifier.used_in_buffer_def_.end()) {
-          new_params.push_back(var);
-          new_buffer_map.Set(var, func->buffer_map[var]);
+    // Optionally remove unused buffer parameters
+    if (simplify_arguments) {
+      // First get used buffers
+      simplifier.used_buffers_ = CollectUsedBuffers(func);
+
+      bool param_updated = false;
+      Array<Var> new_params;
+      Map<Var, Buffer> new_buffer_map;
+      // Check whether each buffer is used
+      for (const auto &var : func->params) {
+        if (func->buffer_map.find(var) != func->buffer_map.end()) {
+          if (simplifier.used_buffers_.find(func->buffer_map[var].get()) !=
+              simplifier.used_buffers_.end()) {
+            new_params.push_back(var);
+            new_buffer_map.Set(var, func->buffer_map[var]);
+          } else if (simplifier.used_in_buffer_def_.find(
+                         func->buffer_map[var]->data.get()) !=
+                     simplifier.used_in_buffer_def_.end()) {
+            new_params.push_back(var);
+            new_buffer_map.Set(var, func->buffer_map[var]);
+          } else {
+            param_updated = true;
+          }
         } else {
-          param_updated = true;
+          // Non-buffer parameters (e.g., scalars) are always retained
+          new_params.push_back(var);
         }
       }
-    }
 
-    if (param_updated) {
-      return PrimFunc(new_params, func.CopyOnWrite()->body, func->ret_type,
-                      new_buffer_map, func->attrs, func->span);
-    } else {
-      return func;
+      if (param_updated) {
+        return PrimFunc(new_params, func.CopyOnWrite()->body, func->ret_type,
+                        new_buffer_map, func->attrs, func->span);
+      }
     }
+    // Either no change to params or argument simplification disabled
+    return func;
   }
 
 private:
@@ -391,7 +396,7 @@ class StmtSimplifier : public IRMutatorWithAnalyzer {
     if (can_inline && !used_in_buffer_def) {
       return body;
     } else if (value.same_as(op->value) && body.same_as(op->body)) {
-      return GetRef<Stmt>(op);
+      return tvm::ffi::GetRef<Stmt>(op);
     } else {
       auto n = this->CopyOnWrite(op);
       n->value = std::move(value);
@@ -460,6 +465,16 @@ class StmtSimplifier : public IRMutatorWithAnalyzer {
     return std::move(store);
   }
 
+  Stmt VisitStmt_(const AttrStmtNode *op) override {
+    if (op->attr_key == "tl.assume") {
+      PrimExpr condition = this->VisitExpr(Downcast<PrimExpr>(op->node));
+      auto n = CopyOnWrite(op);
+      n->node = std::move(condition);
+      return Parent::VisitStmt_(n.get());
+    }
+    return Parent::VisitStmt_(op);
+  }
+
 private:
   bool ArrayDeepEqual(const Array<PrimExpr> &lhs, const Array<PrimExpr> &rhs) {
     if (lhs.size() != rhs.size()) {
@@ -522,10 +537,10 @@ tvm::transform::Pass Simplify(bool simplify_arguments = true) {
   return CreatePrimFuncPass(pass_func, 0, "tl.Simplify", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.Simplify", Simplify);
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/split_host_device.cc b/src/transform/split_host_device.cc
index 6e9ae914a..57d4b8127 100644
--- a/src/transform/split_host_device.cc
+++ b/src/transform/split_host_device.cc
@@ -33,28 +33,65 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
+#include "../op/builtin.h"
+#include "common/assume.h"
 #include "tir/analysis/var_use_def_analysis.h"
+#include "tvm/node/cast.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/tir/stmt.h"
 
 namespace tvm {
 namespace tl {
-
+using namespace ffi;
 namespace tir = tvm::tir;
 
+// This pass traverses the AST, split the target function into host part and
+// device part and copies all assume attribute statements to the device side.
+
+// 1. Traverse AST and collect all assume statements into host_assumes_.
+// 2. Until the first AttrStmtNode with tvm::attr::kTarget.
+// 3. Call SplitDeviceFunc, which will create a new device function and replace
+//    the original body with a call to that function.
 class HostDeviceSplitter : public tir::StmtMutator {
 public:
   explicit HostDeviceSplitter(IRModule *device_mod,
                               std::function<GlobalVar()> var_supply)
       : device_mod_(device_mod), var_supply_(std::move(var_supply)) {}
 
+  void SetNonRestrictParams(Optional<Array<tir::Var>> params) {
+    for (auto param : params.value()) {
+      non_restrict_params_.push_back(param);
+    }
+  }
+
   tir::Stmt VisitStmt_(const tir::AttrStmtNode *op) final {
     if (op->attr_key == tvm::attr::kTarget) {
       found_device_region_ = true;
       auto device_target = op->node.as<tvm::Target>().value().WithoutHost();
       return SplitDeviceFunc(op->body, device_target);
+    } else if (op->attr_key == tir::attr::tilelang_assume) {
+      // NOTE(chaofan): the assumes collected here must be in host-side.
+      //    This is because when the collector reaches the split region,
+      //    it will start to split and return. For safety, we add a check here.
+      ICHECK(!found_device_region_)
+          << "Assumes collection should not be in device region.";
+      // We first push back the outside assume, then visit the child.
+      // So when moving assumes to device side, we need to do the building
+      // process in a reverse order.
+      host_assumes_.push_back(op);
     }
     return tir::StmtMutator::VisitStmt_(op);
   }
 
+  tir::Stmt VisitStmt_(const tir::EvaluateNode *op) final {
+    auto stmt = GetRef<tir::Stmt>(op);
+    // There should be no assume in evaluate form after InjectAssumes.
+    ICHECK(!IsAssumeInEvaluateForm(stmt))
+        << "Unexpected assume in evaluate form. Please run InjectAssumes pass "
+           "first.";
+    return tir::StmtMutator::VisitStmt_(op);
+  }
+
   tir::Stmt ForceSplit(tir::Stmt body, tvm::Target device_target) {
     return SplitDeviceFunc(std::move(body), std::move(device_target));
   }
@@ -63,8 +100,18 @@ class HostDeviceSplitter : public tir::StmtMutator {
 
 private:
   bool found_device_region_{false};
+  Array<tir::Var> non_restrict_params_;
+
+  Stmt wrapBodyWithHostSideAssumes(Stmt body) {
+    for (auto it = host_assumes_.rbegin(); it != host_assumes_.rend(); ++it) {
+      body =
+          AttrStmt((*it)->node, tir::attr::tilelang_assume, (*it)->value, body);
+    }
+    return body;
+  }
 
   tir::Stmt SplitDeviceFunc(tir::Stmt body, tvm::Target device_target) {
+
     auto [params, buffers_to_declare] =
         [&]() -> std::tuple<Array<tir::Var>, Array<tir::Buffer>> {
       tir::VarUseDefAnalyzer use_def(/*defined_vars=*/{},
@@ -104,14 +151,21 @@ class HostDeviceSplitter : public tir::StmtMutator {
       kernel_ret_type = VoidType();
     }
 
+    // Declare necessary buffers for the device side.
     for (tir::Buffer buf : buffers_to_declare) {
       body = tir::DeclBuffer(buf, std::move(body));
     }
+
+    // Copy assumes from host-side to device-side.
+    body = wrapBodyWithHostSideAssumes(body);
+
     tir::PrimFunc device_func(params, body, kernel_ret_type);
     device_func =
-        WithAttrs(std::move(device_func), {{tvm::attr::kTarget, device_target},
-                                           {tir::attr::kNoAlias, true},
-                                           {tir::attr::kIsGlobalFunc, true}});
+        WithAttrs(std::move(device_func),
+                  {{tvm::attr::kTarget, device_target},
+                   {tir::attr::kNoAlias, true},
+                   {tir::attr::kIsGlobalFunc, true},
+                   {tl::attr::kNonRestrictParams, non_restrict_params_}});
 
     GlobalVar kernel_symbol_global = var_supply_();
     (*device_mod_)->Add(kernel_symbol_global, device_func);
@@ -138,11 +192,20 @@ class HostDeviceSplitter : public tir::StmtMutator {
   IRModule *device_mod_;
   // Generate new GlobalVar for the kernel
   std::function<GlobalVar()> var_supply_;
+  // Collect assumes in host side
+  Array<const tir::AttrStmtNode *> host_assumes_;
 };
 
 tir::PrimFunc SplitHostDevice(tir::PrimFunc func, IRModule *device_mod,
                               std::function<GlobalVar()> var_supply) {
   HostDeviceSplitter splitter(device_mod, std::move(var_supply));
+  // Propagate non-restrict parameter list from host func to device kernels
+  if (auto opt = func->GetAttr<Array<tir::Var>>(tl::attr::kNonRestrictParams)) {
+    splitter.SetNonRestrictParams(opt.value());
+    // Remove the attribute from host-side PrimFunc; it only matters for device
+    // codegen.
+    func = tvm::WithoutAttr(std::move(func), tl::attr::kNonRestrictParams);
+  }
 
   if (auto body = splitter(func->body); !body.same_as(func->body)) {
     func.CopyOnWrite()->body = body;
@@ -159,7 +222,6 @@ tir::PrimFunc SplitHostDevice(tir::PrimFunc func, IRModule *device_mod,
       }
     }
   }
-
   return func;
 }
 
@@ -190,7 +252,6 @@ tvm::transform::Pass SplitHostDevice() {
         }
       }
     }
-
     mod->Update(updates);
     mod->Update(device_mod);
     return tir::transform::ConvertSSA()(mod);
@@ -200,10 +261,10 @@ tvm::transform::Pass SplitHostDevice() {
                                           {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.SplitHostDevice", SplitHostDevice);
-});
+}
 
 } // namespace transform
 } // namespace tl
diff --git a/src/transform/storage_access.cc b/src/transform/storage_access.cc
index e8ba0ed78..49c839929 100644
--- a/src/transform/storage_access.cc
+++ b/src/transform/storage_access.cc
@@ -29,6 +29,7 @@
 #include <string>
 #include <utility>
 
+#include "../op/builtin.h"
 #include "tir/transforms/ir_utils.h"
 
 namespace tvm {
@@ -38,10 +39,11 @@ using namespace tir;
 
 void TileLangStorageAccessVisitor::VisitExpr_(const BufferLoadNode *op) {
   Var buf = op->buffer->data;
-  buffer_data_to_buffer_.Set(GetRef<Var>(buf.get()), op->buffer);
+  buffer_data_to_buffer_.Set(tvm::ffi::GetRef<Var>(buf.get()), op->buffer);
   StorageScope scope = GetScope(buf);
   if (Enabled(buf.get(), scope)) {
-    ICHECK(allow_append_) << GetRef<BufferLoad>(op) << " " << scope.to_string();
+    ICHECK(allow_append_) << tvm::ffi::GetRef<BufferLoad>(op) << " "
+                          << scope.to_string();
     AccessEntry e;
     e.threads = env_threads();
     e.thread_range = this->ComputeThreadRange(e.threads);
@@ -65,7 +67,7 @@ void TileLangStorageAccessVisitor::VisitStmt_(const BufferStoreNode *op) {
   curr_stmt_.stmt = op;
 
   Var buf = op->buffer->data;
-  buffer_data_to_buffer_.Set(GetRef<Var>(buf.get()), op->buffer);
+  buffer_data_to_buffer_.Set(tvm::ffi::GetRef<Var>(buf.get()), op->buffer);
   StorageScope scope = GetScope(buf);
   if (Enabled(buf.get(), scope)) {
     AccessEntry e;
@@ -252,7 +254,11 @@ void TileLangStorageAccessVisitor::VisitStmt_(const IfThenElseNode *op) {
   this->VisitExpr(op->condition);
   PrimExpr real_condition = ExtractRealCondition(op->condition);
 
-  curr_stmt_.access.clear();
+  // Preserve accesses collected from the condition expression so they
+  // participate in dependency analysis. Otherwise, a write to shared memory
+  // immediately followed by an if-condition reading that memory would not
+  // trigger a sync before the if-statement.
+  std::vector<AccessEntry> cond_access = std::move(curr_stmt_.access);
   allow_append_ = false;
 
   scope_.push_back(std::vector<StmtEntry>());
@@ -265,6 +271,11 @@ void TileLangStorageAccessVisitor::VisitStmt_(const IfThenElseNode *op) {
   s.stmt = op;
   s.access = Summarize(std::move(scope_.back()), nullptr);
   scope_.pop_back();
+  // Merge the condition's access summary into the if-statement's access list
+  // so the planner can insert a sync before the if when necessary.
+  if (!cond_access.empty()) {
+    s.access.insert(s.access.begin(), cond_access.begin(), cond_access.end());
+  }
   if (op->else_case) {
     scope_.push_back(std::vector<StmtEntry>());
     {
@@ -287,12 +298,7 @@ void TileLangStorageAccessVisitor::VisitStmt_(const WhileNode *op) {
   if (!is_thread_invariant) {
     ++condition_counter_;
   }
-
-  allow_append_ = true;
   this->VisitExpr(op->condition);
-  curr_stmt_.access.clear();
-  allow_append_ = false;
-
   scope_.push_back(std::vector<StmtEntry>());
   this->VisitStmt(op->body);
   StmtEntry s;
@@ -306,14 +312,32 @@ void TileLangStorageAccessVisitor::VisitStmt_(const WhileNode *op) {
 }
 
 void TileLangStorageAccessVisitor::VisitExpr_(const CallNode *op) {
+  // Mark async TMA load context so that tvm_access_ptr within the call
+  // can be tagged accordingly.
+  auto is_tma_load = [&]() {
+    if (auto opt = op->op.as<Op>()) {
+      const Op &call_op = opt.value();
+      return call_op.same_as(tl::tma_load()) ||
+             call_op.same_as(tl::tma_load_im2col());
+    }
+    return false;
+  }();
+  if (is_tma_load) {
+    tma_depth_++;
+    for (const auto &a : op->args) {
+      this->VisitExpr(a);
+    }
+    tma_depth_--;
+    return;
+  }
   if (op->op.same_as(builtin::address_of())) {
     ICHECK_EQ(op->args.size(), 1U);
     if (auto load = op->args[0].as<BufferLoadNode>()) {
       Buffer buffer = load->buffer;
       DataType dtype = buffer->dtype;
       const VarNode *buffer_var = buffer->data.as<VarNode>();
-      buffer_data_to_buffer_.Set(GetRef<Var>(buffer_var), buffer);
-      StorageScope scope = GetScope(GetRef<Var>(buffer_var));
+      buffer_data_to_buffer_.Set(tvm::ffi::GetRef<Var>(buffer_var), buffer);
+      StorageScope scope = GetScope(tvm::ffi::GetRef<Var>(buffer_var));
       Array<Range> buffer_ranges;
       // from indices to buffer indices
       ICHECK(buffer->shape.size() == load->indices.size());
@@ -351,17 +375,18 @@ void TileLangStorageAccessVisitor::VisitExpr_(const CallNode *op) {
     PrimExpr offset = op->args[2];
     PrimExpr extent = op->args[3];
     const IntImmNode *flag = op->args[4].as<IntImmNode>();
-    StorageScope scope = GetScope(GetRef<Var>(buffer_var));
+    StorageScope scope = GetScope(tvm::ffi::GetRef<Var>(buffer_var));
     // The buffer scope.
     if (Enabled(buffer_var, scope)) {
       ICHECK(allow_append_);
       Array<Range> buffer_ranges;
-      if (buffer_data_to_buffer_.find(GetRef<Var>(buffer_var)) ==
+      if (buffer_data_to_buffer_.find(tvm::ffi::GetRef<Var>(buffer_var)) ==
           buffer_data_to_buffer_.end()) {
         // cannot find buffer map, use the default buffer
         buffer_ranges = {Range::FromMinExtent(offset, extent)};
       } else {
-        Buffer buffer = buffer_data_to_buffer_.at(GetRef<Var>(buffer_var));
+        Buffer buffer =
+            buffer_data_to_buffer_.at(tvm::ffi::GetRef<Var>(buffer_var));
         auto buffer_shape = buffer->shape;
         // convert 1d offset to multi-dimensional index
         auto linear_to_indices = [this](PrimExpr offset,
@@ -392,7 +417,7 @@ void TileLangStorageAccessVisitor::VisitExpr_(const CallNode *op) {
       e.threads = env_threads();
       e.thread_range = this->ComputeThreadRange(e.threads);
       e.dtype = dtype;
-      e.buffer = GetRef<Var>(buffer_var);
+      e.buffer = tvm::ffi::GetRef<Var>(buffer_var);
       e.buffer_ranges = buffer_ranges;
       e.is_pointer_access = true;
       e.touched = {
@@ -400,10 +425,12 @@ void TileLangStorageAccessVisitor::VisitExpr_(const CallNode *op) {
       e.scope = scope;
       if (flag->value & 1) {
         e.type = kRead;
+        e.is_async_copy = (tma_depth_ > 0);
         curr_stmt_.access.emplace_back(e);
       }
       if (flag->value & 2) {
         e.type = kWrite;
+        e.is_async_copy = (tma_depth_ > 0);
         curr_stmt_.access.emplace_back(e);
       }
     }
diff --git a/src/transform/storage_access.h b/src/transform/storage_access.h
index 9afce29ba..54114ace2 100644
--- a/src/transform/storage_access.h
+++ b/src/transform/storage_access.h
@@ -39,6 +39,7 @@ namespace tvm {
 namespace tl {
 
 using namespace tir;
+using namespace ffi;
 using arith::IRVisitorWithAnalyzer;
 using runtime::StorageRank;
 using runtime::StorageScope;
@@ -83,6 +84,10 @@ class TileLangStorageAccessVisitor : public IRVisitorWithAnalyzer {
     bool double_buffer_write = false;
     /*! \brief Whether the access is pointer access */
     bool is_pointer_access = false;
+    /*! \brief Whether this access originates from an async copy context
+     *         (e.g., inside a TMA load) and therefore multiple writes
+     *         among themselves should not force barriers between them. */
+    bool is_async_copy = false;
   };
 
   /*! \brief Access pattern about a single statement */
@@ -159,6 +164,8 @@ class TileLangStorageAccessVisitor : public IRVisitorWithAnalyzer {
   bool allow_append_{false};
   // Whether we are in device environment
   bool in_device_env_{false};
+  // Nesting depth of tma_load/tma_load_im2col calls
+  int tma_depth_{0};
   // Whether we are inside condition.
   int condition_counter_{0};
   // The current double buffer write scope.
diff --git a/src/transform/storage_rewrite.cc b/src/transform/storage_rewrite.cc
index da8f0943e..40973f39a 100644
--- a/src/transform/storage_rewrite.cc
+++ b/src/transform/storage_rewrite.cc
@@ -544,7 +544,7 @@ class StoragePlanRewriter : public StmtExprMutator {
       }
       return it->second->alloc_var;
     } else {
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     }
   }
   PrimExpr VisitExpr_(const CallNode *op) final {
@@ -679,7 +679,7 @@ class StoragePlanRewriter : public StmtExprMutator {
     return !scope.tag.empty() && scope.tag != ".dyn" &&
            scope.tag != ".barrier" && scope.tag != ".workspace" &&
            scope.tag != ".vtcm" && scope.tag != ".var" &&
-           scope.tag != ".descriptor";
+           scope.tag.find(".descriptor") != 0;
   }
 
   // Allocate entry of node.
@@ -865,7 +865,7 @@ class StoragePlanRewriter : public StmtExprMutator {
     ICHECK_NE(e->const_nbits, 0U);
     MemoryInfo info;
     if (e->scope.tag != ".barrier" && e->scope.tag != ".var" &&
-        e->scope.tag != ".descriptor") {
+        e->scope.tag.find(".descriptor") != 0) {
       info = GetMemoryInfo(e->scope.to_string());
     }
     uint64_t total_bits = e->const_nbits;
@@ -978,8 +978,8 @@ class StoragePlanRewriter : public StmtExprMutator {
           ICHECK(alloc_info.count(var));
           const AllocEntry &entry = alloc_info.at(var);
           const AllocateNode *alloc = entry.alloc;
-          auto storage_scope =
-              StorageScope::Create(GetPtrStorageScope(GetRef<Var>(var)));
+          auto storage_scope = StorageScope::Create(
+              GetPtrStorageScope(tvm::ffi::GetRef<Var>(var)));
           StorageEntry *dst_entry = nullptr;
           // inplace detection
           if (detect_inplace) {
@@ -1425,9 +1425,30 @@ class VectorTypeAccessChecker : public StmtExprVisitor {
   void
   OnArrayDeclaration(const Var &buffer, DataType element_dtype, PrimExpr extent,
                      BufferVarInfo::DeclarationLocation declaration_location) {
-    ICHECK(info_map_.find(buffer.get()) == info_map_.end())
-        << "Array declaration of " << buffer->name_hint
-        << " occurred multiple times.";
+    auto it = info_map_.find(buffer.get());
+    if (it != info_map_.end()) {
+      // The same buffer var may appear in more than one Allocate due to
+      // upstream transforms (e.g., storage planning/merging). Treat repeated
+      // declarations as benign and merge metadata instead of erroring.
+      BufferVarInfo &existing = it->second;
+      // Prefer a concrete element dtype if the previous one was a handle.
+      if (existing.element_dtype.is_handle() && !element_dtype.is_handle()) {
+        existing.element_dtype =
+            element_dtype == DataType::Bool()
+                ? DataType::Int(8).with_lanes(element_dtype.lanes())
+                : element_dtype;
+      }
+      // If extent was previously unknown (0) and a concrete extent is
+      // provided now, record it.
+      if (!existing.extent.defined() || is_zero(existing.extent)) {
+        existing.extent = extent;
+      }
+      // Merge declaration locations (bitwise OR of flags).
+      existing.declaration_location =
+          static_cast<BufferVarInfo::DeclarationLocation>(
+              existing.declaration_location | declaration_location);
+      return;
+    }
 
     if (element_dtype == DataType::Bool()) {
       element_dtype = DataType::Int(8).with_lanes(element_dtype.lanes());
@@ -1732,7 +1753,7 @@ class VectorTypeRewriter : public StmtExprMutator {
     Var var = (it == rewrite_map_.end()) ? op->var : it->second.new_buffer_var;
     if (var.same_as(op->var) && value.same_as(op->value) &&
         body.same_as(op->body)) {
-      return GetRef<Stmt>(op);
+      return tvm::ffi::GetRef<Stmt>(op);
     }
     return LetStmt(var, value, body);
   }
@@ -1985,10 +2006,10 @@ Pass StorageRewrite() {
   return CreatePrimFuncPass(pass_func, 0, "tir.StorageRewrite", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.StorageRewrite", StorageRewrite);
-});
+}
 
 Pass PointerValueTypeRewrite() {
   auto pass_func = [](PrimFunc f, const IRModule &m, const PassContext &ctx) {
@@ -1997,11 +2018,11 @@ Pass PointerValueTypeRewrite() {
   return CreatePrimFuncPass(pass_func, 0, "tl.PointerValueTypeRewrite", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.PointerValueTypeRewrite",
                         PointerValueTypeRewrite);
-});
+}
 
 } // namespace transform
 } // namespace tl
diff --git a/src/transform/thread_storage_sync.cc b/src/transform/thread_storage_sync.cc
index f0ec5cb3d..0627678e1 100644
--- a/src/transform/thread_storage_sync.cc
+++ b/src/transform/thread_storage_sync.cc
@@ -86,6 +86,7 @@ class TileLangThreadSyncPlanner : public TileLangStorageAccessVisitor {
       // check if sync before statement is needed.
       bool sync_before_stmt = (syncs_inserted_.count(s.stmt) != 0);
       // Apply the syncs added already.
+
       if (sync_before_stmt) {
         reads.clear();
         writes.clear();
@@ -98,7 +99,8 @@ class TileLangThreadSyncPlanner : public TileLangStorageAccessVisitor {
             break;
           }
         } else if (acc.type == kWrite) {
-          if (FindConflict(reads, acc, false)) {
+          if (FindConflict(reads, acc, false) ||
+              FindConflict(writes, acc, false)) {
             sync_before_stmt = true;
             break;
           }
@@ -123,27 +125,51 @@ class TileLangThreadSyncPlanner : public TileLangStorageAccessVisitor {
           writes.clear();
         }
       }
+
       if (sync_before_stmt) {
         insert_syncs(s.stmt);
       }
     }
     if (loop != nullptr) {
+      // Check if the loop body contains any reads in the same sync scope.
+      // If there are reads, we conservatively keep the sync within the loop
+      // body to preserve per-iteration ordering when needed. If there are no
+      // reads (e.g., only writes to shared.dyn), we can safely hoist the sync
+      // to before the loop to avoid redundant barriers.
+      bool has_read_in_scope = false;
+      for (const StmtEntry &s : seq) {
+        for (const AccessEntry &acc : s.access) {
+          if (acc.type == kRead && acc.scope == sync_scope_) {
+            has_read_in_scope = true;
+            break;
+          }
+        }
+        if (has_read_in_scope)
+          break;
+      }
+      // If there is a loop-carried dependency, insert a single sync
+      // before the loop rather than hoisting a sync into the loop body.
+      // This reduces redundant per-iteration synchronizations for cases
+      // where each iteration touches disjoint regions (e.g., stmatrix
+      // writes to shared.dyn) and only a global ordering before/after the
+      // loop is required.
       for (size_t i = 0; i < seq.size(); ++i) {
         const StmtEntry &s = seq[i];
         if (syncs_inserted_.count(s.stmt) != 0)
           break;
         if (reads.empty() && writes.empty())
           break;
-        bool sync_before_stmt = false;
+        bool need_loop_sync = false;
         for (const AccessEntry &acc : s.access) {
           if (acc.type == kRead) {
             if (FindConflict(writes, acc, true)) {
-              sync_before_stmt = true;
+              need_loop_sync = true;
               break;
             }
           } else if (acc.type == kWrite) {
-            if (FindConflict(reads, acc, true)) {
-              sync_before_stmt = true;
+            if (FindConflict(reads, acc, true) ||
+                FindConflict(writes, acc, true)) {
+              need_loop_sync = true;
               break;
             }
           } else if (acc.type == kSync) {
@@ -151,8 +177,17 @@ class TileLangThreadSyncPlanner : public TileLangStorageAccessVisitor {
             writes.clear();
           }
         }
-        if (sync_before_stmt) {
-          insert_syncs(s.stmt);
+        if (need_loop_sync) {
+          if (!has_read_in_scope) {
+            // Mark the loop itself to receive a sync before it, instead of
+            // inserting inside the loop body. This ensures a single sync is
+            // emitted outside the loop and avoids per-iteration overhead.
+            insert_syncs(loop);
+          } else {
+            // Fall back to inserting before the first conflicting statement
+            // inside the loop to maintain correctness when reads are present.
+            insert_syncs(s.stmt);
+          }
           break;
         }
       }
@@ -217,6 +252,14 @@ class TileLangThreadSyncPlanner : public TileLangStorageAccessVisitor {
 
   bool FindConflict(const AccessEntry &prev, const AccessEntry &curr,
                     bool loop_carry) {
+    // Special case: ignore conflicts between async-copy writes (e.g., TMA
+    // loads into shared memory). Multiple async writes do not require
+    // interspersed barriers among themselves. We still respect conflicts with
+    // reads to ensure visibility before consumption.
+    if (prev.type == kWrite && curr.type == kWrite && prev.is_async_copy &&
+        curr.is_async_copy) {
+      return false;
+    }
     // Access to different buffers does not conflict.
     if (!prev.buffer.same_as(curr.buffer)) {
       return false;
@@ -241,10 +284,15 @@ class TileLangThreadSyncPlanner : public TileLangStorageAccessVisitor {
       return true;
     }
     if (prev.is_pointer_access || curr.is_pointer_access) {
-      // If either access is a pointer access, conservatively assume a
-      // conflict. For example, address_of(A[0, 0]) may refer to an unknown
-      // memory region, so we cannot safely determine if it overlaps with
-      // previous accesses.
+      // For accesses created via tvm_access_ptr we may still be able to prove
+      // disjointness using their byte ranges.  If both sides expose a touched
+      // interval and we can show they don't overlap, skip the conflict.
+      if (prev.is_pointer_access && curr.is_pointer_access &&
+          PointerAccessIsDisjoint(prev, curr)) {
+        return false;
+      }
+      // Otherwise fall back to the conservative answer: treat them as
+      // overlapping.
       return true;
     }
 
@@ -327,7 +375,7 @@ class TileLangThreadSyncPlanner : public TileLangStorageAccessVisitor {
         }
       }
 
-      if (!(has_same_index)) {
+      if (!has_same_index) {
         break;
       }
     }
@@ -350,6 +398,26 @@ class TileLangThreadSyncPlanner : public TileLangStorageAccessVisitor {
     return range_is_overlap;
   }
 
+  bool PointerAccessIsDisjoint(const AccessEntry &lhs, const AccessEntry &rhs) {
+    if (lhs.touched.size() != 1 || rhs.touched.size() != 1) {
+      return false;
+    }
+    PrimExpr lhs_min = analyzer_.Simplify(lhs.touched[0].min());
+    PrimExpr lhs_max = analyzer_.Simplify(lhs.touched[0].max());
+    PrimExpr rhs_min = analyzer_.Simplify(rhs.touched[0].min());
+    PrimExpr rhs_max = analyzer_.Simplify(rhs.touched[0].max());
+
+    if (analyzer_.CanProve(lhs_max < rhs_min,
+                           arith::ProofStrength::kSymbolicBound)) {
+      return true;
+    }
+    if (analyzer_.CanProve(rhs_max < lhs_min,
+                           arith::ProofStrength::kSymbolicBound)) {
+      return true;
+    }
+    return false;
+  }
+
   void VisitStmt_(const AttrStmtNode *op) final {
     if (op->attr_key == tvm::tir::attr::thread_extent) {
       IterVar iv = Downcast<IterVar>(op->node);
@@ -782,10 +850,10 @@ tvm::transform::Pass ThreadSync(const String &storage_scope) {
   return CreatePrimFuncPass(pass_func, 0, "tl.ThreadSync", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.ThreadSync", ThreadSync);
-});
+}
 
 } // namespace transform
 } // namespace tl
diff --git a/src/transform/vectorize_loop.cc b/src/transform/vectorize_loop.cc
index 56a6ec3b6..e6f853290 100644
--- a/src/transform/vectorize_loop.cc
+++ b/src/transform/vectorize_loop.cc
@@ -33,6 +33,7 @@
 #include <tvm/tir/transform.h>
 
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -43,6 +44,7 @@ namespace tvm {
 namespace tl {
 
 using namespace tir;
+using namespace ffi;
 
 /*!
  * \brief Perform data type legalization on the given BufferLoadNode pointer.
@@ -208,20 +210,31 @@ class TLVectorizer : public StmtMutator,
   using ExprFunctor::VisitExpr;
   using StmtMutator::operator();
 
+  // Convenience entry to vectorize a loop body without exposing
+  // the mutator invocation pattern at call sites.
+  static Stmt Vectorize(const Var &var, const PrimExpr &var_lanes, Stmt body) {
+    TLVectorizer vec{var, var_lanes};
+    Stmt original_body = body;
+    auto vec_stmt = vec(std::move(body));
+    // If scalarization is needed, scalarize the entire original body
+    if (vec.need_scalarize_) {
+      return vec.Scalarize(original_body);
+    }
+    return vec_stmt;
+  }
+
   TLVectorizer(const Var &var, const PrimExpr &var_lanes)
       : var_(var), var_lanes_(var_lanes) {
     ramp_ = Ramp(IntImm(var->dtype, 0), IntImm(var->dtype, 1), var_lanes);
   }
 
   Stmt VisitStmt(const Stmt &stmt) final {
-    ICHECK(!need_scalarize_);
-    Stmt ret = StmtMutator::VisitStmt(stmt);
+    // If scalarization is already needed, return original stmt unchanged
+    // to let the top-level Vectorize handle it
     if (need_scalarize_) {
-      need_scalarize_ = false;
-      return Scalarize(stmt);
-    } else {
-      return ret;
+      return stmt;
     }
+    return StmtMutator::VisitStmt(stmt);
   }
 
   PrimExpr VisitExpr(const PrimExpr &e) final {
@@ -242,7 +255,7 @@ class TLVectorizer : public StmtMutator,
     PrimExpr a = this->VisitExpr(op->a);
     PrimExpr b = this->VisitExpr(op->b);
     if (a.same_as(op->a) && b.same_as(op->b)) {
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     } else {
       bool is_vec_a = a.dtype().is_scalable_or_fixed_length_vector();
       bool is_vec_b = b.dtype().is_scalable_or_fixed_length_vector();
@@ -296,7 +309,7 @@ class TLVectorizer : public StmtMutator,
   PrimExpr VisitExpr_(const NotNode *op) final {
     PrimExpr a = this->VisitExpr(op->a);
     if (a.same_as(op->a)) {
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     } else {
       return !(a);
     }
@@ -337,10 +350,10 @@ class TLVectorizer : public StmtMutator,
     PrimExpr value = this->VisitExpr(op->value);
     if (value.dtype().is_scalable_or_fixed_length_vector()) {
       need_scalarize_ = true;
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     }
     if (value.same_as(op->value)) {
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     } else {
       return Broadcast(op->value, op->lanes);
     }
@@ -352,7 +365,7 @@ class TLVectorizer : public StmtMutator,
     PrimExpr f = this->VisitExpr(op->false_value);
     if (cond.same_as(op->condition) && t.same_as(op->true_value) &&
         f.same_as(op->false_value)) {
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     } else {
       int cond_lanes = cond.dtype().get_lanes_or_vscale_factor();
       int t_lanes = t.dtype().get_lanes_or_vscale_factor();
@@ -370,7 +383,7 @@ class TLVectorizer : public StmtMutator,
   PrimExpr VisitExpr_(const CastNode *op) final {
     PrimExpr value = this->VisitExpr(op->value);
     if (value.same_as(op->value)) {
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     } else {
       if (value.dtype().is_scalable_vector()) {
         return Cast(op->dtype.with_scalable_vscale_factor(
@@ -383,26 +396,26 @@ class TLVectorizer : public StmtMutator,
   }
 
   PrimExpr VisitExpr_(const FloatImmNode *op) final {
-    return GetRef<PrimExpr>(op);
+    return tvm::ffi::GetRef<PrimExpr>(op);
   }
 
   PrimExpr VisitExpr_(const IntImmNode *op) final {
-    return GetRef<PrimExpr>(op);
+    return tvm::ffi::GetRef<PrimExpr>(op);
   }
 
   PrimExpr VisitExpr_(const StringImmNode *op) final {
-    return GetRef<PrimExpr>(op);
+    return tvm::ffi::GetRef<PrimExpr>(op);
   }
 
   // Variable
   PrimExpr VisitExpr_(const VarNode *op) final {
-    Var var = GetRef<Var>(op);
+    Var var = tvm::ffi::GetRef<Var>(op);
 
     if (var.same_as(var_)) {
       return ramp_;
     }
-    auto it = let_binding_.find(var);
-    if (it != let_binding_.end()) {
+    auto it = let_var_map_.find(var);
+    if (it != let_var_map_.end()) {
       return it->second;
     } else {
       return std::move(var);
@@ -413,13 +426,13 @@ class TLVectorizer : public StmtMutator,
     PrimExpr cond = this->VisitExpr(op->args[0]);
     if (cond.dtype().is_scalable_or_fixed_length_vector()) {
       need_scalarize_ = true;
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     }
     PrimExpr t = this->VisitExpr(op->args[1]);
     PrimExpr f = this->VisitExpr(op->args[2]);
     if (cond.same_as(op->args[0]) && t.same_as(op->args[1]) &&
         f.same_as(op->args[2])) {
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     } else {
       int t_lanes = t.dtype().get_lanes_or_vscale_factor();
       int f_lanes = f.dtype().get_lanes_or_vscale_factor();
@@ -441,7 +454,7 @@ class TLVectorizer : public StmtMutator,
     ICHECK(op->op.same_as(builtin::reinterpret()));
     PrimExpr value = this->VisitExpr(op->args[0]);
     if (value.same_as(op->args[0])) {
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     } else {
       int lanes = value.dtype().get_lanes_or_vscale_factor();
       if (value.dtype().is_scalable_vector()) {
@@ -490,7 +503,6 @@ class TLVectorizer : public StmtMutator,
     bool vectorizable = optional_op &&
                         op_vectorizable_.get(optional_op.value(), false) &&
                         !op->dtype.is_scalable_vector();
-
     if (!vectorizable) {
       // Cannot vectorize this op
       Array<PrimExpr> new_args;
@@ -498,12 +510,12 @@ class TLVectorizer : public StmtMutator,
         auto new_arg = this->VisitExpr(arg);
         if (new_arg.dtype().is_scalable_or_fixed_length_vector()) {
           need_scalarize_ = true;
-          return GetRef<PrimExpr>(op);
+          return tvm::ffi::GetRef<PrimExpr>(op);
         }
         new_args.push_back(new_arg);
       }
       if (op->args.same_as(new_args)) {
-        return GetRef<PrimExpr>(op);
+        return tvm::ffi::GetRef<PrimExpr>(op);
       } else {
         return Call(op->dtype, op->op, new_args);
       }
@@ -512,7 +524,7 @@ class TLVectorizer : public StmtMutator,
       Array<PrimExpr> new_args = MutateArray(op->args, &lane);
       // normal code path.
       if (op->args.same_as(new_args)) {
-        return GetRef<PrimExpr>(op);
+        return tvm::ffi::GetRef<PrimExpr>(op);
       } else {
         return Call(op->dtype.with_lanes(lane), op->op, new_args);
       }
@@ -520,7 +532,7 @@ class TLVectorizer : public StmtMutator,
   }
   // BufferLoad
   PrimExpr VisitExpr_(const BufferLoadNode *op) final {
-    auto load = GetRef<BufferLoad>(op);
+    auto load = tvm::ffi::GetRef<BufferLoad>(op);
 
     auto fmutate = [this](const PrimExpr &index) {
       return this->VisitExpr(index);
@@ -530,7 +542,6 @@ class TLVectorizer : public StmtMutator,
     if (!indices.same_as(op->indices)) {
       BufferLoadNode *writer = load.CopyOnWrite();
       writer->indices = indices;
-      // writer->LegalizeDType();
       LegalizeBufferLoadDType(writer);
     }
 
@@ -545,21 +556,23 @@ class TLVectorizer : public StmtMutator,
     // This is used to allow cases when we reuse a single let
     // expression to construct a nested expr.
     // (let x = 1 in x + 1) * (let x = 1 in x + 1)
-    auto it = let_binding_.find(op->var);
-    if (it != let_binding_.end()) {
+    auto it = let_var_map_.find(op->var);
+    if (it != let_var_map_.end()) {
       ICHECK(deep_equal_(it->second, value))
           << "Let cannot bind the same var to two different values";
     }
     if (value.dtype().get_lanes_or_vscale_factor() !=
         op->value.dtype().get_lanes_or_vscale_factor()) {
       Var new_var(op->var->name_hint, value.dtype());
-      let_binding_[op->var] = new_var;
+      let_var_map_[op->var] = new_var;
+      // Record mapping from the new var to its bound value
+      let_value_binding_[new_var] = value;
       return Let(new_var, value, this->VisitExpr(op->body));
     } else {
-      let_binding_[op->var] = op->var;
+      let_var_map_[op->var] = op->var;
       PrimExpr body = this->VisitExpr(op->body);
       if (value.same_as(op->value) && body.same_as(op->body)) {
-        return GetRef<PrimExpr>(op);
+        return tvm::ffi::GetRef<PrimExpr>(op);
       } else {
         return Let(op->var, value, body);
       }
@@ -567,7 +580,7 @@ class TLVectorizer : public StmtMutator,
   }
   // BufferStore
   Stmt VisitStmt_(const BufferStoreNode *op) final {
-    auto store = GetRef<BufferStore>(op);
+    auto store = tvm::ffi::GetRef<BufferStore>(op);
 
     auto fmutate = [this](const PrimExpr &index) {
       return this->VisitExpr(index);
@@ -630,11 +643,11 @@ class TLVectorizer : public StmtMutator,
     ICHECK(!op->extent.dtype().is_scalable_or_fixed_length_vector());
     PrimExpr extent = this->VisitExpr(op->extent);
     if (extent.dtype().is_scalable_or_fixed_length_vector()) {
-      return Scalarize(GetRef<Stmt>(op));
+      return Scalarize(tvm::ffi::GetRef<Stmt>(op));
     }
     Stmt body = this->VisitStmt(op->body);
     if (extent.same_as(op->extent) && body.same_as(op->body)) {
-      return GetRef<Stmt>(op);
+      return tvm::ffi::GetRef<Stmt>(op);
     } else {
       return For(op->loop_var, op->min, extent, op->kind, body,
                  op->thread_binding, op->annotations);
@@ -645,7 +658,7 @@ class TLVectorizer : public StmtMutator,
     ICHECK(!op->condition.dtype().is_scalable_or_fixed_length_vector());
     PrimExpr condition = this->VisitExpr(op->condition);
     if (condition.dtype().is_scalable_or_fixed_length_vector()) {
-      return Scalarize(GetRef<Stmt>(op));
+      return Scalarize(tvm::ffi::GetRef<Stmt>(op));
     }
     Stmt then_case = this->VisitStmt(op->then_case);
     Optional<Stmt> else_case = std::nullopt;
@@ -654,7 +667,7 @@ class TLVectorizer : public StmtMutator,
     }
     if (condition.same_as(op->condition) && then_case.same_as(op->then_case) &&
         else_case.same_as(op->else_case)) {
-      return GetRef<Stmt>(op);
+      return tvm::ffi::GetRef<Stmt>(op);
     } else {
       return IfThenElse(condition, then_case, else_case);
     }
@@ -666,20 +679,23 @@ class TLVectorizer : public StmtMutator,
   // LetStmt
   Stmt VisitStmt_(const LetStmtNode *op) final {
     PrimExpr value = this->VisitExpr(op->value);
-    ICHECK(!let_binding_.count(op->var))
+    ICHECK(!let_var_map_.count(op->var))
         << "SSA violation, a single var is binded twice";
-    let_binding_[op->var] = value;
-
     if (value.dtype().get_lanes_or_vscale_factor() !=
         op->value.dtype().get_lanes_or_vscale_factor()) {
       Var new_var(op->var->name_hint, value.dtype());
-      let_binding_[op->var] = new_var;
+      let_var_map_[op->var] = new_var;
+      // Record mapping from the new var to its bound value
+      let_value_binding_[op->var] = op->value;
+      let_value_binding_[new_var] = value;
+
       return LetStmt(new_var, value, this->VisitStmt(op->body));
     } else {
-      let_binding_[op->var] = op->var;
+      let_var_map_[op->var] = op->var;
+      let_value_binding_[op->var] = value;
       Stmt body = this->VisitStmt(op->body);
       if (value.same_as(op->value) && body.same_as(op->body)) {
-        return GetRef<Stmt>(op);
+        return tvm::ffi::GetRef<Stmt>(op);
       } else {
         return LetStmt(op->var, value, body);
       }
@@ -693,7 +709,7 @@ class TLVectorizer : public StmtMutator,
     if (condition.dtype().is_scalable_or_fixed_length_vector()) {
       LOG(WARNING) << "Cannot handle vector extent in alloc of "
                    << op->buffer_var->name_hint;
-      return Scalarize(GetRef<Stmt>(op));
+      return Scalarize(tvm::ffi::GetRef<Stmt>(op));
     }
 
     return StmtMutator::VisitStmt_(op);
@@ -790,20 +806,18 @@ class TLVectorizer : public StmtMutator,
       } else {
         // Can't vectorize to a standard type, fall back to scalarize
         need_scalarize_ = true;
-        return GetRef<PrimExpr>(op);
+        return tvm::ffi::GetRef<PrimExpr>(op);
       }
 
       // Reinterpret the value to vector type (e.g., int4 for 8xbf16)
       PrimExpr vec_value = Call(vec_dtype, builtin::reinterpret(), {new_value});
-
-      // A trick to get the lvalue of the vectorized value
-      PrimExpr vec_value_lvalue = vec_value.as<CallNode>()->args[0];
+      PrimExpr vec_value_slice = vec_value.as<CallNode>()->args[0];
 
       // Build new args with base addresses and reinterpreted value
       Array<PrimExpr> new_args;
       new_args.push_back(func_name);
       new_args.push_back(new_addr);
-      new_args.push_back(vec_value_lvalue);
+      new_args.push_back(vec_value_slice);
       // Copy remaining args (sem, scope, etc.)
       for (size_t i = 3; i < op->args.size(); ++i) {
         new_args.push_back(this->VisitExpr(op->args[i]));
@@ -817,12 +831,12 @@ class TLVectorizer : public StmtMutator,
     if (new_addr.dtype().is_scalable_or_fixed_length_vector() ||
         new_value.dtype().is_scalable_or_fixed_length_vector()) {
       need_scalarize_ = true;
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     }
 
     // No vectorization needed, return with updated args if changed
     if (new_addr.same_as(addr_arg) && new_value.same_as(value_arg)) {
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     }
 
     Array<PrimExpr> new_args;
@@ -837,8 +851,27 @@ class TLVectorizer : public StmtMutator,
 
   // scalarize the statement
   Stmt Scalarize(Stmt stmt) {
-    Var idx(var_->name_hint + ".s", var_->dtype);
+    Var idx(var_->name_hint + "_s", var_->dtype);
+    // Find all Vars in stmt that are keys in let_value_binding_
+    std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual> used_let_bound_vars;
+    PostOrderVisit(stmt, [this, &used_let_bound_vars](const ObjectRef &node) {
+      if (const auto *v = node.as<VarNode>()) {
+        Var var = GetRef<Var>(v);
+        if (let_value_binding_.count(var)) {
+          used_let_bound_vars.insert(var);
+        }
+      }
+    });
     stmt = Substitute(stmt, {{var_, idx}});
+
+    if (!used_let_bound_vars.empty()) {
+      for (const auto &v : used_let_bound_vars) {
+        // Bind the existing var v to its value around the stmt scope
+        auto new_value = Substitute(let_value_binding_.at(v), {{var_, idx}});
+        stmt = LetStmt(v, new_value, stmt);
+      }
+    }
+
     return For(idx, IntImm(var_->dtype, 0), var_lanes_, ForKind::kSerial, stmt);
   }
 
@@ -855,8 +888,11 @@ class TLVectorizer : public StmtMutator,
   PrimExpr ramp_;
   // flag to mark requirement of scalarization.
   bool need_scalarize_{false};
-  // Let binding
-  std::unordered_map<Var, PrimExpr, ObjectPtrHash, ObjectPtrEqual> let_binding_;
+  // Let var mapping
+  std::unordered_map<Var, PrimExpr, ObjectPtrHash, ObjectPtrEqual> let_var_map_;
+  // Let value binding: map new_var -> value
+  std::unordered_map<Var, PrimExpr, ObjectPtrHash, ObjectPtrEqual>
+      let_value_binding_;
   // vectorizable property
   OpAttrMap<TVectorizable> op_vectorizable_ =
       Op::GetAttrMap<TVectorizable>("TVectorizable");
@@ -894,7 +930,7 @@ class TLVectorizer : public StmtMutator,
     PrimExpr a = this->VisitExpr(op->a);
     PrimExpr b = this->VisitExpr(op->b);
     if (a.same_as(op->a) && b.same_as(op->b)) {
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     } else {
       int a_lanes = a.dtype().get_lanes_or_vscale_factor();
       int b_lanes = b.dtype().get_lanes_or_vscale_factor();
@@ -910,7 +946,7 @@ class TLVectorizer : public StmtMutator,
     PrimExpr a = this->VisitExpr(op->a);
     PrimExpr b = this->VisitExpr(op->b);
     if (a.same_as(op->a) && b.same_as(op->b)) {
-      return GetRef<PrimExpr>(op);
+      return tvm::ffi::GetRef<PrimExpr>(op);
     } else {
       int a_lanes = a.dtype().get_lanes_or_vscale_factor();
       int b_lanes = b.dtype().get_lanes_or_vscale_factor();
@@ -954,7 +990,7 @@ class LoopVectorizer : public StmtMutator {
             << " for target " << Target::Current();
       }
       ICHECK(is_zero(op->min));
-      return TLVectorizer(op->loop_var, op->extent)(op->body);
+      return TLVectorizer::Vectorize(op->loop_var, op->extent, op->body);
     } else {
       return StmtMutator::VisitStmt_(op);
     }
@@ -990,10 +1026,10 @@ tvm::transform::Pass VectorizeLoop(bool enable_vectorize = true) {
   return CreatePrimFuncPass(pass_func, 0, "tl.VectorizeLoop", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.VectorizeLoop", VectorizeLoop);
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/warp_specialized_rewriter.cc b/src/transform/warp_specialized_rewriter.cc
index 104b46c79..8e891d855 100644
--- a/src/transform/warp_specialized_rewriter.cc
+++ b/src/transform/warp_specialized_rewriter.cc
@@ -50,6 +50,7 @@ class ProducerUsedBufferFinder : public StmtExprVisitor {
 public:
   auto FindProducerusedBuffer(const Stmt &stmt) {
     producer_buffers_.clear();
+    let_var_to_expr_.clear();
     std::unordered_set<const BufferNode *> last_producer_buffers_;
     for (;;) {
       VisitStmt(stmt);
@@ -68,6 +69,28 @@ class ProducerUsedBufferFinder : public StmtExprVisitor {
     for (const auto &buffer : usage.buffer_use_count_) {
       producer_buffers_.insert(buffer.first);
     }
+    // Also collect buffers through let bindings
+    CollectBuffersFromExpr(expr);
+  }
+
+  // Collect buffers from expression, following let bindings
+  void CollectBuffersFromExpr(const PrimExpr &expr) {
+    PostOrderVisit(expr, [this](const ObjectRef &node) {
+      if (auto bl = node.as<BufferLoadNode>()) {
+        producer_buffers_.insert(bl->buffer.get());
+      } else if (auto var_node = node.as<VarNode>()) {
+        auto var = tvm::ffi::GetRef<Var>(var_node);
+        auto it = let_var_to_expr_.find(var.get());
+        if (it != let_var_to_expr_.end()) {
+          CollectBuffersFromExpr(it->second);
+        }
+      }
+    });
+  }
+
+  void VisitStmt_(const LetStmtNode *op) final {
+    let_var_to_expr_[op->var.get()] = op->value;
+    StmtExprVisitor::VisitStmt_(op);
   }
 
   void VisitStmt_(const IfThenElseNode *op) final {
@@ -102,15 +125,15 @@ class ProducerUsedBufferFinder : public StmtExprVisitor {
   void VisitExpr_(const CallNode *op) final {
     if (op->op.same_as(tma_load()) || op->op.same_as(tma_load_im2col())) {
       for (auto arg : op->args) {
-        if (auto buffer_load = arg.as<BufferLoadNode>()) {
-          producer_buffers_.insert(buffer_load->buffer.get());
-        }
+        // Collect buffers from args, including through let bindings
+        CollectBuffersFromExpr(arg);
       }
     }
   }
 
 private:
   std::unordered_set<const BufferNode *> producer_buffers_;
+  std::unordered_map<const VarNode *, PrimExpr> let_var_to_expr_;
 };
 
 class WarpSpecializedRoleMarker : public StmtVisitor {
@@ -138,13 +161,10 @@ class WarpSpecializedRoleMarker : public StmtVisitor {
         role = Role::kProducer;
         has_bulk_copy_ = true;
       }
-      if (call->op.same_as(loop_break()) || call->op.same_as(wait_eq()))
-        role = Role::kBoth;
-      if (call->op.same_as(get_clock()))
+      if (call->op.same_as(loop_break())) {
         role = Role::kBoth;
+      }
     }
-    // NOTE(wt): We should have set the role for barrier ops (on device and
-    // system level) to kBoth, but some issue exists in warp-specialized cases
     SetRole(op, role);
   }
 
@@ -162,7 +182,7 @@ class WarpSpecializedRoleMarker : public StmtVisitor {
 
     // Check reads from global
     Block block(/*iter_vars=*/{}, /*reads=*/{}, /*writes=*/{}, /*name_hint=*/"",
-                /*body*/ GetRef<Stmt>(op));
+                /*body*/ tvm::ffi::GetRef<Stmt>(op));
     auto access = GetBlockReadWriteRegion(block, buffer_data_to_buffer_);
     auto reads = access[0];
     Role role = Role::kProducer;
@@ -514,7 +534,7 @@ class GroupOpRewriter : public StmtExprMutator {
     annotations.Set(String("stmt_group"), Integer(1));
     auto original_node = (op->body).as<SeqStmtNode>();
     if (!original_node) {
-      return GetRef<For>(op);
+      return tvm::ffi::GetRef<For>(op);
     }
     Array<Stmt> new_body;
     int cur_id = 0;
@@ -598,7 +618,7 @@ class WSCodeEmitter : public StmtMutator {
   WSCodeEmitter(bool is_emitting_producer, const IterVar &thread_iv,
                 Map<Var, Buffer> buffer_data_to_buffer,
                 const WarpSpecializedRoleMarker &marker,
-                bool mbarrier_only = false, bool only_has_wgmma = false)
+                bool mbarrier_only = false)
       : is_emitting_producer_(is_emitting_producer),
         buffer_data_to_buffer_(std::move(buffer_data_to_buffer)),
         marker_(marker), thread_var_(thread_iv->var),
@@ -649,7 +669,7 @@ class WSCodeEmitter : public StmtMutator {
     if (role == Role::kBoth) {
       return StmtMutator::VisitStmt_(op);
     } else if ((role == Role::kProducer) == is_emitting_producer_) {
-      return GetRef<Stmt>(op);
+      return tvm::ffi::GetRef<Stmt>(op);
     } else {
       return Evaluate(0);
     }
@@ -767,11 +787,7 @@ class WSCodeEmitter : public StmtMutator {
             int pattern_idx = map.release[i][j];
             PrimExpr release_barrier_id =
                 stage_ + num_barriers_ + num_stages_ * pattern_idx;
-            if (only_has_wgmma_)
-              block_stmt.push_back(makeArriveBarrier(
-                  release_barrier_id, 0, EQ(FloorMod(thread_var_, 128), 0)));
-            else
-              block_stmt.push_back(makeArriveBarrier(release_barrier_id));
+            block_stmt.push_back(makeArriveBarrier(release_barrier_id));
             for (int s = 0; s < num_stages_; s++) {
               released_barrier_.insert(s + num_barriers_ +
                                        num_stages_ * pattern_idx);
@@ -1116,7 +1132,6 @@ class WSCodeEmitter : public StmtMutator {
   bool mbarrier_only_ = false;
   PipelineInfo pipeline_info_;
   friend class WarpSpecializedRewriter;
-  bool only_has_wgmma_ = false;
   bool has_simt_copy_ = false;
 };
 
@@ -1184,38 +1199,6 @@ class WarpSpecializedRewriter : public StmtExprMutator {
     return for_node;
   }
 
-  /**
-   * @brief Rewrite a BlockRealize for warp specialization, inserting barriers
-   * and emitting producer/consumer bodies.
-   *
-   * This visitor handles BlockRealize nodes when a thread IterVar (thread_iv_)
-   * is defined and warp-specialization is applicable. It:
-   * - Determines producer/consumer roles via WarpSpecializedRoleMarker and
-   *   returns the original block if no producer is detected.
-   * - If warp specialization is disabled, emits only mbarrier initialization
-   * and the mbarrier-only transformed body.
-   * - Otherwise, detects WgMMA usage for the block body and constructs separate
-   *   WSCodeEmitter instances for producer and consumer paths (propagating the
-   *   WgMMA flag to the consumer emitter).
-   * - Generates producer/consumer code, applies register hint calls
-   * (set_max_nreg) when available, and rewrites thread indices with
-   * ThreadIdxRewriter to partition threads between producer and consumer roles.
-   * - Computes and initializes a list of mbarrier handles with per-barrier
-   *   arrive thread counts (taking SIMT-copy and WgMMA cases into account).
-   * - Wraps the transformed body in an IfThenElse that dispatches producer vs
-   *   consumer based on thread index, and annotates the region with the
-   *   "kWarpSpecializationScope" attribute that contains producer/consumer
-   *   thread extents.
-   *
-   * Side effects:
-   * - May update member state: only_has_wgmma_, updated_thread_extent_,
-   *   need_update_thread_extent_.
-   * - May abort via ICHECK if invariants (e.g., matching barrier counts) are
-   *   violated.
-   *
-   * @return The possibly rewritten BlockRealize statement (original when no
-   *         warp-specialization is applied or thread_iv_ is undefined).
-   */
   Stmt VisitStmt_(const BlockRealizeNode *op) final {
     BlockRealize block_realize =
         Downcast<BlockRealize>(StmtExprMutator::VisitStmt_(op));
@@ -1249,7 +1232,6 @@ class WarpSpecializedRewriter : public StmtExprMutator {
       block_realize.CopyOnWrite()->block = block;
       return block_realize;
     }
-    only_has_wgmma_ = WgMMACollector::HasWgMMA(block->body);
     WSCodeEmitter producer(true, thread_iv_, buffer_data_to_buffer_, marker);
     WSCodeEmitter consumer(false, thread_iv_, buffer_data_to_buffer_, marker,
                            false);
@@ -1309,7 +1291,6 @@ class WarpSpecializedRewriter : public StmtExprMutator {
   bool need_update_thread_extent_ = false;
   bool disable_warp_specialized_ = false;
   bool disable_shuffle_elect_ = false;
-  bool only_has_wgmma_ = false;
 };
 
 using namespace tir::transform;
@@ -1326,7 +1307,7 @@ tvm::transform::Pass WarpSpecialized() {
       return WarpSpecializedRewriter::Substitute(f, disable_warp_specialized,
                                                  disable_shuffle_elect);
     } else {
-      ObjectRef node = String("default");
+      auto node = ffi::String("default");
       f.CopyOnWrite()->body =
           AttrStmt(node, attr::kCustomWarpSpecialization, 1, f->body);
       return f;
@@ -1335,10 +1316,10 @@ tvm::transform::Pass WarpSpecialized() {
   return CreatePrimFuncPass(pass_func, 0, "tl.WarpSpecialized", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.WarpSpecialized", WarpSpecialized);
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/warp_specialized_rewriter.h b/src/transform/warp_specialized_rewriter.h
index 0bb35d37d..01a2474a8 100644
--- a/src/transform/warp_specialized_rewriter.h
+++ b/src/transform/warp_specialized_rewriter.h
@@ -17,7 +17,6 @@
 #include <utility>
 
 #include "../op/builtin.h"
-#include "../op/sync.h"
 #include "./common/collector.h"
 #include "runtime/thread_storage_scope.h"
 #include "tir/transforms/ir_utils.h"
diff --git a/src/transform/wgmma_sync_rewriter.cc b/src/transform/wgmma_sync_rewriter.cc
index 0b5a5eb39..538b49110 100644
--- a/src/transform/wgmma_sync_rewriter.cc
+++ b/src/transform/wgmma_sync_rewriter.cc
@@ -266,10 +266,10 @@ tvm::transform::Pass RewriteWgmmaSync() {
   return CreatePrimFuncPass(pass_func, 0, "tl.RewriteWgmmaSync", {});
 }
 
-TVM_FFI_STATIC_INIT_BLOCK({
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.transform.RewriteWgmmaSync", RewriteWgmmaSync);
-});
+}
 
 } // namespace tl
 } // namespace tvm
diff --git a/testing/conftest.py b/testing/conftest.py
index 9f49d40a9..4010e0d83 100644
--- a/testing/conftest.py
+++ b/testing/conftest.py
@@ -33,12 +33,9 @@ def pytest_terminal_summary(terminalreporter, exitstatus, config):
         "warnings",
         "error",
     }
-    if (sum(
-            len(terminalreporter.stats.get(k, []))
-            for k in known_types.difference({"skipped", "deselected"})) == 0):
+    if sum(len(terminalreporter.stats.get(k, [])) for k in known_types.difference({"skipped", "deselected"})) == 0:
         terminalreporter.write_sep(
             "!",
-            (f"Error: No tests were collected. "
-             f"{dict(sorted((k, len(v)) for k, v in terminalreporter.stats.items()))}"),
+            (f"Error: No tests were collected. {dict(sorted((k, len(v)) for k, v in terminalreporter.stats.items()))}"),
         )
         pytest.exit("No tests were collected.", returncode=5)
diff --git a/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py b/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
index bf4d49e41..b26354830 100644
--- a/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
+++ b/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
@@ -1,10 +1,12 @@
+import pytest
 import torch
 import tilelang.testing
 from tilelang import tvm as tvm
 import tilelang.language as T
 from tilelang.intrinsics import make_mfma_swizzle_layout as make_swizzle_layout
 from tilelang.intrinsics.mfma_macro_generator import (
-    MatrixCoreIntrinEmitter,)
+    MatrixCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 
 tilelang.testing.set_random_seed(0)
@@ -22,19 +24,9 @@ def tl_matmul(
     b_transposed=True,
     k_pack=1,
 ):
-    assert in_dtype in [
-        "float16",
-        "int8",
-    ], "Currently only float16 and int8 are supported"
-    assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
-    ], "Currently only float16, float32 and int32 are supported"
-
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if in_dtype in {"float8_e4m3fnuz", "int8"}:
+    if in_dtype in {T.float8_e4m3fnuz, T.int8}:
         micro_size_k = 32
 
     block_row_warps = 2
@@ -87,12 +79,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -100,10 +91,12 @@ def main(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -111,7 +104,6 @@ def main(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=0):
-
                 # Load A into shared memory
                 if a_transposed:
                     T.copy(A[ko * block_K, by * block_M], A_shared)
@@ -125,7 +117,6 @@ def main(
                     T.copy(B[ko * block_K, bx * block_N], B_shared)
 
                 for ki in T.serial(0, (block_K // (k_pack * micro_size_k))):
-
                     # Load A into fragment
                     mfma_emitter.ldmatrix_a(
                         A_local,
@@ -169,17 +160,8 @@ def main(
     return main
 
 
-def assert_tl_matmul_correctness(M,
-                                 N,
-                                 K,
-                                 in_dtype,
-                                 out_dtype,
-                                 accum_dtype="float32",
-                                 a_transposed=False,
-                                 b_transposed=True,
-                                 k_pack=1):
-    matmul = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed,
-                       k_pack)
+def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype=T.float32, a_transposed=False, b_transposed=True, k_pack=1):
+    matmul = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack)
     print(matmul)
     kernel = tilelang.compile(matmul)
     src_code = kernel.get_kernel_source()
@@ -187,9 +169,12 @@ def assert_tl_matmul_correctness(M,
     assert src_code is not None
     A_shape = (K, M) if a_transposed else (M, K)
     B_shape = (N, K) if b_transposed else (K, N)
-    if in_dtype == "int8":
+    if in_dtype == T.int8:
         A = torch.randint(-128, 127, A_shape, device="cuda", dtype=torch.int8)
         B = torch.randint(-128, 127, B_shape, device="cuda", dtype=torch.int8)
+    elif in_dtype == T.float8_e4m3fnuz:
+        A = torch.rand(A_shape, device="cuda", dtype=torch.float16).to(getattr(torch, in_dtype))
+        B = torch.rand(B_shape, device="cuda", dtype=torch.float16).to(getattr(torch, in_dtype))
     else:
         A = torch.rand(A_shape, device="cuda", dtype=getattr(torch, in_dtype))
         B = torch.rand(B_shape, device="cuda", dtype=getattr(torch, in_dtype))
@@ -207,16 +192,13 @@ def assert_tl_matmul_correctness(M,
 
     if a_transposed and b_transposed:
         # Get Reference Result
-        ref_c = torch.matmul(A.T.to(torch.float32),
-                             B.T.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.T.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, out_dtype))
     elif a_transposed and not b_transposed:
         # Get Reference Result
-        ref_c = torch.matmul(A.Tto(torch.float32),
-                             B.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.Tto(torch.float32), B.to(torch.float32)).to(getattr(torch, out_dtype))
     elif not a_transposed and b_transposed:
         # Get Reference Result
-        ref_c = torch.matmul(A.to(torch.float32),
-                             B.T.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, out_dtype))
     else:
         # Get Reference Result
         ref_c = torch.matmul(A.to(torch.float32), B.to(torch.float32)).to(getattr(torch, out_dtype))
@@ -226,24 +208,37 @@ def assert_tl_matmul_correctness(M,
     torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
 
 
+@pytest.mark.parametrize(
+    "M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack",
+    [
+        (128, 128, 128, T.float16, T.float16, T.float32, False, True, 1),
+        (128, 256, 256, T.float16, T.float32, T.float32, False, True, 1),
+        (128, 256, 256, T.float16, T.float32, T.float32, False, True, 2),
+        (128, 128, 128, T.int8, T.int32, T.int32, False, True, 1),
+        (128, 256, 256, T.int8, T.int32, T.int32, False, True, 1),
+        (128, 256, 256, T.int8, T.int32, T.int32, False, True, 2),
+        (128, 256, 256, T.int8, T.int32, T.int32, False, False, 1),
+        (128, 256, 256, T.int8, T.int32, T.int32, False, False, 2),
+        (128, 128, 128, T.float8_e4m3fnuz, T.float16, T.float32, False, True, 1),
+    ],
+)
 @tilelang.testing.requires_rocm
-def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(128, 128, 128, "float16", "float16")
-    assert_tl_matmul_correctness(128, 256, 256, "float16", "float32")
-    assert_tl_matmul_correctness(128, 256, 256, "float16", "float32", k_pack=2)
-    assert_tl_matmul_correctness(128, 128, 128, "int8", "int32", accum_dtype="int32")
-    assert_tl_matmul_correctness(128, 256, 256, "int8", "int32", accum_dtype="int32")
-    assert_tl_matmul_correctness(128, 256, 256, "int8", "int32", accum_dtype="int32", k_pack=2)
-    assert_tl_matmul_correctness(
-        128, 256, 256, "int8", "int32", b_transposed=False, accum_dtype="int32")
-    assert_tl_matmul_correctness(
-        128, 256, 256, "int8", "int32", b_transposed=False, accum_dtype="int32", k_pack=2)
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e4m3fnuz", "float16")
-    assert_tl_matmul_correctness(128, 256, 256, "float8_e4m3fnuz", "float32")
-    assert_tl_matmul_correctness(128, 256, 256, "float8_e4m3fnuz", "float32", k_pack=2)
-    assert_tl_matmul_correctness(128, 256, 256, "float8_e4m3fnuz", "float32", b_transposed=False)
+def test_assert_tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack):
     assert_tl_matmul_correctness(
-        128, 256, 256, "float8_e4m3fnuz", "float32", b_transposed=False, k_pack=2)
+        M,
+        N,
+        K,
+        in_dtype,
+        out_dtype,
+        accum_dtype=accum_dtype,
+        a_transposed=a_transposed,
+        b_transposed=b_transposed,
+        k_pack=k_pack,
+    )
+    assert_tl_matmul_correctness(128, 256, 256, T.float8_e4m3fnuz, T.float32)
+    assert_tl_matmul_correctness(128, 256, 256, T.float8_e4m3fnuz, T.float32, k_pack=2)
+    assert_tl_matmul_correctness(128, 256, 256, T.float8_e4m3fnuz, T.float32, b_transposed=False)
+    assert_tl_matmul_correctness(128, 256, 256, T.float8_e4m3fnuz, T.float32, b_transposed=False, k_pack=2)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py b/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py
index 73cdc280b..dc95eb701 100644
--- a/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py
+++ b/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py
@@ -1,3 +1,4 @@
+import pytest
 import torch
 import tilelang.testing
 from tilelang import tvm as tvm
@@ -23,10 +24,9 @@ def tl_matmul(
     b_preshuffle=False,
     b_g2l_load=False,
 ):
-
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if in_dtype in {"float8_e4m3fnuz", "int8"}:
+    if in_dtype in {T.float8_e4m3fnuz, T.int8}:
         micro_size_k = 32
 
     block_row_warps = 2
@@ -53,18 +53,21 @@ def tl_matmul(
 
     A_shape = (K, M) if a_transposed else (M, K)
     if b_preshuffle:
-        B_shape = (N // micro_size_y, K // pack_size_k, micro_size_y,
-                   pack_size_k) if b_transposed else (K // pack_size_k, N // micro_size_y,
-                                                      pack_size_k, micro_size_y)
+        B_shape = (
+            (N // micro_size_y, K // pack_size_k, micro_size_y, pack_size_k)
+            if b_transposed
+            else (K // pack_size_k, N // micro_size_y, pack_size_k, micro_size_y)
+        )
     else:
         B_shape = (N, K) if b_transposed else (K, N)
 
     A_shared_shape = (block_K, block_M) if a_transposed else (block_M, block_K)
     if b_preshuffle:
-        B_shared_shape = (block_N // micro_size_y, block_K // pack_size_k, micro_size_y,
-                          pack_size_k) if b_transposed else (block_K // pack_size_k,
-                                                             block_N // micro_size_y, pack_size_k,
-                                                             micro_size_y)
+        B_shared_shape = (
+            (block_N // micro_size_y, block_K // pack_size_k, micro_size_y, pack_size_k)
+            if b_transposed
+            else (block_K // pack_size_k, block_N // micro_size_y, pack_size_k, micro_size_y)
+        )
     else:
         B_shared_shape = (block_N, block_K) if b_transposed else (block_K, block_N)
 
@@ -94,21 +97,22 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                }
+            )
 
             num_ko = K // block_K
             num_ki = block_K // (k_pack * micro_size_k)
@@ -119,7 +123,6 @@ def main(
             T.clear(C_local)
 
             for ko in T.Pipelined(num_ko, num_stages=0):
-
                 # Load A into shared memory
                 if a_transposed:
                     T.copy(A[ko * block_K, by * block_M], A_shared)
@@ -129,20 +132,13 @@ def main(
                 # Load B into shared memory
                 if b_g2l_load is False:
                     if b_transposed:
-                        for j, k, jj, kk in T.Parallel(block_N // micro_size_y,
-                                                       block_K // pack_size_k, micro_size_y,
-                                                       pack_size_k):
-                            B_shared[j, k, jj, kk] = B[bx * block_N // micro_size_y + j,
-                                                       ko * block_K // pack_size_k + k, jj, kk]
+                        for j, k, jj, kk in T.Parallel(block_N // micro_size_y, block_K // pack_size_k, micro_size_y, pack_size_k):
+                            B_shared[j, k, jj, kk] = B[bx * block_N // micro_size_y + j, ko * block_K // pack_size_k + k, jj, kk]
                     else:
-                        for k, j, kk, jj in T.Parallel(block_K // pack_size_k,
-                                                       block_N // micro_size_y, pack_size_k,
-                                                       micro_size_y):
-                            B_shared[k, j, kk, jj] = B[ko * block_K // pack_size_k + k,
-                                                       bx * block_N // micro_size_y + j, kk, jj]
+                        for k, j, kk, jj in T.Parallel(block_K // pack_size_k, block_N // micro_size_y, pack_size_k, micro_size_y):
+                            B_shared[k, j, kk, jj] = B[ko * block_K // pack_size_k + k, bx * block_N // micro_size_y + j, kk, jj]
 
                 for ki in T.serial(0, num_ki):
-
                     # Load A S2L
                     mfma_emitter.ldmatrix_a(
                         A_local,
@@ -176,10 +172,10 @@ def main(
 
 
 def shuffle_weight(
-        x: torch.Tensor,
-        layout=(16, 32),
-        k_pack=1,
-        is_transpose=False,
+    x: torch.Tensor,
+    layout=(16, 32),
+    k_pack=1,
+    is_transpose=False,
 ) -> torch.Tensor:
     IN, IK = layout
     BK = IK * k_pack
@@ -194,19 +190,20 @@ def shuffle_weight(
     return x.contiguous()
 
 
-def assert_tl_matmul_correctness(M,
-                                 N,
-                                 K,
-                                 in_dtype,
-                                 out_dtype,
-                                 accum_dtype="float32",
-                                 a_transposed=False,
-                                 b_transposed=True,
-                                 k_pack=1,
-                                 b_preshuffle=False,
-                                 b_g2l_load=False):
-    matmul = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed,
-                       k_pack, b_preshuffle, b_g2l_load)
+def assert_tl_matmul_correctness(
+    M,
+    N,
+    K,
+    in_dtype,
+    out_dtype,
+    accum_dtype=T.float32,
+    a_transposed=False,
+    b_transposed=True,
+    k_pack=1,
+    b_preshuffle=False,
+    b_g2l_load=False,
+):
+    matmul = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack, b_preshuffle, b_g2l_load)
     print(matmul)
     kernel = tilelang.compile(matmul)
     src_code = kernel.get_kernel_source()
@@ -214,9 +211,12 @@ def assert_tl_matmul_correctness(M,
     assert src_code is not None
     A_shape = (K, M) if a_transposed else (M, K)
     B_shape = (N, K) if b_transposed else (K, N)
-    if in_dtype == "int8":
+    if in_dtype == T.int8:
         A = torch.randint(-128, 127, A_shape, device="cuda", dtype=torch.int8)
         B = torch.randint(-128, 127, B_shape, device="cuda", dtype=torch.int8)
+    elif in_dtype == T.float8_e4m3fnuz:
+        A = torch.rand(A_shape, device="cuda", dtype=torch.float16).to(getattr(torch, in_dtype))
+        B = torch.rand(B_shape, device="cuda", dtype=torch.float16).to(getattr(torch, in_dtype))
     else:
         A = torch.rand(A_shape, device="cuda", dtype=getattr(torch, in_dtype))
         B = torch.rand(B_shape, device="cuda", dtype=getattr(torch, in_dtype))
@@ -241,16 +241,13 @@ def assert_tl_matmul_correctness(M,
 
     if a_transposed and b_transposed:
         # Get Reference Result
-        ref_c = torch.matmul(A.T.to(torch.float32),
-                             B.T.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.T.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, out_dtype))
     elif a_transposed and not b_transposed:
         # Get Reference Result
-        ref_c = torch.matmul(A.Tto(torch.float32),
-                             B.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.Tto(torch.float32), B.to(torch.float32)).to(getattr(torch, out_dtype))
     elif not a_transposed and b_transposed:
         # Get Reference Result
-        ref_c = torch.matmul(A.to(torch.float32),
-                             B.T.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, out_dtype))
     else:
         # Get Reference Result
         ref_c = torch.matmul(A.to(torch.float32), B.to(torch.float32)).to(getattr(torch, out_dtype))
@@ -261,27 +258,46 @@ def assert_tl_matmul_correctness(M,
     torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
 
 
+@pytest.mark.parametrize(
+    "M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack, b_preshuffle, b_g2l_load",
+    [
+        (256, 256, 512, T.int8, T.int32, T.int32, False, True, 1, True, False),
+        (256, 256, 512, T.int8, T.int32, T.int32, False, False, 1, True, False),
+        (256, 256, 512, T.int8, T.int32, T.int32, False, True, 2, True, False),
+        (256, 256, 512, T.int8, T.int32, T.int32, False, False, 2, True, False),
+        (256, 256, 512, T.float8_e4m3fnuz, T.float32, T.float32, False, True, 1, True, False),
+        (256, 256, 512, T.float8_e4m3fnuz, T.float32, T.float32, False, False, 1, True, False),
+        (256, 256, 512, T.float8_e4m3fnuz, T.float32, T.float32, False, True, 2, True, False),
+        (256, 256, 512, T.float8_e4m3fnuz, T.float32, T.float32, False, False, 2, True, False),
+    ],
+)
 @tilelang.testing.requires_rocm
-def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(
-        256, 256, 256, "int8", "int32", accum_dtype="int32", b_preshuffle=True)
-    assert_tl_matmul_correctness(
-        256, 256, 256, "int8", "int32", accum_dtype="int32", b_preshuffle=True)
-    assert_tl_matmul_correctness(
-        256, 256, 256, "int8", "int32", b_transposed=False, accum_dtype="int32", b_preshuffle=True)
-
-    assert_tl_matmul_correctness(
-        256, 256, 512, "int8", "int32", accum_dtype="int32", k_pack=2, b_preshuffle=True)
+def test_assert_tl_matmul(
+    M,
+    N,
+    K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    a_transposed,
+    b_transposed,
+    k_pack,
+    b_preshuffle,
+    b_g2l_load,
+):
     assert_tl_matmul_correctness(
-        256,
-        256,
-        512,
-        "int8",
-        "int32",
-        b_transposed=False,
-        accum_dtype="int32",
-        k_pack=2,
-        b_preshuffle=True)
+        M,
+        N,
+        K,
+        in_dtype,
+        out_dtype,
+        accum_dtype=accum_dtype,
+        a_transposed=a_transposed,
+        b_transposed=b_transposed,
+        k_pack=k_pack,
+        b_preshuffle=b_preshuffle,
+        b_g2l_load=b_g2l_load,
+    )
 
 
 if __name__ == "__main__":
diff --git a/testing/python/amd/test_tilelang_test_amd.py b/testing/python/amd/test_tilelang_test_amd.py
index bf131ce7b..4035c299c 100644
--- a/testing/python/amd/test_tilelang_test_amd.py
+++ b/testing/python/amd/test_tilelang_test_amd.py
@@ -1,3 +1,4 @@
+import pytest
 from tilelang import tvm as tvm
 import tilelang as tl
 import tilelang.language as T
@@ -27,8 +28,7 @@ def matmul(
     vec_size = 4 * k_pack
 
     @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
+    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor((M, N), out_dtype)):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
@@ -96,33 +96,49 @@ def ref_program(A, B):
     profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
 
 
+@pytest.mark.parametrize(
+    "trans_A, trans_B, k_pack",
+    [
+        (False, False, 1),
+        (False, True, 1),
+        (True, True, 1),
+        (True, False, 1),
+        (False, True, 2),
+    ],
+)
 @tilelang.testing.requires_rocm
-def test_gemm_f16f32f32_nt():
-    run_gemm(1024, 1024, 1024, False, False, "float16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, False, True, "float16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, True, True, "float16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, True, False, "float16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, False, True, "float16", "float32", "float32", 128, 128, 32, k_pack=2)
-
-
+def test_gemm_f16f32f32_nt(trans_A, trans_B, k_pack):
+    run_gemm(1024, 1024, 1024, trans_A, trans_B, T.float16, T.float32, T.float32, 128, 128, 32, k_pack=k_pack)
+
+
+@pytest.mark.parametrize(
+    "trans_A, trans_B, k_pack",
+    [
+        (False, False, 1),
+        (False, True, 1),
+        (True, True, 1),
+        (True, False, 1),
+        (False, True, 2),
+    ],
+)
 @tilelang.testing.requires_rocm
-def test_gemm_bf16f32f32_nt():
-    run_gemm(1024, 1024, 1024, False, False, "bfloat16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, False, True, "bfloat16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, True, True, "bfloat16", "float32", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, True, False, "bfloat16", "float32", "float32", 128, 128, 32)
-    run_gemm(
-        1024, 1024, 1024, False, True, "bfloat16", "float32", "float32", 128, 128, 32, k_pack=2)
-
-
+def test_gemm_bf16f32f32_nt(trans_A, trans_B, k_pack):
+    run_gemm(1024, 1024, 1024, trans_A, trans_B, T.bfloat16, T.float32, T.float32, 128, 128, 32, k_pack=k_pack)
+
+
+@pytest.mark.parametrize(
+    "trans_A, trans_B, k_pack",
+    [
+        (False, False, 1),
+        (False, True, 1),
+        (True, True, 1),
+        (True, False, 1),
+        (False, True, 2),
+    ],
+)
 @tilelang.testing.requires_rocm
-def test_gemm_bf16bf16f32():
-    run_gemm(1024, 1024, 1024, False, False, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, False, True, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, True, True, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-    run_gemm(1024, 1024, 1024, True, False, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-    run_gemm(
-        1024, 1024, 1024, False, True, "bfloat16", "bfloat16", "float32", 128, 128, 32, k_pack=2)
+def test_gemm_bf16bf16f32(trans_A, trans_B, k_pack):
+    run_gemm(1024, 1024, 1024, trans_A, trans_B, T.bfloat16, T.bfloat16, T.float32, 128, 128, 32, k_pack=k_pack)
 
 
 def matmul_rs(
@@ -149,9 +165,9 @@ def matmul_rs(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -223,29 +239,26 @@ def ref_program(A, B):
     profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
 
 
-@tilelang.testing.requires_rocm
-def test_gemm_rs_f16f32f32_nt():
-    run_gemm_rs(1024, 1024, 1024, False, False, "float16", "float32", "float32", 128, 128, 32)
-    run_gemm_rs(1024, 1024, 1024, False, True, "float16", "float32", "float32", 128, 128, 32)
-    run_gemm_rs(1024, 1024, 1024, True, True, "float16", "float32", "float32", 128, 128, 32)
-    run_gemm_rs(1024, 1024, 1024, True, False, "float16", "float32", "float32", 128, 128, 32)
-
-
-@tilelang.testing.requires_rocm
-def test_gemm_rs_bf16f32f32_nt():
-    run_gemm_rs(1024, 1024, 1024, False, False, "bfloat16", "float32", "float32", 128, 128, 32)
-    run_gemm_rs(1024, 1024, 1024, False, True, "bfloat16", "float32", "float32", 128, 128, 32)
-    run_gemm_rs(1024, 1024, 1024, True, True, "bfloat16", "float32", "float32", 128, 128, 32)
-    run_gemm_rs(1024, 1024, 1024, True, False, "bfloat16", "float32", "float32", 128, 128, 32)
-
-
-@tilelang.testing.requires_rocm
-def test_gemm_rs_bf16bf16f32_nt():
-    run_gemm_rs(1024, 1024, 1024, False, False, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-    run_gemm_rs(1024, 1024, 1024, False, True, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-    run_gemm_rs(1024, 1024, 1024, True, True, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-    run_gemm_rs(1024, 1024, 1024, True, False, "bfloat16", "bfloat16", "float32", 128, 128, 32)
-
+# @tilelang.testing.requires_rocm
+# def test_gemm_rs_f16f32f32_nt():
+#     run_gemm_rs(1024, 1024, 1024, False, False, T.float16, T.float32, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, False, True, T.float16, T.float32, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, True, True, T.float16, T.float32, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, True, False, T.float16, T.float32, T.float32, 128, 128, 32)
+
+# @tilelang.testing.requires_rocm
+# def test_gemm_rs_bf16f32f32_nt():
+#     run_gemm_rs(1024, 1024, 1024, False, False, T.bfloat16, T.float32, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, False, True, T.bfloat16, T.float32, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, True, True, T.bfloat16, T.float32, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, True, False, T.bfloat16, T.float32, T.float32, 128, 128, 32)
+
+# @tilelang.testing.requires_rocm
+# def test_gemm_rs_bf16bf16f32_nt():
+#     run_gemm_rs(1024, 1024, 1024, False, False, T.bfloat16, T.bfloat16, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, False, True, T.bfloat16, T.bfloat16, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, True, True, T.bfloat16, T.bfloat16, T.float32, 128, 128, 32)
+#     run_gemm_rs(1024, 1024, 1024, True, False, T.bfloat16, T.bfloat16, T.float32, 128, 128, 32)
 
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/analysis/test_tilelang_fragment_loop_checker.py b/testing/python/analysis/test_tilelang_fragment_loop_checker.py
new file mode 100644
index 000000000..99458f1c8
--- /dev/null
+++ b/testing/python/analysis/test_tilelang_fragment_loop_checker.py
@@ -0,0 +1,151 @@
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+import pytest
+
+
+@tilelang.jit
+def simple_invalid_loop(dtype: T.dtype = T.bfloat16, accum_dtype: T.dtype = T.float32, num_threads: int = 128):
+    A = T.dynamic("A")
+
+    @T.prim_func
+    def main(
+        data: T.Tensor((128, A), dtype),  # type: ignore
+    ):
+        with T.Kernel(128, threads=num_threads) as (tid,):
+            data_frag = T.alloc_fragment([128], accum_dtype)
+
+            for i in T.Parallel(128):
+                if i < A:
+                    data_frag[i] = data[tid, i]
+
+            for i in T.Parallel(A):
+                data_frag[i] = 0
+
+    return main
+
+
+@tilelang.jit
+def nested_invalid_loop(dtype: T.dtype = T.bfloat16, accum_dtype: T.dtype = T.float32, num_threads: int = 128):
+    A = T.dynamic("A")
+
+    @T.prim_func
+    def main(
+        data: T.Tensor((128, A), dtype),  # type: ignore
+    ):
+        with T.Kernel(128, threads=num_threads) as (tid,):
+            data_frag = T.alloc_fragment([128], accum_dtype)
+
+            for i in T.Parallel(128):
+                if i < A:
+                    data_frag[i] = data[tid, i]
+
+            for i in T.Parallel(A // 64):
+                for j in T.Parallel(64):
+                    data_frag[i * 64 + j] = 0
+
+    return main
+
+
+@tilelang.jit
+def invalid_loop_with_complex_dataflow(dtype: T.dtype = T.bfloat16, accum_dtype: T.dtype = T.float32, num_threads: int = 128):
+    A = T.dynamic("A")
+
+    @T.prim_func
+    def main(
+        data: T.Tensor((128, A), dtype),  # type: ignore
+    ):
+        with T.Kernel(128, threads=num_threads) as (tid,):
+            data_frag = T.alloc_fragment([128], accum_dtype)
+
+            for i in T.Parallel(128):
+                if i < A:
+                    data_frag[i] = data[tid, i]
+
+            for i in T.Parallel(A):
+                data_frag[64 // 2 + i % 64] = 0
+
+    return main
+
+
+@tilelang.jit
+def valid_loop_not_use_loop_var(dtype: T.dtype = T.bfloat16, accum_dtype: T.dtype = T.float32, num_threads: int = 128):
+    A = T.dynamic("A")
+
+    @T.prim_func
+    def main(
+        data: T.Tensor((128, A), dtype),  # type: ignore
+    ):
+        with T.Kernel(128, threads=num_threads) as (tid,):
+            data_frag = T.alloc_fragment([128], accum_dtype)
+
+            for i in T.Parallel(128):
+                if i < A:
+                    data_frag[i] = data[tid, i]
+
+            for i in T.Parallel(A):  # noqa: B007
+                for j in T.Parallel(64):
+                    data_frag[j] = 0  # This is valid because we don't use i
+
+    return main
+
+
+@tilelang.jit
+def valid_loop_not_frag(dtype: T.dtype = T.bfloat16, accum_dtype: T.dtype = T.float32, num_threads: int = 128):
+    A = T.dynamic("A")
+
+    @T.prim_func
+    def main(
+        data: T.Tensor((128, A), dtype),  # type: ignore
+    ):
+        with T.Kernel(128, threads=num_threads) as (tid,):
+            data_shared = T.alloc_shared([128], accum_dtype)
+
+            for i in T.Parallel(128):
+                if i < A:
+                    data_shared[i] = data[tid, i]
+
+            for i in T.Parallel(A):
+                data_shared[i] = 0  # Valid because this is shared memory
+
+    return main
+
+
+@tilelang.jit
+def valid_loop_serial(dtype: T.dtype = T.bfloat16, accum_dtype: T.dtype = T.float32, num_threads: int = 128):
+    A = T.dynamic("A")
+
+    @T.prim_func
+    def main(
+        data: T.Tensor((128, A), dtype),  # type: ignore
+    ):
+        with T.Kernel(128, threads=num_threads) as (tid,):
+            data_shared = T.alloc_shared([128], accum_dtype)
+
+            for i in T.Parallel(128):
+                if i < A:
+                    data_shared[i] = data[tid, i]
+
+            for i in T.serial(A):
+                data_shared[i] = 0  # Valid because this is serial
+
+    return main
+
+
+def test_invalid_loop():
+    with pytest.raises(ValueError):
+        simple_invalid_loop()
+    with pytest.raises(ValueError):
+        nested_invalid_loop()
+    with pytest.raises(ValueError):
+        invalid_loop_with_complex_dataflow()
+
+
+def test_valid_loop():
+    valid_loop_not_use_loop_var()
+    valid_loop_not_frag()
+    valid_loop_serial()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/analysis/test_tilelang_nested_loop_checker.py b/testing/python/analysis/test_tilelang_nested_loop_checker.py
new file mode 100644
index 000000000..664fda5b8
--- /dev/null
+++ b/testing/python/analysis/test_tilelang_nested_loop_checker.py
@@ -0,0 +1,719 @@
+import tilelang
+import tilelang.language as T
+import torch
+import tilelang.testing
+import pytest
+
+tilelang.testing.set_random_seed()
+
+
+def _require_cuda_tensor(shape, dtype=torch.float32):
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+    try:
+        return torch.randn(*shape, device="cuda", dtype=dtype)
+    except RuntimeError as err:
+        pytest.skip(f"CUDA runtime unavailable: {err}")
+
+
+"""
+Nested Parallel cases:
+
+T.Parallel
+    T.Parallel
+
+Rule:
+    - continuous parallels is allowed and will be merged into one T.Parallel.
+    - Non-continuous (e.g. with some statements in the outer-loop) are forbidden.
+"""
+
+
+@tilelang.jit(out_idx=[1])
+def nested_continuous_parallels(length=256, block=16, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block):
+                for j in T.Parallel(block):
+                    B[i * block + j] = A[i * block + j] + 1.0
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def nested_triple_continuous_parallels(length=256, block1=8, block2=2, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block1 // block2):
+                for j in T.Parallel(block1):
+                    for k in T.Parallel(block2):
+                        B[i * block1 * block2 + j * block2 + k] = A[i * block1 * block2 + j * block2 + k] + 1.0
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def nested_noncontinuous_parallels(length=256, block=16, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block):
+                B[i] = 0
+                for j in T.Parallel(block):
+                    B[i * block + j] = A[i * block + j] + 1.0
+
+    return main
+
+
+def test_nested_parallels():
+    kernel1 = nested_continuous_parallels(length=256, block=16)
+    kernel2 = nested_triple_continuous_parallels(length=256, block1=8, block2=2)
+    data = _require_cuda_tensor((256,), torch.float32)
+    result1 = kernel1(data)
+    result2 = kernel2(data)
+    torch.testing.assert_close(result1, data + 1.0, atol=1e-5, rtol=1e-5)
+    torch.testing.assert_close(result2, data + 1.0, atol=1e-5, rtol=1e-5)
+
+    # This is invalid
+    with pytest.raises(ValueError):
+        nested_noncontinuous_parallels(length=256, block=16)
+
+
+"""
+Nested Pipeline cases:
+
+T.Pipeline
+    T.Pipeline
+
+is OK.
+"""
+
+
+def matmul_nested_pipelines(
+    M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, threads, order, stage, extra_pipeline_repeats
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            for _ in T.Pipelined(extra_pipeline_repeats):
+                T.clear(C_local)
+                for k in T.Pipelined(T.ceildiv(K, block_K), order=order, stage=stage):
+                    if trans_A:
+                        T.copy(A[k * block_K, by * block_M], A_shared)
+                    else:
+                        T.copy(A[by * block_M, k * block_K], A_shared)
+                    if trans_B:
+                        T.copy(B[bx * block_N, k * block_K], B_shared)
+                    else:
+                        T.copy(B[k * block_K, bx * block_N], B_shared)
+                    T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+                T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_nested_pipelines(
+    order,
+    stage,
+    extra_pipeline_repeats,
+):
+    M = 1024
+    N = 1024
+    K = 1024
+    block_M = 128
+    block_N = 128
+    block_K = 32
+    trans_A = False
+    trans_B = False
+    in_dtype = T.float16
+    out_dtype = T.float16
+    dtypeAccum = T.float32
+    num_threads = 128
+    program = matmul_nested_pipelines(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_threads,
+        order,
+        stage,
+        extra_pipeline_repeats,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[2],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    profiler = kernel.get_profiler()
+
+    def ref_program(A, B):
+        import torch
+
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        if in_dtype == T.float32:
+            # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
+            # float32 automatically, -0x1000 meas
+            A = (A.view(torch.int32) - 0x1000).view(torch.float32)
+            B = (B.view(torch.int32) - 0x1000).view(torch.float32)
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(torch.__getattribute__(out_dtype))
+        return C
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+
+def test_nested_pipelines():
+    run_gemm_nested_pipelines(order=[0, 1, 2], stage=[0, 0, 1], extra_pipeline_repeats=3)
+
+
+"""
+Nested serial cases:
+
+T.serial
+    T.serial
+
+is OK.
+"""
+
+
+@tilelang.jit(out_idx=[1])
+def nested_continuous_serials(length=256, block=16, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.serial(length // block):
+                for j in T.serial(block):
+                    B[i * block + j] = A[i * block + j] + 1.0
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def nested_noncontinuous_serials(length=256, block=16, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.serial(length // block):
+                B[i] = 0
+                for j in T.serial(block):
+                    B[i * block + j] = A[i * block + j] + 1.0
+
+    return main
+
+
+def test_nested_serials():
+    kernel1 = nested_continuous_serials(length=256, block=16)
+    data = _require_cuda_tensor((256,), torch.float32)
+    result1 = kernel1(data)
+    torch.testing.assert_close(result1, data + 1.0, atol=1e-5, rtol=1e-5)
+
+    # This is valid
+    nested_noncontinuous_serials(length=256, block=16)
+
+
+"""
+Mixed serial and Parallel loops:
+
+(S-P)
+T.serial
+    T.Parallel
+
+(P-S)
+T.Parallel
+    T.serial
+
+Rule:
+    - No Parallel - * - Parallel
+"""
+
+
+@tilelang.jit(out_idx=[1])
+def nested_continuous_sp(length=256, block=16, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.serial(length // block):
+                for j in T.Parallel(block):
+                    B[i * block + j] = A[i * block + j] + 1.0
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def nested_continuous_ps(length=256, block=16, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block):
+                for j in T.serial(block):
+                    B[i * block + j] = A[i * block + j] + 1.0
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def nested_continuous_psp(length=256, block1=8, block2=2, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block1 // block2):
+                for j in T.serial(block1):
+                    for k in T.Parallel(block2):
+                        B[i * block1 * block2 + j * block2 + k] = A[i * block1 * block2 + j * block2 + k] + 1.0
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def nested_continuous_sps(length=256, block1=8, block2=2, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.serial(length // block1 // block2):
+                for j in T.Parallel(block1):
+                    for k in T.serial(block2):
+                        B[i * block1 * block2 + j * block2 + k] = A[i * block1 * block2 + j * block2 + k] + 1.0
+
+    return main
+
+
+def test_mixed_sp():
+    kernel1 = nested_continuous_sp(length=256, block=16)
+    kernel2 = nested_continuous_ps(length=256, block=16)
+    data = _require_cuda_tensor((256,), torch.float32)
+    result1 = kernel1(data)
+    result2 = kernel2(data)
+    torch.testing.assert_close(result1, data + 1.0, atol=1e-5, rtol=1e-5)
+    torch.testing.assert_close(result2, data + 1.0, atol=1e-5, rtol=1e-5)
+
+    # This should be invalid (Undefined behaviour)
+    with pytest.raises(ValueError):
+        nested_continuous_psp(length=256, block1=16, block2=8)
+
+    kernel3 = nested_continuous_sps(length=256, block1=8, block2=2)
+    result3 = kernel3(data)
+    torch.testing.assert_close(result3, data + 1.0, atol=1e-5, rtol=1e-5)
+
+
+"""
+Mixed Pipelined and Parallel loops:
+
+(Pi-Pa)
+T.Pipelined
+    T.Parallel
+
+(Pa-Pi)
+T.Parallel
+    T.Pipelined
+
+Rule:
+    - Pi-Pa is ok where Pa-Pi is not allowed.
+    - For more nested cases, refer to the rule of T.Parallel.
+"""
+
+
+def matmul_nested_pipa(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    threads,
+    order,
+    stage,
+):
+    A_shape = (M, K)
+    B_shape = (K, N)
+    A_shared_shape = (block_M, block_K)
+    B_shared_shape = (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), order=order, stage=stage):
+                for i, j in T.Parallel(block_M, block_K):
+                    A_shared[i, j] = A[by * block_M + i, k * block_K + j]
+                for i, j in T.Parallel(block_K, block_N):
+                    B_shared[i, j] = B[k * block_K + i, bx * block_N + j]
+
+                # T.copy(A[by * block_M, k * block_K], A_shared)
+                # T.copy(B[k * block_K, bx * block_N], B_shared)
+
+                T.gemm(A_shared, B_shared, C_local, False, False)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def matmul_nested_papipa(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    threads,
+    order,
+    stage,
+):
+    A_shape = (M, K)
+    B_shape = (K, N)
+    A_shared_shape = (block_M, block_K)
+    B_shared_shape = (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for _ in T.Parallel(1):
+                for k in T.Pipelined(T.ceildiv(K, block_K), order=order, stage=stage):
+                    for i, j in T.Parallel(block_M, block_K):
+                        A_shared[i, j] = A[by * block_M + i, k * block_K + j]
+                    for i, j in T.Parallel(block_K, block_N):
+                        B_shared[i, j] = B[k * block_K + i, bx * block_N + j]
+
+                    # T.copy(A[by * block_M, k * block_K], A_shared)
+                    # T.copy(B[k * block_K, bx * block_N], B_shared)
+
+                    T.gemm(A_shared, B_shared, C_local, False, False)
+                T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_mixed_pp(
+    order,
+    stage,
+):
+    M = 1024
+    N = 1024
+    K = 1024
+    block_M = 128
+    block_N = 128
+    block_K = 32
+    in_dtype = T.float16
+    out_dtype = T.float16
+    dtypeAccum = T.float32
+    num_threads = 128
+
+    program = matmul_nested_pipa(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_threads,
+        order,
+        stage,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[2],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    profiler = kernel.get_profiler()
+
+    def ref_program(A, B):
+        import torch
+
+        if in_dtype == T.float32:
+            # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
+            # float32 automatically, -0x1000 meas
+            A = (A.view(torch.int32) - 0x1000).view(torch.float32)
+            B = (B.view(torch.int32) - 0x1000).view(torch.float32)
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(torch.__getattribute__(out_dtype))
+        return C
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+    program1 = matmul_nested_papipa(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_threads,
+        order,
+        stage,
+    )
+    with pytest.raises(ValueError):
+        tilelang.compile(
+            program1,
+            out_idx=[2],
+            pass_configs={
+                tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+                tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+            },
+        )
+
+
+def test_mixed_pp():
+    run_gemm_mixed_pp(order=[0, 1, 2], stage=[0, 0, 1])
+
+
+"""
+TiledOp in a T.Parallel is also not permitted.
+"""
+
+
+def matmul_with_parallel(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    threads,
+    order,
+    stage,
+):
+    A_shape = (M, K)
+    B_shape = (K, N)
+    A_shared_shape = (block_M, block_K)
+    B_shared_shape = (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), order=order, stage=stage):
+                for i, j in T.Parallel(block_M, block_K):
+                    A_shared[i, j] = A[by * block_M + i, k * block_K + j]
+                for i, j in T.Parallel(block_K, block_N):
+                    B_shared[i, j] = B[k * block_K + i, bx * block_N + j]
+
+                # T.copy(A[by * block_M, k * block_K], A_shared)
+                # T.copy(B[k * block_K, bx * block_N], B_shared)
+
+                for _ in T.Parallel(1):
+                    T.gemm(A_shared, B_shared, C_local, False, False)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_tiled_op_with_parallel(
+    order,
+    stage,
+):
+    M = 1024
+    N = 1024
+    K = 1024
+    block_M = 128
+    block_N = 128
+    block_K = 32
+    in_dtype = T.float16
+    out_dtype = T.float16
+    dtypeAccum = T.float32
+    num_threads = 128
+
+    program = matmul_nested_pipa(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_threads,
+        order,
+        stage,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[2],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    profiler = kernel.get_profiler()
+
+    def ref_program(A, B):
+        import torch
+
+        if in_dtype == T.float32:
+            # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
+            # float32 automatically, -0x1000 meas
+            A = (A.view(torch.int32) - 0x1000).view(torch.float32)
+            B = (B.view(torch.int32) - 0x1000).view(torch.float32)
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(torch.__getattribute__(out_dtype))
+        return C
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+    program1 = matmul_with_parallel(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_threads,
+        order,
+        stage,
+    )
+    with pytest.raises(ValueError):
+        tilelang.compile(
+            program1,
+            out_idx=[2],
+            pass_configs={
+                tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+                tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+            },
+        )
+
+
+@tilelang.jit(out_idx=[1])
+def tir_op_with_parallel(length=256, block=16, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block):
+                for j in T.Parallel(block):
+                    B[i * block + j] = T.max(A[i * block + j], 0.0)
+
+    return main
+
+
+@tilelang.jit(out_idx=[1])
+def customize_op_with_parallel(length=256, block=16, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
+    ):
+        with T.Kernel(1, threads=length) as _:
+            for i in T.Parallel(length // block):
+                for j in T.Parallel(block):
+                    B[i * block + j] = A[i * block + j]
+                    T.atomic_add(B[i * block + j], 1.0)
+
+    return main
+
+
+def test_tiled_op_with_parallel():
+    run_gemm_tiled_op_with_parallel(order=[0, 1, 2], stage=[0, 0, 1])
+
+    kernel1 = tir_op_with_parallel(length=256, block=16)
+    data = _require_cuda_tensor((256,), torch.float32)
+    result1 = kernel1(data)
+    torch.testing.assert_close(result1, torch.relu(data), atol=1e-5, rtol=1e-5)
+    kernel2 = customize_op_with_parallel(length=256, block=16)
+    result2 = kernel2(data)
+    torch.testing.assert_close(result2, data + 1, atol=1e-5, rtol=1e-5)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/arith/test_arith_hard.py b/testing/python/arith/test_arith_hard.py
new file mode 100644
index 000000000..6fc859ba6
--- /dev/null
+++ b/testing/python/arith/test_arith_hard.py
@@ -0,0 +1,105 @@
+import tilelang.testing
+import tilelang.language as T
+from tvm.arith import Analyzer
+from tvm.ir.expr import Range
+from tvm.tir.expr import Not, Or
+
+
+def implies(x, y):
+    return Or(Not(x), y)
+
+
+def test_hard_prove():
+    a = T.Var("a", T.int32)
+    b = T.Var("b", T.int32)
+    c = T.Var("c", T.int32)
+    d = T.Var("d", T.int32)
+
+    def check_expr(expr):
+        analyzer = Analyzer()
+        result = analyzer.can_prove(expr, 1)
+        if not result:
+            smtlib2 = analyzer.get_smtlib2(expr)
+            raise AssertionError(f"Failed to prove: {expr}\nSMT-LIB2:\n{smtlib2}")
+        # assert result, f"Failed to prove: {expr}"
+
+    @T.macro
+    def complex_expr_1():
+        return implies(a > 0 and b > 0 and c > 0, ((b - a) // c) * c + a <= b)
+
+    check_expr(complex_expr_1())
+
+    @T.macro
+    def complex_expr_2():
+        return implies(a < b and b < c and a * d < b * d, b * d < c * d)
+
+    check_expr(complex_expr_2())
+
+    @T.macro
+    def complex_expr_3():
+        return implies(a >= 0 and a < 128, a // 128 == (a // 64 * 32 + a % 32 // 16 * 8) // 64)
+
+    check_expr(complex_expr_3())
+
+    @T.macro
+    def complex_expr_4():
+        return implies(
+            a >= 0 and a < 128,
+            (a % 16 * 64 + a // 64 * 32 + a % 8 // 4 * 32 + (a % 32 // 16 + a % 2) % 2 * 8 + 16 - (a // 64 + a % 8 // 4) // 2 * 64) // 512
+            == (a % 16 * 64 + a // 64 * 32 + a % 8 // 4 * 32 + (a % 32 // 16 + a % 2) % 2 * 8 - (a // 64 + a % 8 // 4) // 2 * 64) // 512,
+        )
+
+    check_expr(complex_expr_4())
+
+
+def test_smtlib2():
+    import z3
+
+    a = T.Var("a", T.int32)
+    b = T.Var("b", T.int32)
+    c = T.Var("c", T.int32)
+
+    @T.macro
+    def complex_expr_1():
+        return implies(a > 0 and b > 0 and c > 0, ((b - a) // c) * c + a <= b)
+
+    e = complex_expr_1()
+    analyzer = Analyzer()
+    analyzer.set_z3_timeout_ms(1000)
+    smtlib2 = analyzer.get_smtlib2(e)
+
+    solver = z3.Solver()
+    solver.from_string(smtlib2)
+    assert solver.check() == z3.unsat, f"Expected unsat, got {solver.check()}"
+
+
+def test_bind():
+    a = T.Var("a", T.int32)
+    b = T.Var("b", T.int32)
+    c = T.Var("c", T.int32)
+
+    analyzer = Analyzer()
+    analyzer.bind(a, Range(1, 100000))
+    analyzer.bind(b, Range(1, 100000))
+    analyzer.bind(c, Range(1, 100000))
+
+    expr = ((b - a) // c) * c + a <= b
+    smtlib2 = analyzer.get_smtlib2(expr)
+    try:
+        result = analyzer.can_prove(expr, 1)
+        assert result, f"Failed to prove with bindings: {expr}"
+    except Exception as e:
+        print(smtlib2)
+        raise e
+
+
+def test_divmod():
+    analyzer = Analyzer()
+    a = T.Var("a", T.int32)
+
+    assert not analyzer.can_prove(a % 2 % -2 - a % 2 == 0)
+    assert analyzer.can_prove(a % -2 % 2 - a % 2 == 0)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/arith/test_arith_intset.py b/testing/python/arith/test_arith_intset.py
new file mode 100644
index 000000000..e3fc7889f
--- /dev/null
+++ b/testing/python/arith/test_arith_intset.py
@@ -0,0 +1,379 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from tilelang import tvm
+import tvm.testing
+from tvm import te
+from tvm import tir
+from tvm.arith.analyzer import Analyzer
+
+
+class IntSetChecker:
+    def __init__(self):
+        self.analyzer = tvm.arith.Analyzer()
+
+    def verify(self, data, dmap, expected):
+        res = self.analyzer.int_set(data, dmap)
+
+        def err_msg():
+            return "\ndata={}\ndmap={}\nres={}\nexpected={}".format(data, dmap, res, expected)
+
+        assert self.analyzer.can_prove_equal(res.min_value, expected[0]), err_msg()
+        assert self.analyzer.can_prove_equal(res.max_value, expected[1]), err_msg()
+
+
+def test_basic():
+    s = tvm.arith.IntervalSet(2, 3)
+    assert s.min_value.value == 2
+    assert s.max_value.value == 3
+
+    s = tvm.arith.IntSet.single_point(2)
+    assert s.min_value.value == 2
+    assert s.max_value.value == 2
+
+
+def test_vector():
+    base = 10
+    stride = 3
+    lanes = 2
+    s = tvm.arith.IntSet.vector(tvm.tir.Ramp(base, stride, lanes))
+    assert s.min_value.value == base
+    assert s.max_value.value == base + stride * (lanes - 1)
+
+
+def test_scalable_vector():
+    base = 5
+    s = tvm.arith.IntSet.vector(tvm.tir.Ramp(base, 2, tvm.tir.vscale() * 4))
+
+    assert s.min_value.value == base
+    assert s.max_value.same_as(tvm.arith.int_set.pos_inf())
+
+
+def test_add_sub():
+    ck = IntSetChecker()
+    x, y = te.var("x"), te.var("y")
+    ck.verify(x + y, {x: tvm.arith.IntervalSet(0, 10)}, (y, 10 + y))
+    ck.verify(x + y, {x: tvm.arith.IntervalSet(0, 10), y: tvm.arith.IntervalSet(1, 11)}, (1, 21))
+    ck.verify(x - y, {x: tvm.arith.IntervalSet(0, 10), y: tvm.arith.IntervalSet(1, 11)}, (-11, 9))
+
+
+def test_mul_div():
+    ck = IntSetChecker()
+    x, y = te.var("x"), te.var("y")
+
+    tdiv = tvm.tir.truncdiv
+    ck.analyzer.update(y, tvm.arith.ConstIntBound(1, 100), override=True)
+    ck.verify(x * y, {x: tvm.arith.IntervalSet(0, 10)}, (0, 10 * y))
+    ck.verify(x * 2, {x: tvm.arith.IntervalSet(1, 10)}, (2, 20))
+    ck.verify(x * -2, {x: tvm.arith.IntervalSet(1, 10)}, (-20, -2))
+
+    ck.verify(tdiv(x, y), {x: tvm.arith.IntervalSet(0, 10)}, (0, tdiv(10, y)))
+    ck.verify(tdiv(x, 2), {x: tvm.arith.IntervalSet(1, 10)}, (0, 5))
+
+    fld = tvm.te.floordiv
+    ck.verify(fld(x, y), {x: tvm.arith.IntervalSet(0, 10)}, (0, fld(10, y)))
+    ck.verify(fld(x, 2), {x: tvm.arith.IntervalSet(-1, 10)}, (-1, 5))
+
+
+def test_mod():
+    ck = IntSetChecker()
+    x, y = te.var("x"), te.var("y")
+    tmod = tvm.tir.truncmod
+    ck.analyzer.update(y, tvm.arith.ConstIntBound(1, 100), override=True)
+    ck.verify(tmod(x, y), {x: tvm.arith.IntervalSet(0, 10)}, (0, y - 1))
+    ck.verify(tmod(x, 10), {x: tvm.arith.IntervalSet(1, 10)}, (0, 9))
+
+    flm = tvm.te.floormod
+    ck.verify(flm(x, 10), {x: tvm.arith.IntervalSet(-10, 10)}, (0, 9))
+    ck.verify(flm(x, 10), {x: tvm.arith.IntervalSet(3, 5)}, (3, 5))
+    ck.verify(flm(x, 10), {x: tvm.arith.IntervalSet(13, 15)}, (3, 5))
+    ck.verify(flm(x, 10), {x: tvm.arith.IntervalSet(3, 15)}, (0, 9))
+    ck.verify(flm(x, 10), {x: tvm.arith.IntervalSet(3, 11)}, (0, 9))
+    ck.verify(flm(x, 10), {x: tvm.arith.IntervalSet(1, 21)}, (0, 9))
+
+    fld = tvm.te.floordiv
+    z = te.var("z")
+    ck.analyzer.bind(x, tvm.ir.Range.from_min_extent(0, 3))
+    ck.verify(
+        flm(y, 8),
+        {y: tvm.arith.IntervalSet(z * 8 + x * 4, z * 8 + x * 4 + 3)},
+        (
+            z * 8 + x * 4 - 8 * fld(z * 8 + x * 4, 8),
+            z * 8 + x * 4 + 3 - 8 * fld(z * 8 + x * 4, 8),
+        ),
+    )
+    ck1 = IntSetChecker()
+    ck1.analyzer.bind(x, tvm.ir.Range.from_min_extent(0, 2))
+    ck1.verify(flm(y, 8), {y: tvm.arith.IntervalSet(z * 8 + x * 4, z * 8 + x * 4 + 3)}, (x * 4, x * 4 + 3))
+
+
+def test_max_min():
+    ck = IntSetChecker()
+    x, y = te.var("x"), te.var("y")
+    ck.verify(tvm.te.max(x, x + 1), {x: tvm.arith.IntervalSet(0, 10)}, (1, 11))
+    ck.verify(tvm.te.min(x - 1, x + 1), {x: tvm.arith.IntervalSet(0, 10)}, (-1, 9))
+    ck.verify(tvm.te.min(x, y), {}, (tvm.te.min(x, y), tvm.te.min(x, y)))
+    ck.verify(tvm.te.max(x, y), {}, (tvm.te.max(x, y), tvm.te.max(x, y)))
+
+
+def test_select():
+    ck = IntSetChecker()
+    # x, y = te.var("x"), te.var("y")
+    x = te.var("x")
+    ck.verify(tvm.tir.Select(x > 0, x - 1, x + 1), {x: tvm.arith.IntervalSet(0, 10)}, (-1, 11))
+
+
+def check_region_bound(expect_region, var_dom, mode, predicate=None):
+    """Helper to check region bound estimation.
+
+    Parameters
+    ----------
+    expect_region: dict
+        The keys are of form (begin, end) or PrimExpr as a single point. The values are
+        expected estimated region or region dict on different bindings.
+
+    var_dom: dict
+        Map var to iteration domain range.
+
+    mode: str
+        Specify "lowerbound", "upperbound" or else use strict bound estimation.
+
+    predicate: PrimExpr
+        Extra predicate, defaults to True.
+    """
+    if predicate is None:
+        predicate = tvm.tir.IntImm("bool", 1)
+    region = []
+    expect = []
+    for k, v in expect_region.items():
+        if not isinstance(k, (tuple, list)):
+            k = (k, k + 1)
+        region.append(tvm.ir.Range.from_min_extent(k[0], Analyzer().simplify(k[1] - k[0])))
+        expect.append(v)
+    if mode == "lowerbound":
+        result = tvm.arith.estimate_region_lower_bound(region=region, var_dom=var_dom, predicate=predicate)
+    elif mode == "upperbound":
+        result = tvm.arith.estimate_region_upper_bound(region=region, var_dom=var_dom, predicate=predicate)
+    else:
+        result = tvm.arith.estimate_region_strict_bound(region=region, var_dom=var_dom, predicate=predicate)
+    if result is None:
+        assert all([_ is None for _ in expect])
+        return
+    assert len(result) == len(expect)
+    for intset, expect_desc in zip(result, expect):
+        if isinstance(expect_desc, dict):
+            # check range on different free var bindings
+            for binding in expect_desc:
+                analyzer = Analyzer()
+                for k, v in binding:
+                    analyzer.bind(k, v)
+                expect_begin, expect_end = expect_desc[binding]
+                result_begin = analyzer.simplify(intset.min_value, 3)
+                result_end = analyzer.simplify(intset.max_value + 1, 3)
+                assert analyzer.can_prove_equal(result_begin - expect_begin, 0), f"{result_begin} vs {expect_begin}"
+                assert analyzer.can_prove_equal(result_end - expect_end, 0), f"{result_end} vs {expect_end}"
+        else:
+            # check range
+            expect_begin, expect_end = expect_desc
+            analyzer = Analyzer()
+            assert analyzer.can_prove_equal(intset.min_value - expect_begin, 0), f"{intset.min_value} vs {expect_begin}"
+            assert analyzer.can_prove_equal(intset.max_value - expect_end + 1, 0), f"{intset.max_value} vs {expect_end - 1}"
+
+
+def test_region_bound_not_independent():
+    # (i, i+2) and (i+2, i+4) are dependent, this the lowerbound is not available
+    i = tvm.tir.Var("i", "int32")
+    var_dom = {
+        i: tvm.ir.Range(begin=0, end=64),
+    }
+    check_region_bound({(i, i + 2): None, (i + 2, i + 4): None}, var_dom, mode="lowerbound")
+    check_region_bound({(i, i + 2): (0, 65), (i + 2, i + 4): (2, 67)}, var_dom, mode="upperbound")
+
+    # when only a subset of access indices are affine
+    i, j, k = tvm.tir.Var("i", "int32"), tvm.tir.Var("j", "int32"), tvm.tir.Var("k", "int32")
+    var_dom = {
+        i: tvm.ir.Range(begin=0, end=16),
+        j: tvm.ir.Range(begin=0, end=16),
+        k: tvm.ir.Range(begin=0, end=16),
+    }
+    check_region_bound(
+        {i // 4: None, j * 4 + i % 4: None, tir.truncdiv(k, 2): None},
+        var_dom,
+        predicate=j * 4 + i % 4 > 3,
+        mode="lowerbound",
+    )
+    check_region_bound(
+        {i // 4: (0, 4), j * 4 + i % 4: (4, 64), tir.truncdiv(k, 2): (0, 8)},
+        var_dom,
+        predicate=j * 4 + i % 4 > 3,
+        mode="upperbound",
+    )
+
+
+def test_region_bound_stride_too_wide():
+    i = tvm.tir.Var("i", "int32")
+    var_dom = {i: tvm.ir.Range(begin=0, end=64)}
+    check_region_bound({(i * 4, i * 4 + 2): None}, var_dom, mode="lowerbound")
+    check_region_bound({(i * 4, i * 4 + 2): (0, 254)}, var_dom, mode="upperbound")
+
+
+def test_region_bound_small_stride():
+    i = tvm.tir.Var("i", "int32")
+    var_dom = {
+        i: tvm.ir.Range(begin=0, end=64),
+    }
+    check_region_bound({(i * 4, i * 4 + 8): (0, 260)}, var_dom, mode="lowerbound")
+
+
+def test_region_lower_bound_split_predicate():
+    x_o = tvm.tir.Var("xo", "int32")
+    x_i = tvm.tir.Var("xi", "int32")
+    x = x_o * 4 + x_i
+    var_dom = {
+        x_o: tvm.ir.Range(begin=0, end=16),
+        x_i: tvm.ir.Range(begin=0, end=4),
+    }
+    check_region_bound({(x * 4, x * 4 + 8): (0, 256)}, var_dom, predicate=x < 63, mode="lowerbound")
+
+    check_region_bound(
+        {(x * 4, x * 4 + 8): (0, 256), (x * 3, x * 3 + 5): (0, 191)},
+        var_dom,
+        predicate=x < 63,
+        mode="upperbound",
+    )
+
+
+def test_region_lower_bound_multiple_variables():
+    div = tvm.tir.floordiv
+    mod = tvm.tir.floormod
+    x = tvm.tir.Var("x", "int32")
+    wid = tvm.tir.Var("wid", "int32")
+    i = div(x, 16)
+    j = div(mod(x, 16), 4) * 8 + mod(x, 4) + div(wid, 32) * 4
+    k = wid % 32
+    var_dom = {
+        x: tvm.ir.Range(begin=0, end=32),
+        wid: tvm.ir.Range(begin=0, end=64),
+    }
+    check_region_bound({i: (0, 2), j: (0, 32), k: (0, 32)}, var_dom, mode="lowerbound")
+
+
+def test_region_lower_bound_negative_scale():
+    i = tvm.tir.Var("i", "int32")
+    j = tvm.tir.Var("j", "int32")
+    var_dom = {
+        i: tvm.ir.Range(begin=0, end=4),
+        j: tvm.ir.Range(begin=0, end=4),
+    }
+    check_region_bound({(1 - i, 5 - i): (-2, 5), (20 - j * 4, 36 - j * 4): (8, 36)}, var_dom, mode="lowerbound")
+
+
+def test_region_lower_bound_for_non_perfect_tile():
+    h1 = tvm.tir.Var("h1", "int32")
+    h2 = tvm.tir.Var("h2", "int32")
+    h3 = tvm.tir.Var("h3", "int32")
+
+    # non-uniform tiling, single inner variable
+    var_dom = {
+        h2: tvm.ir.Range(begin=0, end=10),
+    }
+    check_region_bound(
+        {
+            h3 * 8 + h2: {
+                (): (
+                    tvm.tir.max(h3 * 8, 1),
+                    tvm.tir.min(0, h3 * 8 - 214) + 224,
+                ),
+                ((h3, 0),): (1, 10),  # h3 == 0: region is [1, 10)
+                ((h3, 10),): (h3 * 8, h3 * 8 + 10),  # 0 < h3 <= 26: region is [h3 * 8, h3 * 8 + 10)
+                ((h3, 27),): (h3 * 8, 224),  # h3 > 26: region is [h3 * 8, 224)
+            }
+        },
+        var_dom,
+        predicate=tvm.tir.all(h3 * 8 + h2 >= 1, h3 * 8 + h2 < 224),
+        mode="lowerbound",
+    )
+
+    # non-uniform tiling, two inner variables
+    var_dom = {
+        h1: tvm.ir.Range(begin=0, end=5),
+        h2: tvm.ir.Range(begin=0, end=2),
+    }
+    check_region_bound(
+        {
+            h3 * 8 + h2 * 5 + h1: {
+                (): (
+                    tvm.tir.max(h3 * 8, 1),
+                    tvm.tir.min(0, h3 * 8 - 214) + 224,
+                ),
+                ((h3, 0),): (1, 10),
+                ((h3, 10),): (h3 * 8, h3 * 8 + 10),
+                ((h3, 27),): (h3 * 8, 224),
+            }
+        },
+        var_dom,
+        predicate=tvm.tir.all(h3 * 8 + h2 * 5 + h1 >= 1, h3 * 8 + h2 * 5 + h1 < 224),
+        mode="lowerbound",
+    )
+
+    # lowerbound should fail on incompatible predicates
+    check_region_bound(
+        {h3 * 8 + h2 * 5 + h1: None},
+        var_dom,
+        predicate=tvm.tir.all(h3 * 8 + h2 * 5 + h1 >= 1, h3 * 8 + h1 * 2 + h2 < 224),
+        mode="lowerbound",
+    )
+    check_region_bound(
+        {h3 * 8 + h2 * 5 + h1: (h3 * 8, h3 * 8 + 10)},
+        var_dom,
+        predicate=tvm.tir.all(h3 * 8 + h2 * 5 + h1 >= 1, h3 * 8 + h1 * 2 + h2 < 224),
+        mode="upperbound",
+    )
+
+
+def test_region_lower_bound_unfusable():
+    var_dom = {
+        tvm.tir.Var("i", "int32"): tvm.ir.Range(8),
+        tvm.tir.Var("j", "int32"): tvm.ir.Range(4),
+    }
+    i, j = var_dom
+    check_region_bound({(i + j) // 2: (0, 6)}, var_dom, mode="lowerbound")
+
+
+def test_union_lower_bound():
+    neg_inf = tvm.arith.int_set.neg_inf()
+    pos_inf = tvm.arith.int_set.pos_inf()
+    set_0 = tvm.arith.IntervalSet(min_value=neg_inf, max_value=0)
+    set_1 = tvm.arith.IntervalSet(min_value=1, max_value=pos_inf)
+    result = tvm.arith.int_set.union_lower_bound([set_0, set_1])
+    assert result.min_value.same_as(neg_inf)
+    assert result.max_value.same_as(pos_inf)
+    set_2 = tvm.arith.IntervalSet(min_value=pos_inf, max_value=neg_inf)
+    result = tvm.arith.int_set.union_lower_bound([set_0, set_1, set_2])
+    assert result.min_value.same_as(neg_inf)
+    assert result.max_value.same_as(pos_inf)
+
+
+def test_modular_set():
+    ck = IntSetChecker()
+    x = tvm.te.var("x", dtype="int32")
+    y = tvm.te.var("y", dtype="int32")
+    expr = (x * 2048 + y * 16) % 7168
+    ck.verify(expr, {x: tvm.arith.IntervalSet(0, 128), y: tvm.arith.IntervalSet(0, 3584)}, (0, 7152))
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/testing/python/arith/test_arith_iter_affine_map.py b/testing/python/arith/test_arith_iter_affine_map.py
new file mode 100644
index 000000000..7a666f87d
--- /dev/null
+++ b/testing/python/arith/test_arith_iter_affine_map.py
@@ -0,0 +1,1292 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from tilelang import tvm
+import tilelang.testing
+from tvm.tir import floordiv, floormod
+from tvm.script import tir as T
+
+
+def ifuse(inputs, pred_extent=None):
+    """Fuse iterators"""
+    value, extent = 0, 1
+    for i, ext in inputs:
+        value = value * ext + i
+        extent = extent * ext
+    return value, extent if pred_extent is None else pred_extent
+
+
+def isplit(axis, factor):
+    """Split iterators"""
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+    return [
+        (fld(axis[0], factor), fld(axis[1] + (factor - 1), factor)),
+        (flm(axis[0], factor), factor),
+    ]
+
+
+def var_dom(iters):
+    """Get domains of iterators"""
+    return {var: tvm.ir.Range(0, ext) for var, ext in iters}
+
+
+def convert_iter_expr(expr):
+    return tvm.arith.normalize_iter_map_to_expr(expr)
+
+
+def assert_iter_sum_pattern(expect_dict, dom_map, predicate=True, check_level="surjective", simplify_trivial_iterators=True):
+    keys = list(expect_dict.keys())
+    res = tvm.arith.detect_iter_map(
+        keys,
+        dom_map,
+        predicate=predicate,
+        check_level=check_level,
+        simplify_trivial_iterators=simplify_trivial_iterators,
+    )
+    indices = res.indices
+    assert len(indices) == len(keys), res.errors
+    for i, input_iter in enumerate(keys):
+        spec = expect_dict[input_iter]
+        (
+            extent,
+            base,
+        ) = spec[0:2]
+        scale = spec[2] if len(spec) > 2 else 1
+        expect_iter = spec[3] if len(spec) > 3 else None
+        sum_expr = indices[i]
+        assert isinstance(sum_expr, tvm.arith.IterSumExpr)
+        if extent == 1:
+            assert len(sum_expr.args) == 0
+        else:
+            assert len(sum_expr.args) == 1
+            tvm.testing.assert_prim_expr_equal(sum_expr.args[0].extent, extent)
+            tvm.testing.assert_prim_expr_equal(sum_expr.args[0].scale, scale)
+        tvm.testing.assert_prim_expr_equal(sum_expr.base, base)
+        if expect_iter is not None:
+            if not isinstance(expect_iter, tvm.arith.IterMapExpr):
+                sum_expr = convert_iter_expr(sum_expr)
+            tvm.ir.assert_structural_equal(sum_expr, expect_iter)
+
+
+def assert_iter_map_simplify(expect_dict, dom_map, predicate=True, check_level="surjective", simplify_trivial_iterators=True):
+    keys = list(expect_dict.keys())
+    _imap = tvm.arith.detect_iter_map(
+        keys,
+        dom_map,
+        predicate=predicate,
+        check_level=check_level,
+        simplify_trivial_iterators=simplify_trivial_iterators,
+    )
+    res = tvm.arith.iter_map_simplify(
+        keys,
+        dom_map,
+        predicate=predicate,
+        check_level=check_level,
+        simplify_trivial_iterators=simplify_trivial_iterators,
+    )
+    for i, input_expr in enumerate(keys):
+        expected_expr = expect_dict[input_expr]
+        tvm.ir.assert_structural_equal(res[i], expected_expr)
+
+
+def assert_iter_sum_failure(iters, dom_map, predicate=True, check_level="surjective"):
+    res = tvm.arith.detect_iter_map(list(iters), dom_map, predicate=predicate, check_level=check_level).indices
+    assert len(res) == 0
+
+
+def test_trivial():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("z", "int32")
+    dom_map = var_dom([(x, 3), (y, 4), (z, 1)])
+
+    assert_iter_sum_pattern({x: (3, 0), y: (4, 0), 3: (1, 3)}, dom_map)
+    assert_iter_sum_pattern({x: (3, 0), 3: (1, 3)}, dom_map)
+
+    # not independent
+    assert_iter_sum_failure([x, x, 3], dom_map)
+
+    assert_iter_sum_pattern({x: (3, 0), y: (4, 0)}, dom_map, check_level="bijective", simplify_trivial_iterators=True)
+    assert_iter_sum_pattern({x: (3, 0), y: (4, 0)}, dom_map, check_level="bijective", simplify_trivial_iterators=False)
+    assert_iter_sum_failure([x, z], dom_map, check_level="bijective")
+
+
+def test_fuse():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    c = tvm.tir.SizeVar("c", "int32")
+    c0 = tvm.tir.SizeVar("c0", "int32")
+
+    assert_iter_sum_pattern({y * 3 + 1 + c + x: (12, 1 + c)}, var_dom([(x, 3), (y, 4)]))
+
+    assert_iter_sum_pattern({ifuse([(x, 3), (y, 4)])[0]: (12, 0)}, var_dom([(x, 3), (y, 4)]))
+
+    # fuse with symbolic factor
+    assert_iter_sum_pattern({(y + 1) * c + x: (4 * c, c)}, var_dom([(x, c), (y, 4)]))
+
+    # duplication
+    assert_iter_sum_failure([y * 3 + x, y], var_dom([(x, 3), (y, 4)]))
+    assert_iter_sum_failure([y, x + 1, y], var_dom([(x, 3), (y, 4)]))
+
+    # factor mismatch
+    assert_iter_sum_failure([y * 4 + x], var_dom([(x, 3), (y, 4)]))
+
+    # simple stride pattern
+    assert_iter_sum_pattern({x * 4 + y * 2: (6, 0, 2, (x * 2 + y) * 2)}, var_dom([(x, 3), (y, 2)]))
+
+    # simple stride pattern with symbolic
+    assert_iter_sum_pattern({x * 2 * c0 + y * 2: (3 * c0, 0, 2, (x * c0 + y) * 2)}, var_dom([(x, 3), (y, c0)]))
+
+
+def test_split():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    c0 = tvm.tir.SizeVar("c0", "int32")
+    c1 = tvm.tir.SizeVar("c1", "int32")
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+
+    assert_iter_sum_pattern({fld(x, 3): (8, 0), flm(x, 3) * 2 + c1: (3, c1, 2)}, var_dom([(x, 24)]))
+
+    assert_iter_sum_pattern({fld(x, 6): (4, 0), fld(flm(x, 6), 2): (3, 0), flm(x, 2): (2, 0)}, var_dom([(x, 24)]))
+
+    # simple symbolic bound
+    # TODO(tvm-team) improve symbolic divisible check to enable
+    # more complicated symbolic bound
+    assert_iter_sum_pattern({fld(x, c0): (c1, 0), flm(x, c0): (c0, 0)}, var_dom([(x, c1 * c0)]))
+
+    assert_iter_sum_pattern({fld(x * 2, 4): (4, 0, 1), flm(x * 2, 4): (2, 0, 2)}, var_dom([(x, 8)]))
+
+    assert_iter_sum_pattern(
+        {
+            fld(x * 2, 4) * 4 + flm(x * 2, 4): (8, 0, 2),
+        },
+        var_dom([(x, 8)]),
+    )
+
+    assert_iter_sum_failure([fld(x, flm(flm(y, 8), 6))], var_dom([(x, 24), (y, 8)]))
+
+    # domain of x is undefined
+    assert_iter_sum_pattern({fld(flm(x, 49) + y, 49): (1, fld(flm(x, 49) + y, 49))}, var_dom([(y, 1)]))
+
+
+def test_compound():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+
+    xo, xi = isplit((x, 10), 5)
+    yo, yi = isplit((y, 9), 3)
+    z = ifuse([yo, xo, yi])
+
+    # reconstruct the pattern manually
+    mx = tvm.arith.IterMark(x, 10)
+    my = tvm.arith.IterMark(y, 9)
+    xoscale = 3
+    yoscale = 6
+    yiscale = 1
+    mxo = tvm.arith.IterSplitExpr(mx, 5, 2, xoscale)
+    myo = tvm.arith.IterSplitExpr(my, 3, 3, yoscale)
+    myi = tvm.arith.IterSplitExpr(my, 1, 3, yiscale)
+    mz = tvm.arith.IterMark(tvm.arith.IterSumExpr([myo, mxo, myi], 0), 18)
+    sz = tvm.arith.IterSumExpr([tvm.arith.IterSplitExpr(mz, 1, 18, 1)], 0)
+    assert_iter_sum_pattern({z[0]: (18, 0, 1, sz), xi[0]: (5, 0)}, var_dom([(x, 10), (y, 9)]))
+
+
+def test_compound_floormod_two_regression():
+    x = tvm.tir.Var("x", "int32")
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+    # regression
+    # extent of 2 of negative scale cannot be normalized
+    assert_iter_sum_failure(
+        [fld(x, 2) * 2 - flm(x, 2) + 1],
+        dom_map=var_dom([(x, 8)]),
+    )
+
+
+def test_predicate():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("z", "int32")
+
+    # available constraints
+    # upper bound only
+    assert_iter_sum_pattern({x * 10 + y: (128, 0)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y < 128)
+
+    assert_iter_sum_pattern({x * 10 + y: (128, 0)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y <= 127)
+
+    # lower bound only
+    assert_iter_sum_pattern({x * 10 + y: (124, 6)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y > 5)
+
+    assert_iter_sum_pattern({x * 10 + y: (124, 6)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y >= 6)
+
+    # lower bound + upper bound
+    assert_iter_sum_pattern(
+        {x * 10 + y: (122, 6)},
+        var_dom([(x, 13), (y, 10)]),
+        predicate=tvm.tir.And(x * 10 + y > 5, x * 10 + y < 128),
+    )
+
+    assert_iter_sum_pattern(
+        {x * 10 + y: (122, 6)},
+        var_dom([(x, 13), (y, 10)]),
+        predicate=tvm.tir.And(x * 10 + y >= 6, x * 10 + y <= 127),
+    )
+
+    assert_iter_sum_pattern(
+        {x * 64 + y * 4 + z: (16, 16)},
+        var_dom([(x, 16), (y, 16), (z, 4)]),
+        predicate=tvm.tir.And(x * 64 + y * 4 + z < 32, x * 16 + y >= 4),
+    )
+
+    # constraints on one fused iter
+    i = tvm.tir.Var("i", "int32")
+    j = tvm.tir.Var("j", "int32")
+    k = tvm.tir.Var("k", "int32")
+    assert_iter_sum_pattern(
+        {i * 8 + j * 2 + k: (88, 1)},
+        var_dom([(i, 11), (j, 5), (k, 2)]),
+        predicate=tvm.tir.all(j * 2 + k >= 1, j * 2 + k < 9),
+    )
+
+    # constraints on single var
+    assert_iter_sum_pattern({i: (10, 0)}, var_dom([(i, 48)]), predicate=i < 10)
+
+    # iterations are subparts of constraint, invalid case 1
+    assert_iter_sum_failure(
+        [i, j, k],
+        var_dom([(i, 128), (j, 128), (k, 128)]),
+        predicate=tvm.tir.all(i * 16384 + j * 128 + k < 100),
+    )
+
+    # iterations are subparts of constraint, invalid case 2
+    assert_iter_sum_failure(
+        [i * 128 + j, k],
+        var_dom([(i, 128), (j, 128), (k, 128)]),
+        predicate=i * 16384 + j * 128 + k < 100,
+    )
+
+    # irrelevant predicate
+    assert_iter_sum_pattern({i + j: (1, j)}, var_dom([(i, 1)]), predicate=j <= 24)
+
+    # constraint on nested fused iters
+    assert_iter_sum_pattern(
+        {i * 8 + j * 2 + k: (22, 3)},
+        var_dom([(i, 11), (j, 5), (k, 2)]),
+        predicate=tvm.tir.all(j * 2 + k >= 1, j * 2 + k < 9, i * 8 + j * 2 + k >= 3, i * 8 + j * 2 + k < 25),
+    )
+
+    # duplicate constraint on one fused iter
+    assert_iter_sum_pattern(
+        {i * 6 + j * 2 + k: (66, 2)},
+        var_dom([(i, 11), (j, 5), (k, 2)]),
+        predicate=tvm.tir.all(j * 2 + k >= 1, j * 2 + k >= 2, j * 2 + k < 8, j * 2 + k < 9),
+    )
+
+    # duplicate constraint on nested fused iters
+    assert_iter_sum_pattern(
+        {i * 6 + j * 2 + k: (15, 3)},
+        var_dom([(i, 11), (j, 5), (k, 2)]),
+        predicate=tvm.tir.all(
+            j * 2 + k >= 1,
+            j * 2 + k >= 2,
+            j * 2 + k < 8,
+            j * 2 + k < 9,
+            i * 6 + j * 2 + k >= 3,
+            i * 6 + j * 2 + k < 25,
+            i * 6 + j * 2 + k >= 1,
+            i * 6 + j * 2 + k < 18,
+        ),
+    )
+
+    # constraint on non-disjoint fused iters should fail
+    assert_iter_sum_failure(
+        [i * 8 + j * 2 + k],
+        var_dom([(i, 11), (j, 5), (k, 2)]),
+        predicate=tvm.tir.all(j * 2 + k >= 2, i * 4 + j >= 0),
+    )
+
+    # constraints with different lower bound
+    assert_iter_sum_pattern(
+        {
+            (i * 16 + j) // 23 * 8 + (i * 16 + j) % 23 - 15: (
+                64,
+                0,
+                1,
+                (i * 16 + j) // 23 * 8 + ((i * 16 + j) % 23 + tvm.tir.IntImm("int32", -15)),
+            )
+        },
+        var_dom([(i, 12), (j, 16)]),
+        predicate=tvm.tir.And(
+            tvm.tir.And(i * 16 + j < 184, tvm.tir.LE(tvm.tir.IntImm("int32", 8), (i * 16 + j) % 23)),
+            tvm.tir.LE(tvm.tir.IntImm("int32", 15), (i * 16 + j) % 23),
+        ),
+    )
+
+    # constraint on many disjoint fused iters, case 1
+    # i4 * 6 + i5 in [3, 9), extent=6 (= scale of i2)
+    # i2 * 30 + i3 * 15 in [30, 90), extent=60 (= scale of i1)
+    # i1 * 60 in [60, 240), extent=180 (= scale of i0)
+    i0 = tvm.tir.Var("i0", "int32")
+    i1 = tvm.tir.Var("i1", "int32")
+    i2 = tvm.tir.Var("i2", "int32")
+    i3 = tvm.tir.Var("i3", "int32")
+    i4 = tvm.tir.Var("i4", "int32")
+    i5 = tvm.tir.Var("i5", "int32")
+    assert_iter_sum_pattern(
+        {i0 * 180 + i1 * 60 + i2 * 30 + i3 * 15 + i4 * 6 + i5: (540, 93)},
+        var_dom([(i0, 3), (i1, 4), (i2, 3), (i3, 2), (i4, 3), (i5, 6)]),
+        predicate=tvm.tir.all(i1 >= 1, i2 * 2 + i3 >= 2, i4 * 6 + i5 >= 3),
+    )
+
+    # constraint on many disjoint fused iters, case 2
+    assert_iter_sum_pattern(
+        {i0 * 45 + i1 * 45 + i2 * 9 + i3 * 4 + i4: (135, 28)},
+        var_dom([(i0, 3), (i1, 2), (i2, 5), (i3, 3), (i4, 4)]),
+        predicate=tvm.tir.all(i1 * 5 + i2 >= 3, i1 * 5 + i2 < 8, i3 * 4 + i4 >= 1, i3 * 4 + i4 < 10),
+    )
+
+    # constraint on split iters
+    assert_iter_sum_pattern(
+        {i % 16: (7, 3), i // 16: (8, 4)},
+        var_dom([(i, 1024)]),
+        predicate=tvm.tir.all(i % 16 >= 3, i % 16 < 10, i // 16 >= 4, i // 16 < 12),
+        check_level="bijective",
+    )
+
+    # constraint on split iters, nested case 1
+    assert_iter_sum_pattern(
+        {(i * 32 + j) % 16: (7, 3)},
+        var_dom([(i, 5), (j, 32)]),
+        predicate=tvm.tir.all((i * 32 + j) % 16 >= 3, (i * 32 + j) % 16 < 10),
+    )
+
+    # constraint on split iters, nested case 2
+    assert_iter_sum_failure(
+        [
+            (i * 32 + j) % 16,
+        ],
+        var_dom([(i, 5), (j, 32)]),
+        predicate=tvm.tir.all(i * 32 + j >= 1, i * 32 + j <= 32),
+        check_level="bijective",
+    )
+    assert_iter_sum_pattern(
+        {(i * 32 + j) % 16: (16, 0)},
+        var_dom([(i, 5), (j, 32)]),
+        predicate=tvm.tir.all(i * 32 + j >= 1, i * 32 + j <= 32),
+    )
+    assert_iter_sum_pattern(
+        {(i * 32 + j - 1) % 16: (16, 0), (i * 32 + j - 1) // 16: (4, 0)},
+        var_dom([(i, 5), (j, 32)]),
+        predicate=tvm.tir.all(i * 32 + j >= 1, i * 32 + j <= 64),
+    )
+
+    # non-standard form of predicate
+    assert_iter_sum_pattern({x * 10 + y: (128, 0)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 < 128 - y)
+
+    # duplicate constraint
+    assert_iter_sum_pattern(
+        {x * 10 + y: (64, 0)},
+        var_dom([(x, 13), (y, 10)]),
+        predicate=tvm.tir.all(x * 10 + y < 128, x * 10 + y < 64),
+    )
+
+    # useless constraint
+    assert_iter_sum_pattern({x * 10 + y: (130, 0)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y < 140)
+
+    i1 = tvm.tir.Var("i1", "int32")
+    i2 = tvm.tir.Var("i2", "int32")
+    i3 = tvm.tir.Var("i3", "int32")
+    i4 = tvm.tir.Var("i4", "int32")
+    assert_iter_sum_pattern(
+        {i1 * 20 + i2 * 10 + i3 * 3 + i4: (128, 0)},
+        var_dom([(i1, 7), (i2, 2), (i3, 4), (i4, 3)]),
+        predicate=(
+            tvm.tir.all(
+                i1 * 2 + i2 < 13,
+                i1 * 20 + i2 * 10 + i3 * 3 + i4 < 128,
+                i3 * 3 + i4 < 10,
+            )
+        ),
+    )
+
+    # wrong constraint
+    assert_iter_sum_failure(
+        [i1 * 20 + i2 * 10 + i3 * 3 + i4],
+        var_dom([(i1, 7), (i2, 2), (i3, 4), (i4, 3)]),
+        predicate=(
+            tvm.tir.all(
+                i1 * 2 + i2 < 13,
+                i1 * 20 + i2 * 10 + i3 * 3 + i4 < 128,
+                i3 * 3 + i4 < 7,
+            )
+        ),
+    )
+
+    # incompatible constraint
+    assert_iter_sum_failure(
+        [i1 * 20 + i2 * 10 + i3 * 3 + i4],
+        var_dom([(i1, 7), (i2, 2), (i3, 4), (i4, 3)]),
+        predicate=(
+            tvm.tir.all(
+                i1 * 2 + i2 < 13,
+                i1 * 20 + i2 * 10 + i3 * 3 + i4 < 128,
+                i3 * 3 + i4 < 10,
+                i1 * 4 + i3 < 20,
+            )
+        ),
+    )
+    assert_iter_sum_failure(
+        [i1 * 20 + i2 * 10 + i3 * 3 + i4],
+        var_dom([(i1, 7), (i2, 2), (i3, 4), (i4, 3)]),
+        predicate=(
+            tvm.tir.all(
+                i1 * 2 + i2 < 13,
+                i1 * 20 + i2 * 10 + i3 * 3 + i4 < 128,
+                i1 * 4 + i3 < 20,
+            )
+        ),
+    )
+
+    # zero iter
+    xo = tvm.tir.Var("xo", "int32")
+    xi = tvm.tir.Var("xi", "int32")
+    y = tvm.tir.Var("y", "int32")
+    assert_iter_sum_pattern(
+        {xo * 129 + xi: (128, 0), y: (128, 0)},
+        var_dom([(xo, 1), (xi, 129), (y, 128)]),
+        predicate=xo * 129 + xi < 128,
+    )
+
+    # strided iteration predicate
+    assert_iter_sum_pattern(
+        {xo * 16 + xi * 4: (10, 0, 4)},
+        var_dom([(xo, 3), (xi, 4)]),
+        predicate=xo * 4 + xi < 10,
+    )
+
+
+def convert_division(divisions):
+    if divisions is None or len(divisions) == 0:
+        return []
+    res = []
+    for division in divisions[:-1]:
+        res.append(
+            [
+                tvm.arith.normalize_iter_map_to_expr(division[0].source),
+                tvm.arith.normalize_iter_map_to_expr(division[1].source),
+            ]
+        )
+    res.append([divisions[-1][0].extent, divisions[-1][1].extent])
+    return res
+
+
+def create_iter(name, extent):
+    return tvm.tir.Var(name, "int32"), extent
+
+
+def test_subspace_division():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("z", "int32")
+    c = tvm.tir.SizeVar("c", "int32")
+
+    # simple 1.1
+    res = tvm.arith.subspace_divide([z * 12 + y * 3 + x + c], var_dom([(x, 3), (y, 4), (z, 5)]), [x])
+    res = convert_division(res)
+    assert len(res) == 2
+    tvm.ir.assert_structural_equal(res[0][0], z * 4 + y)
+    tvm.ir.assert_structural_equal(res[0][1], x + c)
+
+    # simple 1.2
+    res = tvm.arith.subspace_divide([z * 12 + y * 3 + x + c], var_dom([(x, 3), (y, 4), (z, 5)]), [x], z * 4 + y < 18)
+    res = convert_division(res)
+    assert len(res) == 2
+    tvm.ir.assert_structural_equal(res[0][0], z * 4 + y)
+    tvm.ir.assert_structural_equal(res[0][1], x + c)
+    tvm.ir.assert_structural_equal(res[1][0], z * 4 + y < 18)
+    tvm.ir.assert_structural_equal(res[1][1], T.bool(True))
+
+    # compound 1
+    i0 = create_iter("i0", 4)
+    j0 = create_iter("j0", 8)
+    i3 = create_iter("i3", 2)
+
+    i1, i2 = isplit(j0, 4)
+    k0 = ifuse([i0, i1])
+    k1 = ifuse([i2, i3])
+
+    # compound 1.1
+    res = tvm.arith.subspace_divide([k0[0], k1[0]], var_dom([i0, j0, i3]), [i3[0]])
+    res = convert_division(res)
+    assert len(res) == 3
+    tvm.ir.assert_structural_equal(res[0][0], (i0[0] * 2) + floordiv(j0[0], 4))
+    tvm.ir.assert_structural_equal(res[0][1], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][0], floormod(j0[0], 4))
+    tvm.ir.assert_structural_equal(res[1][1], i3[0])
+
+    # assert_iter_sum_pattern
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([i3])).indices
+    assert len(res1) == 2
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0, j0])).indices
+    assert len(res2) == 2
+
+    # compound 1.2
+    res = tvm.arith.subspace_divide([k0[0], k1[0]], var_dom([i0, j0, i3]), [j0[0], i3[0]])
+    res = convert_division(res)
+    assert len(res) == 3
+    tvm.ir.assert_structural_equal(res[0][0], i0[0])
+    tvm.ir.assert_structural_equal(res[0][1], floordiv(j0[0], 4))
+    tvm.ir.assert_structural_equal(res[1][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][1], (floormod(j0[0], 4) * 2) + i3[0])
+
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([j0, i3])).indices
+    assert len(res1) == 2
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0])).indices
+    assert len(res2) == 2
+
+    # compound 1.3
+    res = tvm.arith.subspace_divide([k0[0], k1[0]], var_dom([i0, j0, i3]), [i0[0], i3[0]])
+    res = convert_division(res)
+    assert len(res) == 0
+
+    # compound 1.4
+    res = tvm.arith.subspace_divide([k0[0], k1[0]], var_dom([i0, j0, i3]), [i3[0]], k0[0] < 7)
+    res = convert_division(res)
+    assert len(res) == 3
+    tvm.ir.assert_structural_equal(res[0][0], (i0[0] * 2) + floordiv(j0[0], 4))
+    tvm.ir.assert_structural_equal(res[0][1], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][0], floormod(j0[0], 4))
+    tvm.ir.assert_structural_equal(res[1][1], i3[0])
+    tvm.ir.assert_structural_equal(res[2][0], (i0[0] * 2) + floordiv(j0[0], 4) < 7)
+    tvm.ir.assert_structural_equal(res[2][1], T.bool(True))
+
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([i3])).indices
+    assert len(res1) == 2
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0, j0])).indices
+    assert len(res2) == 2
+
+    # compound 1.5
+    res = tvm.arith.subspace_divide([k0[0], k1[0]], var_dom([i0, j0, i3]), [j0[0], i3[0]], k1[0] < 7)
+    res = convert_division(res)
+    assert len(res) == 3
+    tvm.ir.assert_structural_equal(res[0][0], i0[0])
+    tvm.ir.assert_structural_equal(res[0][1], floordiv(j0[0], 4))
+    tvm.ir.assert_structural_equal(res[1][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][1], (floormod(j0[0], 4) * 2) + i3[0])
+    tvm.ir.assert_structural_equal(res[2][0], T.bool(True))
+    tvm.ir.assert_structural_equal(res[2][1], (floormod(j0[0], 4) * 2) + i3[0] < 7)
+
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([j0, i3])).indices
+    assert len(res1) == 2
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0])).indices
+    assert len(res2) == 2
+
+    # compound 1.6
+    res = tvm.arith.subspace_divide([k0[0], k1[0]], var_dom([i0, j0, i3]), [i3[0]], tvm.tir.all(k0[0] < 7, k1[0] < 7))
+    res = convert_division(res)
+    assert len(res) == 0
+
+    # compound 2
+    j0 = create_iter("j0", 4)
+    l0 = create_iter("l0", 2)
+    l1 = create_iter("l1", 6)
+    j3 = create_iter("j3", 3)
+
+    k0 = ifuse([l0, l1])
+    i1, j2 = isplit(k0, 3)
+    j1, i1 = isplit(i1, 2)
+    i0 = ifuse([j0, j1])
+    i2 = ifuse([j2, j3])
+
+    # compound 2.1
+    res = tvm.arith.subspace_divide([i0[0], i1[0], i2[0]], var_dom([j0, l0, l1, j3]), [l1[0], j3[0]])
+    res = convert_division(res)
+    assert len(res) == 4
+    tvm.ir.assert_structural_equal(res[0][0], (j0[0] * 2) + l0[0])
+    tvm.ir.assert_structural_equal(res[0][1], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][1], floordiv(l1[0], 3))
+    tvm.ir.assert_structural_equal(res[2][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[2][1], (floormod(l1[0], 3) * 3) + j3[0])
+
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1], res[2][1]], var_dom([l1, j3])).indices
+    assert len(res1) == 3
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0], res[2][0]], var_dom([j0, l0])).indices
+    assert len(res2) == 3
+
+    # compound 2.2
+    res = tvm.arith.subspace_divide([i0[0], i1[0], i2[0]], var_dom([j0, l0, l1, j3]), [l0[0], l1[0], j3[0]])
+    res = convert_division(res)
+    assert len(res) == 4
+    tvm.ir.assert_structural_equal(res[0][0], j0[0])
+    tvm.ir.assert_structural_equal(res[0][1], floordiv(l0[0] * 6 + l1[0], 6))
+    tvm.ir.assert_structural_equal(res[1][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][1], floordiv(floormod(l0[0] * 6 + l1[0], 6), 3))
+    tvm.ir.assert_structural_equal(res[2][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[2][1], (floormod(l0[0] * 6 + l1[0], 3) * 3) + j3[0])
+
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1], res[2][1]], var_dom([l0, l1, j3])).indices
+    assert len(res1) == 3
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0], res[2][0]], var_dom([j0])).indices
+    assert len(res2) == 3
+
+    # compound 2.3
+    res = tvm.arith.subspace_divide([i0[0], i1[0], i2[0]], var_dom([j0, l0, l1, j3]), [l0[0], j3[0]])
+    res = convert_division(res)
+    assert len(res) == 0
+
+    # compound 2.4
+    res = tvm.arith.subspace_divide(
+        [i0[0], i1[0], i2[0]],
+        var_dom([j0, l0, l1, j3]),
+        [l1[0], j3[0]],
+        tvm.tir.all(i0[0] < 7, i2[0] < 8),
+    )
+    res = convert_division(res)
+    assert len(res) == 4
+    tvm.ir.assert_structural_equal(res[0][0], (j0[0] * 2) + l0[0])
+    tvm.ir.assert_structural_equal(res[0][1], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][1], floordiv(l1[0], 3))
+    tvm.ir.assert_structural_equal(res[2][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[2][1], (floormod(l1[0], 3) * 3) + j3[0])
+    tvm.ir.assert_structural_equal(res[3][0], (j0[0] * 2) + l0[0] < 7)
+    tvm.ir.assert_structural_equal(res[3][1], (floormod(l1[0], 3) * 3) + j3[0] < 8)
+
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1], res[2][1]], var_dom([l1, j3])).indices
+    assert len(res1) == 3
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0], res[2][0]], var_dom([j0, l0])).indices
+    assert len(res2) == 3
+
+    # compound 2.5
+    res = tvm.arith.subspace_divide([i0[0], i1[0], i2[0]], var_dom([j0, l0, l1, j3]), [j3[0]], i2[0] < 8)
+    res = convert_division(res)
+    assert len(res) == 0
+
+
+def test_subspace_divide_trivial_iters():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    # z = tvm.tir.Var("z", "int32")
+
+    # trivial 1.1
+    res = tvm.arith.subspace_divide([x * 16 + y], var_dom([(x, 1), (y, 16)]), [y], simplify_trivial_iterators=False)
+    res = convert_division(res)
+    assert len(res) == 2
+    tvm.ir.assert_structural_equal(res[0][0], x)
+    tvm.ir.assert_structural_equal(res[0][1], y)
+
+    # trivial 1.2
+    res = tvm.arith.subspace_divide(
+        [x, y],
+        var_dom([(x, 1), (y, 1)]),
+        [y],
+        simplify_trivial_iterators=False,
+    )
+    res = convert_division(res)
+    assert len(res) == 3
+    tvm.ir.assert_structural_equal(res[0][0], x)
+    tvm.ir.assert_structural_equal(res[0][1], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][0], T.int32(0))
+    tvm.ir.assert_structural_equal(res[1][1], y)
+
+
+def test_complex():
+    n0 = create_iter("n0", 2)
+    n1 = create_iter("n1", 4)
+
+    m0 = ifuse([n0, n1], 6)
+    m1 = create_iter("m1", 3)
+
+    l0 = create_iter("l0", 4)
+    l1 = create_iter("l1", 8)
+    l2 = ifuse([m0, m1], 16)
+    l3 = create_iter("l3", 32)
+
+    k0, k4 = isplit(l0, 2)
+    k1, k5 = isplit(l1, 2)
+    k2, k6 = isplit(l2, 4)
+    k3, k7 = isplit(l3, 4)
+
+    j0 = ifuse([k0, k1], 7)
+    j1 = ifuse([k2, k3])
+    j2 = ifuse([k4, k5])
+    j3 = ifuse([k6, k7], 15)
+
+    i0 = ifuse([j0, j1], 200)
+    i1 = ifuse([j2, j3], 50)
+
+    n0_mark = tvm.arith.IterMark(n0[0], n0[1])
+    n1_mark = tvm.arith.IterMark(n1[0], n1[1])
+    l0_mark = tvm.arith.IterMark(l0[0], l0[1])
+    l1_mark = tvm.arith.IterMark(l1[0], l1[1])
+    m1_mark = tvm.arith.IterMark(m1[0], m1[1])
+    l3_mark = tvm.arith.IterMark(l3[0], l3[1])
+
+    m0_expr = tvm.arith.IterSumExpr(
+        [
+            tvm.arith.IterSplitExpr(n0_mark, 1, n0[1], 4),
+            tvm.arith.IterSplitExpr(n1_mark, 1, n1[1], 1),
+        ],
+        0,
+    )
+    m0_mark = tvm.arith.IterMark(m0_expr, 6)
+    l2_expr = tvm.arith.IterSumExpr(
+        [tvm.arith.IterSplitExpr(m0_mark, 1, 6, 3), tvm.arith.IterSplitExpr(m1_mark, 1, m1[1], 1)],
+        0,
+    )
+    l2_mark = tvm.arith.IterMark(l2_expr, 16)
+    k0_expr = tvm.arith.IterSplitExpr(l0_mark, 2, 2, 4)
+    k1_expr = tvm.arith.IterSplitExpr(l1_mark, 2, 4, 1)
+    k2_expr = tvm.arith.IterSplitExpr(l2_mark, 4, 4, 8)
+    k3_expr = tvm.arith.IterSplitExpr(l3_mark, 4, 8, 1)
+    k4_expr = tvm.arith.IterSplitExpr(l0_mark, 1, 2, 30)
+    k5_expr = tvm.arith.IterSplitExpr(l1_mark, 1, 2, 15)
+    k6_expr = tvm.arith.IterSplitExpr(l2_mark, 1, 4, 4)
+    k7_expr = tvm.arith.IterSplitExpr(l3_mark, 1, 4, 1)
+
+    j0_expr = tvm.arith.IterSumExpr([k0_expr, k1_expr], 0)
+    j0_mark = tvm.arith.IterMark(j0_expr, 7)
+    i0_expr = tvm.arith.IterSumExpr([tvm.arith.IterSplitExpr(j0_mark, 1, 7, 32), k2_expr, k3_expr], 0)
+
+    j3_expr = tvm.arith.IterSumExpr([k6_expr, k7_expr], 0)
+    j3_mark = tvm.arith.IterMark(j3_expr, 15)
+    i1_expr = tvm.arith.IterSumExpr([k4_expr, k5_expr, tvm.arith.IterSplitExpr(j3_mark, 1, 15, 1)], 0)
+
+    i0_mark = tvm.arith.IterMark(i0_expr, i0[1])
+    i1_mark = tvm.arith.IterMark(i1_expr, i1[1])
+
+    i0_final = tvm.arith.IterSumExpr([tvm.arith.IterSplitExpr(i0_mark, 1, i0[1], 1)], 0)
+    i1_final = tvm.arith.IterSumExpr([tvm.arith.IterSplitExpr(i1_mark, 1, i1[1], 1)], 0)
+
+    assert_iter_sum_pattern(
+        {i0[0]: (200, 0, 1, i0_final), i1[0]: (50, 0, 1, i1_final)},
+        var_dom([l0, l1, n0, n1, m1, l3]),
+        predicate=tvm.tir.all(i0[0] < 200, i1[0] < 50, m0[0] < 6, l2[0] < 16, j0[0] < 7, j3[0] < 15),
+    )
+
+    # wrong constraint
+    assert_iter_sum_failure(
+        [i0[0], i1[0]],
+        var_dom([l0, l1, n0, n1, m1, l3]),
+        tvm.tir.all(i0[0] < 200, i1[0] < 50, m0[0] < 9, l2[0] < 16, j0[0] < 7, j3[0] < 14),
+    )
+
+    # subspace_division
+    res = tvm.arith.subspace_divide(
+        [i0[0], i1[0]],
+        var_dom([l0, l1, n0, n1, m1, l3]),
+        [n0[0], n1[0], m1[0], l3[0]],
+        tvm.tir.all(m0[0] < 6, l2[0] < 16, j0[0] < 7, j3[0] < 15),
+    )
+    res = convert_division(res)
+    assert len(res) == 3
+    tvm.ir.assert_structural_equal(res[0][0], floordiv(l0[0], 2) * 4 + floordiv(l1[0], 2))
+    tvm.ir.assert_structural_equal(res[0][1], (floordiv((n0[0] * 4 + n1[0]) * 3 + m1[0], 4) * 8) + floordiv(l3[0], 4))
+    tvm.ir.assert_structural_equal(res[1][0], ((floormod(l0[0], 2) * 2) + floormod(l1[0], 2)))
+    tvm.ir.assert_structural_equal(res[1][1], ((floormod(((n0[0] * 4 + n1[0]) * 3 + m1[0]), 4) * 4) + floormod(l3[0], 4)))
+    tvm.ir.assert_structural_equal(res[2][0], (floordiv(l0[0], 2) * 4) + floordiv(l1[0], 2) < 7)
+    tvm.ir.assert_structural_equal(
+        res[2][1],
+        tvm.tir.all(
+            n0[0] * 4 + n1[0] < 6,
+            (n0[0] * 4 + n1[0]) * 3 + m1[0] < 16,
+            floormod(((n0[0] * 4 + n1[0]) * 3 + m1[0]), 4) * 4 + floormod(l3[0], 4) < 15,
+        ),
+    )
+
+    assert_iter_sum_pattern({res[0][1]: (32, 0), res[1][1]: (15, 0)}, var_dom([n0, n1, m1, l3]), res[2][1])
+    assert_iter_sum_pattern({res[0][0]: (8, 0), res[1][0]: (4, 0)}, var_dom([l0, l1]))
+
+
+def test_normalize_iter_map_to_expr():
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+
+    xo, xi = isplit((x, 10), 5)
+    yo, yi = isplit((y, 9), 3)
+    z = ifuse([yo, xo, yi])
+    res = tvm.arith.detect_iter_map([z[0], xi[0]], var_dom([(x, 10), (y, 9)]))
+
+    tvm.ir.assert_structural_equal(
+        tvm.arith.normalize_iter_map_to_expr(res.indices[0]),
+        fld(y, 3) * 6 + fld(x, 5) * 3 + flm(y, 3),
+    )
+    tvm.ir.assert_structural_equal(tvm.arith.normalize_iter_map_to_expr(res.indices[1]), flm(x, 5))
+
+    # iter mark wrap a complex expr
+    split = tvm.arith.IterSplitExpr(tvm.arith.IterMark(x * y + 1, 1024), 1, 1024, 1)
+    tvm.ir.assert_structural_equal(tvm.arith.normalize_iter_map_to_expr(split), x * y + 1)
+
+
+def test_inverse_affine_iter_map():
+    analyzer = tvm.arith.Analyzer()
+    l0 = create_iter("l0", 64)
+    l1 = create_iter("l1", 64)
+    l2 = create_iter("l2", 64)
+
+    # simple case
+    l0_0, l0_1 = isplit(l0, 16)
+    l1_0, l1_1 = isplit(l1, 4)
+    l0_1_l1_1_fused = ifuse([l0_1, l1_1])
+
+    iter_map = tvm.arith.detect_iter_map([l0_1_l1_1_fused[0], l0_0[0], l1_0[0]], var_dom([l0, l1])).indices
+    outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
+    res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
+    assert len(res) == 2
+    l0_inverse = floordiv(outputs[0], 4) + outputs[1] * 16
+    l1_inverse = floormod(outputs[0], 4) + outputs[2] * 4
+    assert analyzer.can_prove_equal(res[l0[0]], l0_inverse)
+    assert analyzer.can_prove_equal(res[l1[0]], l1_inverse)
+
+    # compound case
+    l0_0, l0_1 = isplit(l0, 16)
+    l1_0, l1_1 = isplit(l1, 4)
+    l2_1, l2_2 = isplit(l2, 4)
+    l2_0, l2_1 = isplit(l2_1, 4)
+
+    l0_1_l2_1_l1_1_l2_0_fused = ifuse([l0_1, l2_1, l1_1, l2_0])
+
+    iter_map = tvm.arith.detect_iter_map([l0_1_l2_1_l1_1_l2_0_fused[0], l0_0[0], l2_2[0], l1_0[0]], var_dom([l0, l1, l2])).indices
+    outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
+    res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
+    assert len(res) == 3
+    l0_inverse = floordiv(outputs[0], 64) + outputs[1] * 16
+    l1_inverse = floormod(floordiv(outputs[0], 4), 4) + outputs[3] * 4
+    l2_inverse = floormod(outputs[0], 4) * 16 + floormod(floordiv(outputs[0], 16), 4) * 4 + outputs[2]
+
+    assert analyzer.can_prove_equal(res[l0[0]], l0_inverse)
+    assert analyzer.can_prove_equal(res[l1[0]], l1_inverse)
+    assert analyzer.can_prove_equal(res[l2[0]], l2_inverse)
+
+    # diamond-shape DAG
+    l0_0, l0_1 = isplit(l0, 16)
+    l1 = ifuse([l0_1, l0_0])
+    l1_0, l1_1 = isplit(l1, 8)
+    l2 = ifuse([l1_1, l1_0])
+
+    iter_map = tvm.arith.detect_iter_map([l2[0]], var_dom([l0])).indices
+    outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
+    res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
+    assert len(res) == 1
+    l1_inverse = floormod(outputs[0], 8) * 8 + floordiv(outputs[0], 8)
+    l0_inverse = floormod(l1_inverse, 4) * 16 + floordiv(l1_inverse, 4)
+
+    assert analyzer.can_prove_equal(res[l0[0]], l0_inverse)
+
+
+def test_inverse_affine_map_trivial_iter():
+    analyzer = tvm.arith.Analyzer()
+    l0 = create_iter("l0", 64)
+    l1 = create_iter("l1", 64)
+    iter_map = tvm.arith.detect_iter_map([0, l0[0], l1[0]], var_dom([l0, l1])).indices
+    outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
+    res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
+    # output_0 is expected to be constant and it is not included in the inverse map
+    assert len(res) == 2
+    assert analyzer.can_prove_equal(res[l0[0]], outputs[1])
+    assert analyzer.can_prove_equal(res[l1[0]], outputs[2])
+
+
+def test_free_variables():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("z", "int32")
+
+    # illegal iter if z is within dom
+    assert_iter_sum_failure([z * 19 + y * 3 + x], var_dom([(x, 3), (y, 3), (z, 3)]))
+
+    # iter is valid if z is free, even there are linear forms of z
+    assert_iter_sum_pattern(
+        {z * 19 + y * 3 + x: (9, z * 19)},
+        var_dom(
+            [
+                (x, 3),
+                (y, 3),
+            ]
+        ),
+    )
+    assert_iter_sum_pattern(
+        {z * z + y * 3 + x: (9, z * z)},
+        var_dom(
+            [
+                (x, 3),
+                (y, 3),
+            ]
+        ),
+    )
+
+
+class TestPadding:
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+
+    positive_test_case = tvm.testing.parameter(
+        # left padding only, offset divisible
+        ({y: 192}, {fld(64 + y, 32): (6, 2, 1), flm(64 + y, 32): (32, 0, 1)}, "bijective"),
+        # left padding only, offset non-divisible
+        ({y: 176}, {fld(80 + y, 32): (6, 2, 1)}),
+        ({y: 176}, {flm(fld(80 + y, 2), 16): (16, 0, 1), flm(80 + y, 2): (2, 0, 1)}),
+        # right padding only, offset divisible
+        ({x: 5, y: 4}, {fld(x * 32 + y * 8, 16): (10, 0, 1), flm(x * 32 + y * 8, 16): (2, 0, 8)}),
+        # right padding only, offset non-divisible
+        ({x: 26}, {fld(x, 15): (2, 0, 1)}),
+        ({x: 26}, {flm(fld(x, 3), 5): (5, 0, 1), flm(x, 3): (3, 0, 1)}),
+        # padding constants on both side
+        ({x: 45}, {fld(x + 71, 32): (2, 2, 1)}),
+        ({x: 45}, {flm(fld(x, 4), 8): (8, 0, 1), flm(x, 4): (4, 0, 1)}),
+        # padding for free iteration part
+        ({y: 360}, {fld(x * 360 + y, 16): (23, fld(x * 360 - flm(x, 2) * 8, 16), 1)}),
+        ({y: 360}, {flm(x * 360 + y, 16): (16, 0, 1)}),
+        # multiple split with same mark offset, could
+        # be surjective on missing (padded // LCM)
+        (
+            {x: 240},
+            {
+                flm(x + 10, 3): (3, 0),
+                flm(fld(x + 10, 3), 4): (4, 0),
+                flm(fld(fld(x + 10, 3), 4), 5): (5, 0),
+            },
+        ),
+        # different offsets on splits
+        (
+            {x: 240},
+            {
+                flm(x + 1, 3): (3, 0),
+                flm(fld(x + 10, 3) + 2, 4): (4, 0),
+                flm(fld(fld(x + 10, 3), 4) + 3, 5): (5, 0),
+            },
+        ),
+    )
+
+    negative_test_case = tvm.testing.parameter(
+        # left padding only, offset non-divisible
+        ({y: 176}, {fld(80 + y, 32), flm(80 + y, 32)}),
+        ({y: 176}, {fld(80 + y, 32), fld(80 + y, 4)}),
+        # right padding only, offset divisible
+        ({x: 5, y: 4}, {fld(x * 32 + y * 8, 5)}),
+        # multiple split with same mark offset, could
+        # be surjective on missing (padded // LCM)
+        (
+            {x: 240},
+            {
+                flm(x + 10, 3),
+                flm(fld(x + 10, 3), 4),
+                flm(fld(fld(x + 10, 3), 4), 5),
+                fld(fld(fld(x + 10, 3), 4), 5),
+            },
+        ),
+        # original extent is smaller than the divident
+        # it is not surjective wrt to the region [0, 16)
+        ({x: 3}, {flm(x, 16)}),
+        # (x % c1) // c2 is not proved as surjective if c1 % c2 != 0
+        ({x: 255}, {fld(flm(x, 255), 16)}),
+    )
+
+    def test_padding(self, positive_test_case):
+        iter_extent, mapped_iterators, *args = positive_test_case
+        check_level = args[0] if args else "surjective"
+        dom_map = {var: tvm.ir.Range(0, ext) for var, ext in iter_extent.items()}
+        assert_iter_sum_pattern(mapped_iterators, dom_map, check_level=check_level)
+
+    def test_padding_error(self, negative_test_case):
+        iter_extent, mapped_iterators, *args = negative_test_case
+        check_level = args[0] if args else "surjective"
+        dom_map = {var: tvm.ir.Range(0, ext) for var, ext in iter_extent.items()}
+        assert_iter_sum_failure(mapped_iterators, dom_map, check_level=check_level)
+
+
+def test_overlapped_fuse():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("z", "int32")
+    a = tvm.tir.Var("x", "int32")
+    b = tvm.tir.Var("y", "int32")
+
+    # non-bijective fuse of two
+    assert_iter_sum_pattern(
+        {
+            x * 7 + y: (22, 0, 1),
+        },
+        var_dom([(x, 3), (y, 8)]),
+        check_level="surjective",
+    )
+    assert_iter_sum_failure([x * 7 + y], var_dom([(x, 3), (y, 8)]), check_level="bijective")
+
+    # non-bijective fuse of three
+    assert_iter_sum_pattern(
+        {
+            x * 18 + y * 7 + z: (40, 0, 1),
+        },
+        var_dom([(x, 2), (y, 3), (z, 8)]),
+        check_level="surjective",
+    )
+    assert_iter_sum_failure([x * 7 + y], var_dom([(x, 2), (y, 3), (z, 8)]), check_level="bijective")
+
+    # negative scale fusion is not allowed
+    assert_iter_sum_failure([x * -7 + y], var_dom([(x, 3), (y, 8)]), check_level="surjective")
+    assert_iter_sum_failure([x * 7 - y], var_dom([(x, 3), (y, 8)]), check_level="surjective")
+
+    # with predicate
+    assert_iter_sum_pattern(
+        {
+            a * 40 + b * 20 + x * 18 + y * 3 + z: (125, 6, 1),
+        },
+        var_dom([(a, 3), (b, 2), (x, 2), (y, 6), (z, 8)]),
+        predicate=tvm.tir.all(z < 4, x * 6 + y > 1, x * 6 + y < 10),
+        check_level="surjective",
+    )
+
+    # stride=1 kernel
+    assert_iter_sum_pattern({x + a: (230, 0, 1)}, var_dom([(x, 224), (a, 7)]), check_level="surjective")
+
+    # do not allow both strided and overlapped
+    assert_iter_sum_failure([5 * x + 2 * y], var_dom([(x, 4), (y, 3)]), check_level="surjective")
+
+
+def test_iter_map_simplify_symbolic_case():
+    """Test itermap simplify"""
+    x = tvm.tir.Var("x", "int64")
+    y = tvm.tir.Var("y", "int64")
+    z = x * 32 + y
+
+    n = tvm.tir.SizeVar("n", "int64")
+
+    def simple_fuse0(x):
+        return (x // n) * n + x % n
+
+    assert_iter_map_simplify({simple_fuse0(x): x}, var_dom([(x, n * 32)]))
+
+    assert_iter_map_simplify({simple_fuse0(z): z}, var_dom([(x, n), (y, 32)]))
+
+    def fsymbolic_fuse0(x):
+        return ((x // (n * n)) % 32) * (n * n) + ((x // n) % n) * n + x % n
+
+    assert_iter_map_simplify({fsymbolic_fuse0(x): x}, var_dom([(x, n * n * 32)]))
+
+    assert_iter_map_simplify({fsymbolic_fuse0(z): z}, var_dom([(x, n * n), (y, 32)]))
+
+    def fsymbolic_fuse1(x):
+        return ((x % (n * n * 32)) // (n * n) * n + (x % (n * n) // n)) * n + x % n
+
+    assert_iter_map_simplify({fsymbolic_fuse1(x): x}, var_dom([(x, n * n * 32)]))
+
+    assert_iter_map_simplify({fsymbolic_fuse1(z): z}, var_dom([(x, n * n), (y, 32)]))
+
+    def fsymbolic_fuse2(i):
+        return (i // (n * n) * n + i % (n * n) // n) * n + i % n
+
+    assert_iter_map_simplify({fsymbolic_fuse2(x): x}, var_dom([(x, n * n * 32)]))
+
+
+def test_iter_map_simplify_symbolic_predicate():
+    """Test itermap simplify"""
+    x = tvm.tir.Var("x", "int64")
+    y = tvm.tir.Var("y", "int64")
+
+    n = tvm.tir.SizeVar("n", "int64")
+
+    def simple_fuse0(x):
+        return (x // n) * n + x % n
+
+    z = x * 32 + y
+    assert_iter_map_simplify({simple_fuse0(z): z}, var_dom([(x, (n + 1) // 2), (y, 32)]), predicate=(z < n * 16))
+
+    def fsymbolic_fuse2(i):
+        return (i // (n * n) * n + i % (n * n) // n) * n + i % n
+
+    z = x * 64 + y
+    assert_iter_map_simplify(
+        {fsymbolic_fuse2(z): z},
+        var_dom([(x, (n * n + 1) // 2), (y, 64)]),
+        predicate=(z < n * n * 32),
+    )
+
+
+def test_iter_map_simplify_symbolic_reshape():
+    n = tvm.tir.Var("n", "int64")
+    fused = tvm.tir.Var("fused", "int64")
+
+    ax0 = (fused // 4096) // n
+    ax1 = (fused // 4096) % n
+    ax2 = fused % 4096
+
+    rhs_index = ((ax2 // 4096 + ax0 * n + ax1) % n) * 4096 + ax2 % 4096
+
+    assert_iter_map_simplify({rhs_index: fused}, var_dom([(fused, n * 4096)]))
+
+
+def test_iter_map_simplify_unit_loop_order():
+    """Test itermap simplify"""
+    x = tvm.tir.Var("x", "int64")
+    y = tvm.tir.Var("y", "int64")
+    z = tvm.tir.Var("z", "int64")
+
+    # trivial iterators can be found at any when comparing via scale
+    # ensure order unchange
+    assert_iter_map_simplify({x + y + z: x + y + z}, var_dom([(x, 1), (y, 1), (z, 1)]), simplify_trivial_iterators=False)
+
+    # Even with simplification, it should follow the original order
+    assert_iter_map_simplify(
+        {x + y + (z // 4) * 4 + z % 4: z + x + y},
+        var_dom([(x, 1), (y, 1), (z, 32)]),
+        simplify_trivial_iterators=False,
+    )
+
+    assert_iter_map_simplify(
+        {y + 64 - x % 2 * 64: y + 64 - x % 2 * 64},
+        var_dom([(x, 6), (y, 64)]),
+        simplify_trivial_iterators=False,
+    )
+
+    # When we have iterators that have same scale but one of them come
+    # with unit extent, we should prioritize unit extent
+    assert_iter_map_simplify(
+        {x // 128 + y + z: y + z},
+        var_dom([(x, 128), (y, 128), (z, 1)]),
+        simplify_trivial_iterators=False,
+    )
+
+
+def assert_normalize_to_iter_sum(index, input_iters, args, base):
+    """Assert the result of arith.normalize_to_iter_sum is correct
+
+    Parameters
+    ----------
+    index : tvm.tir.PrimExpr
+        The index to be normalized
+    input_iters : Mapping[Var, Range]
+        The input iterators
+    args : List[Union[tvm.arith.IterSplitExpr, Tuple[PrimExpr, PrimExpr]]]
+        The expected result. Ordered list of args of the expected IterSumExpr. Each arg can be
+        either IterSplitExpr or a tuple of (PrimExpr, PrimExpr) where the first element is the
+        iterator normalized to PrimExpr and the second element is the scale.
+    base : tvm.tir.PrimExpr
+        The expected base
+    """
+    res = tvm.arith.normalize_to_iter_sum(index, input_iters)
+
+    assert isinstance(res, tvm.arith.IterSumExpr)
+    assert len(res.args) == len(args)
+    for split, item in zip(res.args, args):
+        if isinstance(item, tvm.arith.IterSplitExpr):
+            tvm.ir.assert_structural_equal(split, item)
+            continue
+        tvm.testing.assert_prim_expr_equal(split.scale, item[1])
+        tvm.testing.assert_prim_expr_equal(tvm.arith.normalize_iter_map_to_expr(split), item[0] * item[1])
+    tvm.testing.assert_prim_expr_equal(res.base, base)
+
+
+def test_normalize_to_iter_sum():
+    x = tvm.tir.Var("x", "int64")
+    y = tvm.tir.Var("y", "int64")
+    z = tvm.tir.Var("z", "int64")
+    a = tvm.tir.Var("a", "int64")
+    n = tvm.tir.Var("n", "int64")
+    # flm = tvm.tir.floormod
+
+    assert_normalize_to_iter_sum(
+        z + ((y + x * 4 + 2) * n) + 3,
+        var_dom([(x, 9), (y, 4), (z, 3)]),
+        [(x, n * 4), (y, n), (z, 1)],
+        2 * n + 3,
+    )
+
+    # max cannot detected so it goes into base
+    assert_normalize_to_iter_sum(
+        tvm.tir.max(z, a) + ((y + x * 4 + 2) * n) + 3,
+        var_dom([(x, 9), (y, 4), (z, 3)]),
+        [(x, n * 4), (y, n)],
+        tvm.tir.max(z, a) + 2 * n + 3,
+    )
+
+    # order by symbolic prod
+    assert_normalize_to_iter_sum(
+        z + ((y * 4 * a + x * 4 + 2) * n) + 3,
+        var_dom([(y, a * n * 4), (x, n * 4), (z, a)]),
+        [(y, a * n * 4), (x, n * 4), (z, 1)],
+        2 * n + 3,
+    )
+
+    # order by cscale
+    assert_normalize_to_iter_sum(
+        z + 2 * y * 3 + 4 * x,
+        var_dom([(y, a * n * 4), (x, n * 4), (z, a)]),
+        [(y, 6), (x, 4), (z, 1)],
+        0,
+    )
+
+    # split pattern
+    assert_normalize_to_iter_sum(
+        z + 2 * y * 3 + 4 * (x // 2),
+        var_dom([(y, a * n * 4), (x, n * 4), (z, a)]),
+        [(y, 6), (x // 2, 4), (z, 1)],
+        0,
+    )
+
+    # non-divisible
+    assert_normalize_to_iter_sum(
+        x // 5,
+        var_dom([(x, 4096)]),
+        [
+            tvm.arith.IterSplitExpr(
+                tvm.arith.IterMark(x, 4096),
+                lower_factor=tvm.tir.const(5, "int64"),
+                extent=tvm.tir.const(820, "int64"),
+                scale=tvm.tir.const(1, "int64"),
+            )
+        ],
+        0,
+    )
+
+    # iter simplify
+    assert_normalize_to_iter_sum(
+        z * 2 + 2 * y * 3 + 4 * (x // 4) + (x % 4),
+        var_dom([(y, a * n * 4), (x, n * 4), (z, a)]),
+        [(y, 6), (z, 2), (x, 1)],
+        0,
+    )
+
+
+def test_detect_iter_map_with_bufferload_recursion():
+    n = tvm.tir.Var("n", "int32")
+    m = tvm.tir.Var("m", "int32")
+    divisor = tvm.tir.Var("divisor", "int32")
+
+    i = tvm.tir.Var("i", "int32")
+    j = tvm.tir.Var("j", "int32")
+
+    buffer = tvm.tir.decl_buffer((n,), "int32", name="seqlen")
+
+    indices = [(buffer[i] + j) // divisor]
+    iter_vars = {
+        i: tvm.ir.Range(tvm.tir.const(0, "int32"), n),
+        j: tvm.ir.Range(tvm.tir.const(0, "int32"), m),
+    }
+
+    result = tvm.arith.detect_iter_map(indices, iter_vars)
+    assert len(result.indices) == 0
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/arith/test_arith_simplify.py b/testing/python/arith/test_arith_simplify.py
new file mode 100644
index 000000000..7d6cf6d3d
--- /dev/null
+++ b/testing/python/arith/test_arith_simplify.py
@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from tilelang import tvm
+import tilelang.testing
+from tvm import tir
+import tvm.ir
+
+
+def test_simplify_reshape_flattened_index():
+    ana = tvm.arith.Analyzer()
+
+    i0 = tir.Var("i0", "int64")
+    i1 = tir.Var("i1", "int64")
+    ana.bind(i0, tvm.ir.Range(0, 8))
+    ana.bind(i1, tvm.ir.Range(0, 3))
+
+    i_flattened = i0 * 3 + i1
+    tvm.ir.assert_structural_equal(
+        ana.simplify((i_flattened) // 12 * 12 + (i_flattened) % 12 // 4 * 4 + (i_flattened) % 4),
+        i_flattened,
+    )
+
+
+dtype = tvm.testing.parameter(
+    "uint8",
+    "uint16",
+    "uint32",
+    "uint64",
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "float16",
+    "float32",
+    "float64",
+)
+
+
+def test_can_prove_self_identity(dtype):
+    ana = tvm.arith.Analyzer()
+
+    n = tir.Var("n", dtype)
+    assert ana.can_prove(n == n)
+
+
+def test_can_prove_self_equal_to_self(dtype):
+    ana = tvm.arith.Analyzer()
+
+    n = tir.Var("n", dtype)
+    assert ana.can_prove_equal(n, n)
+
+
+def test_simplify_symbolic_comparison():
+    ana = tvm.arith.Analyzer()
+
+    i0 = tir.Var("i0", "int64")
+    i1 = tir.Var("i1", "int64")
+    n, m = tvm.tir.SizeVar("n", "int64"), tvm.tir.SizeVar("m", "int64")
+    outer = (n + 31) // 32
+    ana.bind(i0, tvm.ir.Range(0, outer))
+    ana.bind(i1, tvm.ir.Range(0, 32))
+    PS = tvm.arith.ProofStrength
+
+    assert ana.can_prove(i0 * 32 + i1 < (n + 31) // 32 * 32, PS.SYMBOLIC_BOUND)
+    assert ana.can_prove(i0 * 32 + i1 < (n + 31) // 32 * 32 + m, PS.SYMBOLIC_BOUND)
+    assert ana.can_prove(i0 * 32 + i1 + 1 <= (n + 31) // 32 * 32, PS.SYMBOLIC_BOUND)
+    assert ana.can_prove((n + 31) // 32 * 32 >= i0 * 32 + i1 + 1, PS.SYMBOLIC_BOUND)
+    assert ana.can_prove((n + 31) // 32 * 32 >= i0 * 32 + i1, PS.SYMBOLIC_BOUND)
+
+
+def test_regression_simplify_inf_recursion():
+    ana = tvm.arith.Analyzer()
+    cond = tir.Var("cond", "int32")
+
+    res = (tvm.tir.NE(cond, 0).astype("int8") - tvm.tir.NE(cond, 0).astype("int8")).astype("int32") == 0
+    # regression in a previous case
+    # try compare and int set recursive call can cause infinite loop
+    ana.rewrite_simplify(res)
+
+
+def test_simplify_floor_mod_with_linear_offset():
+    """
+    Test that the floor_mod is simplified correctly when the offset is linear.
+    """
+    ana = tvm.arith.Analyzer()
+    past_decoder_sequence_length = tir.Var("past_decoder_sequence_length", "int64")
+    expr1 = (past_decoder_sequence_length + 1) * 64
+    divisor1 = (past_decoder_sequence_length + 1) * 32
+    assert ana.can_prove_equal(tvm.tir.floormod(expr1, divisor1), 0)
+    divisor2 = 32 * (past_decoder_sequence_length + 1)
+    assert ana.can_prove_equal(tvm.tir.floormod(expr1, divisor2), 0)
+
+
+def test_simplify_float_division():
+    # Test for the discussion:
+    # https://discuss.tvm.apache.org/t/discuss-is-constant-division-to-multiplication-rewrite-in-tvm-necessary/18615
+    ana = tvm.arith.Analyzer()
+    x = tir.Var("x", "float32")
+    ry = x / 27
+    # in old version, the division will be rewritten into x * T.float32(1 / 27)
+    sy = ana.rewrite_simplify(ry)
+    tvm.ir.assert_structural_equal(ry, sy)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/autotune/test_tilelang_autotune.py b/testing/python/autotune/test_tilelang_autotune.py
index 85e2e4807..53707ca34 100644
--- a/testing/python/autotune/test_tilelang_autotune.py
+++ b/testing/python/autotune/test_tilelang_autotune.py
@@ -48,6 +48,7 @@ def get_configs(M, N, K, with_roller=False):
         from tilelang.carver.template import MatmulTemplate
         from tilelang.carver.arch import CUDA
         from tilelang.carver.roller.rasterization import NoRasterization
+
         arch = CUDA("cuda")
         topk = 20
 
@@ -56,9 +57,9 @@ def get_configs(M, N, K, with_roller=False):
             M=M,
             N=N,
             K=K,
-            in_dtype="float16",
-            out_dtype="float16",
-            accum_dtype="float16",
+            in_dtype=T.float16,
+            out_dtype=T.float16,
+            accum_dtype=T.float16,
         ).with_arch(arch)
 
         func = carve_template.equivalent_function()
@@ -84,7 +85,6 @@ def get_configs(M, N, K, with_roller=False):
         for config in configs:
             print(config)
     else:
-
         block_M = [64]
         block_N = [64]
         block_K = [32]
@@ -100,7 +100,8 @@ def get_configs(M, N, K, with_roller=False):
                 num_stages,
                 thread_num,
                 enable_rasterization,
-            ))
+            )
+        )
 
         configs = [
             {
@@ -110,7 +111,8 @@ def get_configs(M, N, K, with_roller=False):
                 "num_stages": c[3],
                 "thread_num": c[4],
                 "enable_rasteration": c[5],  # keep param name for backward-compat
-            } for c in _configs
+            }
+            for c in _configs
         ]
     return configs
 
@@ -185,14 +187,14 @@ def kernel(
         """
         # Use half-precision for input data to reduce memory bandwidth,
         # accumulate in float for better numerical accuracy
-        dtype = "float16"
-        accum_dtype = "float"
+        dtype = T.float16
+        accum_dtype = T.float32
 
         @T.prim_func
         def main(
-                A: T.Tensor((M, K), dtype),
-                B: T.Tensor((N, K), dtype),
-                C: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((N, K), dtype),
+            C: T.Tensor((M, N), dtype),
         ):
             """
             The compiled TVM function for block-level matrix multiplication.
@@ -206,9 +208,7 @@ def main(
             """
             # Bind x-dimension to block index in N,
             #     y-dimension to block index in M.
-            with T.Kernel(
-                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
                 # Allocate shared memory for A sub-block of shape (block_M, block_K)
                 A_shared = T.alloc_shared((block_M, block_K), dtype)
                 # Allocate shared memory for B sub-block of shape (block_N, block_K)
@@ -247,12 +247,16 @@ def main(
 
         return main
 
-    autotuner = AutoTuner.from_kernel(
-        kernel=kernel, configs=get_configs(M, N, K, with_roller)).set_compile_args(
+    autotuner = (
+        AutoTuner.from_kernel(kernel=kernel, configs=get_configs(M, N, K, with_roller))
+        .set_compile_args(
             out_idx=[-1],
             target="auto",
-        ).set_profile_args(
-            ref_prog=ref_program,)
+        )
+        .set_profile_args(
+            ref_prog=ref_program,
+        )
+    )
     return autotuner.run(warmup=3, rep=20)
 
 
diff --git a/testing/python/autotune/test_tilelang_autotune_with_inputs.py b/testing/python/autotune/test_tilelang_autotune_with_inputs.py
index 39efce6bf..4edea0b88 100644
--- a/testing/python/autotune/test_tilelang_autotune_with_inputs.py
+++ b/testing/python/autotune/test_tilelang_autotune_with_inputs.py
@@ -30,38 +30,23 @@ def ref_program(A, B):
 
 
 def get_configs():
-    iter_params = dict(
-        block_M=[64],
-        block_N=[64],
-        block_K=[32],
-        num_stages=[0, 1],
-        thread_num=[128],
-        enable_rasterization=[False])
-    return [{
-        k: v for k, v in zip(iter_params, values)
-    } for values in itertools.product(*iter_params.values())]
-
-
-@tilelang.autotune(configs=get_configs(),)
+    iter_params = dict(block_M=[64], block_N=[64], block_K=[32], num_stages=[0, 1], thread_num=[128], enable_rasterization=[False])
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
+
+
+@tilelang.autotune(
+    configs=get_configs(),
+)
 @tilelang.jit(out_idx=[-1])
-def matmul(M,
-           N,
-           K,
-           block_M=128,
-           block_N=128,
-           block_K=32,
-           num_stages=0,
-           thread_num=128,
-           enable_rasterization=False):
-
-    dtype = "float16"
-    accum_dtype = "float"
+def matmul(M, N, K, block_M=128, block_N=128, block_K=32, num_stages=0, thread_num=128, enable_rasterization=False):
+    dtype = T.float16
+    accum_dtype = T.float32
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         """
         The compiled TVM function for block-level matrix multiplication.
@@ -76,7 +61,6 @@ def main(
         # Bind x-dimension to block index in N,
         #     y-dimension to block index in M.
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-
             # Allocate shared memory for A sub-block of shape (block_M, block_K)
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             # Allocate shared memory for B sub-block of shape (block_N, block_K)
diff --git a/testing/python/cache/test_tilelang_cache_matmul.py b/testing/python/cache/test_tilelang_cache_matmul.py
index 6e966a88a..f38ed487e 100644
--- a/testing/python/cache/test_tilelang_cache_matmul.py
+++ b/testing/python/cache/test_tilelang_cache_matmul.py
@@ -28,9 +28,9 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="flo
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -63,6 +63,7 @@ def ref_program(A, B):
         Reference PyTorch matrix multiplication for comparison.
         """
         import torch
+
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(torch.half)  # Assuming dtype="float16" in matmul
         return C
diff --git a/testing/python/cache/test_tilelang_kernel_cache.py b/testing/python/cache/test_tilelang_kernel_cache.py
new file mode 100644
index 000000000..9f6683a8d
--- /dev/null
+++ b/testing/python/cache/test_tilelang_kernel_cache.py
@@ -0,0 +1,287 @@
+# Test Plan: Disk Cache Verification using PostProc Callback
+#
+# Purpose: Reliably test disk cache in CI by using postproc callbacks to detect
+#          whether compilation actually happened or cache was used.
+#
+# Strategy:
+# - postproc is ONLY called during codegen (cache miss)
+# - postproc is NOT called when loading from cache (cache hit)
+# - Use a counter in postproc to distinguish these cases
+#
+# CI Safety:
+# - Use isolated cache/tmp directories per test (pytest tmp_path)
+# - Use unique kernel identifiers (UUID + global_symbol) to avoid collisions
+# - Clear memory cache between passes to force disk I/O
+# - os.replace() requires source and dest on same filesystem (atomic rename)
+#
+# Technical Details:
+# - Cache key is based on func.script(show_meta=True) hash
+# - Python comments do NOT affect cache key (not in TIR)
+# - Must use .with_attr("global_symbol", ...) to create unique cache keys
+
+import pytest
+import tilelang
+import tilelang.language as T
+import tvm_ffi
+import torch
+import uuid
+from pathlib import Path
+from tilelang.env import env
+from tilelang.cache import _dispatch_map
+
+BACKENDS = [
+    "tvm_ffi",
+    "cython",
+    "nvrtc",
+    "cutedsl",
+]
+
+
+def _get_target_from_backend(backend: str):
+    """Map backend to target string."""
+    return "cutedsl" if backend == "cutedsl" else "auto"
+
+
+class PostProcCounter:
+    """Track postproc callback invocations with a simple counter."""
+
+    def __init__(self):
+        self.count = 0
+        self.marker = None
+
+    def register_callback(self, backend: str):
+        """Register postproc callback for the given backend."""
+        comment_prefix = "#" if backend == "cutedsl" else "//"
+        global_func = "tilelang_callback_cutedsl_postproc" if backend == "cutedsl" else "tilelang_callback_cuda_postproc"
+
+        def callback(code, _):
+            self.count += 1
+            self.marker = f"{comment_prefix} CACHE_TEST_MARKER_{self.count}"
+            return self.marker + "\n" + code
+
+        tvm_ffi.register_global_func(global_func, f=callback, override=True)
+        return callback
+
+
+@pytest.fixture(scope="module", autouse=True)
+def setup_module_env():
+    """Setup and restore module-level environment and cache state."""
+    # Save original env values
+    original_cache_dir = env.TILELANG_CACHE_DIR
+    original_tmp_dir = env.TILELANG_TMP_DIR
+
+    # Enable cache once for entire module
+    tilelang.enable_cache()
+
+    yield
+
+    # Restore env at module end
+    env.TILELANG_CACHE_DIR = original_cache_dir
+    env.TILELANG_TMP_DIR = original_tmp_dir
+
+    # Restore default postproc callbacks
+    tvm_ffi.register_global_func("tilelang_callback_cuda_postproc", f=lambda code, _: code, override=True)
+    tvm_ffi.register_global_func("tilelang_callback_cutedsl_postproc", f=lambda code, _: code, override=True)
+
+
+@pytest.fixture(scope="function")
+def clean_cache_env(tmp_path, request):
+    """Provide isolated cache environment for each test.
+
+    Creates isolated cache/tmp directories to ensure:
+    - No interference from previous test runs
+    - No interference between parallel tests
+    - Clean slate for testing cache miss/hit behavior
+    - No "Invalid cross-device link" errors (os.replace requires same filesystem)
+
+    Technical notes:
+    - TILELANG_TMP_DIR MUST be on same filesystem as TILELANG_CACHE_DIR because
+      cache implementation uses os.replace() for atomic writes
+    - Env restoration is handled by setup_module_env at module scope
+    """
+    # This fixture should ONLY be used with @pytest.mark.parametrize("backend", ...)
+    backend = request.node.callspec.params["backend"]  # Will raise KeyError if missing
+
+    cache_dir = tmp_path / "tilelang_cache"
+    cache_dir.mkdir()
+
+    tmp_dir = tmp_path / "tilelang_tmp"
+    tmp_dir.mkdir()
+
+    # Patch env variables to point to isolated directories
+    env.TILELANG_CACHE_DIR = str(cache_dir)
+    env.TILELANG_TMP_DIR = str(tmp_dir)
+
+    # Clear memory caches to force disk I/O
+    _dispatch_map[backend]._memory_cache.clear()
+
+    return cache_dir
+
+
+@pytest.mark.parametrize("backend", BACKENDS)
+def test_disk_cache_with_postproc(clean_cache_env, backend):
+    """Test disk cache for multiple backends using postproc callback.
+
+    Tests all CUDA-based backends: nvrtc, cutedsl
+    (tvm_ffi, cython, torch use the same cuda_postproc callback as nvrtc)
+
+    Verification logic:
+    1. Pass 1: cache miss → postproc called → marker in kernel source
+    2. Pass 2: cache hit → postproc NOT called → same marker still in source
+    3. Verify cache files created on disk
+    4. Verify functional correctness
+    """
+    counter = PostProcCounter()
+    counter.register_callback(backend)
+
+    # Use UUID in global_symbol to ensure unique cache key per test run
+    unique_id = uuid.uuid4().hex[:8]
+    M, N = 1024, 1024
+
+    @T.prim_func
+    def vector_add(
+        A: T.Tensor((M, N), T.float32),
+        B: T.Tensor((M, N), T.float32),
+        C: T.Tensor((M, N), T.float32),
+    ):
+        with T.Kernel(M, threads=256) as bx:
+            for i in T.serial(N):
+                C[bx, i] = A[bx, i] + B[bx, i]
+
+    kernel_func = vector_add.with_attr("global_symbol", f"vector_add_{backend}_{unique_id}")
+
+    # === Pass 1: Cache miss (memory cache already cleared by fixture) ===
+    kernel1 = tilelang.compile(
+        kernel_func,
+        out_idx=[2],
+        target=_get_target_from_backend(backend),
+        execution_backend=backend,
+    )
+
+    assert counter.count == 1, f"Cache miss: postproc should be called once, got {counter.count}"
+
+    source1 = kernel1.get_kernel_source()
+    assert counter.marker in source1, f"Expected marker '{counter.marker}' in kernel source"
+
+    # Verify cache files created
+    cache_files = list(Path(clean_cache_env).rglob("*.*"))
+    assert len(cache_files) > 0, "Cache files should be created, found none"
+
+    # === Pass 2: Cache hit (clear memory cache to force disk read) ===
+    _dispatch_map[backend]._memory_cache.clear()
+
+    kernel2 = tilelang.compile(
+        kernel_func,
+        out_idx=[2],
+        target=_get_target_from_backend(backend),
+        execution_backend=backend,
+    )
+
+    assert counter.count == 1, f"Cache hit: postproc should not be called again, got {counter.count} calls"
+
+    source2 = kernel2.get_kernel_source()
+    assert counter.marker in source2, f"Expected cached marker '{counter.marker}' in source"
+
+    # === Verify functional correctness ===
+    a = torch.randn(M, N, dtype=torch.float32).cuda()
+    b = torch.randn(M, N, dtype=torch.float32).cuda()
+
+    c1 = kernel1(a, b)
+    c2 = kernel2(a, b)
+    ref = a + b
+
+    torch.testing.assert_close(c1, ref)
+    torch.testing.assert_close(c2, ref)
+    torch.testing.assert_close(c1, c2)
+
+
+@pytest.mark.parametrize("backend", BACKENDS)
+def test_cache_miss_detection(clean_cache_env, backend):
+    """Verify cache correctly misses when function changes.
+
+    This ensures our testing method is valid - different functions should
+    produce different cache keys and trigger recompilation.
+    """
+    counter = PostProcCounter()
+    counter.register_callback(backend)
+
+    M, N = 512, 512
+
+    # Kernel 1: A + 1.0
+    @T.prim_func
+    def func1(A: T.Tensor((M, N), T.float32), B: T.Tensor((M, N), T.float32)):
+        with T.Kernel(M, threads=128) as bx:
+            for i in T.serial(N):
+                B[bx, i] = A[bx, i] + 1.0
+
+    unique_id_1 = uuid.uuid4().hex[:8]
+    kernel_func1 = func1.with_attr("global_symbol", f"func1_{backend}_{unique_id_1}")
+
+    tilelang.compile(
+        kernel_func1,
+        out_idx=[1],
+        target=_get_target_from_backend(backend),
+        execution_backend=backend,
+    )
+    assert counter.count == 1, f"First kernel: expected 1 call, got {counter.count}"
+
+    # Kernel 2: A + 2.0 (different implementation)
+    @T.prim_func
+    def func2(A: T.Tensor((M, N), T.float32), B: T.Tensor((M, N), T.float32)):
+        with T.Kernel(M, threads=128) as bx:
+            for i in T.serial(N):
+                B[bx, i] = A[bx, i] + 2.0  # Different!
+
+    unique_id_2 = uuid.uuid4().hex[:8]
+    kernel_func2 = func2.with_attr("global_symbol", f"func2_{backend}_{unique_id_2}")
+
+    tilelang.compile(
+        kernel_func2,
+        out_idx=[1],
+        target=_get_target_from_backend(backend),
+        execution_backend=backend,
+    )
+
+    assert counter.count == 2, f"Different function should cause cache miss, expected 2 calls, got {counter.count}"
+
+
+@pytest.mark.parametrize("backend", BACKENDS)
+def test_cache_isolation_between_tests(clean_cache_env, backend):
+    """Verify cache isolation between tests.
+
+    Ensures clean_cache_env fixture provides independent cache directory for each test.
+    """
+    # Verify cache directory is empty
+    cache_files = list(Path(clean_cache_env).rglob("*"))
+    assert all(f.is_dir() for f in cache_files), f"Cache should be empty, found: {cache_files}"
+
+    # Compile a kernel
+    counter = PostProcCounter()
+    counter.register_callback(backend)
+
+    unique_id = uuid.uuid4().hex[:8]
+
+    @T.prim_func
+    def simple(A: T.Tensor((128,), T.float32), B: T.Tensor((128,), T.float32)):
+        with T.Kernel(128, threads=128) as i:
+            B[i] = A[i] * 2.0
+
+    kernel_func = simple.with_attr("global_symbol", f"simple_{backend}_{unique_id}")
+
+    tilelang.compile(
+        kernel_func,
+        out_idx=[1],
+        target=_get_target_from_backend(backend),
+        execution_backend=backend,
+    )
+
+    # Should be cache miss (empty cache dir)
+    assert counter.count == 1, f"Expected cache miss, got count={counter.count}"
+
+    # Verify cache files created
+    cache_files_after = list(Path(clean_cache_env).rglob("*.*"))
+    assert len(cache_files_after) > 0, f"Cache files should be created, found: {cache_files_after}"
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py b/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py
new file mode 100644
index 000000000..67d20b897
--- /dev/null
+++ b/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py
@@ -0,0 +1,72 @@
+import tilelang.testing
+from tilelang.carver.arch.driver.cuda_driver import (
+    get_cuda_device_properties,
+    get_device_name,
+    get_shared_memory_per_block,
+    get_device_attribute,
+    get_max_dynamic_shared_size_bytes,
+    get_persisting_l2_cache_max_size,
+    get_num_sms,
+    get_registers_per_block,
+)
+import torch
+
+
+class _cudaDeviceAttrNames:
+    r"""
+    This struct carries all properties that are of int32_t.
+    refer to https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g49e2f8c2c0bd6fe264f2fc970912e5cd
+    """
+
+    cudaDevAttrMaxThreadsPerBlock: int = 1
+    cudaDevAttrMaxSharedMemoryPerBlock: int = 8
+    cudaDevAttrMaxRegistersPerBlock: int = 12
+    cudaDevAttrMultiProcessorCount: int = 16
+    cudaDevAttrMaxSharedMemoryPerMultiprocessor: int = 81
+    cudaDevAttrMaxPersistingL2CacheSize: int = 108
+
+
+def test_driver_get_device_properties():
+    prop = get_cuda_device_properties()
+    assert prop is not None, "Failed to get CUDA device properties"
+    assert isinstance(prop, torch.cuda._CudaDeviceProperties), "Returned object is not of type _CudaDeviceProperties"
+
+
+def test_device_get_device_name():
+    tl_device_name = get_device_name()
+    th_device_name = torch.cuda.get_device_name()
+    assert tl_device_name == th_device_name, "Device names do not match"
+
+
+def test_device_get_shared_memory_per_block():
+    tl_smem = get_shared_memory_per_block()
+    driver_smem = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxSharedMemoryPerBlock)
+    assert tl_smem == driver_smem, "Shared memory per block values do not match"
+
+
+def test_device_get_persisting_l2_cache_size():
+    tl_cache_size = get_persisting_l2_cache_max_size()
+    driver_cache_size = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxPersistingL2CacheSize)
+    assert tl_cache_size == driver_cache_size, "Persisting L2 cache size values do not match"
+
+
+def test_device_get_num_sms():
+    tl_num_sms = get_num_sms()
+    driver_num_sms = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMultiProcessorCount)
+    assert tl_num_sms == driver_num_sms, "Number of SMs do not match"
+
+
+def test_device_get_registers_per_block():
+    tl_regs_per_block = get_registers_per_block()
+    driver_regs_per_block = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxRegistersPerBlock)
+    assert tl_regs_per_block == driver_regs_per_block, "Registers per block values do not match"
+
+
+def test_device_get_max_dynamic_shared_size_bytes():
+    tl_dynamic_smem = get_max_dynamic_shared_size_bytes()
+    driver_dynamic_smem = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxSharedMemoryPerMultiprocessor)
+    assert tl_dynamic_smem == driver_dynamic_smem, "Max dynamic shared size bytes values do not match"
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/carver/test_tilelang_carver_generate_hints.py b/testing/python/carver/test_tilelang_carver_generate_hints.py
index 43cdb27e3..ea674f7c7 100644
--- a/testing/python/carver/test_tilelang_carver_generate_hints.py
+++ b/testing/python/carver/test_tilelang_carver_generate_hints.py
@@ -3,22 +3,20 @@
 from tilelang.carver.roller import PrimFuncNode, OutputNode, Edge
 from tilelang.carver.arch import auto_infer_current_arch
 from tvm import te
+from tilelang.language import dtypes as T
 
 
 def run_general_matmul_emit_configs(M, N, K, topk: int = 20):
     arch = auto_infer_current_arch()
 
     def gemm(M, N, K):
-        A = te.placeholder((M, K), name='A', dtype='float16')
-        B = te.placeholder((N, K), name='B', dtype='float16')
+        A = te.placeholder((M, K), name="A", dtype=T.float16)
+        B = te.placeholder((N, K), name="B", dtype=T.float16)
 
         # Describe the matrix multiplication in TE
-        k = te.reduce_axis((0, K), name='k')
+        k = te.reduce_axis((0, K), name="k")
 
-        C = te.compute(
-            (M, N),
-            lambda i, j: te.sum(A[i, k].astype('float16') * B[j, k].astype('float16'), axis=[k]),
-            name='C')
+        C = te.compute((M, N), lambda i, j: te.sum(A[i, k].astype(T.float16) * B[j, k].astype(T.float16), axis=[k]), name="C")
 
         return A, B, C
 
@@ -29,8 +27,7 @@ def gemm(M, N, K):
 
     tensorized_func, tags = carver.utils.get_tensorized_func_and_tags(func, arch.target)
     print(tags)
-    policy = carver.TensorCorePolicy.from_prim_func(
-        func=tensorized_func, arch=arch, tags=tags, name="matmul_0")
+    policy = carver.TensorCorePolicy.from_prim_func(func=tensorized_func, arch=arch, tags=tags, name="matmul_0")
 
     hints = policy.emit_config(topk=topk)
 
@@ -59,16 +56,13 @@ def run_general_matmul_matmul_emit_configs(M, N, K, topk: int = 20):
     arch = auto_infer_current_arch()
 
     def gemm(M, N, K):
-        A = te.placeholder((M, K), name='A', dtype='float16')
-        B = te.placeholder((N, K), name='B', dtype='float16')
+        A = te.placeholder((M, K), name="A", dtype=T.float16)
+        B = te.placeholder((N, K), name="B", dtype=T.float16)
 
         # Describe the matrix multiplication in TE
-        k = te.reduce_axis((0, K), name='k')
+        k = te.reduce_axis((0, K), name="k")
 
-        C = te.compute(
-            (M, N),
-            lambda i, j: te.sum(A[i, k].astype('float16') * B[j, k].astype('float16'), axis=[k]),
-            name='C')
+        C = te.compute((M, N), lambda i, j: te.sum(A[i, k].astype(T.float16) * B[j, k].astype(T.float16), axis=[k]), name="C")
 
         return A, B, C
 
diff --git a/testing/python/carver/test_tilelang_carver_recommend_hints.py b/testing/python/carver/test_tilelang_carver_recommend_hints.py
index fee46761f..3a060f532 100644
--- a/testing/python/carver/test_tilelang_carver_recommend_hints.py
+++ b/testing/python/carver/test_tilelang_carver_recommend_hints.py
@@ -1,13 +1,11 @@
 import tilelang.testing
 from tilelang import carver
+from tilelang.language import dtypes as T
 from tilelang.carver.arch import auto_infer_current_arch
 from typing import List
 
 
-def run_general_reduction_recommend_hints(structure: str = "SSR",
-                                          shape: List[int] = None,
-                                          dtype: str = "float16",
-                                          topk: int = 20):
+def run_general_reduction_recommend_hints(structure: str = "SSR", shape: List[int] = None, dtype: T.dtype = T.float16, topk: int = 20):
     arch = auto_infer_current_arch()
     carve_template = carver.GeneralReductionTemplate(
         structure=structure,
@@ -23,14 +21,12 @@ def run_general_reduction_recommend_hints(structure: str = "SSR",
 
 
 def test_general_reduction_recommend_hints():
-    run_general_reduction_recommend_hints("SSR", [1024, 1024, 1024], "float16")
-    run_general_reduction_recommend_hints("SS", [1024, 1024], "float16")
-    run_general_reduction_recommend_hints("SRS", [1024, 1024, 1024], "float16")
+    run_general_reduction_recommend_hints("SSR", [1024, 1024, 1024], T.float16)
+    run_general_reduction_recommend_hints("SS", [1024, 1024], T.float16)
+    run_general_reduction_recommend_hints("SRS", [1024, 1024, 1024], T.float16)
 
 
-def run_elementwise_recommend_hints(shape: List[int] = None,
-                                    dtype: str = "float16",
-                                    topk: int = 20):
+def run_elementwise_recommend_hints(shape: List[int] = None, dtype: T.dtype = T.float16, topk: int = 20):
     arch = auto_infer_current_arch()
     carve_template = carver.ElementwiseTemplate(
         shape=shape,
@@ -45,18 +41,18 @@ def run_elementwise_recommend_hints(shape: List[int] = None,
 
 
 def test_elementwise_recommend_hints():
-    run_elementwise_recommend_hints([1024, 1024], "float16")
-    run_elementwise_recommend_hints([1024], "float16")
-    run_elementwise_recommend_hints([1024, 1024, 1024], "float16")
+    run_elementwise_recommend_hints([1024, 1024], T.float16)
+    run_elementwise_recommend_hints([1024], T.float16)
+    run_elementwise_recommend_hints([1024, 1024, 1024], T.float16)
 
 
 def run_matmul_recommend_hints(
     M: int = 1024,
     N: int = 1024,
     K: int = 1024,
-    in_dtype: str = "float16",
-    out_dtype: str = "float16",
-    accum_dtype: str = "float16",
+    in_dtype: T.dtype = T.float16,
+    out_dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float16,
 ):
     arch = auto_infer_current_arch()
     carve_template = carver.MatmulTemplate(
@@ -76,16 +72,14 @@ def run_matmul_recommend_hints(
 
 
 def test_matmul_recommend_hints():
-    run_matmul_recommend_hints(1024, 1024, 1024, "float16", "float16", "float16")
-    run_matmul_recommend_hints(1024, 1024, 1024, "int8", "int32", "int32")
-    run_matmul_recommend_hints(1024, 1024, 1024, "float16", "float32", "float16")
+    run_matmul_recommend_hints(1024, 1024, 1024, T.float16, T.float16, T.float16)
+    run_matmul_recommend_hints(1024, 1024, 1024, T.int8, T.int32, T.int32)
+    run_matmul_recommend_hints(1024, 1024, 1024, T.float16, T.float32, T.float16)
 
 
-def run_gemv_recommend_hints(N: int = 1024,
-                             K: int = 1024,
-                             in_dtype: str = "float16",
-                             out_dtype: str = "float16",
-                             accum_dtype: str = "float16"):
+def run_gemv_recommend_hints(
+    N: int = 1024, K: int = 1024, in_dtype: T.dtype = T.float16, out_dtype: T.dtype = T.float16, accum_dtype: T.dtype = T.float16
+):
     arch = auto_infer_current_arch()
     carve_template = carver.GEMVTemplate(
         N=N,
@@ -103,9 +97,9 @@ def run_gemv_recommend_hints(N: int = 1024,
 
 
 def test_gemv_recommend_hints():
-    run_gemv_recommend_hints(1024, 1024, "float16", "float16", "float16")
-    run_gemv_recommend_hints(1024, 1024, "int8", "int32", "int32")
-    run_gemv_recommend_hints(1024, 1024, "float16", "float32", "float16")
+    run_gemv_recommend_hints(1024, 1024, T.float16, T.float16, T.float16)
+    run_gemv_recommend_hints(1024, 1024, T.int8, T.int32, T.int32)
+    run_gemv_recommend_hints(1024, 1024, T.float16, T.float32, T.float16)
 
 
 def run_fmha_recommend_hints(
@@ -114,9 +108,9 @@ def run_fmha_recommend_hints(
     seq_length: int = 512,
     seq_kv_length: int = 512,
     head_dim: int = 128,
-    in_dtype: str = "float16",
-    accum_dtype: str = "float16",
-    out_dtype: str = "float16",
+    in_dtype: T.dtype = T.float16,
+    accum_dtype: T.dtype = T.float16,
+    out_dtype: T.dtype = T.float16,
 ):
     arch = auto_infer_current_arch()
     carve_template = carver.FlashAttentionTemplate(
@@ -140,8 +134,8 @@ def run_fmha_recommend_hints(
 
 
 def test_fmha_recommend_hints():
-    run_fmha_recommend_hints(4, 32, 512, 512, 128, "float16", "float16", "float16")
-    run_fmha_recommend_hints(4, 32, 512, 512, 128, "int8", "int32", "int32")
+    run_fmha_recommend_hints(4, 32, 512, 512, 128, T.float16, T.float16, T.float16)
+    run_fmha_recommend_hints(4, 32, 512, 512, 128, T.int8, T.int32, T.int32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/components/test_cuda_restrict_codegen.py b/testing/python/components/test_cuda_restrict_codegen.py
new file mode 100644
index 000000000..bff8b3b19
--- /dev/null
+++ b/testing/python/components/test_cuda_restrict_codegen.py
@@ -0,0 +1,48 @@
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+
+
+def _get_sig_line(code: str) -> str:
+    # Find the kernel signature line in generated CUDA code
+    for line in code.splitlines():
+        line = line.strip()
+        if line.startswith('extern "C" __global__ void'):
+            return line
+    raise AssertionError("Kernel signature not found in generated code")
+
+
+@tilelang.testing.requires_cuda
+def test_cuda_restrict_default_has_restrict():
+    N = 128
+
+    @T.prim_func
+    def kernel(x: T.Tensor((N,), T.float32), y: T.Tensor((N,), T.float32)):
+        with T.Kernel(N, threads=32) as pid:
+            y[pid] = x[pid] + 1.0
+
+    artifact = tilelang.lower(kernel, target="cuda")
+    sig = _get_sig_line(artifact.kernel_source)
+    # By default, kNoAlias is set and both pointers are restrict-qualified
+    assert "__restrict__" in sig
+
+
+@tilelang.testing.requires_cuda
+def test_cuda_restrict_annotation_removes_restrict():
+    N = 128
+
+    @T.prim_func
+    def kernel_body_annot(x: T.Tensor((N,), T.float32), y: T.Tensor((N,), T.float32)):
+        # Explicitly mark buffers that may alias as non-restrict
+        with T.Kernel(N, threads=32) as pid:
+            T.annotate_restrict_buffers(x, y)
+            y[pid] = x[pid] + 1.0
+
+    art1 = tilelang.lower(kernel_body_annot, target="cuda")
+    sig1 = _get_sig_line(art1.kernel_source)
+    # No parameter should be emitted with __restrict__
+    assert "__restrict__" not in sig1
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/components/test_storage_rewrite_detect_inplace.py b/testing/python/components/test_storage_rewrite_detect_inplace.py
index 1d60708fe..4c4f4e5f3 100644
--- a/testing/python/components/test_storage_rewrite_detect_inplace.py
+++ b/testing/python/components/test_storage_rewrite_detect_inplace.py
@@ -8,12 +8,12 @@ def _compile_kernel_without_inplace():
     num_tokens = T.symbolic("num_tokens")
 
     @T.prim_func
-    def buggy_kernel(x: T.Tensor[(num_tokens,), "float"]):
+    def buggy_kernel(x: T.Tensor[(num_tokens,), T.float]):
         with T.Kernel(num_tokens, threads=32) as pid:
-            read = T.alloc_var("int")
+            read = T.alloc_var(T.int)
             read = x[pid]
 
-            write = T.alloc_var("int")
+            write = T.alloc_var(T.int)
             write = read * 2
             x[pid] = write
 
@@ -23,17 +23,18 @@ def buggy_kernel(x: T.Tensor[(num_tokens,), "float"]):
 @tilelang.jit(
     pass_configs={
         tilelang.PassConfigKey.TL_STORAGE_REWRITE_DETECT_INPLACE: True,
-    },)
+    },
+)
 def _compile_kernel_with_inplace():
     num_tokens = T.symbolic("num_tokens")
 
     @T.prim_func
-    def buggy_kernel(x: T.Tensor[(num_tokens,), "float"]):
+    def buggy_kernel(x: T.Tensor[(num_tokens,), T.float]):
         with T.Kernel(num_tokens, threads=32) as pid:
-            read = T.alloc_var("int")
+            read = T.alloc_var(T.int)
             read = x[pid]
 
-            write = T.alloc_var("int")
+            write = T.alloc_var(T.int)
             write = read * 2
             x[pid] = write
 
diff --git a/testing/python/components/test_tilelang_pass_config_disable_warp_specialized.py b/testing/python/components/test_tilelang_pass_config_disable_warp_specialized.py
index 499f3346b..d599e581a 100644
--- a/testing/python/components/test_tilelang_pass_config_disable_warp_specialized.py
+++ b/testing/python/components/test_tilelang_pass_config_disable_warp_specialized.py
@@ -1,5 +1,6 @@
-from tilelang import tvm as tvm
 import tilelang.testing
+from tilelang import language as T
+import torch
 
 
 def matmul(
@@ -22,13 +23,11 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -88,12 +87,11 @@ def run_gemm(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: disable_warp_specialized,
-        })
+        },
+    )
     profiler = kernel.get_profiler()
 
     def ref_program(A, B):
-        import torch
-
         if trans_A:
             A = A.T
         if trans_B:
@@ -113,9 +111,9 @@ def test_gemm_f16f16f16_nn():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
@@ -128,9 +126,9 @@ def test_gemm_f16f16f16_nn():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
diff --git a/testing/python/cpu/test_tilelang_cpu_gemm.py b/testing/python/cpu/test_tilelang_cpu_gemm.py
index 0129b3731..4113c9d06 100644
--- a/testing/python/cpu/test_tilelang_cpu_gemm.py
+++ b/testing/python/cpu/test_tilelang_cpu_gemm.py
@@ -5,14 +5,14 @@
 import torch
 
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     num_stages = 0
 
     @T.prim_func
     def matmul(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), is_cpu=True) as (bx, by):
             A_local = T.alloc_local((block_M, block_K), dtype)
@@ -31,7 +31,6 @@ def matmul(
             # )
 
             for ko in T.Pipelined(K // block_K, num_stages=num_stages):
-
                 T.copy(A[by * block_M, ko * block_K], A_local)
 
                 # Or Copy with Parallel
@@ -62,14 +61,13 @@ def test_matmul_codegen():
 
 
 def test_matmul_compile():
-
-    def matmul_jit_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+    def matmul_jit_test(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
         # a simple kernel just for jit test
         @T.prim_func
         def matmul(
-                A: T.Tensor((M, K), dtype),
-                B: T.Tensor((K, N), dtype),
-                C: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, K), dtype),
+            B: T.Tensor((K, N), dtype),
+            C: T.Tensor((M, N), dtype),
         ):
             with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), is_cpu=True) as (bx, by):
                 A_local = T.alloc_local((block_M, block_K), dtype)
@@ -103,9 +101,9 @@ def matmul(
     block_M, block_N, block_K = M // 4, N // 4, K // 4
     cpu_func = matmul_jit_test(M, N, K, block_M, block_N, block_K)
     with tvm.target.Target("c"):
-        complied_fun = tilelang.compile(cpu_func, -1, execution_backend="ctypes")
+        complied_fun = tilelang.compile(cpu_func, -1, execution_backend="cython")
 
-    in_dtype = "float16"
+    in_dtype = T.float16
     A = torch.randn(M, K, dtype=torch.__getattribute__(in_dtype))
     B = torch.randn(K, N, dtype=torch.__getattribute__(in_dtype))
 
diff --git a/testing/python/debug/test_device_assert.py b/testing/python/debug/test_device_assert.py
new file mode 100644
index 000000000..210b8966d
--- /dev/null
+++ b/testing/python/debug/test_device_assert.py
@@ -0,0 +1,34 @@
+# type: ignore
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+
+# TODO(dyq) It intentionally triggers a device-side assert so we can't include this in CI
+# Please run manually when you want to verify that device_assert actually traps on GPU.
+def _manual_device_assert_triggered():
+    @T.prim_func
+    def program():
+        with T.Kernel(threads=128):
+            tid = T.get_thread_binding()
+            T.device_assert(tid > 0, "Assertion Trigger !")
+
+    jit_kernel = tilelang.compile(program, target="cuda")
+    profiler = jit_kernel.get_profiler()
+    profiler.run_once()
+
+
+def test_device_assert_no_trigger():
+    @T.prim_func
+    def program():
+        with T.Kernel(threads=128):
+            tid = T.get_thread_binding()
+            T.device_assert(tid == tid)
+
+    jit_kernel = tilelang.compile(program, target="cuda")
+    profiler = jit_kernel.get_profiler()
+    profiler.run_once()
+
+
+if __name__ == "__main__":
+    _manual_device_assert_triggered()
diff --git a/testing/python/debug/test_tilelang_debug_print.py b/testing/python/debug/test_tilelang_debug_print.py
index 1bc761619..735eb3e80 100644
--- a/testing/python/debug/test_tilelang_debug_print.py
+++ b/testing/python/debug/test_tilelang_debug_print.py
@@ -5,27 +5,47 @@
 import tilelang.language as T
 
 
-def debug_print_buffer(M=16, N=16, dtype="float16"):
-
+def debug_print_buffer(M=16, N=16, dtype=T.float16):
     @T.prim_func
     def program(Q: T.Tensor((M, N), dtype)):
         with T.Kernel(4, 4, 2, threads=128 * 2) as (bx, by, bz):
             shared_buf = T.alloc_shared([M, N], dtype)
             T.print(shared_buf)
 
-    jit_kernel = tilelang.compile(program, target="cuda")
+    jit_kernel = tilelang.compile(program)
     profiler = jit_kernel.get_profiler()
     profiler.run_once()
 
 
 def test_debug_print_buffer():
-    debug_print_buffer(16, 16, dtype="float")
-    debug_print_buffer(16, 16, dtype="float16")
-    debug_print_buffer(16, 16, dtype="uint8")
+    debug_print_buffer(dtype=T.int8)
+    debug_print_buffer(dtype=T.int16)
+    debug_print_buffer(dtype=T.int32)
+    debug_print_buffer(dtype=T.int64)
+    debug_print_buffer(dtype=T.uint8)
+    debug_print_buffer(dtype=T.uint16)
+    debug_print_buffer(dtype=T.uint32)
+    debug_print_buffer(dtype=T.uint64)
+    debug_print_buffer(dtype=T.float16)
+    debug_print_buffer(dtype=T.float32)
+    debug_print_buffer(dtype=T.float64)
+    debug_print_buffer(dtype=T.bfloat16)
+
+
+@tilelang.testing.requires_cuda
+def test_debug_print_buffer_cuda_fp8():
+    debug_print_buffer(dtype=T.float8_e4m3fn)
+    debug_print_buffer(dtype=T.float8_e5m2)
+
+
+@tilelang.testing.requires_rocm
+def test_debug_print_buffer_rocm_fp8():
+    debug_print_buffer(dtype=T.float8_e4m3fnuz)
+    debug_print_buffer(dtype=T.float8_e5m2fnuz)
 
 
 def debug_print_buffer_conditional(M=16, N=16):
-    dtype = "float16"
+    dtype = T.float16
 
     @T.prim_func
     def program(Q: T.Tensor((M, N), dtype)):
@@ -45,7 +65,7 @@ def test_debug_print_buffer_conditional():
 
 
 def debug_print_value_conditional(M=16, N=16):
-    dtype = "float16"
+    dtype = T.float16
 
     @T.prim_func
     def program(Q: T.Tensor((M, N), dtype)):
@@ -64,7 +84,7 @@ def test_debug_print_value_conditional():
 
 
 def debug_print_register_files(M=16, N=16):
-    dtype = "float16"
+    dtype = T.float16
 
     @T.prim_func
     def program(Q: T.Tensor((M, N), dtype)):
@@ -83,7 +103,7 @@ def test_debug_print_register_files():
 
 
 def debug_print_msg(M=16, N=16):
-    dtype = "float16"
+    dtype = T.float16
 
     @T.prim_func
     def program(Q: T.Tensor((M, N), dtype)):
diff --git a/testing/python/dynamic/test_tilelang_dynamic_symbolic.py b/testing/python/dynamic/test_tilelang_dynamic_symbolic.py
index 4b9dff711..8e50a2759 100644
--- a/testing/python/dynamic/test_tilelang_dynamic_symbolic.py
+++ b/testing/python/dynamic/test_tilelang_dynamic_symbolic.py
@@ -5,7 +5,7 @@
 from tvm import DataType
 import tilelang.language as T
 from tilelang.intrinsics.utils import get_swizzle_layout
-from tilelang.intrinsics.mma_macro_generator import (TensorCoreIntrinEmitter)
+from tilelang.intrinsics.mma_macro_generator import TensorCoreIntrinEmitter
 
 tilelang.testing.set_random_seed(0)
 
@@ -96,12 +96,11 @@ def tl_matmul_macro(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -109,10 +108,12 @@ def main(
             B_local = T.alloc_local((warp_cols * local_size), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -120,7 +121,6 @@ def main(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -130,7 +130,6 @@ def main(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -207,8 +206,7 @@ def tl_matmul_block(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
     @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
+    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor((M, N), out_dtype)):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
@@ -306,8 +304,7 @@ def tl_matmul_block_all_dynamic(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
     @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
+    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor((M, N), out_dtype)):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
@@ -417,7 +414,7 @@ def assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
     )
     pass_configs = {
         tilelang.PassConfigKey.TL_DISABLE_DYNAMIC_TAIL_SPLIT: dynamic_alignment != 0,
-        tilelang.PassConfigKey.TL_DYNAMIC_ALIGNMENT: dynamic_alignment
+        tilelang.PassConfigKey.TL_DYNAMIC_ALIGNMENT: dynamic_alignment,
     }
     if M % 64 == 0 or N % 64 == 0 or K % 64 != 0:
         # workaround for hopper tma lower pass
@@ -462,55 +459,31 @@ def test_assert_tl_matmul_macro():
 
 
 def test_assert_tl_matmul_block():
-    assert_tl_matmul_block_correctness(128, 128, 128, False, False, "float16", "float16", "float16",
-                                       64, 64, 32)
-    assert_tl_matmul_block_correctness(67, 128, 128, False, False, "float16", "float16", "float16",
-                                       64, 64, 32)
-    assert_tl_matmul_block_correctness(36, 128, 128, False, False, "float16", "float16", "float16",
-                                       64, 64, 32)
+    assert_tl_matmul_block_correctness(128, 128, 128, False, False, "float16", "float16", "float16", 64, 64, 32)
+    assert_tl_matmul_block_correctness(67, 128, 128, False, False, "float16", "float16", "float16", 64, 64, 32)
+    assert_tl_matmul_block_correctness(36, 128, 128, False, False, "float16", "float16", "float16", 64, 64, 32)
 
 
 def test_assert_tl_matmul_block_all_dynamic():
-    assert_tl_matmul_block_all_dynamic_correctness(128, 128, 128, False, False, "float16",
-                                                   "float16", "float16", 64, 64, 32)
-    assert_tl_matmul_block_all_dynamic_correctness(67, 128, 128, False, False, "float16", "float16",
-                                                   "float16", 64, 64, 32)
-    assert_tl_matmul_block_all_dynamic_correctness(36, 128, 128, False, False, "float16", "float16",
-                                                   "float16", 64, 64, 32)
+    assert_tl_matmul_block_all_dynamic_correctness(128, 128, 128, False, False, "float16", "float16", "float16", 64, 64, 32)
+    assert_tl_matmul_block_all_dynamic_correctness(67, 128, 128, False, False, "float16", "float16", "float16", 64, 64, 32)
+    assert_tl_matmul_block_all_dynamic_correctness(36, 128, 128, False, False, "float16", "float16", "float16", 64, 64, 32)
 
 
 def test_assert_tl_matmul_block_all_dynamic_with_pass_config():
     assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-        128,
-        128,
-        128,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float16",
-        64,
-        64,
-        32,
-        dynamic_alignment=8)
+        128, 128, 128, False, False, "float16", "float16", "float16", 64, 64, 32, dynamic_alignment=8
+    )
     assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-        64,
-        128,
-        128,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float16",
-        64,
-        64,
-        32,
-        dynamic_alignment=8)
+        64, 128, 128, False, False, "float16", "float16", "float16", 64, 64, 32, dynamic_alignment=8
+    )
     assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-        64, 128, 60, False, False, "float16", "float16", "float16", 64, 64, 32, dynamic_alignment=4)
+        64, 128, 60, False, False, "float16", "float16", "float16", 64, 64, 32, dynamic_alignment=4
+    )
     # Tail split is enabled with dynamic alignment 0
     assert_tl_matmul_block_all_dynamic_correctness_with_pass_config(
-        64, 128, 64, False, False, "float16", "float16", "float16", 64, 64, 32, dynamic_alignment=0)
+        64, 128, 64, False, False, "float16", "float16", "float16", 64, 64, 32, dynamic_alignment=0
+    )
 
 
 if __name__ == "__main__":
diff --git a/testing/python/dynamic/test_tilelang_dynamic_symbolic_bench.py b/testing/python/dynamic/test_tilelang_dynamic_symbolic_bench.py
index b5ccbda92..1bee1356f 100644
--- a/testing/python/dynamic/test_tilelang_dynamic_symbolic_bench.py
+++ b/testing/python/dynamic/test_tilelang_dynamic_symbolic_bench.py
@@ -25,10 +25,8 @@ def tl_matmul_block_static(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
     @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
+    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor((M, N), out_dtype)):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -137,10 +135,8 @@ def tl_matmul_block_dynamic_m(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
     @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
+    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor((M, N), out_dtype)):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -247,10 +243,8 @@ def tl_matmul_block_dynamic_mn(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
     @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
+    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor((M, N), out_dtype)):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -357,10 +351,8 @@ def tl_matmul_block_dynamic_mnk(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
     @T.prim_func
-    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor(
-        (M, N), out_dtype)):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
+    def main(A: T.Tensor(A_shape, in_dtype), B: T.Tensor(B_shape, in_dtype), C: T.Tensor((M, N), out_dtype)):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
@@ -445,8 +437,7 @@ def ref_program(A, B):
 
 
 def run_assert_tl_matmul_block_static(M, N, K, block_M, block_N, block_K):
-    assert_tl_matmul_block_static(M, N, K, block_M, block_N, block_K, False, False, "float16",
-                                  "float16", "float32")
+    assert_tl_matmul_block_static(M, N, K, block_M, block_N, block_K, False, False, "float16", "float16", "float32")
 
 
 def run_assert_tl_matmul_block_dynamic_m(M, N, K, block_M, block_N, block_K):
@@ -462,10 +453,8 @@ def run_assert_tl_matmul_block_dynamic_m(M, N, K, block_M, block_N, block_K):
         "float16",
         "float16",
         "float32",
-        pass_configs={
-            "tl.disable_dynamic_tail_split": True,
-            "tl.dynamic_alignment": 8
-        })
+        pass_configs={"tl.disable_dynamic_tail_split": True, "tl.dynamic_alignment": 8},
+    )
     assert_tl_matmul_block_dynamic_m(
         M,
         N,
@@ -478,7 +467,8 @@ def run_assert_tl_matmul_block_dynamic_m(M, N, K, block_M, block_N, block_K):
         "float16",
         "float16",
         "float32",
-        pass_configs={"tl.disable_dynamic_tail_split": False})
+        pass_configs={"tl.disable_dynamic_tail_split": False},
+    )
 
 
 def run_assert_tl_matmul_block_dynamic_mn(M, N, K, block_M, block_N, block_K):
@@ -494,10 +484,8 @@ def run_assert_tl_matmul_block_dynamic_mn(M, N, K, block_M, block_N, block_K):
         "float16",
         "float16",
         "float32",
-        pass_configs={
-            "tl.disable_dynamic_tail_split": True,
-            "tl.dynamic_alignment": 8
-        })
+        pass_configs={"tl.disable_dynamic_tail_split": True, "tl.dynamic_alignment": 8},
+    )
     assert_tl_matmul_block_dynamic_mn(
         M,
         N,
@@ -510,7 +498,8 @@ def run_assert_tl_matmul_block_dynamic_mn(M, N, K, block_M, block_N, block_K):
         "float16",
         "float16",
         "float32",
-        pass_configs={"tl.disable_dynamic_tail_split": False})
+        pass_configs={"tl.disable_dynamic_tail_split": False},
+    )
 
 
 def run_assert_tl_matmul_block_dynamic_mnk(M, N, K, block_M, block_N, block_K):
@@ -526,10 +515,8 @@ def run_assert_tl_matmul_block_dynamic_mnk(M, N, K, block_M, block_N, block_K):
         "float16",
         "float16",
         "float32",
-        pass_configs={
-            "tl.disable_dynamic_tail_split": True,
-            "tl.dynamic_alignment": 4
-        })
+        pass_configs={"tl.disable_dynamic_tail_split": True, "tl.dynamic_alignment": 4},
+    )
     assert_tl_matmul_block_dynamic_mnk(
         M,
         N,
@@ -542,7 +529,8 @@ def run_assert_tl_matmul_block_dynamic_mnk(M, N, K, block_M, block_N, block_K):
         "float16",
         "float16",
         "float32",
-        pass_configs={"tl.disable_dynamic_tail_split": False})
+        pass_configs={"tl.disable_dynamic_tail_split": False},
+    )
 
 
 def test_all():
diff --git a/testing/python/fastmath/test_mathops_fastmath.py b/testing/python/fastmath/test_mathops_fastmath.py
index c3b5d1b52..e181eb4df 100644
--- a/testing/python/fastmath/test_mathops_fastmath.py
+++ b/testing/python/fastmath/test_mathops_fastmath.py
@@ -1,3 +1,4 @@
+import pytest
 import tilelang
 import tilelang.language as T
 import torch
@@ -7,16 +8,16 @@
 
 def get_mathop_lines(source, mathop_name):
     """Extract lines containing the mathop from CUDA source for debugging"""
-    lines = source.split('\n')
+    lines = source.split("\n")
     relevant_lines = []
     for i, line in enumerate(lines):
-        if mathop_name in line and ('(' in line):
+        if mathop_name in line and ("(" in line):
             # Include some context
             start = max(0, i - 1)
             end = min(len(lines), i + 2)
             relevant_lines.extend([f"{j}: {lines[j]}" for j in range(start, end)])
             relevant_lines.append("---")
-    return '\n'.join(relevant_lines[-10:])  # Show last 10 lines to avoid too much output
+    return "\n".join(relevant_lines[-10:])  # Show last 10 lines to avoid too much output
 
 
 def check_fastmath_usage(source, mathop_name, expect_fastmath=False):
@@ -27,9 +28,7 @@ def check_fastmath_usage(source, mathop_name, expect_fastmath=False):
     fastmath_matches = re.findall(fastmath_pattern, source)
     non_fastmath_matches = re.findall(non_fastmath_pattern, source)
 
-    print(
-        f"Found {len(fastmath_matches)} fastmath calls, {len(non_fastmath_matches)} non-fastmath calls"
-    )
+    print(f"Found {len(fastmath_matches)} fastmath calls, {len(non_fastmath_matches)} non-fastmath calls")
     if len(fastmath_matches) > 0:
         print(f"Fastmath calls found: {fastmath_matches}")
     if len(non_fastmath_matches) > 0:
@@ -51,13 +50,7 @@ def check_non_fastmath_usage(source, mathop_name):
     check_fastmath_usage(source, mathop_name, expect_fastmath=False)
 
 
-def run_single_arg_mathop_test(mathop_name,
-                               mathop_func,
-                               M=128,
-                               N=128,
-                               block_M=32,
-                               block_N=32,
-                               dtype="float32"):
+def run_single_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test single-argument mathops.
     T.exp should generate expf (non-fastmath), T.__exp should generate __expf (fastmath)
@@ -65,13 +58,12 @@ def run_single_arg_mathop_test(mathop_name,
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i,
-                                                                      bx * block_N + j])
+                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j])
 
     # Test with FAST_MATH disabled
     kernel_no_fastmath = tilelang.compile(
@@ -80,7 +72,8 @@ def main(
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     source_no_fastmath = kernel_no_fastmath.get_kernel_source()
 
@@ -93,28 +86,22 @@ def main(
     print(f"✓ {mathop_name} compilation and execution test passed")
 
 
-def run_two_arg_mathop_test(mathop_name,
-                            mathop_func,
-                            M=128,
-                            N=128,
-                            block_M=32,
-                            block_N=32,
-                            dtype="float32"):
+def run_two_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test two-argument mathops to ensure they generate non-fastmath CUDA code.
     """
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                C[by * block_M + i,
-                  bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j],
-                                                  B[by * block_M + i, bx * block_N + j])
+                C[by * block_M + i, bx * block_N + j] = mathop_func(
+                    A[by * block_M + i, bx * block_N + j], B[by * block_M + i, bx * block_N + j]
+                )
 
     # Test with FAST_MATH disabled
     kernel_no_fastmath = tilelang.compile(
@@ -123,7 +110,8 @@ def main(
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     # Test with FAST_MATH enabled
     kernel_fastmath = tilelang.compile(
@@ -132,7 +120,8 @@ def main(
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
+        },
+    )
 
     source_no_fastmath = kernel_no_fastmath.get_kernel_source()
     source_fastmath = kernel_fastmath.get_kernel_source()
@@ -145,7 +134,7 @@ def main(
     check_non_fastmath_usage(source_fastmath, mathop_name)
 
     # Test numerical correctness
-    torch_dtype = getattr(torch, dtype)
+    torch_dtype = dtype.as_torch()
     a = torch.randn(M, N, device="cuda", dtype=torch_dtype)
     b = torch.randn(M, N, device="cuda", dtype=torch_dtype)
 
@@ -171,8 +160,8 @@ def run_abs_test():
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), "float32"),
-            B: T.Tensor((M, N), "float32"),
+        A: T.Tensor((M, N), T.float32),
+        B: T.Tensor((M, N), T.float32),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
@@ -184,7 +173,8 @@ def main(
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     source = kernel.get_kernel_source()
     print("\n=== Testing abs (maps to fabs) ===")
@@ -199,26 +189,19 @@ def main(
     print("✓ abs numerical test passed")
 
 
-def run_fastmath_mathop_test(mathop_name,
-                             mathop_func,
-                             M=128,
-                             N=128,
-                             block_M=32,
-                             block_N=32,
-                             dtype="float32"):
+def run_fastmath_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test fastmath mathops to ensure they generate fastmath CUDA code (with __ prefix).
     """
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i,
-                                                                      bx * block_N + j])
+                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j])
 
     # Test with FAST_MATH enabled
     kernel_fastmath = tilelang.compile(
@@ -227,18 +210,19 @@ def main(
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
+        },
+    )
 
     source_fastmath = kernel_fastmath.get_kernel_source()
 
     print(f"\n=== Testing {mathop_name} (fastmath version) ===")
     print("FAST_MATH=True:")
     # Strip the __ prefix for checking in the CUDA source
-    cuda_mathop_name = mathop_name.lstrip('_')
+    cuda_mathop_name = mathop_name.lstrip("_")
     check_fastmath_usage(source_fastmath, cuda_mathop_name, expect_fastmath=True)
 
     # Test numerical correctness
-    torch_dtype = getattr(torch, dtype)
+    torch_dtype = dtype.as_torch()
     a = torch.randn(M, N, device="cuda", dtype=torch_dtype)
 
     # Ensure positive values for functions that need them
@@ -259,13 +243,9 @@ def main(
     print(f"✓ {mathop_name} numerical test passed")
 
 
-@tilelang.testing.requires_cuda
-def test_mathops_generate_no_fastmath():
-    """Test that our tl.* mathops generate fastmath CUDA code (__expf etc.)"""
-    # Based on test results, our tl.* intrinsics actually generate
-    # no fastmath versions
-    # This appears to be the intended behavior
-    single_arg_mathops = [
+@pytest.mark.parametrize(
+    "name, func",
+    [
         ("exp", T.exp),
         ("exp2", T.exp2),
         ("exp10", T.exp10),
@@ -287,24 +267,26 @@ def test_mathops_generate_no_fastmath():
         ("trunc", T.trunc),
         ("round", T.round),
         ("nearbyint", T.nearbyint),
-    ]
-
-    for name, func in single_arg_mathops:
-        run_single_arg_mathop_test(name, func, dtype="float32")
-        print(f"✓ {name} test passed")
+    ],
+)
+@tilelang.testing.requires_cuda
+def test_mathops_generate_no_fastmath(name, func):
+    """Test that our tl.* mathops generate fastmath CUDA code (__expf etc.)"""
+    run_single_arg_mathop_test(name, func, dtype=T.float32)
+    print(f"✓ {name} test passed")
 
 
-@tilelang.testing.requires_cuda
-def test_two_arg_mathops_fastmath():
-    """Test all two-argument mathops"""
-    # Two argument mathops
-    two_arg_mathops = [
+@pytest.mark.parametrize(
+    "name, func",
+    [
         ("pow", T.pow),
         ("fmod", T.fmod),
-    ]
-
-    for name, func in two_arg_mathops:
-        run_two_arg_mathop_test(name, func, dtype="float32")
+    ],
+)
+@tilelang.testing.requires_cuda
+def test_two_arg_mathops_fastmath(name, func):
+    """Test all two-argument mathops"""
+    run_two_arg_mathop_test(name, func, dtype=T.float32)
 
 
 @tilelang.testing.requires_cuda
@@ -313,11 +295,9 @@ def test_abs_maps_to_fabs():
     run_abs_test()
 
 
-@tilelang.testing.requires_cuda
-def test_fastmath_versions():
-    """Test that __exp, __exp10, __log, __log2, __log10, __tan, __cos, __sin generate fastmath CUDA code"""
-    # Test fastmath versions
-    fastmath_mathops = [
+@pytest.mark.parametrize(
+    "name, func",
+    [
         ("__exp", T.__exp),
         ("__exp10", T.__exp10),
         ("__log", T.__log),
@@ -326,11 +306,13 @@ def test_fastmath_versions():
         ("__tan", T.__tan),
         ("__cos", T.__cos),
         ("__sin", T.__sin),
-    ]
-
-    for name, func in fastmath_mathops:
-        run_fastmath_mathop_test(name, func, dtype="float32")
-        print(f"✓ {name} test passed")
+    ],
+)
+@tilelang.testing.requires_cuda
+def test_fastmath_versions(name, func):
+    """Test that __exp, __exp10, __log, __log2, __log10, __tan, __cos, __sin generate fastmath CUDA code"""
+    run_fastmath_mathop_test(name, func, dtype=T.float32)
+    print(f"✓ {name} test passed")
 
 
 if __name__ == "__main__":
diff --git a/testing/python/issue/test_tilelang_issue_1001.py b/testing/python/issue/test_tilelang_issue_1001.py
new file mode 100644
index 000000000..f2315ef21
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1001.py
@@ -0,0 +1,34 @@
+import torch
+import tilelang
+import tilelang.testing
+from tilelang import language as T
+
+
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+    },
+)
+def _cumsum_view_infer_layout(hidden):
+    num_tokens = T.dynamic("num_tokens")
+
+    @T.prim_func
+    def buggy_kernel(x: T.Tensor[(num_tokens, hidden), T.float]):
+        with T.Kernel(num_tokens, threads=128) as pid:
+            smem = T.alloc_shared((hidden,), dtype=T.float32)
+            T.copy(x[pid, :], smem)
+            T.cumsum(T.view(smem, (1, hidden)), dim=1)
+
+    return buggy_kernel
+
+
+def test_cumsum_view_infer_layout():
+    hidden = 128
+    x = torch.randn(1, hidden, device="cuda", dtype=torch.float)
+    kernel = _cumsum_view_infer_layout(hidden)
+    kernel(x)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1008.py b/testing/python/issue/test_tilelang_issue_1008.py
new file mode 100644
index 000000000..a35a18449
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1008.py
@@ -0,0 +1,55 @@
+import torch
+import tilelang
+import tilelang.testing
+from tilelang import language as T
+
+
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+    },
+)
+def _fill_with_static_region_kernel():
+    num_tokens = T.symbolic("num_tokens")
+
+    @T.prim_func
+    def buggy_kernel(x: T.Tensor[(num_tokens,), "int64"]):  # noqa: F821
+        with T.Kernel(num_tokens, threads=128) as _:
+            T.fill(x[0:128], 0)
+
+    return buggy_kernel
+
+
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+    },
+)
+def _fill_with_dynamic_region_kernel():
+    num_tokens = T.symbolic("num_tokens")
+
+    @T.prim_func
+    def buggy_kernel(x: T.Tensor[(num_tokens,), "int64"]):  # noqa: F821
+        with T.Kernel(num_tokens, threads=128) as _:
+            a, b = T.alloc_var(T.int), T.alloc_var(T.int)
+            T.fill(x[a:b], 0)
+
+    return buggy_kernel
+
+
+def test_fill_with_static_region_kernel():
+    kernel = _fill_with_static_region_kernel()
+    x = torch.zeros((256,), dtype=torch.int64, device="cuda")
+    kernel(x)
+
+
+def test_fill_with_dynamic_region_kernel():
+    kernel = _fill_with_dynamic_region_kernel()
+    x = torch.zeros((256,), dtype=torch.int64, device="cuda")
+    kernel(x)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1115.py b/testing/python/issue/test_tilelang_issue_1115.py
new file mode 100644
index 000000000..658c126a0
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1115.py
@@ -0,0 +1,47 @@
+import torch
+import tilelang
+import tilelang.language as T
+
+
+def test_int64_address():
+    @tilelang.jit
+    def set_cache_kernel(
+        S,
+        D,
+        pos_ty="int64",
+        dtype=T.float32,
+    ):
+        @T.prim_func
+        def main(
+            pos: T.Tensor(
+                [
+                    S,
+                ],
+                pos_ty,
+            ),  # type: ignore  `TypeError: Check failed: (a.dtype() == b.dtype()) is false: mismatched types. int64 vs. int32`
+            value: T.Tensor([S, D], dtype),  # type: ignore
+            cache: T.Tensor([S, D], dtype),  # type: ignore
+        ):
+            with T.Kernel(S, threads=128) as bx:
+                slot = pos[bx]
+                for i in T.Parallel(D):
+                    cache[slot, i] = value[bx, i]
+
+        return main
+
+    D = 2
+    S = 10
+    cache = torch.rand((S, D), device="cuda", dtype=torch.float32)
+    value = torch.rand((S, D), device="cuda", dtype=torch.float32)
+    pos_int64 = torch.arange(S, device="cuda", dtype=torch.int64)
+    pos_int32 = torch.arange(S, device="cuda", dtype=torch.int32)
+    kernel_int64 = set_cache_kernel(S, D, "int64")
+    kernel_int32 = set_cache_kernel(S, D, T.int32)
+    kernel_int64(pos_int64, value, cache)
+    torch.testing.assert_close(cache, value)
+    kernel_int32(pos_int32, value, cache)
+    torch.testing.assert_close(cache, value)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1198.py b/testing/python/issue/test_tilelang_issue_1198.py
new file mode 100644
index 000000000..e6330e435
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1198.py
@@ -0,0 +1,19 @@
+import tilelang.testing
+import tilelang.language as T
+
+
+def test_issue_1198():
+    @T.prim_func
+    def foo(
+        x: T.Buffer(
+            [
+                32,
+            ],
+            T.int32,
+        ),
+    ):
+        pass
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1210.py b/testing/python/issue/test_tilelang_issue_1210.py
new file mode 100644
index 000000000..2e141d782
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1210.py
@@ -0,0 +1,36 @@
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+
+
+def _make_kernel(M, N):
+    dtype = T.bfloat16
+
+    @T.prim_func
+    def fwd_main(KV: T.Tensor((M, N), dtype), ids: T.Tensor((4,), T.int32)):
+        with T.Kernel(4, threads=1):
+            A = T.alloc_shared([N], dtype)
+            B = T.alloc_shared([N], dtype)
+
+            # Regression for a bug where InjectSoftwarePipeline left the loop
+            # variable as a free var, causing MakePackedAPI to fail
+            for i in T.Pipelined(4, num_stages=1):
+                _id = ids[i]
+                T.copy(KV[_id, :], A)
+                T.clear(B)
+
+    return fwd_main
+
+
+def test_make_packed_api_no_free_loop_var():
+    func = _make_kernel(4, 4)
+    # Keep warp-specialization/TMA disabled to match the original repro
+    cfg = {
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+    }
+    tilelang.compile(func, pass_configs=cfg)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1237.py b/testing/python/issue/test_tilelang_issue_1237.py
new file mode 100644
index 000000000..bb936e468
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1237.py
@@ -0,0 +1,23 @@
+import tilelang.testing
+from tilelang import language as T
+
+
+def test_issue_1237_dynamic_copy_extent_builds():
+    # Repro from debug/1113_issues/copy_dyn.py, adapted as a unit test.
+    # The goal is to ensure T.copy correctly handles dynamic extents
+    # (e.g., src slice length vs. static dst buffer size) during prim_func building.
+
+    length = T.symbolic("len", dtype=T.int32)
+
+    @T.prim_func
+    def sample_kernel(global_tensor: T.Tensor[(length,), T.int32]):  # noqa: F821
+        with T.Kernel(1, threads=32):
+            buffer_shared = T.alloc_shared((1024,), dtype=T.int32)
+            T.copy(global_tensor[0:length], buffer_shared)
+
+    # Building the prim_func is sufficient to exercise the bug path; no need to JIT/execute.
+    _ = sample_kernel
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1374.py b/testing/python/issue/test_tilelang_issue_1374.py
new file mode 100644
index 000000000..5b5305188
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1374.py
@@ -0,0 +1,30 @@
+import tilelang as tl
+import tilelang.testing
+import tilelang.language as T
+
+
+def test_issue_1374_non_var_itermark():
+    @tl.jit
+    def get_wrong_kernel(M: int = 4096):
+        dtype = "int32"
+        num_threads = 128
+
+        @T.prim_func
+        def main(A: T.Tensor((16, 14), dtype=dtype), B: T.Tensor((16, 448), dtype=dtype)):
+            with T.Kernel(1, threads=num_threads) as (bx,):
+                A_local = T.alloc_fragment((16, 14), dtype=dtype)
+                B_local = T.alloc_fragment((16, 448), dtype=dtype)
+
+                T.copy(A, A_local)
+                T.copy(B, B_local)
+                for i, j in T.Parallel(16, 448):
+                    A_local[i, j // 32] += B[i, j]
+
+        return main
+
+    kernel = get_wrong_kernel()
+    print(kernel.get_kernel_source())
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_814.py b/testing/python/issue/test_tilelang_issue_814.py
index 1a9e63d29..f9f94bd74 100644
--- a/testing/python/issue/test_tilelang_issue_814.py
+++ b/testing/python/issue/test_tilelang_issue_814.py
@@ -5,12 +5,11 @@
 
 
 @tilelang.jit
-def _tmp_var_kernel(N, block_N, dtype="float"):
-
+def _tmp_var_kernel(N, block_N, dtype=T.float32):
     @T.prim_func
     def kernel(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=128) as bx:
             for i in T.Parallel(block_N):
diff --git a/testing/python/issue/test_tilelang_issue_830.py b/testing/python/issue/test_tilelang_issue_830.py
index ab5937122..1a2a909d2 100644
--- a/testing/python/issue/test_tilelang_issue_830.py
+++ b/testing/python/issue/test_tilelang_issue_830.py
@@ -8,7 +8,6 @@
 
 @tilelang.jit
 def _empty_kernel():
-
     @T.prim_func
     def empty_kernel():
         with T.Kernel(1, threads=32) as thread_idx:
@@ -17,7 +16,15 @@ def empty_kernel():
     return empty_kernel
 
 
+@tilelang.testing.requires_cuda
 def test_empty_kernel_lowering():
+    # Ensure a valid CUDA runtime context is current on this thread for the
+    # target device before using driver API calls. Without this, calls like
+    # cuModuleLoadData can fail with CUDA_ERROR_INVALID_CONTEXT, especially
+    # for kernels that don't touch any device memory or streams beforehand
+    # (e.g., "empty" kernels) and therefore haven't triggered context
+    # creation implicitly.
+    torch.cuda.set_device(0)
     kernel = _empty_kernel()
     kernel()
 
@@ -27,7 +34,7 @@ def _empty_with_dead_code_kernel():
     num_tokens = T.dynamic("num_tokens")
 
     @T.prim_func
-    def buggy_kernel(x: T.Tensor[(num_tokens,), "float32"]):
+    def buggy_kernel(x: T.Tensor[(num_tokens,), T.float32]):
         with T.Kernel(num_tokens, threads=32) as pid:
             y = x[pid]
 
@@ -43,7 +50,6 @@ def test_empty_with_dead_code_kernel():
 
 @tilelang.jit
 def _empty_kernel_with_binding_variants(use_tuple_binding: bool = False):
-
     @T.prim_func
     def kernel_with_tuple_kernel_binding():
         with T.Kernel(1, threads=32) as (pid,):
@@ -59,7 +65,9 @@ def kernel_with_scalar_kernel_binding():
     return kernel_with_tuple_kernel_binding if use_tuple_binding else kernel_with_scalar_kernel_binding
 
 
+@tilelang.testing.requires_cuda
 def test_empty_kernel_with_binding_variants():
+    torch.cuda.set_device(0)
     kernel = _empty_kernel_with_binding_variants()
     kernel()
 
diff --git a/testing/python/issue/test_tilelang_issue_96.py b/testing/python/issue/test_tilelang_issue_96.py
index e42ebb59e..9bf5c69bd 100644
--- a/testing/python/issue/test_tilelang_issue_96.py
+++ b/testing/python/issue/test_tilelang_issue_96.py
@@ -4,19 +4,17 @@
 import torch
 
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (
-                    bx,
-                    by,
-                ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (
+            bx,
+            by,
+        ):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             B_shared = T.alloc_shared((block_N, block_K), dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
diff --git a/testing/python/issue/test_tilelang_issue_merge_if.py b/testing/python/issue/test_tilelang_issue_merge_if.py
index 1db7f337c..e3b1e3082 100644
--- a/testing/python/issue/test_tilelang_issue_merge_if.py
+++ b/testing/python/issue/test_tilelang_issue_merge_if.py
@@ -6,13 +6,12 @@
 
 
 def merge_if_test():
-
     @T.prim_func
     def main():
-        A = T.alloc_fragment((1,), "float16")
-        B = T.alloc_fragment((1,), "float16")
-        C = T.alloc_fragment((1,), "float16")
-        D = T.alloc_fragment((1,), "float16")
+        A = T.alloc_fragment((1,), T.float16)
+        B = T.alloc_fragment((1,), T.float16)
+        C = T.alloc_fragment((1,), T.float16)
+        D = T.alloc_fragment((1,), T.float16)
         if A[0] == 0:
             A[0] = 0
         if B[0] == 0:
diff --git a/testing/python/jit/test_tilelang_jit_callback.py b/testing/python/jit/test_tilelang_jit_callback.py
index d5aa00a4d..9ad8da47f 100644
--- a/testing/python/jit/test_tilelang_jit_callback.py
+++ b/testing/python/jit/test_tilelang_jit_callback.py
@@ -1,8 +1,9 @@
-from tilelang import tvm as tvm
+from tilelang import language as T
 import tilelang.testing
 import tilelang
 from tilelang.engine.callback import register_cuda_postproc_callback
 import torch
+import pytest
 
 
 def matmul(
@@ -25,13 +26,11 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -91,23 +90,26 @@ def tilelang_callback_cuda_postproc(code, _):
         code = f"// {stramp}\n" + code
         return code
 
+    tilelang.disable_cache()
     matmul_kernel = tilelang.compile(program, out_idx=-1)
+    tilelang.enable_cache()
 
     kernel_source = matmul_kernel.get_kernel_source()
 
     assert stramp in kernel_source, f"Expected {stramp} in the kernel source"
 
 
-def test_gemm_f16f16f16_nn():
+@pytest.mark.skip(reason="Skipping callback test")
+def test_cuda_postproc_callback():
     run_gemm(
         512,
         1024,
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
@@ -135,13 +137,11 @@ def matmu_jit_kernel(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -205,7 +205,6 @@ def run_gemm_jit_kernel(
         B = B.T
 
     def ref_program(A, B):
-        import torch
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(torch.__getattribute__(out_dtype))
         return C
@@ -223,9 +222,9 @@ def test_gemm_jit_kernel():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
diff --git a/testing/python/jit/test_tilelang_jit_gemm_ctypes.py b/testing/python/jit/test_tilelang_jit_cutedsl.py
similarity index 59%
rename from testing/python/jit/test_tilelang_jit_gemm_ctypes.py
rename to testing/python/jit/test_tilelang_jit_cutedsl.py
index 650bb2f97..202bbf117 100644
--- a/testing/python/jit/test_tilelang_jit_gemm_ctypes.py
+++ b/testing/python/jit/test_tilelang_jit_cutedsl.py
@@ -28,9 +28,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -52,69 +52,7 @@ def main(
     return main
 
 
-def run_gemm(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-):
-    program = matmul(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-
-    stramp = "&*(XS)"
-
-    @tvm.register_func("tilelang_callback_cuda_postproc", override=True)
-    def tilelang_callback_cuda_postproc(code, _):
-        code = f"// {stramp}\n" + code
-        return code
-
-    matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="ctypes")
-
-    kernel_source = matmul_kernel.get_kernel_source()
-
-    assert stramp in kernel_source, f"Expected {stramp} in the kernel source"
-
-
-def test_gemm_f16f16f16_nn():
-    run_gemm(
-        512,
-        1024,
-        768,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float16",
-        128,
-        256,
-        32,
-        2,
-    )
-
-
-def matmu_jit_kernel(
+def matmul_jit_kernel(
     M,
     N,
     K,
@@ -134,13 +72,11 @@ def matmu_jit_kernel(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -177,7 +113,7 @@ def run_gemm_jit_kernel(
     num_stages=3,
     num_threads=128,
 ):
-    program = matmu_jit_kernel(
+    program = matmul_jit_kernel(
         M,
         N,
         K,
@@ -193,7 +129,7 @@ def run_gemm_jit_kernel(
         num_threads,
     )
 
-    matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="ctypes")
+    matmul_kernel = tilelang.compile(program, out_idx=-1, target="cutedsl")
 
     in_dtype = map_torch_type(in_dtype)
     out_dtype = map_torch_type(out_dtype)
@@ -208,6 +144,7 @@ def run_gemm_jit_kernel(
 
     def ref_program(A, B):
         import torch
+
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(out_dtype)
         return C
@@ -235,19 +172,9 @@ def test_gemm_jit_kernel():
     )
 
 
-def run_ctypes_kernel_do_bench(M,
-                               N,
-                               K,
-                               trans_A,
-                               trans_B,
-                               in_dtype,
-                               out_dtype,
-                               dtypeAccum,
-                               block_M,
-                               block_N,
-                               block_K,
-                               num_stages=3,
-                               num_threads=128):
+def run_cutedsl_kernel_do_bench(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -264,14 +191,14 @@ def run_ctypes_kernel_do_bench(M,
         num_threads,
     )
 
-    matmul_kernel = tilelang.compile(program, execution_backend="ctypes")
+    matmul_kernel = tilelang.compile(program, target="cutedsl")
 
     profiler = matmul_kernel.get_profiler()
 
-    ctypes_latency = profiler.do_bench(func=matmul_kernel)
-    print(f"Ctypes Latency: {ctypes_latency} ms")
+    cutedsl_latency = profiler.do_bench(func=matmul_kernel)
+    print(f"CuTeDSL Latency: {cutedsl_latency} ms")
 
-    assert ctypes_latency is not None
+    assert cutedsl_latency is not None
 
     tvm_latency = profiler.do_bench()
     print(f"TVM Latency: {tvm_latency} ms")
@@ -279,24 +206,13 @@ def run_ctypes_kernel_do_bench(M,
     assert tvm_latency is not None
 
 
-def test_ctypes_kernel_do_bench():
-    run_ctypes_kernel_do_bench(512, 1024, 768, False, False, "float16", "float16", "float16", 128,
-                               256, 32, 2)
-
-
-def run_ctypes_kernel_multi_stream(M,
-                                   N,
-                                   K,
-                                   trans_A,
-                                   trans_B,
-                                   in_dtype,
-                                   out_dtype,
-                                   dtypeAccum,
-                                   block_M,
-                                   block_N,
-                                   block_K,
-                                   num_stages=3,
-                                   num_threads=128):
+def test_cutedsl_kernel_do_bench():
+    run_cutedsl_kernel_do_bench(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+
+def run_cutedsl_kernel_multi_stream(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -313,7 +229,7 @@ def run_ctypes_kernel_multi_stream(M,
         num_threads,
     )
 
-    matmul_kernel = tilelang.compile(program, execution_backend="ctypes")
+    matmul_kernel = tilelang.compile(program, target="cutedsl")
     in_dtype = map_torch_type(in_dtype)
     out_dtype = map_torch_type(out_dtype)
     tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
@@ -332,24 +248,13 @@ def run_ctypes_kernel_multi_stream(M,
             matmul_kernel(tensor_a, tensor_b, tensor_c)
 
 
-def test_ctypes_kernel_multi_stream():
-    run_ctypes_kernel_multi_stream(512, 1024, 768, False, False, "float16", "float16", "float16",
-                                   128, 256, 32, 2)
-
-
-def run_ctypes_dynamic_shape(M,
-                             N,
-                             K,
-                             trans_A,
-                             trans_B,
-                             in_dtype,
-                             out_dtype,
-                             dtypeAccum,
-                             block_M,
-                             block_N,
-                             block_K,
-                             num_stages=3,
-                             num_threads=128):
+def test_cutedsl_kernel_multi_stream():
+    run_cutedsl_kernel_multi_stream(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+
+def run_cutedsl_dynamic_shape(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -366,7 +271,7 @@ def run_ctypes_dynamic_shape(M,
         num_threads,
     )
 
-    matmul_kernel = tilelang.compile(program, execution_backend="ctypes")
+    matmul_kernel = tilelang.compile(program, target="cutedsl")
     if isinstance(M, T.Var):
         M = 1024
     if isinstance(N, T.Var):
@@ -389,21 +294,25 @@ def run_ctypes_dynamic_shape(M,
     matmul_kernel(tensor_a, tensor_b, tensor_c)
 
     tensor_ref_c = torch.matmul(tensor_a.to(torch.float), tensor_b.to(torch.float)).to(out_dtype)
-    tilelang.testing.torch_assert_close(
-        tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+    tilelang.testing.torch_assert_close(tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
-def test_ctypes_dynamic_shape():
-    run_ctypes_dynamic_shape(
-        T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+def test_cutedsl_dynamic_shape():
+    run_cutedsl_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+    run_cutedsl_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+
+    run_cutedsl_dynamic_shape(
+        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, "float16", "float16", "float16", 128, 256, 32, 2
+    )
 
-    run_ctypes_dynamic_shape(
-        T.dynamic("m"), T.dynamic("n"), 768, False, False, "float16", "float16", "float16", 128,
-        256, 32, 2)
 
-    run_ctypes_dynamic_shape(
-        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, "float16", "float16",
-        "float16", 128, 256, 32, 2)
+def check_hopper():
+    if not torch.cuda.is_available():
+        return False
+    props = torch.cuda.get_device_properties(0)
+    compute_capability = props.major, props.minor
+    return compute_capability == (9, 0)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/jit/test_tilelang_jit_gemm.py b/testing/python/jit/test_tilelang_jit_gemm.py
index 25c19a058..97391f26f 100644
--- a/testing/python/jit/test_tilelang_jit_gemm.py
+++ b/testing/python/jit/test_tilelang_jit_gemm.py
@@ -1,4 +1,4 @@
-from tilelang import tvm as tvm
+from tilelang import language as T
 import tilelang.testing
 import tilelang
 import torch
@@ -27,13 +27,11 @@ def matmul_kernel_jit(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -95,7 +93,6 @@ def run_gemm_kernel_jit(
         B = B.T
 
     def ref_program(A, B):
-        import torch
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(torch.__getattribute__(out_dtype))
         return C
@@ -113,9 +110,9 @@ def test_gemm_f16f16f16_nn_kernel_jit():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         128,
         32,
diff --git a/testing/python/jit/test_tilelang_jit_gemm_cython.py b/testing/python/jit/test_tilelang_jit_gemm_cython.py
index efffc0fa8..04c71db9d 100644
--- a/testing/python/jit/test_tilelang_jit_gemm_cython.py
+++ b/testing/python/jit/test_tilelang_jit_gemm_cython.py
@@ -28,9 +28,9 @@ def matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -52,68 +52,6 @@ def main(
     return main
 
 
-def run_gemm(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-):
-    program = matmul(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-
-    stramp = "&*(XS)"
-
-    @tvm.register_func("tilelang_callback_cuda_postproc", override=True)
-    def tilelang_callback_cuda_postproc(code, _):
-        code = f"// {stramp}\n" + code
-        return code
-
-    matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="cython")
-
-    kernel_source = matmul_kernel.get_kernel_source()
-
-    assert stramp in kernel_source, f"Expected {stramp} in the kernel source"
-
-
-def test_gemm_f16f16f16_nn():
-    run_gemm(
-        512,
-        1024,
-        768,
-        False,
-        False,
-        "float16",
-        "float16",
-        "float16",
-        128,
-        256,
-        32,
-        2,
-    )
-
-
 def matmu_jit_kernel(
     M,
     N,
@@ -138,9 +76,9 @@ def matmu_jit_kernel(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -208,6 +146,7 @@ def run_gemm_jit_kernel(
 
     def ref_program(A, B):
         import torch
+
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(out_dtype)
         return C
@@ -225,9 +164,9 @@ def test_gemm_jit_kernel():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
@@ -235,19 +174,9 @@ def test_gemm_jit_kernel():
     )
 
 
-def run_cython_kernel_do_bench(M,
-                               N,
-                               K,
-                               trans_A,
-                               trans_B,
-                               in_dtype,
-                               out_dtype,
-                               dtypeAccum,
-                               block_M,
-                               block_N,
-                               block_K,
-                               num_stages=3,
-                               num_threads=128):
+def run_cython_kernel_do_bench(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -265,45 +194,26 @@ def run_cython_kernel_do_bench(M,
     )
 
     cython_matmul_kernel = tilelang.compile(program, execution_backend="cython")
-    ctypes_matmul_kernel = tilelang.compile(program, execution_backend="ctypes")
 
     cython_profiler = cython_matmul_kernel.get_profiler()
-    ctypes_profiler = ctypes_matmul_kernel.get_profiler()
 
     cython_latency = cython_profiler.do_bench(func=cython_matmul_kernel)
     print(f"cython Latency: {cython_latency} ms")
 
-    # assert ctypes_latency is not None
-
     tvm_latency = cython_profiler.do_bench()
     print(f"TVM Latency: {tvm_latency} ms")
 
     assert tvm_latency is not None
-
-    ctypes_latency = ctypes_profiler.do_bench(func=ctypes_matmul_kernel)
-    print(f"ctypes Latency: {ctypes_latency} ms")
-
     assert cython_latency is not None
 
 
 def test_cython_kernel_do_bench():
-    run_cython_kernel_do_bench(512, 1024, 768, False, False, "float16", "float16", "float16", 128,
-                               256, 32, 2)
-
-
-def run_cython_kernel_multi_stream(M,
-                                   N,
-                                   K,
-                                   trans_A,
-                                   trans_B,
-                                   in_dtype,
-                                   out_dtype,
-                                   dtypeAccum,
-                                   block_M,
-                                   block_N,
-                                   block_K,
-                                   num_stages=3,
-                                   num_threads=128):
+    run_cython_kernel_do_bench(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+
+def run_cython_kernel_multi_stream(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -342,23 +252,12 @@ def run_cython_kernel_multi_stream(M,
 
 
 def test_cython_kernel_multi_stream():
-    run_cython_kernel_multi_stream(512, 1024, 768, False, False, "float16", "float16", "float16",
-                                   128, 256, 32, 2)
-
-
-def run_cython_dynamic_shape(M,
-                             N,
-                             K,
-                             trans_A,
-                             trans_B,
-                             in_dtype,
-                             out_dtype,
-                             dtypeAccum,
-                             block_M,
-                             block_N,
-                             block_K,
-                             num_stages=3,
-                             num_threads=128):
+    run_cython_kernel_multi_stream(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+
+def run_cython_dynamic_shape(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -398,36 +297,20 @@ def run_cython_dynamic_shape(M,
     matmul_kernel(tensor_a, tensor_b, tensor_c)
 
     tensor_ref_c = torch.matmul(tensor_a.to(torch.float), tensor_b.to(torch.float)).to(out_dtype)
-    tilelang.testing.torch_assert_close(
-        tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+    tilelang.testing.torch_assert_close(tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
 def test_cython_dynamic_shape():
-    run_cython_dynamic_shape(
-        T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
-
-    run_cython_dynamic_shape(
-        T.dynamic("m"), T.dynamic("n"), 768, False, False, "float16", "float16", "float16", 128,
-        256, 32, 2)
-
-    run_cython_dynamic_shape(
-        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, "float16", "float16",
-        "float16", 128, 256, 32, 2)
-
-
-def run_cython_dynamic_shape_with_out_idx(M,
-                                          N,
-                                          K,
-                                          trans_A,
-                                          trans_B,
-                                          in_dtype,
-                                          out_dtype,
-                                          dtypeAccum,
-                                          block_M,
-                                          block_N,
-                                          block_K,
-                                          num_stages=3,
-                                          num_threads=128):
+    run_cython_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+    run_cython_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+    run_cython_dynamic_shape(T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+
+def run_cython_dynamic_shape_with_out_idx(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
     program = matmul(
         M,
         N,
@@ -467,13 +350,11 @@ def run_cython_dynamic_shape_with_out_idx(M,
 
     tensor_ref_c = torch.matmul(tensor_a.to(torch.float), tensor_b.to(torch.float)).to(out_dtype)
 
-    tilelang.testing.torch_assert_close(
-        tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+    tilelang.testing.torch_assert_close(tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
 def test_cython_dynamic_shape_with_out_idx():
-    run_cython_dynamic_shape_with_out_idx(
-        T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_cython_dynamic_shape_with_out_idx(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
 
 
 def matmul_int_variable(
@@ -498,10 +379,10 @@ def matmul_int_variable(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-            offset: T.int32,
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+        offset: T.int32,
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -525,10 +406,10 @@ def main(
     return main
 
 
-def run_matmul_int_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype,
-                            out_dtype, dtypeAccum, num_stages, threads):
-    program = matmul_int_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype,
-                                  out_dtype, dtypeAccum, num_stages, threads)
+def run_matmul_int_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, num_stages, threads):
+    program = matmul_int_variable(
+        M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, num_stages, threads
+    )
     matmul_kernel = tilelang.compile(program, execution_backend="cython", out_idx=2)
 
     in_dtype = map_torch_type(in_dtype)
@@ -544,8 +425,7 @@ def run_matmul_int_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B
 
 
 def test_matmul_int_variable():
-    run_matmul_int_variable(1024, 1024, 1024, 128, 128, 32, False, False, "float16", "float16",
-                            "float32", 0, 128)
+    run_matmul_int_variable(1024, 1024, 1024, 128, 128, 32, False, False, T.float16, T.float16, T.float32, 0, 128)
 
 
 def matmul_float_variable(
@@ -570,10 +450,10 @@ def matmul_float_variable(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-            offset: T.float32,
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+        offset: T.float32,
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -597,10 +477,10 @@ def main(
     return main
 
 
-def run_matmul_float_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype,
-                              out_dtype, dtypeAccum, num_stages, threads):
-    program = matmul_float_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype,
-                                    out_dtype, dtypeAccum, num_stages, threads)
+def run_matmul_float_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, num_stages, threads):
+    program = matmul_float_variable(
+        M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, num_stages, threads
+    )
     matmul_kernel = tilelang.compile(program, execution_backend="cython", out_idx=2)
 
     in_dtype = map_torch_type(in_dtype)
@@ -616,8 +496,7 @@ def run_matmul_float_variable(M, N, K, block_M, block_N, block_K, trans_A, trans
 
 
 def test_matmul_float_variable():
-    run_matmul_float_variable(1024, 1024, 1024, 128, 128, 32, False, False, "float16", "float16",
-                              "float32", 0, 128)
+    run_matmul_float_variable(1024, 1024, 1024, 128, 128, 32, False, False, T.float16, T.float16, T.float32, 0, 128)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/jit/test_tilelang_jit_nullptr.py b/testing/python/jit/test_tilelang_jit_nullptr.py
new file mode 100644
index 000000000..a9edb5e93
--- /dev/null
+++ b/testing/python/jit/test_tilelang_jit_nullptr.py
@@ -0,0 +1,54 @@
+import torch
+from tilelang import tvm as tvm
+import tilelang.testing
+import tilelang as tl
+import tilelang.language as T
+from tilelang.utils import map_torch_type
+
+
+@tl.jit
+def tensor_null_test(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32, with_bias=False):
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), accum_dtype),
+        Bias: T.Tensor((N), accum_dtype),
+    ):
+        # Initialize Kernel Context
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_N, block_K), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            T.clear(C_local)
+
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                # Copy tile of A
+                T.copy(A[by * block_M, ko * block_K], A_shared)
+                T.copy(B[bx * block_N, ko * block_K], B_shared)
+                T.gemm(A_shared, B_shared, C_local, transpose_B=True)
+
+            if with_bias:
+                for i, j in T.Parallel(block_M, block_N):
+                    C_local[i, j] += Bias[bx * block_N + j]
+
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_test(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
+    a = torch.randn(M, K, device="cuda", dtype=map_torch_type(dtype))
+    b = torch.randn(N, K, device="cuda", dtype=map_torch_type(dtype))
+    c = torch.zeros(M, N, device="cuda", dtype=map_torch_type(accum_dtype))
+    kernel = tensor_null_test(M, N, K, block_M, block_N, block_K, dtype, accum_dtype, with_bias=False)
+    kernel(a, b, c, None)
+
+
+def test_nullptr():
+    run_test(1024, 1024, 1024, 128, 128, 32)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/jit/test_tilelang_jit_nvrtc.py b/testing/python/jit/test_tilelang_jit_nvrtc.py
new file mode 100644
index 000000000..a519ff59e
--- /dev/null
+++ b/testing/python/jit/test_tilelang_jit_nvrtc.py
@@ -0,0 +1,436 @@
+from tilelang import tvm as tvm
+import tilelang.language as T
+import tilelang.testing
+import tilelang
+import torch
+from tilelang.utils.tensor import map_torch_type
+
+
+def matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def matmu_jit_kernel(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_jit_kernel(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    program = matmu_jit_kernel(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="nvrtc")
+
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+
+    A = torch.randn(M, K, dtype=in_dtype).cuda()
+    B = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        A = A.T
+    if trans_B:
+        B = B.T
+
+    def ref_program(A, B):
+        import torch
+
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(out_dtype)
+        return C
+
+    ref_C = ref_program(A, B)
+    C = matmul_kernel(A, B)
+
+    tilelang.testing.torch_assert_close(C, ref_C, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+def test_gemm_jit_kernel():
+    run_gemm_jit_kernel(
+        512,
+        1024,
+        768,
+        False,
+        False,
+        T.float16,
+        T.float16,
+        T.float16,
+        128,
+        256,
+        32,
+        2,
+    )
+
+
+def run_nvrtc_kernel_do_bench(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, execution_backend="nvrtc")
+
+    profiler = matmul_kernel.get_profiler()
+
+    nvrtc_latency = profiler.do_bench(func=matmul_kernel)
+    print(f"NVRTC Latency: {nvrtc_latency} ms")
+
+    assert nvrtc_latency is not None
+
+    tvm_latency = profiler.do_bench()
+    print(f"TVM Latency: {tvm_latency} ms")
+
+    assert tvm_latency is not None
+
+
+def test_nvrtc_kernel_do_bench():
+    run_nvrtc_kernel_do_bench(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+
+def run_nvrtc_kernel_multi_stream(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, execution_backend="nvrtc")
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+    tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
+    tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        tensor_a = tensor_a.T
+    if trans_B:
+        tensor_b = tensor_b.T
+    tensor_c = torch.randn(M, N, dtype=out_dtype).cuda()
+
+    num_streams = 4
+    for _ in range(num_streams):
+        stream = torch.cuda.Stream()
+        with torch.cuda.stream(stream):
+            matmul_kernel(tensor_a, tensor_b, tensor_c)
+
+
+def test_nvrtc_kernel_multi_stream():
+    run_nvrtc_kernel_multi_stream(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+
+def run_nvrtc_dynamic_shape(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, execution_backend="nvrtc")
+    if isinstance(M, T.Var):
+        M = 1024
+    if isinstance(N, T.Var):
+        N = 1024
+    if isinstance(K, T.Var):
+        K = 768
+
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+
+    tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
+    tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        tensor_a = tensor_a.T
+    if trans_B:
+        tensor_b = tensor_b.T
+    tensor_c = torch.randn(M, N, dtype=out_dtype).cuda()
+
+    matmul_kernel(tensor_a, tensor_b, tensor_c)
+
+    tensor_ref_c = torch.matmul(tensor_a.to(torch.float), tensor_b.to(torch.float)).to(out_dtype)
+    tilelang.testing.torch_assert_close(tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+def test_nvrtc_dynamic_shape():
+    run_nvrtc_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+    run_nvrtc_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+    run_nvrtc_dynamic_shape(T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+
+def check_hopper():
+    if not torch.cuda.is_available():
+        return False
+    props = torch.cuda.get_device_properties(0)
+    compute_capability = props.major, props.minor
+    return compute_capability == (9, 0)
+
+
+def convolution_im2col(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, threads, dtype=T.float16, accum_dtype=T.float32):
+    KH, KW = K, K
+    OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
+    OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
+
+    @T.prim_func
+    def main(
+        data: T.Tensor((N, H, W, C), dtype),
+        kernel: T.Tensor((KH, KW, C, F), dtype),
+        out: T.Tensor((N, OH, OW, F), dtype),
+    ):
+        with T.Kernel(T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M), threads=threads) as (bx, by):
+            data_shared = T.alloc_shared((block_M, block_K), dtype)
+            kernel_shared = T.alloc_shared((block_K, block_N), dtype)
+            out_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            out_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            kernel_flat = T.Tensor((KH * KW * C, F), dtype, kernel.data)
+            out_flat = T.Tensor((N * OH * OW, F), dtype, out.data)
+
+            T.clear(out_local)
+            for k_iter in T.Pipelined(T.ceildiv(KH * KW * C, block_K), num_stages=num_stages):
+                T.c2d_im2col(data, data_shared, by, k_iter, KH, S, D, P)
+                T.copy(kernel_flat[k_iter * block_K, bx * block_N], kernel_shared)
+                T.gemm(data_shared, kernel_shared, out_local)
+
+            T.copy(out_local, out_shared)
+            T.copy(out_shared, out_flat[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_nvrtc_im2col_tma_desc(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages=3, num_threads=256):
+    """Test im2col TMA descriptor functionality in NVRTC backend."""
+    program = convolution_im2col(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, num_threads)
+
+    conv_kernel = tilelang.compile(program, out_idx=-1, execution_backend="nvrtc")
+
+    a = torch.randn(N, H, W, C).cuda().half()
+    b = torch.randn(K, K, C, F).cuda().half()
+
+    out_c = conv_kernel(a, b)
+
+    # Reference implementation using torch.conv2d
+    def ref_program(A, B):
+        A = A.permute(0, 3, 1, 2)  # N, H, W, C -> N, C, H, W
+        B = B.permute(3, 2, 0, 1)  # H, W, C, F -> F, C, H, W
+        C = torch.conv2d(A, B, stride=S, padding=P, dilation=D)
+        C = C.permute(0, 2, 3, 1)  # N, C, H, W -> N, H, W, C
+        return C
+
+    ref_c = ref_program(a, b)
+    tilelang.testing.torch_assert_close(out_c, ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+def test_nvrtc_im2col_tma_desc():
+    """Test im2col TMA descriptor with NVRTC backend."""
+    if not check_hopper():
+        import pytest
+
+        pytest.skip("Test requires Hopper GPU (compute capability 9.0)")
+
+    # Small test case for im2col TMA descriptor
+    run_nvrtc_im2col_tma_desc(
+        N=4, C=64, H=32, W=32, F=64, K=3, S=1, D=1, P=1, block_M=64, block_N=128, block_K=32, num_stages=3, num_threads=256
+    )
+
+
+def test_nvrtc_l2_persistent_map():
+    """Test L2 persistent cache annotation with elementwise add."""
+    from tilelang.language import annotate_l2_hit_ratio
+
+    M = 1024
+    N = 1024
+
+    @tilelang.jit(out_idx=[-1], execution_backend="nvrtc")
+    def elementwise_add_with_l2_cache(
+        M,
+        N,
+        block_size=256,
+        dtype=T.float32,
+    ):
+        @T.prim_func
+        def kernel(
+            A: T.Tensor((M, N), dtype),
+            B: T.Tensor((M, N), dtype),
+            C: T.Tensor((M, N), dtype),
+        ):
+            with T.Kernel(M * N // block_size, threads=block_size) as bx:
+                # Annotate L2 persistent cache for buffer B
+                # B will be accessed multiple times and benefit from L2 caching
+                annotate_l2_hit_ratio({B: 0.8})
+
+                for i in T.serial(block_size):
+                    idx = bx * block_size + i
+                    if idx < M * N:
+                        row = idx // N
+                        col = idx % N
+                        C[row, col] = A[row, col] + B[row, col]
+
+        return kernel
+
+    # Compile the kernel
+    kernel = elementwise_add_with_l2_cache(M, N)
+
+    # Create test tensors
+    a = torch.randn(M, N, dtype=torch.float32).cuda()
+    b = torch.randn(M, N, dtype=torch.float32).cuda()
+
+    # Run kernel with out_idx=[-1], C is returned not passed in
+    c = kernel(a, b)
+
+    # Verify correctness
+    ref_c = a + b
+    tilelang.testing.torch_assert_close(c, ref_c, atol=1e-5, rtol=1e-5)
+
+    print("L2 persistent map test passed!")
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/jit/test_tilelang_jit_parcompile.py b/testing/python/jit/test_tilelang_jit_parcompile.py
new file mode 100644
index 000000000..56201e1cc
--- /dev/null
+++ b/testing/python/jit/test_tilelang_jit_parcompile.py
@@ -0,0 +1,75 @@
+import tilelang.testing
+import tilelang
+import torch
+from tilelang import language as T
+
+
+@tilelang.jit(
+    out_idx=-1,  # create the output tensor during runtime
+    verbose=True,
+)
+def matmul_kernel_jit(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A=False,
+    trans_B=True,
+    in_dtype=T.float16,
+    out_dtype=T.float32,
+    accum_dtype=T.float32,
+    num_stages=2,
+    threads=128,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def test_par_compile():
+    configs = [
+        (1024, 1024, 1024, 128, 128, 32),
+        (2048, 2048, 2048, 256, 256, 64),
+        (4096, 4096, 4096, 64, 64, 128),
+    ]
+    kernels = matmul_kernel_jit.par_compile(configs)
+    for (M, N, K, _, _, _), kernel in zip(configs, kernels):
+        A = torch.randn(M, K, dtype=torch.float16).cuda()
+        B = torch.randn(N, K, dtype=torch.float16).cuda()
+        ref = (A @ B.T).float()
+        C = kernel(A, B)
+        tilelang.testing.torch_assert_close(C, ref, rtol=1e-2, atol=1e-2)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/jit/test_tilelang_jit_tvm_ffi.py b/testing/python/jit/test_tilelang_jit_tvm_ffi.py
new file mode 100644
index 000000000..d9ab313c4
--- /dev/null
+++ b/testing/python/jit/test_tilelang_jit_tvm_ffi.py
@@ -0,0 +1,446 @@
+from tilelang import tvm as tvm
+import tilelang.language as T
+import tilelang.testing
+import tilelang
+import torch
+from tilelang.utils.tensor import map_torch_type
+
+
+def matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def matmu_jit_kernel(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_jit_kernel(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    program = matmu_jit_kernel(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="tvm_ffi")
+
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+
+    A = torch.randn(M, K, dtype=in_dtype).cuda()
+    B = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        A = A.T
+    if trans_B:
+        B = B.T
+
+    def ref_program(A, B):
+        import torch
+
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(out_dtype)
+        return C
+
+    ref_C = ref_program(A, B)
+    C = matmul_kernel(A, B)
+
+    tilelang.testing.torch_assert_close(C, ref_C, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+def test_gemm_jit_kernel():
+    run_gemm_jit_kernel(
+        512,
+        1024,
+        768,
+        False,
+        False,
+        T.float16,
+        T.float16,
+        T.float16,
+        128,
+        256,
+        32,
+        2,
+    )
+
+
+def run_tvm_ffi_kernel_do_bench(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, execution_backend="tvm_ffi")
+
+    profiler = matmul_kernel.get_profiler()
+
+    tvm_ffi_latency = profiler.do_bench(func=matmul_kernel)
+    print(f"tvm_ffi Latency: {tvm_ffi_latency} ms")
+
+    assert tvm_ffi_latency is not None
+
+    tvm_latency = profiler.do_bench()
+    print(f"TVM Latency: {tvm_latency} ms")
+
+    assert tvm_latency is not None
+
+
+def test_tvm_ffi_kernel_do_bench():
+    run_tvm_ffi_kernel_do_bench(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+
+def run_tvm_ffi_kernel_multi_stream(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, execution_backend="tvm_ffi")
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+    tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
+    tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        tensor_a = tensor_a.T
+    if trans_B:
+        tensor_b = tensor_b.T
+    tensor_c = torch.randn(M, N, dtype=out_dtype).cuda()
+
+    num_streams = 4
+    for _ in range(num_streams):
+        stream = torch.cuda.Stream()
+        with torch.cuda.stream(stream):
+            matmul_kernel(tensor_a, tensor_b, tensor_c)
+
+
+def test_tvm_ffi_kernel_multi_stream():
+    run_tvm_ffi_kernel_multi_stream(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+
+def run_tvm_ffi_dynamic_shape(
+    M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages=3, num_threads=128
+):
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    matmul_kernel = tilelang.compile(program, execution_backend="tvm_ffi")
+    if isinstance(M, T.Var):
+        M = 1024
+    if isinstance(N, T.Var):
+        N = 1024
+    if isinstance(K, T.Var):
+        K = 768
+
+    in_dtype = map_torch_type(in_dtype)
+    out_dtype = map_torch_type(out_dtype)
+
+    tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
+    tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
+
+    if trans_A:
+        tensor_a = tensor_a.T
+    if trans_B:
+        tensor_b = tensor_b.T
+    tensor_c = torch.randn(M, N, dtype=out_dtype).cuda()
+
+    matmul_kernel(tensor_a, tensor_b, tensor_c)
+
+    tensor_ref_c = torch.matmul(tensor_a.to(torch.float), tensor_b.to(torch.float)).to(out_dtype)
+    tilelang.testing.torch_assert_close(tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+def test_tvm_ffi_dynamic_shape():
+    run_tvm_ffi_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+    run_tvm_ffi_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+
+    run_tvm_ffi_dynamic_shape(
+        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2
+    )
+
+
+def check_hopper():
+    if not torch.cuda.is_available():
+        return False
+    props = torch.cuda.get_device_properties(0)
+    compute_capability = props.major, props.minor
+    return compute_capability == (9, 0)
+
+
+def convolution_im2col(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, threads, dtype=T.float16, accum_dtype=T.float32):
+    KH, KW = K, K
+    OH = (H + 2 * P - D * (K - 1) - 1) // S + 1
+    OW = (W + 2 * P - D * (K - 1) - 1) // S + 1
+
+    @T.prim_func
+    def main(
+        data: T.Tensor((N, H, W, C), dtype),
+        kernel: T.Tensor((KH, KW, C, F), dtype),
+        out: T.Tensor((N, OH, OW, F), dtype),
+    ):
+        with T.Kernel(T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M), threads=threads) as (bx, by):
+            data_shared = T.alloc_shared((block_M, block_K), dtype)
+            kernel_shared = T.alloc_shared((block_K, block_N), dtype)
+            out_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            out_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            kernel_flat = T.Tensor((KH * KW * C, F), dtype, kernel.data)
+            out_flat = T.Tensor((N * OH * OW, F), dtype, out.data)
+
+            T.clear(out_local)
+            for k_iter in T.Pipelined(T.ceildiv(KH * KW * C, block_K), num_stages=num_stages):
+                T.c2d_im2col(data, data_shared, by, k_iter, KH, S, D, P)
+                T.copy(kernel_flat[k_iter * block_K, bx * block_N], kernel_shared)
+                T.gemm(data_shared, kernel_shared, out_local)
+
+            T.copy(out_local, out_shared)
+            T.copy(out_shared, out_flat[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_tvm_ffi_im2col_tma_desc(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages=3, num_threads=256):
+    """Test im2col TMA descriptor functionality in tvm_ffi backend."""
+    program = convolution_im2col(N, C, H, W, F, K, S, D, P, block_M, block_N, block_K, num_stages, num_threads)
+
+    conv_kernel = tilelang.compile(program, out_idx=-1, execution_backend="tvm_ffi")
+
+    a = torch.randn(N, H, W, C).cuda().half()
+    b = torch.randn(K, K, C, F).cuda().half()
+
+    out_c = conv_kernel(a, b)
+
+    # Reference implementation using torch.conv2d
+    def ref_program(A, B):
+        A = A.permute(0, 3, 1, 2)  # N, H, W, C -> N, C, H, W
+        B = B.permute(3, 2, 0, 1)  # H, W, C, F -> F, C, H, W
+        C = torch.conv2d(A, B, stride=S, padding=P, dilation=D)
+        C = C.permute(0, 2, 3, 1)  # N, C, H, W -> N, H, W, C
+        return C
+
+    ref_c = ref_program(a, b)
+    tilelang.testing.torch_assert_close(out_c, ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+def test_tvm_ffi_im2col_tma_desc():
+    """Test im2col TMA descriptor with tvm_ffi backend."""
+    if not check_hopper():
+        import pytest
+
+        pytest.skip("Test requires Hopper GPU (compute capability 9.0)")
+
+    # Small test case for im2col TMA descriptor
+    run_tvm_ffi_im2col_tma_desc(
+        N=4, C=64, H=32, W=32, F=64, K=3, S=1, D=1, P=1, block_M=64, block_N=128, block_K=32, num_stages=3, num_threads=256
+    )
+
+
+def test_tvm_ffi_l2_persistent_map():
+    """Test L2 persistent cache annotation with elementwise add."""
+    from tilelang.language import annotate_l2_hit_ratio
+
+    M = 1024
+    N = 1024
+
+    @tilelang.jit(out_idx=[-1], execution_backend="tvm_ffi")
+    def elementwise_add_with_l2_cache(
+        M,
+        N,
+        block_size=256,
+        dtype=T.float32,
+    ):
+        @T.prim_func
+        def kernel(
+            A: T.Tensor((M, N), dtype),
+            B: T.Tensor((M, N), dtype),
+            C: T.Tensor((M, N), dtype),
+        ):
+            with T.Kernel(M * N // block_size, threads=block_size) as bx:
+                # Annotate L2 persistent cache for buffer B
+                # B will be accessed multiple times and benefit from L2 caching
+                annotate_l2_hit_ratio({B: 0.8})
+
+                for i in T.serial(block_size):
+                    idx = bx * block_size + i
+                    if idx < M * N:
+                        row = idx // N
+                        col = idx % N
+                        C[row, col] = A[row, col] + B[row, col]
+
+        return kernel
+
+    # Compile the kernel
+    kernel = elementwise_add_with_l2_cache(M, N)
+
+    source = kernel.get_host_source()
+    assert "__tvm_cuda_stream_set_access_policy_window_packed" in source, (
+        "Expected __tvm_cuda_stream_set_access_policy_window_packed in the kernel source"
+    )
+    assert "__tvm_cuda_stream_reset_access_policy_window_packed" in source, (
+        "Expected __tvm_cuda_stream_reset_access_policy_window_packed in the kernel source"
+    )
+
+    # Create test tensors
+    a = torch.randn(M, N, dtype=torch.float32).cuda()
+    b = torch.randn(M, N, dtype=torch.float32).cuda()
+
+    # Run kernel with out_idx=[-1], C is returned not passed in
+    c = kernel(a, b)
+
+    # Verify correctness
+    ref_c = a + b
+    tilelang.testing.torch_assert_close(c, ref_c, atol=1e-5, rtol=1e-5)
+
+    print("L2 persistent map test passed!")
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py b/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py
index b4509fadc..97d050b73 100644
--- a/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py
+++ b/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py
@@ -6,7 +6,8 @@
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 from tilelang.utils.tensor import map_torch_type
 
@@ -38,22 +39,27 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "bfloat16",
-        "float8_e4m3",
-        "float8_e5m2",
-        "int8",
+        T.float16,
+        T.bfloat16,
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    is_float8 = in_dtype in ["float8_e4m3", "float8_e5m2"]
-    if out_dtype == "int32" or is_float8:
+    is_float8 = in_dtype in [
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.float8_e4m3fn,
+        T.float8_e5m2fnuz,
+    ]
+    if out_dtype == T.int32 or is_float8:
         micro_size_k = 32
 
     # This is a debug config
@@ -61,7 +67,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 32
     warp_col_tiles = 32
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -106,12 +112,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -119,10 +124,12 @@ def main(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -130,7 +137,6 @@ def main(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -140,7 +146,6 @@ def main(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -216,7 +221,7 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 0)
 def test_assert_tl_matmul_bfloat16():
-    assert_tl_matmul_correctness(256, 256, 256, "bfloat16", "float32", "float32")
+    assert_tl_matmul_correctness(256, 256, 256, T.bfloat16, T.float32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_element_wise_add.py b/testing/python/kernel/test_tilelang_kernel_element_wise_add.py
index 3ec6ae030..501b38fda 100644
--- a/testing/python/kernel/test_tilelang_kernel_element_wise_add.py
+++ b/testing/python/kernel/test_tilelang_kernel_element_wise_add.py
@@ -1,5 +1,5 @@
-from tilelang import tvm as tvm
 import tilelang.testing
+from tilelang import language as T
 import torch
 
 
@@ -12,19 +12,17 @@ def elementwise_add(
     out_dtype,
     threads,
 ):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), in_dtype),
-            B: T.Tensor((M, N), in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor((M, N), in_dtype),
+        B: T.Tensor((M, N), in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             start_x = bx * block_N
             start_y = by * block_M
 
-            for (local_y, local_x) in T.Parallel(block_M, block_N):
+            for local_y, local_x in T.Parallel(block_M, block_N):
                 y = start_y + local_y
                 x = start_x + local_x
 
@@ -67,8 +65,8 @@ def test_elementwise_add_f32():
     run_elementwise_add(
         512,
         1024,
-        "float32",
-        "float32",
+        T.float32,
+        T.float32,
         128,
         256,
     )
@@ -78,8 +76,8 @@ def test_elementwise_add_f16():
     run_elementwise_add(
         512,
         1024,
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
         128,
         256,
     )
@@ -89,8 +87,8 @@ def test_elementwise_add_i32():
     run_elementwise_add(
         512,
         1024,
-        "int32",
-        "int32",
+        T.int32,
+        T.int32,
         128,
         256,
     )
@@ -100,8 +98,8 @@ def test_elementwise_add_f32f16():
     run_elementwise_add(
         512,
         1024,
-        "float32",
-        "float16",
+        T.float32,
+        T.float16,
         128,
         256,
     )
diff --git a/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py b/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py
index 19f327d66..276083b26 100644
--- a/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py
+++ b/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py
@@ -12,12 +12,11 @@ def calc_diff(x, y):
 
 
 def matmul_nt(M, N, K, bM, bN, bK, in_dtype, out_dtype, accum_dtype):
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), in_dtype),
-            B: T.Tensor((N, K), in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor((M, K), in_dtype),
+        B: T.Tensor((N, K), in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, bN), T.ceildiv(M, bM), threads=128) as (bx, by):
             A_shared = T.alloc_shared((bM, bK), in_dtype)
@@ -44,8 +43,7 @@ def assert_matmul_correctness(M, N, K, block_M, block_N, block_K, in_dtype, out_
 
     C = kernel(A, B)
 
-    ref_c = torch.matmul(A.to(map_torch_type(accum_dtype)),
-                         B.T.to(map_torch_type(accum_dtype))).to(map_torch_type(out_dtype))
+    ref_c = torch.matmul(A.to(map_torch_type(accum_dtype)), B.T.to(map_torch_type(accum_dtype))).to(map_torch_type(out_dtype))
     print(C)
     print(ref_c)
     diff = calc_diff(C, ref_c)
@@ -56,8 +54,8 @@ def assert_matmul_correctness(M, N, K, block_M, block_N, block_K, in_dtype, out_
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(9)
 def test_assert_matmul():
-    assert_matmul_correctness(1024, 1024, 1024, 128, 128, 64, "float8_e4m3", "float32", "float32")
-    assert_matmul_correctness(1024, 1024, 1024, 128, 128, 64, "float8_e5m2", "float32", "float32")
+    assert_matmul_correctness(1024, 1024, 1024, 128, 128, 64, T.float8_e4m3fn, T.float32, T.float32)
+    assert_matmul_correctness(1024, 1024, 1024, 128, 128, 64, T.float8_e5m2, T.float32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py b/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py
index 34def174d..9ba369b6b 100644
--- a/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py
+++ b/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py
@@ -6,7 +6,8 @@
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 from tilelang.utils.tensor import map_torch_type
 
@@ -38,21 +39,26 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "float8_e4m3",
-        "float8_e5m2",
-        "int8",
+        T.float16,
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    is_float8 = in_dtype in ["float8_e4m3", "float8_e5m2"]
-    if out_dtype == "int32" or is_float8:
+    is_float8 = in_dtype in [
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.float8_e4m3fn,
+        T.float8_e5m2fnuz,
+    ]
+    if out_dtype == T.int32 or is_float8:
         micro_size_k = 32
 
     # This is a debug config
@@ -60,7 +66,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 32
     warp_col_tiles = 32
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -105,12 +111,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -118,10 +123,12 @@ def main(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -129,7 +136,6 @@ def main(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -139,7 +145,6 @@ def main(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -216,8 +221,8 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 9)
 def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e4m3", "float32", "float32")
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e5m2", "float32", "float32")
+    assert_tl_matmul_correctness(128, 128, 128, T.float8_e4m3fn, T.float32, T.float32)
+    assert_tl_matmul_correctness(128, 128, 128, T.float8_e5m2, T.float32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py b/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py
index afd01f337..7b757992a 100644
--- a/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py
+++ b/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py
@@ -27,8 +27,8 @@ def gemv_simt(
 ):
     assert n_partition is not None, "n_partition must be provided"
     assert reduce_thread is not None, (
-        "reduce_thread must be provided currently, as related bitblas.gpu.gemv.GEMV"
-        "sch_outer_reduction_with_config is not implemented")
+        "reduce_thread must be provided currently, as related bitblas.gpu.gemv.GEMVsch_outer_reduction_with_config is not implemented"
+    )
 
     assert isinstance(N, int) and isinstance(K, int), "Do not support dynamic N and K Currently"
 
@@ -46,20 +46,19 @@ def gemv_simt(
     C_shape = (M, N)
 
     dp4a_size = 4
-    use_dp4a = in_dtype == "int8" and accum_dtype == "int32"
+    use_dp4a = in_dtype == T.int8 and accum_dtype == T.int32
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            Bias: T.Tensor(Bias_shape, out_dtype),
-            C: T.Tensor(C_shape, out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        Bias: T.Tensor(Bias_shape, out_dtype),
+        C: T.Tensor(C_shape, out_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, n_partition), M, threads=(reduce_thread, n_partition)) as (
-                    bx,
-                    by,
-                ):
+        with T.Kernel(T.ceildiv(N, n_partition), M, threads=(reduce_thread, n_partition)) as (
+            bx,
+            by,
+        ):
             A_local = T.alloc_local((micro_size_k,), in_dtype)
             B_local = T.alloc_local((micro_size_k,), in_dtype)
             accum_res = T.alloc_local((1,), accum_dtype)
@@ -88,13 +87,12 @@ def main(
                         )
                 else:
                     for ki in T.serial(micro_size_k):
-                        accum_res[0] += A_local[ki].astype(accum_dtype) * B_local[ki].astype(
-                            accum_dtype)
+                        accum_res[0] += A_local[ki].astype(accum_dtype) * B_local[ki].astype(accum_dtype)
 
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
-                    "reduce_scope",
-                    T.reinterpret(T.uint64(0), dtype="handle"),
+                T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                "reduce_scope",
+                T.reinterpret(T.uint64(0), dtype="handle"),
             ):
                 T.evaluate(
                     T.tvm_thread_allreduce(
@@ -104,11 +102,11 @@ def main(
                         reduced_accum_res[0],
                         kr,
                         dtype="handle",
-                    ))
+                    )
+                )
             if kr == 0:
                 if with_bias:
-                    C[by,
-                      bx * n_partition + ni] = reduced_accum_res[0] + Bias[bx * n_partition + ni]
+                    C[by, bx * n_partition + ni] = reduced_accum_res[0] + Bias[bx * n_partition + ni]
                 else:
                     C[by, bx * n_partition + ni] = reduced_accum_res[0]
 
@@ -166,8 +164,8 @@ def evaluate_gemv_simt(
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 9)
 def test_gemv_simt():
-    evaluate_gemv_simt(1, 1024, 1024, "float8_e4m3", "float32", "float32", with_bias=False)
-    evaluate_gemv_simt(1, 1024, 1024, "float8_e5m2", "float32", "float32", with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, T.float8_e4m3fn, T.float32, T.float32, with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, T.float8_e5m2, T.float32, T.float32, with_bias=False)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm.py b/testing/python/kernel/test_tilelang_kernel_gemm.py
index 5dcde1d5e..6dc95e98a 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm.py
@@ -1,5 +1,6 @@
 from tilelang import tvm as tvm
 import tilelang.testing
+import tilelang.language as T
 
 
 def matmul(
@@ -22,13 +23,11 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -92,11 +91,11 @@ def ref_program(A, B):
             A = A.T
         if trans_B:
             B = B.T
-        if in_dtype == "float32":
+        if in_dtype == T.float32:
             # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
             # float32 automatically, -0x1000 meas
-            A = ((A.view(torch.int32) - 0x1000)).view(torch.float32)
-            B = ((B.view(torch.int32) - 0x1000)).view(torch.float32)
+            A = (A.view(torch.int32) - 0x1000).view(torch.float32)
+            B = (B.view(torch.int32) - 0x1000).view(torch.float32)
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(torch.__getattribute__(out_dtype))
         return C
@@ -111,9 +110,9 @@ def test_gemm_f16f16f16_nn():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         128,
         32,
@@ -128,9 +127,9 @@ def test_gemm_f16f16f32_nn():
         768,
         False,
         False,
-        "float16",
-        "float16",
-        "float32",
+        T.float16,
+        T.float16,
+        T.float32,
         128,
         128,
         32,
@@ -144,9 +143,9 @@ def test_gemm_bf16bf16f32_nn():
         768,
         False,
         False,
-        "bfloat16",
-        "bfloat16",
-        "float32",
+        T.bfloat16,
+        T.bfloat16,
+        T.float32,
         128,
         128,
         32,
@@ -160,9 +159,9 @@ def test_gemm_f32f32f32_nn():
         768,
         False,
         False,
-        "float32",
-        "float32",
-        "float32",
+        T.float32,
+        T.float32,
+        T.float32,
         64,
         128,
         32,
@@ -176,9 +175,9 @@ def test_gemm_f16f16f16_tn():
         768,
         True,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         128,
         32,
@@ -193,9 +192,9 @@ def test_gemm_f16f16f16_nt():
         768,
         False,
         True,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         128,
         32,
@@ -204,15 +203,15 @@ def test_gemm_f16f16f16_nt():
 
 
 def test_gemm_i8i8i32_nt():
-    run_gemm(512, 1024, 768, False, True, "int8", "int8", "int32", 128, 128, 64)
+    run_gemm(512, 1024, 768, False, True, T.int8, T.int8, T.int32, 128, 128, 64)
 
 
 def test_gemm_i8i8i32_tn():
-    run_gemm(512, 1024, 768, True, False, "int8", "int8", "int32", 128, 128, 64)
+    run_gemm(512, 1024, 768, True, False, T.int8, T.int8, T.int32, 128, 128, 64)
 
 
 def test_gemm_f64f64f64_nt():
-    run_gemm(512, 512, 512, False, True, "float64", "float64", "float64", 64, 32, 16)
+    run_gemm(512, 512, 512, False, True, T.float64, T.float64, T.float64, 64, 32, 16)
 
 
 def test_gemm_f32f32f32_nt():
@@ -222,9 +221,9 @@ def test_gemm_f32f32f32_nt():
         768,
         False,
         True,
-        "float32",
-        "float32",
-        "float32",
+        T.float32,
+        T.float32,
+        T.float32,
         64,
         128,
         32,
@@ -238,9 +237,9 @@ def test_gemm_f32f32f32_tn():
         768,
         True,
         False,
-        "float32",
-        "float32",
-        "float32",
+        T.float32,
+        T.float32,
+        T.float32,
         64,
         128,
         32,
@@ -254,9 +253,9 @@ def test_pad_aligned_f16f16f16_nn():
         768 - 24,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
@@ -271,9 +270,9 @@ def test_pad_f16f16f16_nn():
         768 - 5,
         False,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         256,
         32,
@@ -288,9 +287,9 @@ def test_pad_f16f16f32_nn():
         768 + 15,
         False,
         False,
-        "float16",
-        "float16",
-        "float32",
+        T.float16,
+        T.float16,
+        T.float32,
         128,
         64,
         32,
@@ -321,9 +320,9 @@ def matmul_sr(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -407,9 +406,9 @@ def test_gemm_f16f16f16_sr():
         768,
         False,
         True,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         128,
         32,
@@ -441,9 +440,9 @@ def matmul_rs(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope="shared")
@@ -526,9 +525,9 @@ def test_gemm_f16f16f16_rs():
         768,
         True,
         False,
-        "float16",
-        "float16",
-        "float16",
+        T.float16,
+        T.float16,
+        T.float16,
         128,
         128,
         32,
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py b/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py
index da2e12cdc..dd1b75ebc 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py
@@ -6,7 +6,8 @@
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.transform import simplify_prim_func
 from tilelang.utils.tensor import map_torch_type
 
@@ -38,22 +39,27 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "bfloat16",
-        "float8_e4m3",
-        "float8_e5m2",
-        "int8",
+        T.float16,
+        T.bfloat16,
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    is_float8 = in_dtype in ["float8_e4m3", "float8_e5m2"]
-    if out_dtype == "int32" or is_float8:
+    is_float8 = in_dtype in [
+        T.float8_e4m3fn,
+        T.float8_e5m2,
+        T.float8_e4m3fn,
+        T.float8_e5m2fnuz,
+    ]
+    if out_dtype == T.int32 or is_float8:
         micro_size_k = 32
 
     # This is a debug config
@@ -61,7 +67,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 32
     warp_col_tiles = 32
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -106,12 +112,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -119,10 +124,12 @@ def main(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -130,7 +137,6 @@ def main(
             T.clear(C_local)
 
             for ko in T.Pipelined((K // block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -140,7 +146,6 @@ def main(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -214,22 +219,22 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 0)
 def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(128, 128, 128, "float16", "float16", "float16")
-    assert_tl_matmul_correctness(128, 256, 256, "float16", "float32", "float32")
-    assert_tl_matmul_correctness(128, 256, 256, "int8", "int32", "int32")
+    assert_tl_matmul_correctness(128, 128, 128, T.float16, T.float16, T.float16)
+    assert_tl_matmul_correctness(128, 256, 256, T.float16, T.float32, T.float32)
+    assert_tl_matmul_correctness(128, 256, 256, T.int8, T.int32, T.int32)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 0)
 def test_assert_tl_matmul_bfloat16():
-    assert_tl_matmul_correctness(256, 256, 256, "bfloat16", "float32", "float32")
+    assert_tl_matmul_correctness(256, 256, 256, T.bfloat16, T.float32, T.float32)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 9)
 def test_assert_tl_matmul_fp8():
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e4m3", "float32", "float32")
-    assert_tl_matmul_correctness(128, 128, 128, "float8_e5m2", "float32", "float32")
+    assert_tl_matmul_correctness(128, 128, 128, T.float8_e4m3fn, T.float32, T.float32)
+    assert_tl_matmul_correctness(128, 128, 128, T.float8_e5m2, T.float32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm_simt.py b/testing/python/kernel/test_tilelang_kernel_gemm_simt.py
index 548497c72..584aa854a 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm_simt.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm_simt.py
@@ -35,13 +35,13 @@ def tl_matmul_simt(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     # This is a debug config
@@ -72,16 +72,15 @@ def tl_matmul_simt(
 
     micro_size_k = 128 // DataType(in_dtype).bits
     dp4a_size = 4
-    use_dp4a = in_dtype == "int8" and accum_dtype == "int32"
+    use_dp4a = in_dtype == T.int8 and accum_dtype == T.int32
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor(C_shape, out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor(C_shape, out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
 
@@ -97,7 +96,6 @@ def main(
             T.clear(C_local)
 
             for ko in T.serial(K // block_K):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -109,29 +107,24 @@ def main(
                 for ki in T.serial((block_K // micro_size_k)):
                     for i in T.serial(local_size_a):
                         for mk in T.vectorized(micro_size_k):
-                            A_local[i, mk] = A_shared[warp_m * local_size_a + i,
-                                                      ki * micro_size_k + mk]
+                            A_local[i, mk] = A_shared[warp_m * local_size_a + i, ki * micro_size_k + mk]
 
                     for i in T.serial(local_size_b):
                         for mk in T.vectorized(micro_size_k):
-                            B_local[i, mk] = B_shared[warp_n * local_size_b + i,
-                                                      ki * micro_size_k + mk]
+                            B_local[i, mk] = B_shared[warp_n * local_size_b + i, ki * micro_size_k + mk]
 
                     for i, j in T.grid(local_size_a, local_size_b):
                         for mk in T.serial(micro_size_k // dp4a_size):
                             if use_dp4a:
-                                T.dp4a(A_local[i, mk * dp4a_size], B_local[j, mk * dp4a_size],
-                                       C_local[i * local_size_b + j])
+                                T.dp4a(A_local[i, mk * dp4a_size], B_local[j, mk * dp4a_size], C_local[i * local_size_b + j])
                             else:
                                 for dp4a_idx in T.serial(dp4a_size):
-                                    C_local[i * local_size_b +
-                                            j] += A_local[i, mk * dp4a_size +
-                                                          dp4a_idx] * B_local[j, mk * dp4a_size +
-                                                                              dp4a_idx]
+                                    C_local[i * local_size_b + j] += (
+                                        A_local[i, mk * dp4a_size + dp4a_idx] * B_local[j, mk * dp4a_size + dp4a_idx]
+                                    )
 
             for i, j in T.grid(local_size_a, local_size_b):
-                C[by * block_M + warp_m * local_size_a + i,
-                  bx * block_N + warp_n * local_size_b + j] = C_local[i * local_size_b + j]
+                C[by * block_M + warp_m * local_size_a + i, bx * block_N + warp_n * local_size_b + j] = C_local[i * local_size_b + j]
 
     return main
 
@@ -146,7 +139,7 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
     # src_code is the generated cuda source
     assert src_code is not None
 
-    if in_dtype == "int8":
+    if in_dtype == T.int8:
         A = torch.randint(-128, 127, (M, K), device="cuda", dtype=torch.int8)
         B = torch.randint(-128, 127, (N, K), device="cuda", dtype=torch.int8)
     else:
@@ -168,9 +161,9 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 
 
 def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(128, 128, 128, "float16", "float16", "float16")
-    assert_tl_matmul_correctness(128, 256, 256, "float16", "float32", "float32")
-    assert_tl_matmul_correctness(128, 256, 256, "int8", "int32", "int32")
+    assert_tl_matmul_correctness(128, 128, 128, T.float16, T.float16, T.float16)
+    assert_tl_matmul_correctness(128, 256, 256, T.float16, T.float32, T.float32)
+    assert_tl_matmul_correctness(128, 256, 256, T.int8, T.int32, T.int32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py b/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py
index bbc2e79e2..1f7660032 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py
@@ -4,13 +4,12 @@
 import torch
 
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -59,7 +58,8 @@ def run_gemm_with_stride_ss(M: int, N: int, K: int, block_M: int, block_N: int,
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     # Create random input tensors on the GPU
     a = torch.randn(M, K, device="cuda", dtype=torch.float16)
     b = torch.randn(K, N, device="cuda", dtype=torch.float16)
diff --git a/testing/python/kernel/test_tilelang_kernel_gemv_simt.py b/testing/python/kernel/test_tilelang_kernel_gemv_simt.py
index 86d6acbda..b4a5c8249 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemv_simt.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemv_simt.py
@@ -27,8 +27,8 @@ def gemv_simt(
 ):
     assert n_partition is not None, "n_partition must be provided"
     assert reduce_thread is not None, (
-        "reduce_thread must be provided currently, as related bitblas.gpu.gemv.GEMV"
-        "sch_outer_reduction_with_config is not implemented")
+        "reduce_thread must be provided currently, as related bitblas.gpu.gemv.GEMVsch_outer_reduction_with_config is not implemented"
+    )
 
     assert isinstance(N, int) and isinstance(K, int), "Do not support dynamic N and K Currently"
 
@@ -46,20 +46,19 @@ def gemv_simt(
     C_shape = (M, N)
 
     dp4a_size = 4
-    use_dp4a = in_dtype == "int8" and accum_dtype == "int32"
+    use_dp4a = in_dtype == T.int8 and accum_dtype == T.int32
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            Bias: T.Tensor(Bias_shape, out_dtype),
-            C: T.Tensor(C_shape, out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        Bias: T.Tensor(Bias_shape, out_dtype),
+        C: T.Tensor(C_shape, out_dtype),
     ):
-        with T.Kernel(
-                T.ceildiv(N, n_partition), M, threads=(reduce_thread, n_partition)) as (
-                    bx,
-                    by,
-                ):
+        with T.Kernel(T.ceildiv(N, n_partition), M, threads=(reduce_thread, n_partition)) as (
+            bx,
+            by,
+        ):
             A_local = T.alloc_local((micro_size_k,), in_dtype)
             B_local = T.alloc_local((micro_size_k,), in_dtype)
             accum_res = T.alloc_local((1,), accum_dtype)
@@ -88,13 +87,12 @@ def main(
                         )
                 else:
                     for ki in T.serial(micro_size_k):
-                        accum_res[0] += A_local[ki].astype(accum_dtype) * B_local[ki].astype(
-                            accum_dtype)
+                        accum_res[0] += A_local[ki].astype(accum_dtype) * B_local[ki].astype(accum_dtype)
 
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
-                    "reduce_scope",
-                    T.reinterpret(T.uint64(0), dtype="handle"),
+                T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                "reduce_scope",
+                T.reinterpret(T.uint64(0), dtype="handle"),
             ):
                 T.evaluate(
                     T.tvm_thread_allreduce(
@@ -104,11 +102,11 @@ def main(
                         reduced_accum_res[0],
                         kr,
                         dtype="handle",
-                    ))
+                    )
+                )
             if kr == 0:
                 if with_bias:
-                    C[by,
-                      bx * n_partition + ni] = reduced_accum_res[0] + Bias[bx * n_partition + ni]
+                    C[by, bx * n_partition + ni] = reduced_accum_res[0] + Bias[bx * n_partition + ni]
                 else:
                     C[by, bx * n_partition + ni] = reduced_accum_res[0]
 
@@ -166,15 +164,15 @@ def evaluate_gemv_simt(
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 0)
 def test_gemv_simt():
-    evaluate_gemv_simt(1, 1024, 1024, "float16", "float16", "float16", with_bias=False)
-    evaluate_gemv_simt(1, 1024, 1024, "int8", "int32", "int32", with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, T.float16, T.float16, T.float16, with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, T.int8, T.int32, T.int32, with_bias=False)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(8, 9)
 def test_gemv_simt_fp8():
-    evaluate_gemv_simt(1, 1024, 1024, "float8_e4m3", "float32", "float32", with_bias=False)
-    evaluate_gemv_simt(1, 1024, 1024, "float8_e5m2", "float32", "float32", with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, T.float8_e4m3fn, T.float32, T.float32, with_bias=False)
+    evaluate_gemv_simt(1, 1024, 1024, T.float8_e5m2, T.float32, T.float32, with_bias=False)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py b/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py
index 5cdd67105..9d60e5229 100644
--- a/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py
+++ b/testing/python/kernel/test_tilelang_kernel_int4_gemm_mma.py
@@ -4,7 +4,8 @@
 import tilelang.testing
 import tilelang.language as T
 from tilelang.intrinsics import (
-    make_mma_swizzle_layout as make_swizzle_layout,)
+    make_mma_swizzle_layout as make_swizzle_layout,
+)
 
 from tilelang.intrinsics.mma_macro_generator import (
     INT4TensorCoreIntrinEmitter,
@@ -25,20 +26,20 @@ def tl_matmul(
     accum_dtype,
 ):
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     K = K // 2
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if accum_dtype == "int32":
+    if accum_dtype == T.int32:
         micro_size_k = 32
 
     # This is a debug config
@@ -46,7 +47,7 @@ def tl_matmul(
     block_col_warps = 2
     warp_row_tiles = 64
     warp_col_tiles = 64
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -91,12 +92,11 @@ def tl_matmul(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -104,10 +104,12 @@ def main(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -115,7 +117,6 @@ def main(
             T.clear(C_local)
 
             for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
@@ -125,7 +126,6 @@ def main(
                     B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -168,7 +168,8 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
         out_idx=[2],
         pass_configs={
             tilelang.PassConfigKey.TL_DEBUG_MERGE_SHARED_MEMORY_ALLOCATIONS: True,
-        })
+        },
+    )
     print(kernel.get_kernel_source())
     profiler = kernel.get_profiler()
 
@@ -196,8 +197,8 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 
 
 def test_assert_tl_matmul_correctness():
-    assert_tl_matmul_correctness(128, 128, 128, "int8", "int32", "int32")
-    assert_tl_matmul_correctness(128, 128, 64, "int8", "int32", "int32")
+    assert_tl_matmul_correctness(128, 128, 128, T.int8, T.int32, T.int32)
+    assert_tl_matmul_correctness(128, 128, 64, T.int8, T.int32, T.int32)
 
 
 @simplify_prim_func
@@ -211,18 +212,18 @@ def tl_matmul_weight_only_transform(
 ):
     K = K // 2
     assert in_dtype in [
-        "float16",
-        "int8",
+        T.float16,
+        T.int8,
     ], "Currently only float16 and int8 are supported"
     assert out_dtype in [
-        "float16",
-        "float32",
-        "int32",
+        T.float16,
+        T.float32,
+        T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if out_dtype == "int32":
+    if out_dtype == T.int32:
         micro_size_k = 32
 
     transform_b = 3
@@ -232,7 +233,7 @@ def tl_matmul_weight_only_transform(
     block_col_warps = 2
     warp_row_tiles = 64
     warp_col_tiles = 64
-    chunk = 32 if in_dtype == "float16" else 64
+    chunk = 32 if in_dtype == T.float16 else 64
     shared_scope = "shared.dyn"
 
     # Pipeline Stage
@@ -285,12 +286,11 @@ def tl_matmul_weight_only_transform(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-
             A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype, scope=shared_scope)
@@ -298,10 +298,12 @@ def main(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout({
-                A_shared: make_swizzle_layout(A_shared),
-                B_shared: make_swizzle_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzle_layout(A_shared),
+                    B_shared: make_swizzle_layout(B_shared),
+                }
+            )
 
             # Improve L2 Cache
             T.use_swizzle(panel_size=10)
@@ -309,19 +311,15 @@ def main(
             T.clear(C_local)
 
             for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=stage):
-
                 # Load A into shared memory
                 for i, k in T.Parallel(block_M, block_K):
                     A_shared[i, k] = A[by * block_M + i, ko * block_K + k]
 
                 # Load B into shared memory
-                for j, k, jj, kk in T.Parallel(block_N // micro_size_y, block_K // micro_size_k,
-                                               micro_size_y, micro_size_k):
-                    B_shared[j, k, jj, kk] = B[bx * (block_N // micro_size_y) + j,
-                                               ko * (block_K // micro_size_k) + k, jj, kk]
+                for j, k, jj, kk in T.Parallel(block_N // micro_size_y, block_K // micro_size_k, micro_size_y, micro_size_k):
+                    B_shared[j, k, jj, kk] = B[bx * (block_N // micro_size_y) + j, ko * (block_K // micro_size_k) + k, jj, kk]
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
@@ -359,6 +357,7 @@ def main(
 
 def assert_tl_matmul_weight_only_transform_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
     import bitblas
+
     matmul = tl_matmul_weight_only_transform(M, N, K, in_dtype, out_dtype, accum_dtype)
     kernel = tilelang.compile(matmul, out_idx=[2])
     profiler = kernel.get_profiler()
@@ -376,8 +375,8 @@ def assert_tl_matmul_weight_only_transform_correctness(M, N, K, in_dtype, out_dt
     ladder_permutate_config = bitblas.ops.LadderPermutateConfig(
         M=N,
         N=(K // 2),
-        datatype="int8",
-        storage_dtype="int8",
+        datatype=T.int8,
+        storage_dtype=T.int8,
         transform_kind=transform_b,
         transpose_matrix=True,
     )
@@ -401,9 +400,9 @@ def assert_tl_matmul_weight_only_transform_correctness(M, N, K, in_dtype, out_dt
 @tilelang.testing.requires_package("bitblas")
 @tilelang.testing.requires_llvm
 def test_assert_tl_matmul_weight_only_transform():
-    assert_tl_matmul_weight_only_transform_correctness(128, 128, 128, "int8", "int32", "int32")
+    assert_tl_matmul_weight_only_transform_correctness(128, 128, 128, T.int8, T.int32, T.int32)
 
 
 if __name__ == "__main__":
     # tilelang.testing.main()
-    assert_tl_matmul_correctness(128, 128, 128, "int8", "int32", "int32")
+    assert_tl_matmul_correctness(128, 128, 128, T.int8, T.int32, T.int32)
diff --git a/testing/python/language/test_tilelang_intimm.py b/testing/python/language/test_tilelang_intimm.py
new file mode 100644
index 000000000..46c2c7987
--- /dev/null
+++ b/testing/python/language/test_tilelang_intimm.py
@@ -0,0 +1,28 @@
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+
+def test_tilelang_intimm():
+    T.int32(0x7FFFFFFF)
+    T.int32(-0x7FFFFFFF - 1)
+    T.uint32(0xFFFFFFFF)
+    T.int64(0x7FFFFFFFFFFFFFFF)
+    T.int64(-0x7FFFFFFFFFFFFFFF - 1)
+    T.uint64(0xFFFFFFFFFFFFFFFF)
+
+    a = T.int32()
+    a & 0x7FFFFFFF
+
+    a = T.uint32()
+    a & 0xFFFFFFFF
+
+    a = T.int64()
+    a & 0x7FFFFFFFFFFFFFFF
+
+    a = T.uint64()
+    a & T.uint64(0xFFFFFFFFFFFFFFFF)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_laguange_chain_equal.py b/testing/python/language/test_tilelang_laguange_chain_equal.py
index 696a9c70b..0a9623fa9 100644
--- a/testing/python/language/test_tilelang_laguange_chain_equal.py
+++ b/testing/python/language/test_tilelang_laguange_chain_equal.py
@@ -8,14 +8,14 @@
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    },)
+    },
+)
 def chain_equal(N, block_size, dtype="float32"):
-
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
-            C: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
+        C: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as bx:
             for lane in T.Parallel(block_size):
diff --git a/testing/python/language/test_tilelang_language_alias.py b/testing/python/language/test_tilelang_language_alias.py
index c99d36102..48fe1ac4d 100644
--- a/testing/python/language/test_tilelang_language_alias.py
+++ b/testing/python/language/test_tilelang_language_alias.py
@@ -4,13 +4,12 @@
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -44,7 +43,7 @@ def main(
     return main
 
 
-def run_matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def run_matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     program = matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
     kernel = tilelang.compile(program, out_idx=[2], target="cuda")
     kernel.run_once()
diff --git a/testing/python/language/test_tilelang_language_all_of.py b/testing/python/language/test_tilelang_language_all_of.py
index 73233ec87..db694d337 100644
--- a/testing/python/language/test_tilelang_language_all_of.py
+++ b/testing/python/language/test_tilelang_language_all_of.py
@@ -13,11 +13,10 @@ def ref_program(A, B, BlockMask, block_M, block_N, block_K):
             accu = torch.zeros((block_M, block_N), dtype=torch.float32, device=A.device)
             for k in range(K // block_K):
                 if torch.all(BlockMask[i, j, k]):
-                    accu += A[i * block_M:(i + 1) * block_M, k * block_K:(k + 1) * block_K].to(
-                        torch.float32) @ B[k * block_K:(k + 1) * block_K,
-                                           j * block_N:(j + 1) * block_N].to(torch.float32)
-            ref_c[i * block_M:(i + 1) * block_M, j * block_N:(j + 1) * block_N] = (
-                accu.to(torch.float16))
+                    accu += A[i * block_M : (i + 1) * block_M, k * block_K : (k + 1) * block_K].to(torch.float32) @ B[
+                        k * block_K : (k + 1) * block_K, j * block_N : (j + 1) * block_N
+                    ].to(torch.float32)
+            ref_c[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N] = accu.to(torch.float16)
     return ref_c
 
 
@@ -32,18 +31,17 @@ def blocksparse_matmul_global(
     num_stages,
     thread_num,
     enable_rasteration,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
-
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -77,18 +75,17 @@ def blocksparse_matmul_shared(
     num_stages,
     thread_num,
     enable_rasteration,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
-
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -127,18 +124,17 @@ def blocksparse_matmul_local(
     num_stages,
     thread_num,
     enable_rasteration,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
-
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -237,7 +233,8 @@ def run_block_sparse_matmul_shared(M=1024, N=1024, K=1024, sparsity=0.5, conditi
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
     block_mask = torch.rand(mask_shape).cuda() > sparsity
@@ -284,7 +281,8 @@ def run_block_sparse_matmul_local(M=1024, N=1024, K=1024, sparsity=0.5, conditio
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
     block_mask = torch.rand(mask_shape).cuda() > sparsity
diff --git a/testing/python/language/test_tilelang_language_alloc.py b/testing/python/language/test_tilelang_language_alloc.py
index 202d6bfaa..709796932 100644
--- a/testing/python/language/test_tilelang_language_alloc.py
+++ b/testing/python/language/test_tilelang_language_alloc.py
@@ -1,4 +1,5 @@
 import tilelang.testing
+from tilelang import language as T
 
 
 def alloc_var(
@@ -6,12 +7,10 @@ def alloc_var(
     block_N,
     dtype,
 ):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             A_shared = T.alloc_shared([block_N], dtype)
@@ -34,11 +33,11 @@ def run_alloc_var(
 
     kernel = tilelang.compile(program, out_idx=[1])
     code = kernel.get_kernel_source()
-    assert "tmp =" in code
+    assert "tmp =" in code or "tmp[0] =" in code
 
 
 def test_alloc_var():
-    run_alloc_var(1024, 128, "float16")
+    run_alloc_var(1024, 128, T.float16)
 
 
 def alloc_var_add(
@@ -50,8 +49,8 @@ def alloc_var_add(
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             A_shared = T.alloc_shared([block_N], dtype)
@@ -74,11 +73,11 @@ def run_alloc_var_add(
 
     kernel = tilelang.compile(program, out_idx=[1])
     code = kernel.get_kernel_source()
-    assert "tmp =" in code
+    assert "tmp =" in code or "tmp[0] =" in code
 
 
 def test_alloc_var_add():
-    run_alloc_var_add(1024, 128, "float16")
+    run_alloc_var_add(1024, 128, T.float16)
 
 
 def alloc_var_with_initializer(
@@ -91,8 +90,8 @@ def alloc_var_with_initializer(
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             tmp = T.alloc_var(dtype, init_value)
@@ -113,12 +112,13 @@ def run_alloc_var_with_initializer(
 
     kernel = tilelang.compile(program, out_idx=[1])
     code = kernel.get_kernel_source()
-    print(code)
     assert f"= {init_value};" in code
 
 
+# TODO(Gong): ROCm is not supported yet, disable for now
+@tilelang.testing.requires_cuda
 def test_alloc_var_with_initializer():
-    run_alloc_var_with_initializer(256, 64, "int32", 5)
+    run_alloc_var_with_initializer(256, 64, T.int32, 5)
 
 
 def alloc_multi_vars_with_initializer(
@@ -130,8 +130,8 @@ def alloc_multi_vars_with_initializer(
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             tmp0 = T.alloc_var(dtype, 1)
@@ -151,14 +151,15 @@ def run_alloc_multi_vars_with_initializer(
     program = alloc_multi_vars_with_initializer(N, block_N, dtype)
 
     kernel = tilelang.compile(program, out_idx=[1])
-    code = kernel.get_kernel_source()
-    print(code)
+    code = kernel.get_kernel_source(kernel_only=True)
     assert code.count("= 1;") == 1
     assert code.count("= 2;") == 1
 
 
+# TODO(Gong): ROCm is not supported yet, disable for now
+@tilelang.testing.requires_cuda
 def test_alloc_multi_vars_with_initializer():
-    run_alloc_multi_vars_with_initializer(256, 64, "int32")
+    run_alloc_multi_vars_with_initializer(256, 64, T.int32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_annot.py b/testing/python/language/test_tilelang_language_annot.py
new file mode 100644
index 000000000..5c9aeeac6
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_annot.py
@@ -0,0 +1,74 @@
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+import torch
+
+
+def test_tensor_annot_mul():
+    @tilelang.jit
+    def example_tensor_annot():
+        n = T.symbolic("n")
+
+        @T.prim_func
+        def kernel(
+            A: T.Tensor((n * 4,), T.int32),
+        ):
+            with T.Kernel(1) as _:
+                for i in range(n * 4):
+                    A[i] = 0
+
+        return kernel
+
+    ker = example_tensor_annot()
+    A = torch.arange(16, dtype=torch.int32, device="cuda")
+    ker(A)
+    expected = torch.zeros(16, dtype=torch.int32, device="cuda")
+    assert torch.equal(A, expected)
+
+
+def test_tensor_annot_add():
+    @tilelang.jit
+    def example_tensor_annot():
+        n = T.symbolic("n")
+
+        @T.prim_func
+        def kernel(
+            A: T.Tensor((n + 1,), T.int32),
+        ):
+            with T.Kernel(1) as _:
+                for i in range(n + 1):
+                    A[i] = 0
+
+        return kernel
+
+    ker = example_tensor_annot()
+    A = torch.arange(16, dtype=torch.int32, device="cuda")
+    ker(A)
+    expected = torch.zeros(16, dtype=torch.int32, device="cuda")
+    assert torch.equal(A, expected)
+
+
+def test_tensor_annot_mul_add():
+    @tilelang.jit
+    def example_tensor_annot():
+        n = T.symbolic("n")
+
+        @T.prim_func
+        def kernel(
+            A: T.Tensor((n * 3 + 1,), T.int32),
+        ):
+            with T.Kernel(1) as _:
+                for i in range(n * 3 + 1):
+                    A[i] = 0
+
+        return kernel
+
+    ker = example_tensor_annot()
+    A = torch.arange(16, dtype=torch.int32, device="cuda")
+    ker(A)
+    expected = torch.zeros(16, dtype=torch.int32, device="cuda")
+    assert torch.equal(A, expected)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_annotate_safe_value.py b/testing/python/language/test_tilelang_language_annotate_safe_value.py
index 3d616ac1e..3c8239a15 100644
--- a/testing/python/language/test_tilelang_language_annotate_safe_value.py
+++ b/testing/python/language/test_tilelang_language_annotate_safe_value.py
@@ -6,12 +6,11 @@
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_copy(M, N, block_M, block_N, dtype="float16", pad_value=0):
-
+def tilelang_copy(M, N, block_M, block_N, dtype=T.float16, pad_value=0):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -27,16 +26,11 @@ def main(
     return main
 
 
-def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16", pad_value=0):
+def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16, pad_value=0):
     program = tilelang_copy(M, N, block_M, block_N, dtype, pad_value=pad_value)
     kernel = tilelang.compile(
-        program,
-        out_idx=[1],
-        target="cuda",
-        pass_configs={
-            "tl.disable_warp_specialized": True,
-            "tl.disable_tma_lower": True
-        })
+        program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
+    )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     ref_b = torch.zeros_like(a)
diff --git a/testing/python/language/test_tilelang_language_any_of.py b/testing/python/language/test_tilelang_language_any_of.py
index 354d32cd0..74db94f7c 100644
--- a/testing/python/language/test_tilelang_language_any_of.py
+++ b/testing/python/language/test_tilelang_language_any_of.py
@@ -13,11 +13,10 @@ def ref_program(A, B, BlockMask, block_M, block_N, block_K):
             accu = torch.zeros((block_M, block_N), dtype=torch.float32, device=A.device)
             for k in range(K // block_K):
                 if torch.any(BlockMask[i, j, k]):
-                    accu += A[i * block_M:(i + 1) * block_M, k * block_K:(k + 1) * block_K].to(
-                        torch.float32) @ B[k * block_K:(k + 1) * block_K,
-                                           j * block_N:(j + 1) * block_N].to(torch.float32)
-            ref_c[i * block_M:(i + 1) * block_M, j * block_N:(j + 1) * block_N] = (
-                accu.to(torch.float16))
+                    accu += A[i * block_M : (i + 1) * block_M, k * block_K : (k + 1) * block_K].to(torch.float32) @ B[
+                        k * block_K : (k + 1) * block_K, j * block_N : (j + 1) * block_N
+                    ].to(torch.float32)
+            ref_c[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N] = accu.to(torch.float16)
     return ref_c
 
 
@@ -32,18 +31,17 @@ def blocksparse_matmul_global(
     num_stages,
     thread_num,
     enable_rasteration,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
-
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -77,18 +75,17 @@ def blocksparse_matmul_shared(
     num_stages,
     thread_num,
     enable_rasteration,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
-
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -127,18 +124,17 @@ def blocksparse_matmul_local(
     num_stages,
     thread_num,
     enable_rasteration,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
-
     block_mask_shape = (M // block_M, N // block_N, K // block_K, condition_dim)
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -237,7 +233,8 @@ def run_block_sparse_matmul_shared(M=1024, N=1024, K=1024, sparsity=0.5, conditi
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
     block_mask = torch.rand(mask_shape).cuda() > sparsity
@@ -284,7 +281,8 @@ def run_block_sparse_matmul_local(M=1024, N=1024, K=1024, sparsity=0.5, conditio
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
     block_mask = torch.rand(mask_shape).cuda() > sparsity
diff --git a/testing/python/language/test_tilelang_language_assume.py b/testing/python/language/test_tilelang_language_assume.py
new file mode 100644
index 000000000..06e92dfa9
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_assume.py
@@ -0,0 +1,86 @@
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+
+
+def test_assume_remove_boundary_check():
+    @tilelang.jit
+    def kernel_with_assume():
+        N = T.dynamic("N")
+
+        @T.prim_func
+        def main(A: T.Tensor((N,), T.float32), l: T.int32, r: T.int32):
+            with T.Kernel(1, threads=32) as _:
+                for i in T.serial(r - l + 1):
+                    T.assume(l + i >= 0 and l + i < N)
+                    A[l + i] = 0
+
+        return main
+
+    jit_kernel = kernel_with_assume()
+    source = jit_kernel.get_kernel_source()
+
+    assert "if (" not in source
+
+
+def test_assume_enable_vectorization():
+    @tilelang.jit
+    def kernel_vectorize(M):
+        N = T.dynamic("N")
+        vectorize_size = 4
+
+        @T.prim_func
+        def main(
+            A: T.Tensor((M, N), T.float32),
+            B: T.Tensor((M, N), T.float32),
+        ):
+            with T.Kernel(1, threads=32) as _:
+                tid = T.get_thread_binding()
+
+                base_idx = tid * 4
+                T.assume(N % vectorize_size == 0)
+
+                for i in T.vectorized(vectorize_size):
+                    T.assume(base_idx + i < N)
+                    B[tid, base_idx + i] = A[tid, base_idx + i]
+
+        return main
+
+    jit_kernel = kernel_vectorize(128)
+    source = jit_kernel.get_kernel_source()
+
+    assert ("float4" in source) and ("if (" not in source)
+
+
+def test_assume_complex_indexing():
+    @tilelang.jit
+    def kernel_complex():
+        M = T.dynamic("M")
+        N = T.dynamic("N")
+
+        @T.prim_func
+        def main(
+            A: T.Tensor((M, N), T.float32),
+            B: T.Tensor((M, N), T.float32),
+        ):
+            with T.Kernel(1, threads=32) as _:
+                tid = T.get_thread_binding()
+                for j in T.serial(N):
+                    i_src = T.min(j + 233, tid + 2)
+                    j_src = j * T.ceildiv(j, i_src) * j - 1
+
+                    T.assume(i_src >= 0 and i_src < M)
+                    T.assume(j_src >= 0 and j_src < N)
+
+                    B[tid, j] = A[i_src, j_src]
+
+        return main
+
+    jit_kernel = kernel_complex()
+    source = jit_kernel.get_kernel_source()
+
+    assert "if (" not in source
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_atomic_add.py b/testing/python/language/test_tilelang_language_atomic_add.py
index 42c33e54d..fa4dff7b3 100644
--- a/testing/python/language/test_tilelang_language_atomic_add.py
+++ b/testing/python/language/test_tilelang_language_atomic_add.py
@@ -3,15 +3,13 @@
 
 
 @tilelang.jit
-def atomic_add_program(K, M, N, block_M, block_N, dtype="float"):
-
+def atomic_add_program(K, M, N, block_M, block_N, dtype=T.float32):
     @T.prim_func
     def atomic_add(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
             A_shared = T.alloc_shared((block_M, block_N), dtype)
 
-            T.copy(A[bz, bx * block_M:(bx + 1) * block_M, by * block_N:(by + 1) * block_N],
-                   A_shared)
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
 
             for i, j in T.Parallel(block_M, block_N):
                 T.atomic_add(B[bx * block_M + i, by * block_N + j], A_shared[i, j])
@@ -19,7 +17,7 @@ def atomic_add(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
     return atomic_add
 
 
-def run_atomic_add(K, M, N, block_M, block_N, dtype="float32"):
+def run_atomic_add(K, M, N, block_M, block_N, dtype=T.float32):
     kernel = atomic_add_program(K, M, N, block_M, block_N, dtype=dtype)
     import torch
 
@@ -38,22 +36,20 @@ def ref_program(A, B):
 
 
 @tilelang.jit
-def tile_atomic_add_program(K, M, N, block_M, block_N, dtype="float"):
-
+def tile_atomic_add_program(K, M, N, block_M, block_N, dtype=T.float32):
     @T.prim_func
     def atomic_add(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
             A_shared = T.alloc_shared((block_M, block_N), dtype)
 
-            T.copy(A[bz, bx * block_M:(bx + 1) * block_M, by * block_N:(by + 1) * block_N],
-                   A_shared)
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
 
             T.atomic_add(B[bx * block_M, by * block_N], A_shared)
 
     return atomic_add
 
 
-def run_tile_atomic_add(K, M, N, block_M, block_N, dtype="float32"):
+def run_tile_atomic_add(K, M, N, block_M, block_N, dtype=T.float32):
     kernel = tile_atomic_add_program(K, M, N, block_M, block_N, dtype=dtype)
     print(kernel.get_kernel_source())
     import torch
@@ -75,15 +71,13 @@ def ref_program(A, B):
 
 
 @tilelang.jit
-def atomic_max_program(K, M, N, block_M, block_N, dtype="float"):
-
+def atomic_max_program(K, M, N, block_M, block_N, dtype=T.float32):
     @T.prim_func
     def atomic_max(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
             A_shared = T.alloc_shared((block_M, block_N), dtype)
 
-            T.copy(A[bz, bx * block_M:(bx + 1) * block_M, by * block_N:(by + 1) * block_N],
-                   A_shared)
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
 
             for i, j in T.Parallel(block_M, block_N):
                 T.atomic_max(B[bx * block_M + i, by * block_N + j], A_shared[i, j])
@@ -91,7 +85,7 @@ def atomic_max(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
     return atomic_max
 
 
-def run_atomic_max(K, M, N, block_M, block_N, dtype="float32"):
+def run_atomic_max(K, M, N, block_M, block_N, dtype=T.float32):
     kernel = atomic_max_program(K, M, N, block_M, block_N, dtype=dtype)
     import torch
 
@@ -110,15 +104,13 @@ def ref_program(A, B):
 
 
 @tilelang.jit
-def atomic_min_program(K, M, N, block_M, block_N, dtype="float"):
-
+def atomic_min_program(K, M, N, block_M, block_N, dtype=T.float32):
     @T.prim_func
     def atomic_min(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
             A_shared = T.alloc_shared((block_M, block_N), dtype)
 
-            T.copy(A[bz, bx * block_M:(bx + 1) * block_M, by * block_N:(by + 1) * block_N],
-                   A_shared)
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
 
             for i, j in T.Parallel(block_M, block_N):
                 T.atomic_min(B[bx * block_M + i, by * block_N + j], A_shared[i, j])
@@ -126,7 +118,7 @@ def atomic_min(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
     return atomic_min
 
 
-def run_atomic_min(K, M, N, block_M, block_N, dtype="float32"):
+def run_atomic_min(K, M, N, block_M, block_N, dtype=T.float32):
     kernel = atomic_min_program(K, M, N, block_M, block_N, dtype=dtype)
     import torch
 
@@ -137,7 +129,7 @@ def ref_program(A, B):
                     B[i, j] = min(B[i, j], A[k, i, j])
 
     A = torch.randn(K, M, N, dtype=getattr(torch, dtype)).cuda()
-    B = torch.full((M, N), float('inf'), dtype=getattr(torch, dtype)).cuda()
+    B = torch.full((M, N), float("inf"), dtype=getattr(torch, dtype)).cuda()
     ref_B = B.clone()
     ref_program(A, ref_B)
     kernel(A, B)
@@ -145,8 +137,7 @@ def ref_program(A, B):
 
 
 @tilelang.jit
-def atomic_load_store_program(M, N, block_M, block_N, dtype="float"):
-
+def atomic_load_store_program(M, N, block_M, block_N, dtype=T.float32):
     @T.prim_func
     def atomic_load_store(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
@@ -160,7 +151,7 @@ def atomic_load_store(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
     return atomic_load_store
 
 
-def run_atomic_load_store(M, N, block_M, block_N, dtype="float32"):
+def run_atomic_load_store(M, N, block_M, block_N, dtype=T.float32):
     kernel = atomic_load_store_program(M, N, block_M, block_N, dtype=dtype)
     import torch
 
@@ -171,24 +162,21 @@ def run_atomic_load_store(M, N, block_M, block_N, dtype="float32"):
 
 
 @tilelang.jit
-def atomic_memory_order_program(K, M, N, block_M, block_N, dtype="float"):
-
+def atomic_memory_order_program(K, M, N, block_M, block_N, dtype=T.float32):
     @T.prim_func
     def atomic_with_memory_order(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
             A_shared = T.alloc_shared((block_M, block_N), dtype)
 
-            T.copy(A[bz, bx * block_M:(bx + 1) * block_M, by * block_N:(by + 1) * block_N],
-                   A_shared)
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
 
             for i, j in T.Parallel(block_M, block_N):
-                T.atomic_add(
-                    B[bx * block_M + i, by * block_N + j], A_shared[i, j], memory_order="relaxed")
+                T.atomic_add(B[bx * block_M + i, by * block_N + j], A_shared[i, j], memory_order="relaxed")
 
     return atomic_with_memory_order
 
 
-def run_atomic_memory_order(K, M, N, block_M, block_N, dtype="float32"):
+def run_atomic_memory_order(K, M, N, block_M, block_N, dtype=T.float32):
     kernel = atomic_memory_order_program(K, M, N, block_M, block_N, dtype=dtype)
     import torch
 
@@ -208,9 +196,8 @@ def ref_program(A, B):
 
 @tilelang.jit
 def atomic_addx2_program(M, N, block_M, block_N):
-
     @T.prim_func
-    def atomic_addx2(A: T.Tensor((M, N), "float16"), B: T.Tensor((M, N), "float16")):
+    def atomic_addx2(A: T.Tensor((M, N), T.float16), B: T.Tensor((M, N), T.float16)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
             for i, j in T.Parallel(block_M, block_N // 2):
                 idx_i = bx * block_M + i
@@ -236,70 +223,69 @@ def run_atomic_addx2(M, N, block_M, block_N):
     torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
 
 
-@tilelang.jit
-def atomic_different_memory_orders_program(M, N, block_M, block_N, dtype="float"):
+def test_atomic_add():
+    run_atomic_add(8, 128, 128, 32, 32)
+
+
+def test_atomic_max():
+    run_atomic_max(4, 64, 64, 16, 16)
+
+
+def test_atomic_min():
+    run_atomic_min(4, 64, 64, 16, 16)
+
+
+def test_atomic_load_store():
+    run_atomic_load_store(64, 64, 16, 16)
+
+
+def test_atomic_memory_order():
+    run_atomic_memory_order(4, 64, 64, 16, 16)
+
+
+def test_atomic_addx2():
+    run_atomic_addx2(32, 64, 8, 16)
 
+
+@tilelang.jit
+def atomic_different_memory_orders_program(M, N, block_M, block_N, dtype=T.float32):
     @T.prim_func
-    def atomic_different_orders(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype), C: T.Tensor(
-        (M, N), dtype), D: T.Tensor((M, N), dtype)):
+    def atomic_different_orders(
+        A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype), C: T.Tensor((M, N), dtype), D: T.Tensor((M, N), dtype)
+    ):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
                 idx_i = bx * block_M + i
                 idx_j = by * block_N + j
                 if idx_i < M and idx_j < N:
                     val = A[idx_i, idx_j]
-                    T.atomic_add(B[idx_i, idx_j], val, memory_order="relaxed")
-                    T.atomic_max(C[idx_i, idx_j], val, memory_order="acquire")
-                    T.atomic_min(D[idx_i, idx_j], val, memory_order="release")
+                    T.atomic_add(B[idx_i, idx_j], val, memory_order="release")
+                    T.atomic_max(C[idx_i, idx_j], val, memory_order="relaxed")
+                    T.atomic_min(D[idx_i, idx_j], val, memory_order="relaxed")
 
     return atomic_different_orders
 
 
-def run_atomic_different_memory_orders(M, N, block_M, block_N, dtype="float32"):
+def run_atomic_different_memory_orders(M, N, block_M, block_N, dtype=T.float32):
     kernel = atomic_different_memory_orders_program(M, N, block_M, block_N, dtype=dtype)
     import torch
 
     A = torch.randn(M, N, dtype=getattr(torch, dtype)).cuda()
     B = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
     C = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
-    D = torch.full((M, N), float('inf'), dtype=getattr(torch, dtype)).cuda()
+    D = torch.full((M, N), float("inf"), dtype=getattr(torch, dtype)).cuda()
 
     kernel(A, B, C, D)
 
     torch.testing.assert_close(B, A, atol=1e-3, rtol=1e-3)
     torch.testing.assert_close(C, torch.maximum(torch.zeros_like(A), A))
-    torch.testing.assert_close(D, torch.minimum(torch.full_like(A, float('inf')), A))
-
-
-def test_atomic_add():
-    run_atomic_add(8, 128, 128, 32, 32)
-
-
-def test_atomic_max():
-    run_atomic_max(4, 64, 64, 16, 16)
-
-
-def test_atomic_min():
-    run_atomic_min(4, 64, 64, 16, 16)
-
-
-def test_atomic_load_store():
-    run_atomic_load_store(64, 64, 16, 16)
-
-
-def test_atomic_memory_order():
-    run_atomic_memory_order(4, 64, 64, 16, 16)
-
-
-def test_atomic_addx2():
-    run_atomic_addx2(32, 64, 8, 16)
+    torch.testing.assert_close(D, torch.minimum(torch.full_like(A, float("inf")), A))
 
 
 @tilelang.jit
 def atomic_addx4_program(M, N, block_M, block_N):
-
     @T.prim_func
-    def atomic_addx4(A: T.Tensor((M, N), "float32"), B: T.Tensor((M, N), "float32")):
+    def atomic_addx4(A: T.Tensor((M, N), T.float32), B: T.Tensor((M, N), T.float32)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
             for i, j in T.Parallel(block_M, block_N // 4):
                 idx_i = bx * block_M + i
@@ -329,23 +315,20 @@ def run_atomic_addx4(M, N, block_M, block_N):
 
 
 @tilelang.jit
-def atomic_return_prev_program(M, N, block_M, block_N, dtype="float"):
-
+def atomic_return_prev_program(M, N, block_M, block_N, dtype=T.float32):
     @T.prim_func
-    def atomic_with_return_prev(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype),
-                                old_vals: T.Tensor((M, N), dtype)):
+    def atomic_with_return_prev(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype), old_vals: T.Tensor((M, N), dtype)):
         with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
                 idx_i = bx * block_M + i
                 idx_j = by * block_N + j
                 if idx_i < M and idx_j < N:
-                    old_vals[idx_i, idx_j] = T.atomic_add(
-                        B[idx_i, idx_j], A[idx_i, idx_j], return_prev=True)
+                    old_vals[idx_i, idx_j] = T.atomic_add(B[idx_i, idx_j], A[idx_i, idx_j], return_prev=True)
 
     return atomic_with_return_prev
 
 
-def run_atomic_return_prev(M, N, block_M, block_N, dtype="float32"):
+def run_atomic_return_prev(M, N, block_M, block_N, dtype=T.float32):
     kernel = atomic_return_prev_program(M, N, block_M, block_N, dtype=dtype)
     import torch
 
@@ -361,7 +344,9 @@ def run_atomic_return_prev(M, N, block_M, block_N, dtype="float32"):
 
 
 def test_atomic_different_memory_orders():
-    run_atomic_different_memory_orders(32, 32, 8, 8)
+    run_atomic_different_memory_orders(32, 32, 8, 8, dtype=T.float32)
+    run_atomic_different_memory_orders(32, 32, 8, 8, dtype=T.float16)
+    run_atomic_different_memory_orders(32, 32, 8, 8, dtype=T.bfloat16)
 
 
 def test_atomic_addx4():
@@ -372,10 +357,9 @@ def test_atomic_return_prev():
     run_atomic_return_prev(32, 32, 8, 8)
 
 
-# TODO(lei): test failed and this is experimental
-# CC @dyq
-# def test_tile_atomic_add():
-#     run_tile_atomic_add(8, 128, 128, 32, 32)
+def test_tile_atomic_add():
+    run_tile_atomic_add(8, 128, 128, 32, 32)
+
 
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_ceildiv.py b/testing/python/language/test_tilelang_language_ceildiv.py
index 35201a074..f5af31b83 100644
--- a/testing/python/language/test_tilelang_language_ceildiv.py
+++ b/testing/python/language/test_tilelang_language_ceildiv.py
@@ -5,9 +5,8 @@
 
 @tilelang.jit(out_idx=[-1])
 def _ceildiv_kernel(a: int, b: int):
-
     @T.prim_func
-    def ceildiv_kernel(A: T.Tensor((1,), "int32")):
+    def ceildiv_kernel(A: T.Tensor((1,), T.int32)):
         with T.Kernel(1, threads=1) as _:
             A[0] = T.ceildiv(T.int32(a), T.int32(b))
 
@@ -30,9 +29,8 @@ def test_ceildiv():
 
 @tilelang.jit
 def _ceildiv_kernel_dyn(b: int):
-
     @T.prim_func
-    def ceildiv_kernel(A: T.Tensor((1,), "int32"), a: T.int32):
+    def ceildiv_kernel(A: T.Tensor((1,), T.int32), a: T.int32):
         with T.Kernel(1, threads=1) as _:
             A[0] = T.ceildiv(T.int32(a), T.int32(b))
 
diff --git a/testing/python/language/test_tilelang_language_chain_equal.py b/testing/python/language/test_tilelang_language_chain_equal.py
new file mode 100644
index 000000000..083eefdcb
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_chain_equal.py
@@ -0,0 +1,46 @@
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+import torch
+
+
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+    },
+)
+def chain_equal(N, block_size, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
+        C: T.Tensor((N,), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as bx:
+            for lane in T.Parallel(block_size):
+                idx = bx * block_size + lane
+                A[idx] = B[idx] = C[idx] = 1
+
+    return main
+
+
+def run_chain_equal(N=128, block_size=64, dtype=T.float32):
+    kernel = chain_equal(N, block_size, dtype)
+    A = torch.zeros((N,), dtype=torch.float32, device="cuda")
+    B = torch.zeros((N,), dtype=torch.float32, device="cuda")
+    C = torch.zeros((N,), dtype=torch.float32, device="cuda")
+    kernel(A, B, C)
+    ref = torch.ones_like(A)
+    torch.testing.assert_close(A, ref)
+    torch.testing.assert_close(B, ref)
+    torch.testing.assert_close(C, ref)
+
+
+@tilelang.testing.requires_cuda
+def test_chain_equal():
+    run_chain_equal()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_clamp.py b/testing/python/language/test_tilelang_language_clamp.py
index 4a2f17791..372d74784 100644
--- a/testing/python/language/test_tilelang_language_clamp.py
+++ b/testing/python/language/test_tilelang_language_clamp.py
@@ -1,5 +1,5 @@
 import tilelang.testing
-from tilelang.utils.tensor import map_torch_type
+from tilelang import language as T
 
 
 def clamp_within_bounds(
@@ -13,8 +13,8 @@ def clamp_within_bounds(
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             A_shared = T.alloc_shared([block_N], dtype)
@@ -56,8 +56,8 @@ def clamp_value_range(
 
     @T.prim_func
     def main(
-            A: T.Tensor((1, N), dtype),
-            B: T.Tensor((1, N), dtype),
+        A: T.Tensor((1, N), dtype),
+        B: T.Tensor((1, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             # A_shared = T.alloc_shared([1, block_N], dtype=dtype)
@@ -91,7 +91,7 @@ def run_clamp_value_range(
     import torch
 
     # Convert string dtype to torch.dtype
-    torch_dtype = map_torch_type(dtype)
+    torch_dtype = dtype.as_torch()
 
     def ref_program(A):
         min_val = torch.min(A) * 0.5
@@ -107,10 +107,10 @@ def ref_program(A):
 
 def test_clamp():
     # clamp tests for float16 and float32
-    run_clamp(1024, 128, "float16", -0.05, 0.05)
-    run_clamp(1024, 128, "float32", -0.06, 0.05)
-    run_clamp_value_range(1024, 128, "float16")
-    run_clamp_value_range(1024, 128, "float32")
+    run_clamp(1024, 128, T.float16, -0.05, 0.05)
+    run_clamp(1024, 128, T.float32, -0.06, 0.05)
+    run_clamp_value_range(1024, 128, T.float16)
+    run_clamp_value_range(1024, 128, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_clear.py b/testing/python/language/test_tilelang_language_clear.py
index be3d808f4..af9d89631 100644
--- a/testing/python/language/test_tilelang_language_clear.py
+++ b/testing/python/language/test_tilelang_language_clear.py
@@ -4,13 +4,12 @@
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((N, K), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((N, K), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -40,12 +39,12 @@ def main(
     return main
 
 
-def run_matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def run_matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     program = matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
-    kernel = tilelang.compile(
-        program, out_idx=[2], target="cuda", pass_configs={"tl.disable_tma_lower": True})
+    kernel = tilelang.compile(program, out_idx=[2], target="cuda", pass_configs={"tl.disable_tma_lower": True})
     import torch
     from tilelang.utils import map_torch_type
+
     a = torch.randn((M, K), dtype=map_torch_type(dtype)).cuda()
     b = torch.randn((N, K), dtype=map_torch_type(dtype)).cuda()
     c = kernel(a, b)
diff --git a/testing/python/language/test_tilelang_language_composable_index.py b/testing/python/language/test_tilelang_language_composable_index.py
index ac2254f30..7893c1f24 100644
--- a/testing/python/language/test_tilelang_language_composable_index.py
+++ b/testing/python/language/test_tilelang_language_composable_index.py
@@ -6,12 +6,11 @@
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_composable_copy(M, N, block_M, block_N, dtype="float16"):
-
+def tilelang_composable_copy(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M * N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M * N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -26,7 +25,7 @@ def main(
     return main
 
 
-def run_tilelang_composable_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
+def run_tilelang_composable_copy(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_composable_copy(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
         program,
@@ -35,7 +34,8 @@ def run_tilelang_composable_copy(M=1024, N=1024, block_M=128, block_N=128, dtype
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b.flatten(), a.flatten(), rtol=1e-2, atol=1e-2)
@@ -44,7 +44,7 @@ def run_tilelang_composable_copy(M=1024, N=1024, block_M=128, block_N=128, dtype
 def test_tilelang_copy():
     run_tilelang_composable_copy(M=1024, N=1024, block_M=128, block_N=128)
     run_tilelang_composable_copy(M=1024, N=576, block_M=32, block_N=576)
-    run_tilelang_composable_copy(M=1024, N=576, block_M=32, block_N=576, dtype="float")
+    run_tilelang_composable_copy(M=1024, N=576, block_M=32, block_N=576, dtype=T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_copy.py b/testing/python/language/test_tilelang_language_copy.py
index 4a2ddee8e..d9d6659d1 100644
--- a/testing/python/language/test_tilelang_language_copy.py
+++ b/testing/python/language/test_tilelang_language_copy.py
@@ -3,34 +3,36 @@
 import torch
 import tilelang.testing
 
+print(torch.__version__)
+
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_copy(M, N, block_M, block_N, dtype="float16"):
-
+def tilelang_copy(M, N, block_M, block_N, src_dtype=T.float16, dst_dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), src_dtype),
+        B: T.Tensor((M, N), dst_dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-            for i, j in T.Parallel(block_M, block_N):
-                B[by * block_M + i, bx * block_N + j] = A[by * block_M + i, bx * block_N + j]
+            T.copy(
+                A[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N],
+                B[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N],
+            )
 
     return main
 
 
-def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
-    program = tilelang_copy(M, N, block_M, block_N, dtype)
+def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
+    program = tilelang_copy(M, N, block_M, block_N, src_dtype=dtype, dst_dtype=dtype)
     kernel = tilelang.compile(
         program,
         out_idx=[1],
-        target="cuda",
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True
-        })
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True},
+    )
+    source = kernel.get_kernel_source()
+    print(source)
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -39,15 +41,14 @@ def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16")
 def test_tilelang_copy():
     run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128)
     run_tilelang_copy(M=1024, N=576, block_M=32, block_N=576)
-    run_tilelang_copy(M=1024, N=576, block_M=32, block_N=576, dtype="float")
-
+    run_tilelang_copy(M=1024, N=576, block_M=32, block_N=576, dtype=T.float32)
 
-def tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype="float16"):
 
+def tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.StridedTensor((M, N), (NN, 1), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.StridedTensor((M, N), (NN, 1), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -57,23 +58,18 @@ def main(
     return main
 
 
-def run_tilelang_copy_with_stride(M=1024,
-                                  N=1024,
-                                  NN=2048,
-                                  block_M=128,
-                                  block_N=128,
-                                  dtype="float16"):
+def run_tilelang_copy_with_stride(M=1024, N=1024, NN=2048, block_M=128, block_N=128, dtype=T.float16):
     if isinstance(NN, int):
         assert NN > N, "NN must be greater than N"
     program = tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype)
     kernel = tilelang.compile(
         program,
         out_idx=[1],
-        target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        })
+        },
+    )
     if isinstance(NN, T.Var):
         NN = N * 2
     a = torch.randn(M, NN, device="cuda", dtype=getattr(torch, dtype))
@@ -86,43 +82,39 @@ def test_tilelang_copy_with_stride():
     run_tilelang_copy_with_stride(M=1024, N=1024, NN=T.dynamic("NN"), block_M=128, block_N=128)
 
 
-def tilelang_copy_bufferload(num_tokens, dtype="float16"):
-
+def tilelang_copy_bufferload(num_tokens, dtype=T.float16):
     @T.prim_func
     def main(
-            indices: T.Tensor((num_tokens,), "int32"),
-            x: T.Tensor((num_tokens,), dtype),
+        indices: T.Tensor((num_tokens,), T.int32),
+        x: T.Tensor((num_tokens,), dtype),
     ):
         with T.Kernel(num_tokens, threads=32) as pid:
-            idx = T.alloc_local([1], "int32")
+            idx = T.alloc_local([1], T.int32)
             T.copy(indices[pid], idx[0])
             x[idx[0]] = x[idx[0]] + 1
 
     return main
 
 
-def run_tilelang_copy_bufferload(num_tokens=128, dtype="float16"):
+def run_tilelang_copy_bufferload(num_tokens=128, dtype=T.float16):
     program = tilelang_copy_bufferload(num_tokens, dtype)
     # test compilation only
     tilelang.compile(
         program,
         out_idx=[1],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True
-        })
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True},
+    )
 
 
 def test_tilelang_copy_bufferload():
     run_tilelang_copy_bufferload(num_tokens=128)
 
 
-def tilelang_copy_buffer_load_with_parallel(M, N, block_M, block_N, dtype="float16"):
-
+def tilelang_copy_buffer_load_with_parallel(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -132,20 +124,13 @@ def main(
     return main
 
 
-def run_tilelang_copy_buffer_load_with_parallel(M=1024,
-                                                N=1024,
-                                                block_M=128,
-                                                block_N=128,
-                                                dtype="float16"):
+def run_tilelang_copy_buffer_load_with_parallel(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_buffer_load_with_parallel(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
         program,
         out_idx=[1],
-        target="cuda",
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True
-        })
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True},
+    )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -155,5 +140,46 @@ def test_tilelang_copy_buffer_load_with_parallel():
     run_tilelang_copy_buffer_load_with_parallel(M=1024, N=1024, block_M=128, block_N=128)
 
 
+def run_tilelang_copy_fp8_e8m0(M=1024, N=1024, block_M=128, block_N=128, src_dtype=T.float8_e8m0fnu, dst_dtype=T.float8_e8m0fnu):
+    program = tilelang_copy(M, N, block_M, block_N, src_dtype=src_dtype, dst_dtype=dst_dtype)
+    kernel = tilelang.compile(
+        program,
+        out_idx=[1],
+    )
+    source = kernel.get_kernel_source()
+    assert "fp8_e8_t" in source
+    dummy_input = torch.randint(0, 100, (M, N), device="cuda", dtype=torch.int8).view(torch.float8_e8m0fnu)
+    output = kernel(dummy_input)
+    assert output is not None
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(10, 0)
+def test_tilelang_copy_fp8_e8m0():
+    run_tilelang_copy_fp8_e8m0(src_dtype=T.float8_e8m0fnu, dst_dtype=T.float8_e8m0fnu)
+
+
+def run_tilelang_copy_fp4(M=1024, N=1024, block_M=128, block_N=128, src_dtype=T.float4_e2m1fn, dst_dtype=T.float4_e2m1fn):
+    program = tilelang_copy(M, N, block_M, block_N, src_dtype=src_dtype, dst_dtype=dst_dtype)
+    kernel = tilelang.compile(
+        program,
+        out_idx=[1],
+    )
+    source = kernel.get_kernel_source()
+    assert "fp4_e2_t" in source
+    # For FP4, use same shape as kernel expects, since int8 is used as storage type
+    dummy_input = torch.randint(0, 100, (M, N), device="cuda", dtype=torch.int8)
+    output = kernel(dummy_input)
+    assert output is not None
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(10, 0)
+def test_tilelang_copy_fp4():
+    run_tilelang_copy_fp4(src_dtype=T.float4_e2m1fn, dst_dtype=T.float4_e2m1fn)
+    run_tilelang_copy_fp4(src_dtype=T.float4_e2m1fn, dst_dtype=T.float16)
+    run_tilelang_copy_fp4(src_dtype=T.float4_e2m1fn, dst_dtype=T.bfloat16)
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_cumsum.py b/testing/python/language/test_tilelang_language_cumsum.py
index 004640535..fecc0d2a8 100644
--- a/testing/python/language/test_tilelang_language_cumsum.py
+++ b/testing/python/language/test_tilelang_language_cumsum.py
@@ -2,15 +2,14 @@
 import tilelang.testing
 import tilelang as tl
 import torch
+import tilelang.language as T
 
 
-def cumsum_smem_test(M, N, block_M, block_N, dim=0, reverse=False, dtype="float32"):
-    import tilelang.language as T
-
+def cumsum_smem_test(M, N, block_M, block_N, dim=0, reverse=False, dtype=T.float32):
     @T.prim_func
     def cumsum(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -23,13 +22,13 @@ def cumsum(
     return cumsum
 
 
-def cumsum_fragment_test(M, N, block_M, block_N, dim=0, reverse=False, dtype="float32"):
+def cumsum_fragment_test(M, N, block_M, block_N, dim=0, reverse=False, dtype=T.float32):
     import tilelang.language as T
 
     @T.prim_func
     def cumsum(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -44,7 +43,7 @@ def cumsum(
     return cumsum
 
 
-def run_cumsum(M, N, block_M, block_N, dim=0, reverse=False, dtype="float32", scope="smem"):
+def run_cumsum(M, N, block_M, block_N, dim=0, reverse=False, dtype=T.float32, scope="smem"):
     if scope == "smem":
         program = cumsum_smem_test(M, N, block_M, block_N, dim, reverse, dtype)
     elif scope == "fragment":
@@ -57,13 +56,16 @@ def ref_program(A):
         ref_b = torch.empty_like(A)
         for i in range(M // block_M):
             for j in range(N // block_N):
-                ref_b[i * block_M:(i + 1) * block_M,
-                      j * block_N:(j + 1) * block_N] = A[i * block_M:(i + 1) * block_M, j *
-                                                         block_N:(j + 1) * block_N].cumsum(dim=dim)
+                ref_b[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N] = A[
+                    i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N
+                ].cumsum(dim=dim)
                 if reverse:
-                    ref_b[i * block_M:(i + 1) * block_M, j * block_N:(j + 1) *
-                          block_N] = A[i * block_M:(i + 1) * block_M, j * block_N:(j + 1) *
-                                       block_N].flip(dims=[dim]).cumsum(dim=dim).flip(dims=[dim])
+                    ref_b[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N] = (
+                        A[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N]
+                        .flip(dims=[dim])
+                        .cumsum(dim=dim)
+                        .flip(dims=[dim])
+                    )
         return ref_b
 
     tilelang_res = jit_kernel(A)
@@ -71,13 +73,13 @@ def ref_program(A):
     torch.testing.assert_close(tilelang_res, ref_res, atol=1e-3, rtol=1e-3)
 
 
-def cumsum_smem_test_1d(N, block_N, reverse=False, dtype="float32"):
+def cumsum_smem_test_1d(N, block_N, reverse=False, dtype=T.float32):
     import tilelang.language as T
 
     @T.prim_func
     def cumsum(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             A_shared = T.alloc_shared((block_N,), dtype)
@@ -89,13 +91,13 @@ def cumsum(
     return cumsum
 
 
-def cumsum_fragment_test_1d(N, block_N, reverse=False, dtype="float32"):
+def cumsum_fragment_test_1d(N, block_N, reverse=False, dtype=T.float32):
     import tilelang.language as T
 
     @T.prim_func
     def cumsum(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
             A_shared = T.alloc_shared((block_N,), dtype)
@@ -109,7 +111,7 @@ def cumsum(
     return cumsum
 
 
-def run_cumsum_1d(N, block_N, reverse=False, dtype="float32", scope="smem"):
+def run_cumsum_1d(N, block_N, reverse=False, dtype=T.float32, scope="smem"):
     if scope == "smem":
         program = cumsum_smem_test_1d(N, block_N, reverse, dtype)
     elif scope == "fragment":
@@ -147,8 +149,8 @@ def test_cumsum_smem():
     run_cumsum(1024, 1024, 128, 128, dim=1, reverse=True)
 
     # Test different dtypes
-    run_cumsum(256, 256, 128, 128, dtype="float32")
-    run_cumsum(256, 256, 128, 128, dtype="float32")
+    run_cumsum(256, 256, 128, 128, dtype=T.float32)
+    run_cumsum(256, 256, 128, 128, dtype=T.float32)
 
 
 def test_cumsum_fragment():
@@ -157,8 +159,8 @@ def test_cumsum_fragment():
     run_cumsum(1024, 1024, 128, 128, dim=1, reverse=True, scope="fragment")
 
     # Test different dtypes
-    run_cumsum(256, 256, 128, 128, dtype="float32", scope="fragment")
-    run_cumsum(256, 256, 128, 128, dtype="float32", scope="fragment")
+    run_cumsum(256, 256, 128, 128, dtype=T.float32, scope="fragment")
+    run_cumsum(256, 256, 128, 128, dtype=T.float32, scope="fragment")
 
 
 def test_cumsum_smem_1d():
@@ -171,5 +173,139 @@ def test_cumsum_fragment_1d():
     run_cumsum_1d(1024, 128, reverse=True, scope="fragment")
 
 
+def cumsum_region_test_1d(N, chunk_size, reverse=False, dtype=T.float32):
+    """Test cumsum with buffer region (slice) as input."""
+    import tilelang.language as T
+
+    @T.prim_func
+    def cumsum_region(
+        InputG_fragment: T.Tensor((N,), dtype),
+        OutputG_fragment: T.Tensor((N,), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, chunk_size), threads=chunk_size) as bx:
+            i = bx
+            chunk_start = i * chunk_size
+            # Copy region to shared memory first (cumsum only supports shared memory)
+            A_shared = T.alloc_shared((chunk_size,), dtype)
+            T.copy(InputG_fragment[chunk_start : chunk_start + chunk_size], A_shared)
+            # Test cumsum with region input - in-place operation on shared memory
+            # This demonstrates the feature: T.cumsum(region, dim=0)
+            T.cumsum(src=A_shared, dim=0, reverse=reverse)
+            # Copy result back to global memory
+            T.copy(A_shared, OutputG_fragment[chunk_start : chunk_start + chunk_size])
+
+    return cumsum_region
+
+
+def run_cumsum_region_1d(N, chunk_size, reverse=False, dtype=T.float32):
+    """Run test for cumsum with region input."""
+    program = cumsum_region_test_1d(N, chunk_size, reverse, dtype)
+    jit_kernel = tl.compile(program, out_idx=-1)
+    A = torch.randn(N, dtype=getattr(torch, dtype)).cuda()
+
+    def ref_program(A):
+        ref_b = torch.empty_like(A)
+        num_blocks = (N + chunk_size - 1) // chunk_size
+        for j in range(num_blocks):
+            start = j * chunk_size
+            end = min(start + chunk_size, N)
+            chunk = A[start:end].clone()
+            if reverse:
+                chunk = torch.flip(chunk, dims=[0])
+            chunk = chunk.cumsum(dim=0)
+            if reverse:
+                chunk = torch.flip(chunk, dims=[0])
+            ref_b[start:end] = chunk
+        return ref_b
+
+    tilelang_res = jit_kernel(A)
+    ref_res = ref_program(A)
+    torch.testing.assert_close(tilelang_res, ref_res, atol=1e-3, rtol=1e-3)
+
+
+def cumsum_region_test_2d(M, N, block_M, block_N, dim=0, reverse=False, dtype=T.float32):
+    """Test cumsum with buffer region (slice) as input in 2D."""
+    import tilelang.language as T
+
+    @T.prim_func
+    def cumsum_region(
+        InputG_fragment: T.Tensor((M, N), dtype),
+        OutputG_fragment: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
+            chunk_start_M = by * block_M
+            chunk_start_N = bx * block_N
+            # Copy region to shared memory first (cumsum only supports shared memory)
+            A_shared = T.alloc_shared((block_M, block_N), dtype)
+            T.copy(
+                InputG_fragment[chunk_start_M : chunk_start_M + block_M, chunk_start_N : chunk_start_N + block_N],
+                A_shared,
+            )
+            # Test cumsum with 2D region input - in-place operation on shared memory
+            T.cumsum(src=A_shared, dim=dim, reverse=reverse)
+            # Copy result back to global memory
+            T.copy(
+                A_shared,
+                OutputG_fragment[chunk_start_M : chunk_start_M + block_M, chunk_start_N : chunk_start_N + block_N],
+            )
+
+    return cumsum_region
+
+
+def run_cumsum_region_2d(M, N, block_M, block_N, dim=0, reverse=False, dtype=T.float32):
+    """Run test for cumsum with 2D region input."""
+    program = cumsum_region_test_2d(M, N, block_M, block_N, dim, reverse, dtype)
+    jit_kernel = tl.compile(program, out_idx=-1)
+    A = torch.randn(M, N, dtype=getattr(torch, dtype)).cuda()
+
+    def ref_program(A):
+        ref_b = torch.empty_like(A)
+        num_blocks_M = (M + block_M - 1) // block_M
+        num_blocks_N = (N + block_N - 1) // block_N
+        for i in range(num_blocks_M):
+            for j in range(num_blocks_N):
+                start_M = i * block_M
+                end_M = min(start_M + block_M, M)
+                start_N = j * block_N
+                end_N = min(start_N + block_N, N)
+                chunk = A[start_M:end_M, start_N:end_N].clone()
+                if reverse:
+                    chunk = torch.flip(chunk, dims=[dim])
+                chunk = chunk.cumsum(dim=dim)
+                if reverse:
+                    chunk = torch.flip(chunk, dims=[dim])
+                ref_b[start_M:end_M, start_N:end_N] = chunk
+        return ref_b
+
+    tilelang_res = jit_kernel(A)
+    ref_res = ref_program(A)
+    torch.testing.assert_close(tilelang_res, ref_res, atol=1e-3, rtol=1e-3)
+
+
+def test_cumsum_region_1d():
+    """Test cumsum with 1D region input."""
+    # Test normal cumsum with region input
+    run_cumsum_region_1d(1024, 128)
+    # Test reverse cumsum with region input
+    run_cumsum_region_1d(1024, 128, reverse=True)
+    # Test with different chunk sizes
+    run_cumsum_region_1d(512, 64)
+    run_cumsum_region_1d(2048, 256)
+    # Tail coverage (non-divisible size)
+    run_cumsum_region_1d(1000, 128)
+
+
+def test_cumsum_region_2d():
+    """Test cumsum with 2D region input."""
+    # Test 2D cumsum along dim 0
+    run_cumsum_region_2d(1024, 1024, 128, 128, dim=0)
+    # Test 2D cumsum along dim 1
+    run_cumsum_region_2d(1024, 1024, 128, 128, dim=1)
+    # Test reverse cumsum
+    run_cumsum_region_2d(512, 512, 64, 64, dim=1, reverse=True)
+    # Tail coverage (non-divisible size)
+    run_cumsum_region_2d(1000, 1000, 128, 128, dim=1)
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_elect.py b/testing/python/language/test_tilelang_language_elect.py
deleted file mode 100644
index 2b5d97d9b..000000000
--- a/testing/python/language/test_tilelang_language_elect.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import torch
-
-import tilelang
-import tilelang.testing
-import tilelang.language as T
-
-
-@tilelang.jit
-def get_kernel():
-
-    @T.prim_func
-    def main(x: T.Tensor((1), 'int32')):
-        with T.Kernel(1, threads=32):
-            if T.elect_one_sync():
-                x[0] += 1
-
-    return main
-
-
-@tilelang.testing.requires_cuda_compute_version(9, 0)
-def test_elect_one_sync():
-    a = torch.tensor([0], dtype=torch.int32, device='cuda')
-    kernel = get_kernel()
-    kernel(a)
-    assert 'cute::elect_one_sync' in kernel.get_kernel_source()
-    assert a[0] == 1
-
-
-if __name__ == "__main__":
-    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_frontend_v2.py b/testing/python/language/test_tilelang_language_frontend_v2.py
new file mode 100644
index 000000000..aacbdacee
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_frontend_v2.py
@@ -0,0 +1,482 @@
+import tilelang
+import tilelang.language as T
+import torch
+import tilelang.testing
+import tvm
+from tvm.script.ir_builder.base import IRBuilderFrame
+from tvm.tir.expr import IntImm, Var
+
+
+def test_argument():
+    @T.prim_func
+    def test_argument(
+        t_1: T.bool,
+        t_2: T.short,
+        t_3: T.int,
+        t_4: T.long,
+        t_5: T.half,
+        t_6: T.float,
+        t_7: T.long,
+        t_8: T.int8,
+        t_9: T.int16,
+        t_10: T.int32,
+        t_11: T.int64,
+        t_12: T.uint8,
+        t_13: T.uint16,
+        t_14: T.uint32,
+        t_15: T.uint64,
+        t_16: T.float8_e4m3fn,
+        t_17: T.float8_e4m3fnuz,
+        t_18: T.float8_e5m2,
+        t_19: T.float8_e5m2fnuz,
+        t_20: T.float8_e8m0fnu,
+        t_21: T.float16,
+        t_22: T.bfloat16,
+        t_23: T.float32,
+        t_24: T.float64,
+    ):
+        pass
+
+
+def test_expr():
+    from tilelang.language.v2.dtypes import _all_dtypes
+
+    errors = []
+    for name in _all_dtypes:
+        dtype = getattr(T, name)
+        assert isinstance(dtype, tvm.DataType), f"{dtype} is not tvm.DataType"
+        try:
+            dtype(1.0)
+            dtype()
+        except TypeError:
+            pass
+        except Exception:
+            errors.append(name)
+    assert not errors
+
+
+# def test_var_decl_sugar():
+
+#     @T.prim_func
+#     def test_var_decl_sugar():
+#         with T.Kernel(128, 128) as (bx, by):
+#             var_1: T.bool = 1.0
+#             var_2: T.short = 1.0
+#             var_3: T.int = 1.0
+#             var_4: T.long = 1.0
+#             var_5: T.half = 1.0
+#             var_6: T.float = 1.0
+#             var_7: T.long = 1.0
+#             var_8: T.int8 = 1.0
+#             var_9: T.int16 = 1.0
+#             var_10: T.int32 = 1.0
+#             var_11: T.int64 = 1.0
+#             var_12: T.uint8 = 1.0
+#             var_13: T.uint16 = 1.0
+#             var_14: T.uint32 = 1.0
+#             var_15: T.uint64 = 1.0
+#             var_16: T.float8_e4m3fn = 1.0
+#             var_17: T.float8_e4m3fnuz = 1.0
+#             var_18: T.float8_e5m2 = 1.0
+#             var_19: T.float8_e5m2fnuz = 1.0
+#             var_20: T.float8_e8m0fnu = 1.0
+#             var_21: T.float16 = 1.0
+#             var_22: T.bfloat16 = 1.0
+#             var_23: T.float32 = 1.0
+#             var_24: T.float64 = 1.0
+#             var_1: T.bool = var_1
+#             var_2: T.short = var_2
+#             var_3: T.int = var_3
+#             var_4: T.long = var_4
+#             var_5: T.half = var_5
+#             var_6: T.float = var_6
+#             var_7: T.long = var_7
+#             var_8: T.int8 = var_8
+#             var_9: T.int16 = var_9
+#             var_10: T.int32 = var_10
+#             var_11: T.int64 = var_11
+#             var_12: T.uint8 = var_12
+#             var_13: T.uint16 = var_13
+#             var_14: T.uint32 = var_14
+#             var_15: T.uint64 = var_15
+#             var_16: T.float8_e4m3fn = var_16
+#             var_17: T.float8_e4m3fnuz = var_17
+#             var_18: T.float8_e5m2 = var_18
+#             var_19: T.float8_e5m2fnuz = var_19
+#             var_20: T.float8_e8m0fnu = var_20
+#             var_21: T.float16 = var_21
+#             var_22: T.bfloat16 = var_22
+#             var_23: T.float32 = var_23
+#             var_24: T.float64 = var_24
+
+#     s = test_var_decl_sugar.script()
+#     for i in range(1, 25):
+#         assert f'var_{i}_1' in s
+#         assert 'tl.local_var_init' in s
+
+
+def test_dtype_str_repr():
+    @T.prim_func
+    def test_str_repr():
+        buf_1 = T.alloc_buffer((1,), dtype=T.bool, scope="shared")  # noqa F841
+        buf_2 = T.alloc_buffer((1,), dtype=T.short, scope="shared")  # noqa F841
+        buf_3 = T.alloc_buffer((1,), dtype=T.int, scope="shared")  # noqa F841
+        buf_4 = T.alloc_buffer((1,), dtype=T.long, scope="shared")  # noqa F841
+        buf_5 = T.alloc_buffer((1,), dtype=T.half, scope="shared")  # noqa F841
+        buf_6 = T.alloc_buffer((1,), dtype=T.float, scope="shared")  # noqa F841
+        buf_7 = T.alloc_buffer((1,), dtype=T.long, scope="shared")  # noqa F841
+        buf_8 = T.alloc_buffer((1,), dtype=T.int8, scope="shared")  # noqa F841
+        buf_9 = T.alloc_buffer((1,), dtype=T.int16, scope="shared")  # noqa F841
+        buf_10 = T.alloc_buffer((1,), dtype=T.int32, scope="shared")  # noqa F841
+        buf_11 = T.alloc_buffer((1,), dtype=T.int64, scope="shared")  # noqa F841
+        buf_12 = T.alloc_buffer((1,), dtype=T.uint8, scope="shared")  # noqa F841
+        buf_13 = T.alloc_buffer((1,), dtype=T.uint16, scope="shared")  # noqa F841
+        buf_14 = T.alloc_buffer((1,), dtype=T.uint32, scope="shared")  # noqa F841
+        buf_15 = T.alloc_buffer((1,), dtype=T.uint64, scope="shared")  # noqa F841
+        buf_16 = T.alloc_buffer((1,), dtype=T.float8_e4m3fn, scope="shared")  # noqa F841
+        buf_17 = T.alloc_buffer((1,), dtype=T.float8_e4m3fnuz, scope="shared")  # noqa F841
+        buf_18 = T.alloc_buffer((1,), dtype=T.float8_e5m2, scope="shared")  # noqa F841
+        buf_19 = T.alloc_buffer((1,), dtype=T.float8_e5m2fnuz, scope="shared")  # noqa F841
+        buf_20 = T.alloc_buffer((1,), dtype=T.float8_e8m0fnu, scope="shared")  # noqa F841
+        buf_21 = T.alloc_buffer((1,), dtype=T.float16, scope="shared")  # noqa F841
+        buf_22 = T.alloc_buffer((1,), dtype=T.bfloat16, scope="shared")  # noqa F841
+        buf_23 = T.alloc_buffer((1,), dtype=T.float32, scope="shared")  # noqa F841
+        buf_24 = T.alloc_buffer((1,), dtype=T.float64, scope="shared")  # noqa F841
+
+
+# not supported now
+# def test_torch_eq():
+#     dtypes = [
+#         T.bool,
+#         T.short,
+#         T.int,
+#         T.long,
+#         T.half,
+#         T.float,
+#         T.long,
+#         T.int8,
+#         T.int16,
+#         T.int32,
+#         T.int64,
+#         T.uint8,
+#         T.uint16,
+#         T.uint32,
+#         T.uint64,
+#         T.float8_e4m3fn,
+#         T.float8_e4m3fnuz,
+#         T.float8_e5m2,
+#         T.float8_e5m2fnuz,
+#         T.float8_e8m0fnu,
+#         T.float16,
+#         T.bfloat16,
+#         T.float32,
+#         T.float64,
+#     ]
+#     torch_dtypes = [
+#         torch.bool,
+#         torch.short,
+#         torch.int,
+#         torch.long,
+#         torch.half,
+#         torch.float,
+#         torch.long,
+#         torch.int8,
+#         torch.int16,
+#         torch.int32,
+#         torch.int64,
+#         torch.uint8,
+#         torch.uint16,
+#         torch.uint32,
+#         torch.uint64,
+#         torch.float8_e4m3fn,
+#         torch.float8_e4m3fnuz,
+#         torch.float8_e5m2,
+#         torch.float8_e5m2fnuz,
+#         torch.float8_e8m0fnu,
+#         torch.float16,
+#         torch.bfloat16,
+#         torch.float32,
+#         torch.float64,
+#     ]
+#     for a, b in zip(dtypes, torch_dtypes):
+#         assert a == b, f"{a} and {b} are not equal"
+#         assert T.dtype(b) == a, "dtype conversion error"
+
+
+def test_var_assign():
+    @tilelang.jit(out_idx=-1)
+    @T.prim_func
+    def test_var_assign(A: T.Tensor((2,), T.int32)):
+        with T.Kernel(1) as _:
+            a: T.int32 = 1
+            b: T.int32 = a
+            a = 2
+            d: T.int32 = a
+            A[0] = b
+            A[1] = d
+
+    res = test_var_assign()()
+    assert res[0] == 1
+    assert res[1] == 2
+
+
+def test_marco_return():
+    @T.macro
+    def macro_return_constant():
+        return 0
+
+    @T.macro
+    def macro_return_frame(x):
+        return T.alloc_var(T.float32, init=x)
+
+    @T.macro
+    def macro_return_expr(x):
+        y = x + 1.0
+        return y
+
+    @T.macro
+    def macro_apply_func(x, fn):
+        return fn(x)
+
+    def check(x, ty):
+        assert isinstance(x, ty)
+
+    @T.prim_func
+    def test_macro_return():
+        with T.Kernel(1) as _:
+            a = macro_return_constant()
+            b = macro_return_frame(3.0)
+            c = macro_return_expr(4.0)
+            d = macro_apply_func(5.0, lambda x: x * 2.0)
+            check(a, (int, float, T.PrimExpr))
+            check(b, (int, float, T.PrimExpr))
+            check(c, (int, float, T.PrimExpr))
+            check(d, (int, float, T.PrimExpr))
+
+
+def test_serial_for_with_step():
+    @tilelang.jit(out_idx=-1)
+    @T.prim_func
+    def test_stepped_serial(A: T.Tensor((10,), T.int32)):
+        with T.Kernel(1) as _:
+            for i in range(0, 10, 2):
+                T.device_assert(0 <= i < 10 and i % 2 == 0, "i out of range")
+                A[i] = 1.0
+            for i in range(1, 10, 2):
+                T.device_assert(1 <= i < 10 and i % 2 == 1, "i out of range")
+                A[i] = 2.0
+
+    ker = test_stepped_serial()
+    res = ker()
+    ref = torch.tensor([1, 2, 1, 2, 1, 2, 1, 2, 1, 2], dtype=torch.int32, device="cuda")
+    assert torch.all(res == ref), f"Expected {ref}, but got {res}"
+
+    @tilelang.jit(out_idx=-1)
+    @T.prim_func
+    def test_serial_step_neg(A: T.Tensor((10,), T.int32)):
+        with T.Kernel(1) as _:
+            for i in range(10, 0, -1):
+                T.device_assert(0 < i <= 10, "i out of range")
+                A[10 - i] = i
+
+    ker = test_serial_step_neg()
+    res = ker()
+    ref = torch.tensor([10, 9, 8, 7, 6, 5, 4, 3, 2, 1], dtype=torch.int32, device="cuda")
+    assert torch.all(res == ref), f"Expected {ref}, but got {res}"
+
+    assert isinstance(T.serial(1, 10, 1), IRBuilderFrame)
+    assert isinstance(T.serial(1, 10, IntImm(T.int32, 1)), IRBuilderFrame)
+    assert not isinstance(T.serial(1, 10, Var("tmp", T.int32)), IRBuilderFrame)
+    assert not isinstance(T.serial(10, -1, -1), IRBuilderFrame)
+
+
+def test_swap_logic():
+    @tilelang.jit
+    @T.prim_func
+    def swap_var(A: T.Tensor[(2,), T.float32]):
+        with T.Kernel(1, threads=1) as _:
+            a = T.alloc_var(T.float32, A[0])
+            b = T.alloc_var(T.float32, A[1])
+            a, b = b, a
+            A[0], A[1] = a, b
+
+    @tilelang.jit
+    @T.prim_func
+    def swap_idx(A: T.Tensor[(2,), T.float32]):
+        with T.Kernel(1, threads=1) as _:
+            A[0], A[1] = A[1], A[0]
+
+    k_swap_var = swap_var()
+    data = torch.tensor([1.0, 2.0], dtype=torch.float32).cuda()
+    k_swap_var(data)
+    ref = torch.tensor([2.0, 1.0], dtype=torch.float32).cuda()
+    torch.testing.assert_close(data, ref)
+
+    k_swap_idx = swap_idx()
+    data = torch.tensor([1.0, 2.0], dtype=torch.float32).cuda()
+    k_swap_idx(data)
+    ref = torch.tensor([2.0, 1.0], dtype=torch.float32).cuda()
+    torch.testing.assert_close(data, ref)
+
+
+def test_while_loop():
+    @tilelang.jit(out_idx=-1)
+    @T.prim_func
+    def test_while_loop(A: T.Tensor((1,), T.int32)):
+        with T.Kernel(1) as _:
+            i = T.alloc_var(T.int32, 0)
+            sum = T.alloc_var(T.int32)
+            while i < 10:
+                sum += i
+                i += 1
+            A[0] = sum
+
+    ker = test_while_loop()
+    A = ker()
+    assert A[0].item() == sum(range(10)), f"Expected {sum(range(10))}, but got {A[0].item()}"
+
+
+def test_var_macro():
+    try:
+
+        @T.macro
+        def macro_with_var(x: T.Var):
+            x = 1  # noqa: F841
+
+        @T.prim_func
+        def prim_call_macro():
+            with T.Kernel(1):
+                x = T.alloc_var(T.int32)
+                macro_with_var(x)
+
+        assert "x[0] = 1" in prim_call_macro.script()
+    finally:
+        pass
+
+    try:
+
+        @T.macro
+        def macro_with_var(x: T.Var):
+            x = 1  # noqa: F841
+
+        @T.prim_func
+        def prim_call_macro():
+            with T.Kernel(1):
+                x = 1
+                macro_with_var(x)
+
+        raise RuntimeError("Expect to report an error, x should not be passed as T.Var")
+    except ValueError:
+        pass
+
+    try:
+
+        @T.macro
+        def macro_with_var(x: T.Ref):
+            x = 1  # noqa: F841
+
+        @T.prim_func
+        def prim_call_macro():
+            with T.Kernel(1):
+                x = T.alloc_var(T.int32)
+                macro_with_var(x)
+
+        assert "x[0] = 1" in prim_call_macro.script()
+    finally:
+        pass
+
+    try:
+
+        @T.macro
+        def macro_with_var(x: T.Ref):
+            x = 1  # noqa: F841
+
+        @T.prim_func
+        def prim_call_macro():
+            with T.Kernel(1):
+                x = 1
+                macro_with_var(x)
+
+        raise RuntimeError("Expect to report an error, x should not be passed as T.Var")
+    except ValueError:
+        pass
+
+
+def test_frame_inside_macro():
+    @tilelang.jit
+    def get_sample_kernel():
+        @T.macro
+        def transform(x):
+            return x + 1
+
+        @T.prim_func
+        def sample_kernel(
+            num_blocks: T.int32,
+            idx_out: T.Tensor[(32,), T.int32],
+        ):
+            with T.Kernel(num_blocks, threads=32) as block_idx:  # noqa: F841
+                fragment = T.alloc_fragment(32, T.int32)
+                T.copy(idx_out, fragment)
+
+                for i in T.Parallel(32):
+                    idx_out[i] = transform(fragment[i])
+
+        return sample_kernel
+
+    kernel = get_sample_kernel()  # noqa: F841
+
+
+def test_buffer_slice_step():
+    try:
+
+        @T.prim_func
+        def prim_buffer_slice_step(A: T.Buffer((10,), T.int32), B: T.Buffer((5,), T.int32)):
+            with T.Kernel(1):
+                B[0:5:2] = A[0:10:2]
+
+        raise AssertionError("Expect to report an error, buffer slice with step is not supported")
+    except RuntimeError:
+        pass
+
+
+def test_boolop():
+    a = Var("a", T.int32)
+    b = Var("b", T.int32)
+    c = Var("c", T.int32)
+    d = Var("d", T.int32)
+
+    @T.macro
+    def cond():
+        return not (a < b and b < c and a * d < b * d) or b * d < c * d
+
+    cond()
+
+
+def test_constexpr_if():
+    @tilelang.jit
+    def probe(tmp: bool):
+        @T.prim_func
+        def foo(A: T.Tensor[[2], T.int32]):
+            with T.Kernel(1):
+                if tmp:
+                    v = A[0]
+                else:
+                    v = A[1]
+                if tmp:
+                    A[1] = v + 1
+                else:
+                    A[0] = v + 1
+
+        return foo
+
+    A = torch.tensor([10, 20], dtype=torch.int32).cuda()
+    expect_1 = torch.tensor([10, 11], dtype=torch.int32).cuda()
+    expect_2 = torch.tensor([12, 11], dtype=torch.int32).cuda()
+    probe(True)(A)
+    assert torch.equal(A, expect_1)
+    probe(False)(A)
+    assert torch.equal(A, expect_2)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_get_warp_info.py b/testing/python/language/test_tilelang_language_get_warp_info.py
index eee3d6b56..f2d7b1269 100644
--- a/testing/python/language/test_tilelang_language_get_warp_info.py
+++ b/testing/python/language/test_tilelang_language_get_warp_info.py
@@ -23,9 +23,8 @@ def _resolve_warps_per_group(warps_per_group: Optional[int]) -> int:
 
 @tilelang.jit(out_idx=[-1])
 def _get_laneid_kernel(num_threads: int = 128, warp_size: Optional[int] = None):
-
     @T.prim_func
-    def laneid_kernel(A: T.Tensor((num_threads,), "int32")):
+    def laneid_kernel(A: T.Tensor((num_threads,), T.int32)):
         with T.Kernel(1, threads=num_threads) as _:
             tx = T.get_thread_binding()
             A[tx] = T.get_lane_idx(warp_size)
@@ -35,9 +34,8 @@ def laneid_kernel(A: T.Tensor((num_threads,), "int32")):
 
 @tilelang.jit(out_idx=[-1])
 def _get_warp_idx_sync_kernel(num_threads: int = 128, warp_size: Optional[int] = None):
-
     @T.prim_func
-    def warp_idx_sync_kernel(A: T.Tensor((num_threads,), "int32")):
+    def warp_idx_sync_kernel(A: T.Tensor((num_threads,), T.int32)):
         with T.Kernel(1, threads=num_threads) as _:
             tx = T.get_thread_binding()
             A[tx] = T.get_warp_idx_sync(warp_size)
@@ -47,9 +45,8 @@ def warp_idx_sync_kernel(A: T.Tensor((num_threads,), "int32")):
 
 @tilelang.jit(out_idx=[-1])
 def _get_warp_idx_kernel(num_threads: int = 128, warp_size: Optional[int] = None):
-
     @T.prim_func
-    def warp_idx_kernel(A: T.Tensor((num_threads,), "int32")):
+    def warp_idx_kernel(A: T.Tensor((num_threads,), T.int32)):
         with T.Kernel(1, threads=num_threads) as _:
             tx = T.get_thread_binding()
             A[tx] = T.get_warp_idx(warp_size)
@@ -63,9 +60,8 @@ def _get_warp_group_idx_kernel(
     warp_size: Optional[int] = None,
     warps_per_group: Optional[int] = None,
 ):
-
     @T.prim_func
-    def warp_group_idx_kernel(A: T.Tensor((num_threads,), "int32")):
+    def warp_group_idx_kernel(A: T.Tensor((num_threads,), T.int32)):
         with T.Kernel(1, threads=num_threads) as _:
             tx = T.get_thread_binding()
             A[tx] = T.get_warp_group_idx(warp_size, warps_per_group)
@@ -75,9 +71,8 @@ def warp_group_idx_kernel(A: T.Tensor((num_threads,), "int32")):
 
 @tilelang.jit(out_idx=[-1])
 def _shuffle_elect_kernel(num_threads: int = 128, thread_extent: int = 64):
-
     @T.prim_func
-    def shuffle_elect_kernel(A: T.Tensor((num_threads,), "int32")):
+    def shuffle_elect_kernel(A: T.Tensor((num_threads,), T.int32)):
         with T.Kernel(1, threads=num_threads) as _:
             tx = T.get_thread_binding()
             elected = T.shuffle_elect(thread_extent)
@@ -197,16 +192,15 @@ def test_get_warp_group_idx_custom():
     run_get_warp_group_idx(num_threads=512, warp_size=32, warps_per_group=5)
 
 
-@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_shuffle_elect_default():
     run_shuffle_elect(num_threads=256, thread_extent=64)
 
 
-@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_shuffle_elect_block_leader():
     run_shuffle_elect(num_threads=128, thread_extent=0)
 
 
 if __name__ == "__main__":
     tilelang.testing.main()
-    # run_get_lane_id()
diff --git a/testing/python/language/test_tilelang_language_if_range.py b/testing/python/language/test_tilelang_language_if_range.py
index b3550f589..c81a241ba 100644
--- a/testing/python/language/test_tilelang_language_if_range.py
+++ b/testing/python/language/test_tilelang_language_if_range.py
@@ -4,13 +4,14 @@
 import tilelang.testing
 
 
-@tilelang.jit(out_idx=[1],)
-def tilelang_if_range(M, N, block_M, block_N, dtype="float16"):
-
+@tilelang.jit(
+    out_idx=[1],
+)
+def tilelang_if_range(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -26,7 +27,7 @@ def main(
     return main
 
 
-def run_tilelang_if_range(M=128, N=128, block_M=32, block_N=32, dtype="float16"):
+def run_tilelang_if_range(M=128, N=128, block_M=32, block_N=32, dtype=T.float16):
     kernel = tilelang_if_range(M, N, block_M, block_N, dtype)
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
diff --git a/testing/python/language/test_tilelang_language_infinity.py b/testing/python/language/test_tilelang_language_infinity.py
new file mode 100644
index 000000000..746afc4e0
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_infinity.py
@@ -0,0 +1,32 @@
+import torch
+import tilelang
+import tilelang.language as T
+
+
+@tilelang.jit(out_idx=-1)
+def get_inf_kernel(dtype: str):
+    @T.prim_func
+    def main(A: T.Tensor((32,), dtype)):
+        with T.Kernel(1, threads=32):
+            T.fill(A, T.infinity(dtype))
+
+    return main
+
+
+def _test_infinity(dtype: str):
+    kernel = get_inf_kernel(dtype)
+    output = kernel()
+
+    assert torch.all(output == torch.inf), f"check failed for {dtype=}"
+
+
+@tilelang.testing.requires_cuda
+def test_infinity():
+    _test_infinity(T.float16)
+    _test_infinity(T.bfloat16)
+    _test_infinity(T.float32)
+    _test_infinity(T.float64)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_int64.py b/testing/python/language/test_tilelang_language_int64.py
new file mode 100644
index 000000000..d81e9dc6f
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_int64.py
@@ -0,0 +1,66 @@
+import tilelang
+import tilelang.language as T
+
+
+@tilelang.jit
+def fill_symbolic(value: float, dtype=T.bfloat16):
+    n = T.symbolic("n", "int64")
+    block_n = 512
+
+    @T.prim_func
+    def main(x: T.Tensor[n, dtype]):
+        # Initialize Kernel Context
+        with T.Kernel(T.ceildiv(n, block_n), threads=128) as bx:
+            # Doesn't yet work with int64-shaped global tensor
+            # T.fill(x[bx * block_n : (bx + 1) * block_n], value)
+            for i in T.Parallel(block_n):
+                x[bx * block_n + i] = value
+
+    return main
+
+
+def run_fill_symbolic(n: int):
+    import torch
+
+    x = torch.zeros(n, dtype=torch.bfloat16, device="cuda")
+    fill_symbolic(1.0)(x)
+    assert x.min() == 1.0 and x.max() == 1.0
+
+
+def test_fill_symbolic():
+    # Requires 8GB VRAM
+    run_fill_symbolic(2**32)
+
+
+@tilelang.jit
+def fill_static(n: int, value: float, dtype=T.bfloat16):
+    block_n = 512
+
+    @T.prim_func
+    def main(x: T.Tensor[n, dtype]):
+        # Initialize Kernel Context
+        with T.Kernel(T.ceildiv(n, block_n), threads=128) as bx:
+            # Doesn't yet work with int64-shaped global tensor
+            # T.fill(x[bx * block_n : (bx + 1) * block_n], value)
+            for i in T.Parallel(block_n):
+                x[bx * block_n + i] = value
+
+    return main
+
+
+def run_fill_static(n: int):
+    import torch
+
+    x = torch.zeros(n, dtype=torch.bfloat16, device="cuda")
+    fill_static(n, 1.0)(x)
+    assert x.min() == 1.0 and x.max() == 1.0
+
+
+def test_fill_static():
+    # Requires 8GB VRAM
+    run_fill_static(2**32)
+
+
+if __name__ == "__main__":
+    test_fill_symbolic()
+    test_fill_static()
diff --git a/testing/python/language/test_tilelang_language_intrinsics_codegen.py b/testing/python/language/test_tilelang_language_intrinsics_codegen.py
new file mode 100644
index 000000000..b1d1e5401
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_intrinsics_codegen.py
@@ -0,0 +1,30 @@
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+
+
+@tilelang.testing.requires_cuda
+def test_language_ldg_codegen():
+    N = 128
+
+    @T.prim_func
+    def main(
+        x: T.Tensor((N,), T.float32),
+        y: T.Tensor((N,), T.float32),
+    ):
+        with T.Kernel(N, threads=32) as pid:
+            # Explicitly request read-only cache load for x[pid]
+            y[pid] = T.__ldg(x[pid]) + 1.0
+
+    # Compile for CUDA and retrieve generated CUDA source
+    kernel = tilelang.compile(main, out_idx=[1], target="cuda")
+    src = kernel.get_kernel_source()
+    print(src)
+    # Assert that codegen uses __ldg on CUDA backend
+    # We look for the intrinsic call with address-of argument
+    assert "__ldg(" in src, "Expected __ldg call in generated CUDA source"
+    assert "__ldg(&" in src or "__ldg(&(" in src, "Expected address-of form in __ldg call"
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_lazy_jit.py b/testing/python/language/test_tilelang_language_lazy_jit.py
new file mode 100644
index 000000000..e3eabdce6
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_lazy_jit.py
@@ -0,0 +1,229 @@
+import tilelang.testing
+import tilelang
+import tilelang.language as T
+from itertools import product
+import torch
+
+
+def test_jit2_gemm():
+    @tilelang.lazy_jit(verbose=True)
+    def gemm(
+        A,
+        B,
+        C,
+        dtype: T.dtype = T.float16,
+        accum_dtype: T.dtype = T.float32,
+        block_M: int = 64,
+        block_N: int = 64,
+        block_K: int = 64,
+    ):
+        M, N, K = T.const("M N K")
+
+        A: T.Tensor[[M, K], dtype]
+        B: T.Tensor[[K, N], dtype]
+        C: T.Tensor[[M, N], dtype]
+
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N)) as (by, bx):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    A = torch.randn(1024, 1024, dtype=torch.float16, device="cuda")
+    B = torch.randn(1024, 1024, dtype=torch.float16, device="cuda")
+    C = torch.randn(1024, 1024, dtype=torch.float16, device="cuda")
+    gemm(A, B, C)
+    C_ref = A @ B
+    torch.testing.assert_close(C, C_ref, atol=1e-2, rtol=1e-2)
+
+
+def test_jit2_gemm_ptr():
+    @tilelang.lazy_jit
+    def gemm_ptr(
+        A: T.ptr,
+        B: T.ptr,
+        C: T.ptr,
+        M: int,
+        N: int,
+        K: int,
+        dtype: T.dtype,
+        out_dtype: T.dtype,
+        block_M: int = 64,
+        block_N: int = 64,
+        block_K: int = 32,
+    ):
+        A = T.make_tensor(A, (M, K), dtype)
+        B = T.make_tensor(B, (K, N), dtype)
+        C = T.make_tensor(C, (M, N), out_dtype)
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), out_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                T.copy(A[bx * block_M, k * block_K], A_shared)
+                T.copy(B[k * block_K, by * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+            T.copy(C_local, C[bx * block_M, by * block_N])
+
+    prod = product([T.float16, T.float32], [T.float32])
+    gemm_ptr.par_compile(
+        [
+            {"A": T.ptr(), "B": T.ptr(), "C": T.ptr(), "M": 1024, "N": 1024, "K": 1024, "dtype": in_dtype, "out_dtype": out_dtype}
+            for in_dtype, out_dtype in prod
+        ]
+    )
+    for in_dtype, out_dtype in prod:
+        in_dtype = in_dtype.as_torch()
+        out_dtype = out_dtype.as_torch()
+        A = torch.randn(1024, 1024, dtype=in_dtype, device="cuda")
+        B = torch.randn(1024, 1024, dtype=in_dtype, device="cuda")
+        C_ref = out_dtype(A @ B)
+        C = torch.empty(1024, 1024, dtype=out_dtype, device="cuda")
+        gemm_ptr(A, B, C, 1024, 1024, 1024, in_dtype, out_dtype)
+        torch.testing.assert_close(C, C_ref, atol=1e-2, rtol=1e-2)
+
+
+def test_jit2_many_annot():
+    @T.macro
+    def copy_impl(A, B):
+        M, N = A.shape
+        M_, N_ = B.shape
+        assert M == M_, f"M mismatch {M} {M_}"
+        assert N == N_, f"N mismatch {N} {N_}"
+        # assert tuple(A.shape) == tuple(B.shape), f"Invalid tensor shape: {A.shape}, {B.shape}"
+        with T.Kernel(T.ceildiv(M, 128), T.ceildiv(N, 128), threads=128) as (bx, by):
+            T.copy(A[bx * 128 : bx * 128 + 128, by * 128 : by * 128 + 128], B[bx * 128 : bx * 128 + 128, by * 128 : by * 128 + 128])
+
+    @tilelang.lazy_jit
+    def copy1(A, B):
+        N, M = T.const("N, M")
+        A: T.Tensor[[N, M], T.float32]
+        B: T.Tensor[[N, M], T.float32]
+        copy_impl(A, B)
+
+    @tilelang.lazy_jit
+    def copy2(
+        A: T.Tensor[[128, 128], T.float32],
+        B: T.Tensor[[128, 128], T.float32],
+    ):
+        copy_impl(A, B)
+
+    @tilelang.lazy_jit
+    def copy3(A, B):
+        N = T.const("N")
+        A: T.Tensor[[N, 128], T.float32]
+        B: T.Tensor[[N, 128], T.float32]
+        copy_impl(A, B)
+
+    @tilelang.lazy_jit
+    def copy4(A, B):
+        N = T.dynamic("N")
+        M = T.const("M")
+        A: T.Tensor[[N, M], T.float32]
+        B: T.Tensor[[N, M], T.float32]
+        copy_impl(A, B)
+
+    @tilelang.lazy_jit
+    def copy5(A, B):
+        N, M, N_, M_ = T.const("N, M, N_, M_")
+        A: T.StridedTensor[[N, M], [N_, M_], T.float32]
+        B: T.StridedTensor[[N, M], [N_, M_], T.float32]
+        copy_impl(A, B)
+
+    @tilelang.lazy_jit
+    def copy6(A, B):
+        N = T.dynamic("N")
+        M, N_, M_ = T.const("M, N_, M_")
+        A: T.StridedTensor[[N, M], [N_, M_], T.float32]
+        B: T.StridedTensor[[N, M], [N_, M_], T.float32]
+        copy_impl(A, B)
+
+    tilelang.par_compile([copy.get_tir(T.Tensor((128, 128))) for copy in [copy1, copy2, copy3, copy4]])
+
+    for copy in [copy1, copy2, copy3, copy4]:
+        A = torch.randn(128, 128, device="cuda")
+        B = torch.empty(128, 128, device="cuda")
+        copy(A, B)
+        assert torch.equal(B, A)
+
+    for copy in [copy5, copy6]:
+        A = torch.randn(128, 2, 128, 2, device="cuda")
+        B = torch.randn(128, 2, 128, 2, device="cuda")
+        copy(A[:, 0, :, 0], B[:, 0, :, 0])
+        assert torch.equal(A[:, 0, :, 0], B[:, 0, :, 0])
+
+
+def test_jit2_return():
+    @T.macro
+    def copy_impl(A):
+        M, N = A.shape
+        B = T.empty(M, N, dtype=A.dtype)
+        M, N = A.shape
+        M_, N_ = B.shape
+        assert M == M_, f"M mismatch {M} {M_}"
+        assert N == N_, f"N mismatch {N} {N_}"
+        # assert tuple(A.shape) == tuple(B.shape), f"Invalid tensor shape: {A.shape}, {B.shape}"
+        with T.Kernel(T.ceildiv(M, 128), T.ceildiv(N, 128), threads=128) as (bx, by):
+            T.copy(A[bx * 128 : bx * 128 + 128, by * 128 : by * 128 + 128], B[bx * 128 : bx * 128 + 128, by * 128 : by * 128 + 128])
+        return B
+
+    @tilelang.lazy_jit
+    def copy1(A):
+        M, N = T.const("M, N")
+        A: T.Tensor[[M, N], T.float32]
+        return copy_impl(A)
+
+    @tilelang.lazy_jit
+    def copy2(A):
+        A: T.Tensor[[128, 128], T.float32]
+        return copy_impl(A)
+
+    @tilelang.lazy_jit
+    def copy3(A):
+        N = T.const("N")
+        A: T.Tensor[[N, 128], T.float32]
+        return copy_impl(A)
+
+    @tilelang.lazy_jit
+    def copy4(A):
+        N = T.dynamic("N")
+        M = T.const("M")
+        A: T.Tensor[[N, M], T.float32]
+        return copy_impl(A)
+
+    @tilelang.lazy_jit
+    def copy5(A):
+        N, M, N_, M_ = T.const("N, M, N_, M_")
+        A: T.StridedTensor[[N, M], [N_, M_], T.float32]
+        return copy_impl(A)
+
+    @tilelang.lazy_jit
+    def copy6(A):
+        N = T.dynamic("N")
+        M, N_, M_ = T.const("M, N_, M_")
+        A: T.StridedTensor[[N, M], [N_, M_], T.float32]
+        return copy_impl(A)
+
+    tilelang.par_compile([copy.get_tir(T.Tensor((128, 128))) for copy in [copy1, copy2, copy3, copy4]])
+
+    for copy in [copy1, copy2, copy3, copy4]:
+        A = torch.randn(128, 128, device="cuda")
+        B = copy(A)
+        assert torch.equal(B, A)
+
+    for copy in [copy5, copy6]:
+        A = torch.randn(128, 2, 128, 2, device="cuda")
+        B = copy(A[:, 0, :, 0])
+        assert torch.equal(A[:, 0, :, 0], B)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_ldst_options.py b/testing/python/language/test_tilelang_language_ldst_options.py
index 3bcf44e0c..4473d7613 100644
--- a/testing/python/language/test_tilelang_language_ldst_options.py
+++ b/testing/python/language/test_tilelang_language_ldst_options.py
@@ -6,12 +6,11 @@
 
 @tilelang.jit
 def get_ld_kernel(scope, sem, na, nc):
-
     @T.prim_func
     def main(x: T.Tensor((32), "int32"), y: T.Tensor((32), "int32")):
         with T.Kernel(1, threads=32):
             tx = T.get_thread_binding()
-            reg = T.alloc_var('int32')
+            reg = T.alloc_var("int32")
             T.ld(x[tx], reg, scope=scope, sem=sem, na=na, nc=nc)
             y[tx] = reg
 
@@ -20,7 +19,6 @@ def main(x: T.Tensor((32), "int32"), y: T.Tensor((32), "int32")):
 
 @tilelang.jit
 def get_st_kernel(scope, sem, na):
-
     @T.prim_func
     def main(x: T.Tensor((32), "int32")):
         with T.Kernel(1, threads=32):
@@ -38,6 +36,7 @@ def _test_ld_options(scope, sem, na, nc):
     assert torch.equal(x, y)
 
 
+@tilelang.testing.requires_distributed
 @tilelang.testing.requires_cuda
 def test_ld_options():
     # ld.acquire.sys.global.s32 / u64
@@ -66,6 +65,7 @@ def _test_st_options(scope, sem, na):
     assert x.equal(torch.arange(32, device="cuda"))
 
 
+@tilelang.testing.requires_distributed
 @tilelang.testing.requires_cuda
 def test_st_options():
     # st.relaxed.sys.global.s32
diff --git a/testing/python/language/test_tilelang_language_let.py b/testing/python/language/test_tilelang_language_let.py
new file mode 100644
index 000000000..6f94ad664
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_let.py
@@ -0,0 +1,22 @@
+import tilelang.testing
+from tilelang import tvm as tvm
+from tilelang import language as T
+
+
+def test_let_vectorize_load():
+    @T.prim_func
+    def main(A_ptr: T.handle):
+        A = T.match_buffer(A_ptr, (16, 16), dtype=T.float32, align=16)
+
+        for _blockIdx in T.thread_binding(1, thread="blockIdx.x"):
+            for _threadIdx in T.thread_binding(128, thread="threadIdx.x"):
+                b = A[0, 0:4]
+                A[0, 4:8] = b
+
+    mod = tvm.IRModule({"main": main})
+    mod = tvm.compile(mod, target="cuda")
+    assert "float4 b" in mod.mod.imports[0].inspect_source()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_let_layout.py b/testing/python/language/test_tilelang_language_let_layout.py
new file mode 100644
index 000000000..fec30b914
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_let_layout.py
@@ -0,0 +1,123 @@
+"""
+Test layout inference for LetStmt expressions.
+
+This test validates that TileLang correctly handles layout inference when
+fragment buffer accesses occur through let bindings. For example:
+
+    block_mask_f = T.alloc_fragment((N_S,), T.int32)
+    T.copy(BlockMask[by, :], block_mask_f)
+    for i in T.Pipelined(N_S):
+        a = block_mask_f[i]  # LetStmt: a is bound to fragment buffer load
+        T.copy(A[a, 0], A_shared)  # a is used as index in TMA copy
+
+Key scenarios tested:
+1. Fragment buffer layout inference through let bindings
+2. TMA (Tensor Memory Accelerator) copy with let-bound indices
+3. CP.ASYNC copy with let-bound indices
+4. Warp specialization with let-bound fragment accesses
+"""
+
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+import torch
+
+
+def blocksparse_copy_kernel(M, N, N_S, block_M, block_N, dtype=T.float16):
+    """BlockSparse copy kernel using fragment for block mask indices."""
+    block_mask_shape = (M // block_M, N_S)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, T.int32),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M)) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_N), dtype)
+            B_shared = T.alloc_shared((block_M, block_N), dtype)
+            block_mask_f = T.alloc_fragment((N_S,), T.int32)
+
+            T.clear(B_shared)
+            T.copy(BlockMask[by, :], block_mask_f)
+            for i in T.Pipelined(N_S):
+                a = block_mask_f[i]  # LetStmt: fragment buffer access
+                if a >= 0:
+                    T.copy(A[a, 0], A_shared)
+                    T.copy(A_shared, B[by * block_M : (by + 1) * block_M, i * block_N : (i + 1) * block_N])
+
+    return main
+
+
+def ref_blocksparse_copy(A, B, BlockMask, M, N, N_S, block_M, block_N):
+    """Reference implementation for blocksparse copy."""
+    ref_B = B.clone()
+    num_row_blocks = M // block_M
+
+    for by in range(num_row_blocks):
+        for i in range(N_S):
+            src_row_start = BlockMask[by, i].item()
+            ref_B[by * block_M : (by + 1) * block_M, i * block_N : (i + 1) * block_N] = A[
+                src_row_start : src_row_start + block_M, 0:block_N
+            ]
+
+    return ref_B
+
+
+def run_blocksparse_copy(M, N, block_M, block_N, pass_configs=None):
+    """Run blocksparse copy test with given parameters."""
+    N_S = N // block_N
+
+    program = blocksparse_copy_kernel(M, N, N_S, block_M, block_N)
+    kernel = tilelang.compile(
+        program,
+        out_idx=[1],
+        target="cuda",
+        pass_configs=pass_configs or {},
+    )
+
+    # Initialize tensors
+    a = torch.randn(M, N, device="cuda", dtype=torch.float16)
+    b = torch.zeros(M, N, device="cuda", dtype=torch.float16)
+
+    # Create BlockMask with valid row indices
+    num_row_blocks = M // block_M
+    block_mask = torch.zeros((num_row_blocks, N_S), dtype=torch.int32, device="cuda")
+    for by in range(num_row_blocks):
+        for i in range(N_S):
+            max_row_block = (M - block_M) // block_M
+            block_mask[by, i] = torch.randint(0, max_row_block + 1, (1,)).item() * block_M
+
+    # Run kernel
+    c = kernel(a, block_mask)
+
+    # Compute reference
+    ref_c = ref_blocksparse_copy(a, b, block_mask, M, N, N_S, block_M, block_N)
+
+    # Verify
+    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+
+
+@tilelang.testing.requires_cuda
+def test_blocksparse_copy_tma():
+    """Test blocksparse copy with TMA (Tensor Memory Accelerator)."""
+    run_blocksparse_copy(M=1024, N=1024, block_M=128, block_N=128, pass_configs={})
+
+
+@tilelang.testing.requires_cuda
+def test_blocksparse_copy_cp_async():
+    """Test blocksparse copy with CP.ASYNC (without TMA)."""
+    run_blocksparse_copy(
+        M=1024,
+        N=1024,
+        block_M=128,
+        block_N=128,
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_mask_op.py b/testing/python/language/test_tilelang_language_mask_op.py
index ad90785f4..8f8997291 100644
--- a/testing/python/language/test_tilelang_language_mask_op.py
+++ b/testing/python/language/test_tilelang_language_mask_op.py
@@ -5,12 +5,11 @@
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_copy_mask_parallel(M, N, block_M, block_N, dtype="float16"):
-
+def tilelang_copy_mask_parallel(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -27,16 +26,11 @@ def main(
     return main
 
 
-def run_tilelang_copy_mask_parallel(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
+def run_tilelang_copy_mask_parallel(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_parallel(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
-        program,
-        out_idx=[1],
-        target="cuda",
-        pass_configs={
-            "tl.disable_warp_specialized": True,
-            "tl.disable_tma_lower": True
-        })
+        program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
+    )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -48,12 +42,11 @@ def test_tilelang_copy_mask_parallel():
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_copy_mask_copy(M, N, block_M, block_N, dtype="float16"):
-
+def tilelang_copy_mask_copy(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -69,16 +62,11 @@ def main(
     return main
 
 
-def run_tilelang_copy_mask_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
+def run_tilelang_copy_mask_copy(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_copy(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
-        program,
-        out_idx=[1],
-        target="cuda",
-        pass_configs={
-            "tl.disable_warp_specialized": True,
-            "tl.disable_tma_lower": True
-        })
+        program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
+    )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -90,12 +78,11 @@ def test_tilelang_copy_mask_copy():
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_copy_mask_parallel_range(M, N, block_M, block_N, dtype="float16"):
-
+def tilelang_copy_mask_parallel_range(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -112,20 +99,11 @@ def main(
     return main
 
 
-def run_tilelang_copy_mask_parallel_range(M=1024,
-                                          N=1024,
-                                          block_M=128,
-                                          block_N=128,
-                                          dtype="float16"):
+def run_tilelang_copy_mask_parallel_range(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_parallel_range(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
-        program,
-        out_idx=[1],
-        target="cuda",
-        pass_configs={
-            "tl.disable_warp_specialized": True,
-            "tl.disable_tma_lower": True
-        })
+        program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
+    )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -137,12 +115,11 @@ def test_tilelang_copy_mask_parallel_range():
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
-def tilelang_copy_mask_copy_range(M, N, block_M, block_N, dtype="float16"):
-
+def tilelang_copy_mask_copy_range(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=256) as (bx, by):
@@ -158,16 +135,11 @@ def main(
     return main
 
 
-def run_tilelang_copy_mask_copy_range(M=1024, N=1024, block_M=128, block_N=128, dtype="float16"):
+def run_tilelang_copy_mask_copy_range(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_copy_range(M, N, block_M, block_N, dtype)
     kernel = tilelang.compile(
-        program,
-        out_idx=[1],
-        target="cuda",
-        pass_configs={
-            "tl.disable_warp_specialized": True,
-            "tl.disable_tma_lower": True
-        })
+        program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
+    )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
diff --git a/testing/python/language/test_tilelang_language_negative_index.py b/testing/python/language/test_tilelang_language_negative_index.py
new file mode 100644
index 000000000..feeed2c6f
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_negative_index.py
@@ -0,0 +1,59 @@
+from tilelang import tvm
+import tilelang as tl
+import tilelang.testing
+import tilelang.language as T
+
+
+@T.prim_func
+def negative_index_before(A: T.Buffer((16,), T.float32), B: T.Buffer((16,), T.float32)):
+    T.func_attr({"tir.noalias": True})
+    B[0] = A[T.int32(-1)]
+
+
+@T.prim_func
+def negative_index_expected(A: T.Buffer((16,), T.float32), B: T.Buffer((16,), T.float32)):
+    T.func_attr({"tir.noalias": True})
+    B[0] = A[T.int32(15)]
+
+
+@T.prim_func
+def negative_index_loop_before(A: T.Buffer((16,), T.float32), B: T.Buffer((4,), T.float32)):
+    T.func_attr({"tir.noalias": True})
+    for i in T.serial(4):
+        B[i] = A[-i - 1]
+
+
+@T.prim_func
+def negative_index_loop_expected(A: T.Buffer((16,), T.float32), B: T.Buffer((4,), T.float32)):
+    T.func_attr({"tir.noalias": True})
+    for i in T.serial(4):
+        B[i] = A[15 - i]
+
+
+@T.prim_func
+def negative_index_symbolic_before(shift: T.int32, A: T.Buffer((16,), T.float32), B: T.Buffer((16,), T.float32)):
+    T.func_attr({"tir.noalias": True})
+    for i in T.serial(16):
+        B[i] = A[shift + i]
+
+
+def test_legalize_negative_index_scalar():
+    mod = tvm.IRModule({"main": negative_index_before})
+    transformed = tl.transform.LegalizeNegativeIndex()(mod)
+    tvm.ir.assert_structural_equal(transformed["main"].body, negative_index_expected.body)
+
+
+def test_legalize_negative_index_affine_expr():
+    mod = tvm.IRModule({"main": negative_index_loop_before})
+    transformed = tl.transform.LegalizeNegativeIndex()(mod)
+    tvm.ir.assert_structural_equal(transformed["main"].body, negative_index_loop_expected.body)
+
+
+def test_legalize_negative_index_symbolic_passthrough():
+    mod = tvm.IRModule({"main": negative_index_symbolic_before})
+    transformed = tl.transform.LegalizeNegativeIndex()(mod)
+    tvm.ir.assert_structural_equal(transformed["main"].body, negative_index_symbolic_before.body)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_parallel.py b/testing/python/language/test_tilelang_language_parallel.py
index b51ca8b68..a392e70b6 100644
--- a/testing/python/language/test_tilelang_language_parallel.py
+++ b/testing/python/language/test_tilelang_language_parallel.py
@@ -8,12 +8,11 @@
 
 
 @tilelang.jit(out_idx=[1])
-def parallel_elementwise_static(length=256, dtype="float32"):
-
+def parallel_elementwise_static(length=256, dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((length,), dtype),
-            B: T.Tensor((length,), dtype),
+        A: T.Tensor((length,), dtype),
+        B: T.Tensor((length,), dtype),
     ):
         with T.Kernel(1, threads=length) as _:
             for i in T.Parallel(length):
@@ -23,13 +22,12 @@ def main(
 
 
 @tilelang.jit(out_idx=[1])
-def parallel_elementwise_dynamic(max_len=512, threads=256, dtype="float32"):
-
+def parallel_elementwise_dynamic(max_len=512, threads=256, dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((max_len,), dtype),
-            B: T.Tensor((max_len,), dtype),
-            valid_len: T.int32,
+        A: T.Tensor((max_len,), dtype),
+        B: T.Tensor((max_len,), dtype),
+        valid_len: T.int32,
     ):
         with T.Kernel(1, threads=threads) as _:
             for i in T.Parallel(max_len):
diff --git a/testing/python/language/test_tilelang_language_pipeline.py b/testing/python/language/test_tilelang_language_pipeline.py
index 212f281ea..8136e246f 100644
--- a/testing/python/language/test_tilelang_language_pipeline.py
+++ b/testing/python/language/test_tilelang_language_pipeline.py
@@ -1,5 +1,6 @@
 from tilelang import tvm as tvm
 import tilelang.testing
+import tilelang.language as T
 
 
 def matmul(
@@ -23,13 +24,11 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -63,9 +62,9 @@ def run_gemm(
     block_K = 32
     trans_A = False
     trans_B = False
-    in_dtype = "float16"
-    out_dtype = "float16"
-    dtypeAccum = "float32"
+    in_dtype = T.float16
+    out_dtype = T.float16
+    dtypeAccum = T.float32
     num_threads = 128
     program = matmul(
         M,
@@ -90,7 +89,8 @@ def run_gemm(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = kernel.get_profiler()
 
     def ref_program(A, B):
@@ -100,11 +100,11 @@ def ref_program(A, B):
             A = A.T
         if trans_B:
             B = B.T
-        if in_dtype == "float32":
+        if in_dtype == T.float32:
             # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
             # float32 automatically, -0x1000 meas
-            A = ((A.view(torch.int32) - 0x1000)).view(torch.float32)
-            B = ((B.view(torch.int32) - 0x1000)).view(torch.float32)
+            A = (A.view(torch.int32) - 0x1000).view(torch.float32)
+            B = (B.view(torch.int32) - 0x1000).view(torch.float32)
         C = torch.matmul(A.to(torch.float), B.to(torch.float))
         C = C.to(torch.__getattribute__(out_dtype))
         return C
@@ -124,27 +124,19 @@ def test_pipeline_order_stage():
     pass_configs={
         tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    })
-def blocksparse_matmul(M,
-                       N,
-                       K,
-                       block_M,
-                       block_N,
-                       block_K,
-                       num_stages,
-                       dtype="float16",
-                       accum_dtype="float"):
-
+    },
+)
+def blocksparse_matmul(M, N, K, block_M, block_N, block_K, num_stages, dtype=T.float16, accum_dtype=T.float32):
     block_mask_shape = (M // block_M, N // block_N, K // block_K)
 
     import tilelang.language as T
 
     @T.prim_func
     def block_sparse_matmul(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            BlockMask: T.Tensor(block_mask_shape, "bool"),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        BlockMask: T.Tensor(block_mask_shape, "bool"),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
@@ -183,8 +175,7 @@ def run_blocksparse_matmul(num_stages):
     a = torch.randn(M, K).cuda().half()
     b = torch.randn(K, N).cuda().half()
 
-    kernel = blocksparse_matmul(
-        M, N, K, block_M=block_M, block_N=block_N, block_K=block_K, num_stages=num_stages)
+    kernel = blocksparse_matmul(M, N, K, block_M=block_M, block_N=block_N, block_K=block_K, num_stages=num_stages)
     print(kernel.get_kernel_source())
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
@@ -200,12 +191,10 @@ def ref_program(A, B, BlockMask, block_M, block_N, block_K):
                 accu = torch.zeros((block_M, block_N), dtype=torch.float32, device=A.device)
                 for k in range(K // block_K):
                     if BlockMask[i, j, k]:
-                        accu += (
-                            A[i * block_M:(i + 1) * block_M, k * block_K:(k + 1) * block_K].to(
-                                torch.float32) @ B[k * block_K:(k + 1) * block_K,
-                                                   j * block_N:(j + 1) * block_N].to(torch.float32))
-                ref_c[i * block_M:(i + 1) * block_M,
-                      j * block_N:(j + 1) * block_N] = accu.to(torch.float16)
+                        accu += A[i * block_M : (i + 1) * block_M, k * block_K : (k + 1) * block_K].to(torch.float32) @ B[
+                            k * block_K : (k + 1) * block_K, j * block_N : (j + 1) * block_N
+                        ].to(torch.float32)
+                ref_c[i * block_M : (i + 1) * block_M, j * block_N : (j + 1) * block_N] = accu.to(torch.float16)
         return ref_c
 
     # Compute the reference result using the naive PyTorch implementation
diff --git a/testing/python/language/test_tilelang_language_ptr.py b/testing/python/language/test_tilelang_language_ptr.py
index e4659ecc5..85458139a 100644
--- a/testing/python/language/test_tilelang_language_ptr.py
+++ b/testing/python/language/test_tilelang_language_ptr.py
@@ -6,8 +6,7 @@
 from tilelang.utils import map_torch_type
 
 
-def matmul_test(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul_test(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
         a_ptr: T.ptr,
@@ -40,7 +39,7 @@ def main(
     return main
 
 
-def run_matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+def run_matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     program = matmul_test(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
     jit_kernel = tl.compile(program, target="cuda", execution_backend="cython")
 
diff --git a/testing/python/language/test_tilelang_language_rand.py b/testing/python/language/test_tilelang_language_rand.py
new file mode 100644
index 000000000..daf51dbb7
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_rand.py
@@ -0,0 +1,37 @@
+import tilelang
+import tilelang.language as T
+import torch
+import pytest
+import tilelang.testing
+
+
+@tilelang.jit
+def tilelang_rand_1d(M=1024, seed=42):
+    num_per_thread = 128
+    threads = 1
+    blk_M = num_per_thread * threads
+
+    @T.prim_func
+    def rand_kernel(A: T.Tensor((M,), "uint32")):
+        with T.Kernel(T.ceildiv(M, threads * num_per_thread), threads=threads) as bx:
+            tx = T.get_thread_binding()
+            T.rng_init(seed, 0, bx * blk_M + tx * num_per_thread)
+            for i, j in T.Parallel(threads, num_per_thread):
+                offsets = (bx * threads + i) * num_per_thread
+                idx = offsets + j
+                if idx < M:
+                    A[idx] = T.rng_rand()
+
+    return rand_kernel
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize("M, seed", [(1024, 42), (512, 123), (128, 0)])
+def test_rand_1d(M, seed):
+    kernel = tilelang_rand_1d(M, seed)
+    tilelang_result = torch.empty(M, dtype=torch.uint32, device="cuda")
+    kernel(tilelang_result)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_reduce.py b/testing/python/language/test_tilelang_language_reduce.py
index 5969ee96d..1d9bf6130 100644
--- a/testing/python/language/test_tilelang_language_reduce.py
+++ b/testing/python/language/test_tilelang_language_reduce.py
@@ -1,17 +1,16 @@
 from tilelang import tvm as tvm
 import tilelang.testing
 import tilelang as tl
+import tilelang.language as T
 
 tilelang.testing.set_random_seed()
 
 
 def _make_shared_reduce(M, N, dtype, reduce_cb):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M,), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M,), dtype),
     ):
         with T.Kernel(1) as _:
             A_shared = T.alloc_shared((M, N), dtype)
@@ -30,13 +29,13 @@ def _run_program(program, ref_program, atol=1e-2, rtol=1e-2):
     profiler.assert_allclose(ref_program, atol=atol, rtol=rtol)
 
 
-def reduce_max_test(M, N, dtype="float16"):
+def reduce_max_test(M, N, dtype=T.float16):
     import tilelang.language as T
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M,), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M,), dtype),
     ):
         with T.Kernel(1) as _:
             A_local = T.alloc_fragment((M, N), dtype)
@@ -49,13 +48,13 @@ def main(
     return main
 
 
-def reduce_sum_test(M, N, dtype="float32"):
+def reduce_sum_test(M, N, dtype=T.float32):
     import tilelang.language as T
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M,), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M,), dtype),
     ):
         with T.Kernel(1) as _:
             A_local = T.alloc_fragment((M, N), dtype)
@@ -68,27 +67,27 @@ def main(
     return main
 
 
-def reduce_sum_ss(M, N, dtype="float32"):
+def reduce_sum_ss(M, N, dtype=T.float32):
     return _make_shared_reduce(M, N, dtype, lambda T, src, dst: T.reduce_sum(src, dst, dim=1))
 
 
-def reduce_max_ss(M, N, dtype="float32"):
+def reduce_max_ss(M, N, dtype=T.float32):
     return _make_shared_reduce(M, N, dtype, lambda T, src, dst: T.reduce_max(src, dst, dim=1))
 
 
-def reduce_min_ss(M, N, dtype="float32"):
+def reduce_min_ss(M, N, dtype=T.float32):
     return _make_shared_reduce(M, N, dtype, lambda T, src, dst: T.reduce_min(src, dst, dim=1))
 
 
-def reduce_abssum_ss(M, N, dtype="float32"):
+def reduce_abssum_ss(M, N, dtype=T.float32):
     return _make_shared_reduce(M, N, dtype, lambda T, src, dst: T.reduce_abssum(src, dst, dim=1))
 
 
-def reduce_absmax_ss(M, N, dtype="float32"):
+def reduce_absmax_ss(M, N, dtype=T.float32):
     return _make_shared_reduce(M, N, dtype, lambda T, src, dst: T.reduce_absmax(src, dst, dim=1))
 
 
-def run_reduce_sum(M, N, dtype="float32", mode="rr"):
+def run_reduce_sum(M, N, dtype=T.float32, mode="rr"):
     if mode == "rr":
         program = reduce_sum_test(M, N, dtype)
     elif mode == "ss":
@@ -98,12 +97,12 @@ def run_reduce_sum(M, N, dtype="float32", mode="rr"):
     _run_program(program, lambda A: A.sum(dim=1))
 
 
-def run_shared_reduce(program_builder, ref_program, M, N, dtype="float32"):
+def run_shared_reduce(program_builder, ref_program, M, N, dtype=T.float32):
     program = program_builder(M, N, dtype)
     _run_program(program, ref_program)
 
 
-def run_reduce_max(M, N, dtype="float16"):
+def run_reduce_max(M, N, dtype=T.float16):
     program = reduce_max_test(M, N, dtype)
     _run_program(program, lambda A: A.max(dim=1).values, atol=1e-2, rtol=1e-2)
 
@@ -116,39 +115,37 @@ def test_reduce_sum():
 
 def test_reduce_sum_shared():
     run_reduce_sum(64, 64, mode="ss")
-    run_reduce_sum(32, 96, mode="ss")
 
 
 def test_reduce_max():
-    run_reduce_max(256, 256, "float16")
-    run_reduce_max(512, 128, "float16")
-    run_reduce_max(256, 256, "float32")
+    run_reduce_max(256, 256, T.float16)
+    run_reduce_max(512, 128, T.float16)
+    run_reduce_max(256, 256, T.float32)
 
 
 def test_reduce_max_shared():
-    run_shared_reduce(reduce_max_ss, lambda A: A.max(dim=1).values, 64, 64, "float32")
-    run_shared_reduce(reduce_max_ss, lambda A: A.max(dim=1).values, 96, 48, "float32")
+    run_shared_reduce(reduce_max_ss, lambda A: A.max(dim=1).values, 64, 64, T.float32)
 
 
 def test_reduce_min_shared():
-    run_shared_reduce(reduce_min_ss, lambda A: A.min(dim=1).values, 64, 64, "float32")
+    run_shared_reduce(reduce_min_ss, lambda A: A.min(dim=1).values, 64, 64, T.float32)
 
 
 def test_reduce_abssum_shared():
-    run_shared_reduce(reduce_abssum_ss, lambda A: A.abs().sum(dim=1), 64, 64, "float32")
+    run_shared_reduce(reduce_abssum_ss, lambda A: A.abs().sum(dim=1), 64, 64, T.float32)
 
 
 def test_reduce_absmax_shared():
-    run_shared_reduce(reduce_absmax_ss, lambda A: A.abs().max(dim=1).values, 64, 64, "float32")
+    run_shared_reduce(reduce_absmax_ss, lambda A: A.abs().max(dim=1).values, 64, 64, T.float32)
 
 
-def reduce_sum_test_clear(M, N, dtype="float32"):
+def reduce_sum_test_clear(M, N, dtype=T.float32):
     import tilelang.language as T
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M,), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M,), dtype),
     ):
         with T.Kernel(1, threads=32) as _:
             A_local = T.alloc_fragment((M, N), dtype)
@@ -162,7 +159,7 @@ def main(
     return main
 
 
-def run_reduce_sum_clear(M, N, dtype="float32"):
+def run_reduce_sum_clear(M, N, dtype=T.float32):
     program = reduce_sum_test_clear(M, N, dtype)
     jit_kernel = tl.compile(program, out_idx=-1)
 
@@ -178,18 +175,18 @@ def ref_program(A):
 
 
 def test_reduce_sum_clear():
-    run_reduce_sum_clear(256, 256, "float32")
-    run_reduce_sum_clear(512, 128, "float32")
-    run_reduce_sum_clear(128, 512, "float32")
+    run_reduce_sum_clear(256, 256, T.float32)
+    run_reduce_sum_clear(512, 128, T.float32)
+    run_reduce_sum_clear(128, 512, T.float32)
 
 
-def reduce_max_test_clear(M, N, dtype="float16"):
+def reduce_max_test_clear(M, N, dtype=T.float16):
     import tilelang.language as T
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M,), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M,), dtype),
     ):
         with T.Kernel(1, threads=32) as _:
             A_local = T.alloc_fragment((M, N), dtype)
@@ -203,7 +200,7 @@ def main(
     return main
 
 
-def run_reduce_max_clear(M, N, dtype="float16"):
+def run_reduce_max_clear(M, N, dtype=T.float16):
     program = reduce_max_test_clear(M, N, dtype)
     jit_kernel = tl.compile(program, out_idx=-1)
 
@@ -219,7 +216,7 @@ def ref_program(A):
 
 
 def test_reduce_max_clear():
-    run_reduce_max_clear(256, 256, "float16")
+    run_reduce_max_clear(256, 256, T.float16)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_reshape.py b/testing/python/language/test_tilelang_language_reshape.py
index fa7b2a43f..10c3d0ce8 100644
--- a/testing/python/language/test_tilelang_language_reshape.py
+++ b/testing/python/language/test_tilelang_language_reshape.py
@@ -1,15 +1,15 @@
-from tilelang import tvm as tvm
 import tilelang.testing
 import tilelang as tl
+from tilelang import language as T
+import torch
+import pytest
 
 
 def reshape_test(N, M, dtype):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N // M, M), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N // M, M), dtype),
     ):
         with T.Kernel(1) as _:
             A_reshaped = T.reshape(A, [N // M, M])
@@ -28,7 +28,8 @@ def run_reshape(N, M, dtype):
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = jit_kernel.get_profiler()
 
     def ref_program(A):
@@ -39,17 +40,15 @@ def ref_program(A):
 
 def test_reshape_smem():
     # Test reshape
-    run_reshape(1024, 32, "float32")
-    run_reshape(2048, 64, "float16")
+    run_reshape(1024, 32, T.float32)
+    run_reshape(2048, 64, T.float16)
 
 
 def reshape_test_smem_1d_2_2d(N, M, dtype):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor((N // M, M), dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N // M, M), dtype),
     ):
         with T.Kernel(1) as _:
             A_shared = T.alloc_shared((N,), dtype)
@@ -72,7 +71,8 @@ def run_reshape_smem_1d_2_2d(N, M, dtype):
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = jit_kernel.get_profiler()
 
     def ref_program(A):
@@ -82,17 +82,15 @@ def ref_program(A):
 
 
 def test_reshape_smem_1d_2_2d():
-    run_reshape_smem_1d_2_2d(1024, 32, "float32")
-    run_reshape_smem_1d_2_2d(2048, 64, "float16")
+    run_reshape_smem_1d_2_2d(1024, 32, T.float32)
+    run_reshape_smem_1d_2_2d(2048, 64, T.float16)
 
 
 def reshape_test_smem_2d_2_1d(N, M, dtype):
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor((N // M, M), dtype),
-            B: T.Tensor((N,), dtype),
+        A: T.Tensor((N // M, M), dtype),
+        B: T.Tensor((N,), dtype),
     ):
         with T.Kernel(1) as _:
             A_shared = T.alloc_shared((N // M, M), dtype)
@@ -115,7 +113,8 @@ def run_reshape_smem_2d_2_1d(N, M, dtype):
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = jit_kernel.get_profiler()
 
     def ref_program(A):
@@ -125,8 +124,158 @@ def ref_program(A):
 
 
 def test_reshape_smem_2d_2_1d():
-    run_reshape_smem_2d_2_1d(1024, 32, "float32")
-    run_reshape_smem_2d_2_1d(2048, 64, "float16")
+    run_reshape_smem_2d_2_1d(1024, 32, T.float32)
+    run_reshape_smem_2d_2_1d(2048, 64, T.float16)
+
+
+def reshape_fragment_test(N, M, dtype):
+    @T.prim_func
+    def main(
+        A: T.Tensor((N // M, M), dtype),
+        B: T.Tensor((N,), dtype),
+    ):
+        with T.Kernel(1, threads=32) as _:
+            A_shared = T.alloc_shared((N // M, M), dtype, scope="shared")
+            A_local = T.alloc_fragment((N // M, M), dtype)
+            B_shared = T.alloc_shared((N,), dtype, scope="shared")
+
+            T.copy(A, A_shared)
+            T.copy(A_shared, A_local)
+            A_local_reshape = T.reshape(A_local, [N])
+            T.copy(A_local_reshape, B_shared)
+            T.copy(B_shared, B)
+
+    return main
+
+
+def run_reshape_fragment(N, M, dtype):
+    program = reshape_fragment_test(N, M, dtype)
+    jit_kernel = tl.compile(
+        program,
+        out_idx=-1,
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    profiler = jit_kernel.get_profiler()
+
+    def ref_program(A):
+        return A.reshape(N)
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+
+def test_reshape_fragment():
+    run_reshape_fragment(1024, 32, T.float32)
+    run_reshape_fragment(2048, 64, T.float16)
+
+
+def reshape_layout_transform_shared(N, M, dtype):
+    from tilelang.intrinsics.mma_layout import make_mma_swizzle_layout
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((N // M, M), dtype),
+        B: T.Tensor((N,), dtype),
+    ):
+        with T.Kernel(1, threads=32) as _:
+            A_shared = T.alloc_shared((N // M, M), dtype, scope="shared")
+
+            T.annotate_layout(
+                {
+                    A_shared: make_mma_swizzle_layout(A_shared),
+                }
+            )
+            T.copy(A, A_shared)
+            A_shared_reshape = T.reshape(A_shared, [N])
+            T.copy(A_shared_reshape, B)
+
+    return main
+
+
+def run_reshape_layout_transform_shared(N, M, dtype):
+    program = reshape_layout_transform_shared(N, M, dtype)
+    jit_kernel = tl.compile(
+        program,
+        out_idx=-1,
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    profiler = jit_kernel.get_profiler()
+
+    def ref_program(A):
+        return A.reshape(N)
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+
+def test_reshape_layout_transform_shared():
+    run_reshape_layout_transform_shared(1024, 32, T.float32)
+    run_reshape_layout_transform_shared(2048, 64, T.float16)
+
+
+def reduce_after_reshape_test(N, M, dtype):
+    @T.prim_func
+    def main(
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N // M,), dtype),
+    ):
+        with T.Kernel(1, threads=32) as _:
+            A_shared = T.alloc_shared((N,), dtype, scope="shared")
+            A_local = T.alloc_fragment((N,), dtype)
+            B_local = T.alloc_fragment((N // M,), dtype)
+
+            T.copy(A, A_shared)
+            T.copy(A_shared, A_local)
+            A_local_reshape = T.reshape(A_local, [N // M, M])
+            T.reduce_max(A_local_reshape, B_local, dim=1)
+            T.copy(B_local, B)
+
+    return main
+
+
+def run_reduce_after_reshape(N, M, dtype):
+    program = reduce_after_reshape_test(N, M, dtype)
+    jit_kernel = tl.compile(
+        program,
+        out_idx=-1,
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    profiler = jit_kernel.get_profiler()
+
+    def ref_program(A):
+        return torch.max(A.reshape(N // M, M), dim=1).values
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+
+def test_reduce_after_reshape():
+    run_reduce_after_reshape(1024, 32, T.float32)
+    run_reduce_after_reshape(2048, 64, T.float16)
+
+
+def reshape_shape_mismatch_test(N, M, dtype):
+    @T.prim_func
+    def main(
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N // M, M), dtype),
+    ):
+        with T.Kernel(1) as _:
+            A_reshaped = T.reshape(A, [N // M, M + 1])
+            T.copy(A_reshaped, B)
+
+    return main
+
+
+def test_reshape_shape_mismatch():
+    with pytest.raises(AssertionError):
+        reshape_shape_mismatch_test(1024, 32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_ternary.py b/testing/python/language/test_tilelang_language_ternary.py
index 821231ab4..20c7b5e77 100644
--- a/testing/python/language/test_tilelang_language_ternary.py
+++ b/testing/python/language/test_tilelang_language_ternary.py
@@ -4,24 +4,24 @@
 import tilelang.testing
 
 
-@tilelang.jit(out_idx=[1],)
-def tilelang_ternary(M, N, block_M, block_N, dtype="float16"):
-
+@tilelang.jit(
+    out_idx=[1],
+)
+def tilelang_ternary(M, N, block_M, block_N, dtype=T.float16):
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                B[by * block_M + i, bx * block_N + j] = (
-                    A[by * block_M + i, bx * block_N + j] if (by * block_M + i) < (M // 2) else 0)
+                B[by * block_M + i, bx * block_N + j] = A[by * block_M + i, bx * block_N + j] if (by * block_M + i) < (M // 2) else 0
 
     return main
 
 
-def run_tilelang_ternary(M=128, N=128, block_M=32, block_N=32, dtype="float16"):
+def run_tilelang_ternary(M=128, N=128, block_M=32, block_N=32, dtype=T.float16):
     kernel = tilelang_ternary(M, N, block_M, block_N, dtype)
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
diff --git a/testing/python/language/test_tilelang_language_tma_1d.py b/testing/python/language/test_tilelang_language_tma_1d.py
new file mode 100644
index 000000000..9cb79c10c
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_tma_1d.py
@@ -0,0 +1,56 @@
+import torch
+import tilelang
+import tilelang.language as T
+
+
+def ref_program(x, y):
+    return x + y
+
+
+@tilelang.jit(out_idx=[-1])
+def elementwise_add(M, N, block_M, block_N, in_dtype, out_dtype, threads):
+    @T.prim_func
+    def elem_add(A: T.Tensor((M, N), in_dtype), B: T.Tensor((M, N), in_dtype), C: T.Tensor((M, N), out_dtype)):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_N), in_dtype)
+            B_shared = T.alloc_shared((block_M, block_N), in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), out_dtype)
+            C_shared = T.alloc_shared((block_M, block_N), out_dtype)
+
+            T.copy(A[by * block_M, bx * block_N], A_shared)
+            T.copy(B[by * block_M, bx * block_N], B_shared)
+            for local_y, local_x in T.Parallel(block_M, block_N):
+                C_local[local_y, local_x] = A_shared[local_y, local_x] + B_shared[local_y, local_x]
+            T.copy(C_local, C_shared)
+            T.copy(C_shared, C[by * block_M, bx * block_N])
+
+    return elem_add
+
+
+def run_elementwise_add(M, N):
+    a = torch.randn(M, N, dtype=torch.float32, device="cuda")
+    b = torch.randn(M, N, dtype=torch.float32, device="cuda")
+
+    # Default config
+    block_M, block_N = 128, 128
+    config = {"block_M": block_M, "block_N": block_N, "threads": 128}
+    kernel = elementwise_add(M, N, **config, in_dtype=T.float32, out_dtype=T.float32)
+
+    out = kernel(a, b)
+    torch.testing.assert_close(out, ref_program(a, b), rtol=1e-2, atol=1e-2)
+
+    code = kernel.get_kernel_source()
+    if block_N == N:
+        assert "tma_load" in code and "CUtensorMap" not in code
+    else:
+        assert "tma_load" in code and "CUtensorMap" in code
+
+
+def main():
+    run_elementwise_add(128, 128)
+    run_elementwise_add(256, 128)
+    run_elementwise_add(256, 256)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/testing/python/language/test_tilelang_language_unroll.py b/testing/python/language/test_tilelang_language_unroll.py
new file mode 100644
index 000000000..06367e975
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_unroll.py
@@ -0,0 +1,35 @@
+import tilelang.testing
+from tilelang import tvm as tvm
+from tilelang import language as T
+
+
+def test_unroll_with_step():
+    @T.prim_func
+    def main(A_ptr: T.handle):
+        A = T.match_buffer(A_ptr, (16, 16), dtype=T.float32, align=16)
+
+        for _blockIdx in T.thread_binding(1, thread="blockIdx.x"):
+            for _threadIdx in T.thread_binding(128, thread="threadIdx.x"):
+                for i in T.unroll(0, 16, step=4):
+                    A[0, i] = 1.0
+
+    kernel = tilelang.compile(main, target="cuda")
+    assert "#pragma unroll" in kernel.get_kernel_source()
+
+
+def test_unroll_with_unroll_factor():
+    @T.prim_func
+    def main(A_ptr: T.handle):
+        A = T.match_buffer(A_ptr, (16, 16), dtype=T.float32, align=16)
+
+        for _blockIdx in T.thread_binding(1, thread="blockIdx.x"):
+            for _threadIdx in T.thread_binding(128, thread="threadIdx.x"):
+                for i in T.unroll(0, 16, unroll_factor=4):
+                    A[0, i] = 1.0
+
+    kernel = tilelang.compile(main, target="cuda")
+    assert "#pragma unroll 4" in kernel.get_kernel_source()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_var_init.py b/testing/python/language/test_tilelang_language_var_init.py
new file mode 100644
index 000000000..36d9bf014
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_var_init.py
@@ -0,0 +1,30 @@
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+
+
+def test_var_assign() -> None:
+    @tilelang.jit(out_idx=-1)
+    def jit_kernel():
+        @T.prim_func
+        def test_var_assign(A: T.Tensor((2,), T.int32)):
+            with T.Kernel(1) as _:
+                a = T.alloc_var(T.int32, init=1)
+                b = T.alloc_var(T.int32, init=a)  # b gets value of a
+                a = 2
+                d = T.alloc_var(T.int32, init=a)  # c gets new value of a
+                A[0] = b
+                A[1] = d
+
+        print(test_var_assign)
+        return test_var_assign
+
+    kernel = jit_kernel()
+    print(kernel.get_kernel_source())
+    res = kernel()
+    assert res[0] == 1
+    assert res[1] == 2
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_vectorize.py b/testing/python/language/test_tilelang_language_vectorize.py
index cee8b5a63..7462aa81b 100644
--- a/testing/python/language/test_tilelang_language_vectorize.py
+++ b/testing/python/language/test_tilelang_language_vectorize.py
@@ -1,16 +1,15 @@
 import torch
 import tilelang.testing
 import tilelang.language as T
+import pytest
 
 
 @tilelang.jit(pass_configs={tilelang.PassConfigKey.TL_DISABLE_VECTORIZE_256: True})
 def vectorize_test(N, M, stride_A, stride_B):
-    assert N % 128 == 0 and M % 128 == 0
-
     @T.prim_func
     def main(
-            A: T.StridedTensor[(N, M), (1, stride_A), "float32"],  # noqa: F821
-            B: T.StridedTensor[(N, M), (1, stride_B), "float32"],  # noqa: F821
+        A: T.StridedTensor[(N, M), (1, stride_A), T.float32],  # noqa: F821
+        B: T.StridedTensor[(N, M), (1, stride_B), T.float32],  # noqa: F821
     ):
         with T.Kernel(M // 128, threads=128) as (bx):
             tx = T.get_thread_binding(0)
@@ -23,6 +22,7 @@ def main(
 
 
 def run_vectorize(N, M, stride_A, stride_B):
+    assert N % 128 == 0 and M % 128 == 0
     assert stride_A >= N and stride_B >= N
 
     jit_kernel = vectorize_test(N, M, stride_A, stride_B)
@@ -39,9 +39,7 @@ def run_vectorize(N, M, stride_A, stride_B):
     code = jit_kernel.get_kernel_source()
 
     vectorize_size = 1
-    while vectorize_size <= 2 and \
-          stride_A % (vectorize_size * 2) == 0 and \
-          stride_B % (vectorize_size * 2) == 0:
+    while vectorize_size <= 2 and stride_A % (vectorize_size * 2) == 0 and stride_B % (vectorize_size * 2) == 0:
         vectorize_size *= 2
 
     if vectorize_size == 4:
@@ -59,5 +57,95 @@ def test_vectorize():
     run_vectorize(N, M, N + 8, N + 16)
 
 
+@tilelang.jit(pass_configs={tilelang.PassConfigKey.TL_DISABLE_VECTORIZE_256: True})
+def vectorize_test_invariant_index(N, M, K):
+    @T.prim_func
+    def main(
+        A: T.Tensor[(N, M), T.float32],  # noqa: F821
+        B: T.Tensor[(N, M), T.float32],  # noqa: F821
+        C: T.Tensor[(N, M // K), T.float32],  # noqa: F821
+    ):
+        with T.Kernel(N // 128, threads=128) as (bx):
+            tx = T.get_thread_binding(0)
+            row = bx * 128 + tx
+
+            for col in T.vectorized(M):
+                B[row, col] = A[row, col] * C[row, col // K]
+
+    return main
+
+
+def run_vectorize_invariant_index(N, M, K):
+    assert N % 128 == 0 and M % K == 0
+
+    jit_kernel = vectorize_test_invariant_index(N, M, K)
+
+    a = torch.randn(N, M, device="cuda", dtype=torch.float32)
+    b = torch.zeros(N, M, device="cuda", dtype=torch.float32)
+    c = torch.randn(N, M // K, device="cuda", dtype=torch.float32)
+
+    jit_kernel(a, b, c)
+
+    indices = torch.arange(a.size(1)) // K
+    ret = a * c[:, indices]
+    torch.testing.assert_close(b, ret, atol=1e-8, rtol=1e-8)
+
+    code = jit_kernel.get_kernel_source()
+
+    vectorize_size = 1
+    while vectorize_size <= 2 and K % (vectorize_size * 2) == 0:
+        vectorize_size *= 2
+
+    if vectorize_size == 4:
+        assert "float4" in code
+    elif vectorize_size == 2:
+        assert "float2" in code
+
+
+def test_vectorize_invariant_index():
+    N, M = 512, 256
+
+    run_vectorize_invariant_index(N, M, 2)
+    run_vectorize_invariant_index(N, M, 4)
+    run_vectorize_invariant_index(N, M * 3, 6)
+    run_vectorize_invariant_index(N, M, 8)
+    run_vectorize_invariant_index(N, M * 3, 12)
+    run_vectorize_invariant_index(N, M * 7, 14)
+
+
+@tilelang.jit
+def vectorize_test_all_dtypes(dtype, vec_num):
+    @T.prim_func
+    def main(A: T.Tensor[(64,), dtype]):
+        with T.Kernel(1, threads=256):
+            for i in T.vectorized(vec_num):
+                A[i] = T.cast(i + 1, dtype)
+
+    return main
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.uint8,
+        torch.uint16,
+        torch.uint32,
+        torch.uint64,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.float8_e4m3fn,
+        torch.float8_e5m2,
+        torch.float8_e8m0fnu,
+    ],
+)
+@pytest.mark.parametrize("vec_num", [1, 2, 4, 8])
+def test_vectorize_all_dtypes(dtype, vec_num):
+    x = torch.empty((64,), dtype=dtype, device="cuda")
+    kernel = vectorize_test_all_dtypes(dtype, vec_num)
+    kernel(x)
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_vectorized_cast.py b/testing/python/language/test_tilelang_language_vectorized_cast.py
index a1777c79f..33d40e679 100644
--- a/testing/python/language/test_tilelang_language_vectorized_cast.py
+++ b/testing/python/language/test_tilelang_language_vectorized_cast.py
@@ -1,15 +1,8 @@
+import pytest
 import torch
 import tilelang.testing
 import tilelang.language as T
 
-str2dtype = {
-    "float32": torch.float32,
-    "float16": torch.float16,
-    "bfloat16": torch.bfloat16,
-    "float8_e4m3": torch.float8_e4m3fn,
-    "float8_e5m2": torch.float8_e5m2,
-}
-
 
 @tilelang.jit
 def vectorized_cast_kernel(M: int, dtype_A: str, dtype_B: str):
@@ -17,8 +10,8 @@ def vectorized_cast_kernel(M: int, dtype_A: str, dtype_B: str):
 
     @T.prim_func
     def main(
-            A: T.Tensor[(M), dtype_A],  # noqa: F821
-            B: T.Tensor[(M), dtype_B],  # noqa: F821
+        A: T.Tensor[(M,), dtype_A],  # noqa: F821
+        B: T.Tensor[(M,), dtype_B],  # noqa: F821
     ):
         with T.Kernel(1, threads=128):
             T.copy(A, B)
@@ -26,55 +19,119 @@ def main(
     return main
 
 
-def run_vectorized_cast(src_dtype_str: str, dst_dtype_str: str, check_str: str, lanes: int = 2):
+@tilelang.jit
+def parallel_vectorized_cast_kernel(M: int, dtype_A: str, dtype_B: str):
+    assert M % 256 == 0
+
+    @T.prim_func
+    def main(
+        A: T.Tensor[(M,), dtype_A],  # noqa: F821
+        B: T.Tensor[(M,), dtype_B],  # noqa: F821
+    ):
+        with T.Kernel(1, threads=128):
+            A_local = T.alloc_fragment((M,), dtype_A)
+            B_local = T.alloc_fragment((M,), dtype_B)
+
+            T.copy(A, A_local)
+            for i in T.Parallel(M):
+                B_local[i] = A_local[i]
+            T.copy(B_local, B)
+
+    return main
+
+
+def run_vectorized_cast(src_dtype: T.dtype, dst_dtype: T.dtype, check_str: str, lanes: int = 2):
     """Run the vectorized cast kernel and check the correctness.
     Args:
-        src_dtype_str: The source data type string.
-        dst_dtype_str: The destination data type string.
+        src_dtype: The source data type.
+        dst_dtype: The destination data type.
         check_str: Used to ensure vectorized cast is used.
         lanes: The number of lanes of the source and destination data types.
     """
 
     M = 128 * lanes
-    kernel = vectorized_cast_kernel(M, src_dtype_str, dst_dtype_str)
-
-    A = torch.randn(M, dtype=str2dtype[src_dtype_str]).cuda()
-    B = torch.zeros(M, dtype=str2dtype[dst_dtype_str]).cuda()
-
-    kernel(A, B)
-
-    torch.testing.assert_close(A.to(str2dtype[dst_dtype_str]), B)
+    kernel = vectorized_cast_kernel(M, src_dtype, dst_dtype)
+    kernel_parallel = parallel_vectorized_cast_kernel(M, src_dtype, dst_dtype)
 
     code = kernel.get_kernel_source()
+    code_parallel = kernel_parallel.get_kernel_source()
+    print(code)
+    assert check_str in code and check_str in code_parallel, f"Cast {src_dtype} to {dst_dtype} with {lanes=} is not vectorized!"
 
-    assert check_str in code, \
-        f"Cast {src_dtype_str} to {dst_dtype_str} with {lanes=} is not vectorized!"
+    if src_dtype == T.float4_e2m1fn or dst_dtype == T.float4_e2m1fn:
+        return
 
+    A_float = torch.randn(M, dtype=torch.float32, device="cuda")
+    A = A_float.to(src_dtype.as_torch())
 
-def test_vectorized_cast():
-    # fp32 -> fp16
-    run_vectorized_cast("float32", "float16", "__float22half2_rn", 2)
-    run_vectorized_cast("float32", "float16", "__float22half2_rn", 4)
+    A = A_float.to(src_dtype.as_torch())
+    B = torch.zeros(M, dtype=dst_dtype.as_torch(), device="cuda")
+    C = torch.zeros(M, dtype=dst_dtype.as_torch(), device="cuda")
 
-    # fp16 -> fp32
-    run_vectorized_cast("float16", "float32", "__half22float2", 2)
-    run_vectorized_cast("float16", "float32", "__half22float2", 4)
-
-    # fp32 -> fp8_e4m3
-    run_vectorized_cast("float32", "float8_e4m3", "__nv_cvt_float2_to_fp8x2", 2)
-    run_vectorized_cast("float32", "float8_e4m3", "__nv_cvt_float2_to_fp8x2", 4)
-
-    # fp32 -> fp8_e5m2
-    run_vectorized_cast("float32", "float8_e5m2", "__nv_cvt_float2_to_fp8x2", 2)
-    run_vectorized_cast("float32", "float8_e5m2", "__nv_cvt_float2_to_fp8x2", 4)
-
-    # fp32 -> bf16
-    run_vectorized_cast("float32", "bfloat16", "__float22bfloat162_rn", 2)
-    run_vectorized_cast("float32", "bfloat16", "__float22bfloat162_rn", 4)
-
-    # bf16 -> fp32
-    run_vectorized_cast("bfloat16", "float32", "__bfloat1622float2", 2)
-    run_vectorized_cast("bfloat16", "float32", "__bfloat1622float2", 4)
+    kernel(A, B)
+    kernel_parallel(A, C)
+
+    torch.testing.assert_close(A.to(dst_dtype.as_torch()), B)
+    torch.testing.assert_close(A.to(dst_dtype.as_torch()), C)
+
+
+@pytest.mark.parametrize(
+    "src_dtype, dst_dtype, check_str, lanes",
+    [
+        (T.float32, T.float16, "__float22half2_rn", 2),
+        (T.float32, T.float16, "__float22half2_rn", 4),
+        (T.float16, T.float32, "__half22float2", 2),
+        (T.float16, T.float32, "__half22float2", 4),
+        (T.float32, T.float8_e4m3fn, "__nv_cvt_float2_to_fp8x2", 2),
+        (T.float32, T.float8_e4m3fn, "__nv_cvt_float2_to_fp8x2", 4),
+        (T.float32, T.float8_e5m2, "__nv_cvt_float2_to_fp8x2", 2),
+        (T.float32, T.float8_e5m2, "__nv_cvt_float2_to_fp8x2", 4),
+        (T.float32, T.bfloat16, "__float22bfloat162_rn", 2),
+        (T.float32, T.bfloat16, "__float22bfloat162_rn", 4),
+        (T.bfloat16, T.float32, "__bfloat1622float2", 2),
+        (T.bfloat16, T.float32, "__bfloat1622float2", 4),
+    ],
+)
+def test_vectorized_cast(src_dtype, dst_dtype, check_str, lanes):
+    run_vectorized_cast(src_dtype, dst_dtype, check_str, lanes)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(8, 9)
+@pytest.mark.parametrize(
+    "src_dtype, dst_dtype, check_str, lanes",
+    [
+        (T.float8_e4m3fn, T.float32, "__tl_cvt_fp8x2_to_float2", 2),
+        (T.float8_e4m3fn, T.float32, "__tl_cvt_fp8x2_to_float2", 4),
+        (T.float8_e5m2, T.float32, "__tl_cvt_fp8x2_to_float2", 2),
+        (T.float8_e5m2, T.float32, "__tl_cvt_fp8x2_to_float2", 4),
+    ],
+)
+def test_vectorized_cast_fp8(src_dtype, dst_dtype, check_str, lanes):
+    run_vectorized_cast(src_dtype, dst_dtype, check_str, lanes)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(10, 0)
+@pytest.mark.parametrize(
+    "src_dtype, dst_dtype, check_str, lanes",
+    [
+        # FP4 <-> Half
+        (T.float4_e2m1fn, T.float16, "__tl_cvt_fp4x2_to_half2", 2),
+        (T.float16, T.float4_e2m1fn, "__tl_cvt_half2_to_fp4x2", 2),
+        # FP4 <-> Float
+        (T.float4_e2m1fn, T.float32, "__tl_cvt_fp4x2_to_float2", 2),
+        (T.float32, T.float4_e2m1fn, "__tl_cvt_float2_to_fp4x2", 2),
+        # FP4 <-> Double
+        (T.float4_e2m1fn, T.float64, "__tl_cvt_fp4x2_to_double2", 2),
+        (T.float64, T.float4_e2m1fn, "__tl_cvt_double2_to_fp4x2", 2),
+        # FP4 <-> BFloat16
+        (T.float4_e2m1fn, T.bfloat16, "__tl_cvt_fp4x2_to_bfloat162", 2),
+        (T.bfloat16, T.float4_e2m1fn, "__tl_cvt_bfloat162_to_fp4x2", 2),
+    ],
+)
+def test_vectorized_cast_fp4(src_dtype, dst_dtype, check_str, lanes):
+    run_vectorized_cast(src_dtype, dst_dtype, check_str, lanes)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_view.py b/testing/python/language/test_tilelang_language_view.py
index c16c51852..dc4c3711b 100644
--- a/testing/python/language/test_tilelang_language_view.py
+++ b/testing/python/language/test_tilelang_language_view.py
@@ -1,14 +1,15 @@
+import tilelang.language as T
 from tilelang import tvm as tvm
 import tilelang.testing
 import tilelang as tl
+import pytest
 
 
 def view_test(N, M, dtype, new_dtype=None):
-    import tilelang.language as T
-
     new_shape = [N // M, M]
     if new_dtype:
         from tvm import DataType
+
         dtype_src = DataType(dtype)
         dtype_dst = DataType(new_dtype)
         src_bits = dtype_src.bits
@@ -18,8 +19,8 @@ def view_test(N, M, dtype, new_dtype=None):
 
     @T.prim_func
     def main(
-            A: T.Tensor((N,), dtype),
-            B: T.Tensor(new_shape, new_dtype if new_dtype else dtype),
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor(new_shape, new_dtype if new_dtype else dtype),
     ):
         with T.Kernel(1) as _:
             A_viewed = T.view(A, new_shape, dtype=new_dtype)
@@ -35,8 +36,7 @@ def run_view(N, M, dtype, new_dtype=None):
 
     def ref_program(A):
         if new_dtype:
-            from tilelang.utils.tensor import map_torch_type
-            torch_dtype = map_torch_type(new_dtype)
+            torch_dtype = T.dtype(new_dtype).as_torch()
             return A.view(N // M, M).view(dtype=torch_dtype)
         return A.view(N // M, M)
 
@@ -44,14 +44,42 @@ def ref_program(A):
 
 
 def test_reshape_view():
-
     # Test view with same dtype
-    run_view(1024, 32, "float32")
-    run_view(2048, 64, "float16")
+    run_view(1024, 32, T.float32)
+    run_view(2048, 64, T.float16)
 
     # Test view with dtype conversion
-    run_view(1024, 32, "float32", "float16")
-    run_view(2048, 64, "float16", "float32")
+    run_view(1024, 32, T.float32, T.float16)
+    run_view(2048, 64, T.float16, T.float32)
+
+
+def view_shape_mismatch_test(N, M, dtype, new_dtype=None):
+    new_shape = [N // M, M + 1]
+    if new_dtype:
+        from tvm import DataType
+
+        dtype_src = DataType(dtype)
+        dtype_dst = DataType(new_dtype)
+        src_bits = dtype_src.bits
+        dst_bits = dtype_dst.bits
+        scale = src_bits / dst_bits
+        new_shape[-1] = int(M * scale)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor(new_shape, new_dtype if new_dtype else dtype),
+    ):
+        with T.Kernel(1) as _:
+            A_viewed = T.view(A, new_shape, dtype=new_dtype)
+            T.copy(A_viewed, B)
+
+    return main
+
+
+def test_view_shape_mismatch():
+    with pytest.raises(AssertionError):
+        view_shape_mismatch_test(1024, 32, T.float32)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_vote.py b/testing/python/language/test_tilelang_language_vote.py
index b4beaf30d..cc9b2303f 100644
--- a/testing/python/language/test_tilelang_language_vote.py
+++ b/testing/python/language/test_tilelang_language_vote.py
@@ -7,14 +7,13 @@
 
 @tilelang.jit
 def get_kernel():
-
     @T.prim_func
-    def main(output: T.Tensor((6), 'int32')):
+    def main(output: T.Tensor((6), "int32")):
         with T.Kernel(1, threads=32):
             tx = T.get_thread_binding(0)
-            value = T.alloc_var('int32')
-            result_any = T.alloc_var('int32')
-            result_all = T.alloc_var('int32')
+            value = T.alloc_var("int32")
+            result_any = T.alloc_var("int32")
+            result_all = T.alloc_var("int32")
             value = 1
             result_any = T.warp_any(value)
             result_all = T.warp_all(value)
@@ -38,11 +37,11 @@ def main(output: T.Tensor((6), 'int32')):
 
 
 def test_vote():
-    output = torch.tensor(6 * [-1], dtype=torch.int32, device='cuda')
+    output = torch.tensor(6 * [-1], dtype=torch.int32, device="cuda")
     kernel = get_kernel()
     kernel(output)
-    assert '__any_sync' and '__all_sync' in kernel.get_kernel_source()
-    ref = torch.tensor([1, 1, 0, 0, 1, 0], dtype=torch.int32, device='cuda')
+    assert "__any_sync" and "__all_sync" in kernel.get_kernel_source()
+    ref = torch.tensor([1, 1, 0, 0, 1, 0], dtype=torch.int32, device="cuda")
     assert output.equal(ref)
 
 
diff --git a/testing/python/language/test_tilelang_language_warp_reduce.py b/testing/python/language/test_tilelang_language_warp_reduce.py
index 681b23470..a8868013d 100644
--- a/testing/python/language/test_tilelang_language_warp_reduce.py
+++ b/testing/python/language/test_tilelang_language_warp_reduce.py
@@ -7,7 +7,6 @@
 
 @tilelang.jit
 def get_kernel(reduce_op: str, dtype: str):
-
     assert reduce_op in ["sum", "max", "min", "bitand", "bitor"]
 
     @T.prim_func
@@ -33,16 +32,16 @@ def main(x: T.Tensor((32), dtype)):
 
 
 def test_warp_reduce_sum():
-    a = torch.randn((32,), dtype=torch.float32, device='cuda')
-    kernel = get_kernel('sum', 'float32')
+    a = torch.randn((32,), dtype=torch.float32, device="cuda")
+    kernel = get_kernel("sum", T.float32)
     ref = torch.full_like(a, a.sum())
     kernel(a)
     torch.testing.assert_close(a, ref)
 
 
 def test_warp_reduce_max():
-    a = torch.randn((32,), dtype=torch.float32, device='cuda')
-    kernel = get_kernel("max", 'float32')
+    a = torch.randn((32,), dtype=torch.float32, device="cuda")
+    kernel = get_kernel("max", T.float32)
     print(kernel.get_kernel_source())
     ref = torch.full_like(a, a.max())
     kernel(a)
@@ -50,16 +49,16 @@ def test_warp_reduce_max():
 
 
 def test_warp_reduce_min():
-    a = torch.randn((32,), dtype=torch.float32, device='cuda')
-    kernel = get_kernel("min", 'float32')
+    a = torch.randn((32,), dtype=torch.float32, device="cuda")
+    kernel = get_kernel("min", T.float32)
     ref = torch.full_like(a, a.min())
     kernel(a)
     torch.testing.assert_close(a, ref)
 
 
 def test_warp_reduce_bitand():
-    a = torch.randint(0, 100, size=(32,), dtype=torch.int32, device='cuda')
-    kernel = get_kernel("bitand", 'int32')
+    a = torch.randint(0, 100, size=(32,), dtype=torch.int32, device="cuda")
+    kernel = get_kernel("bitand", T.int32)
     ref_val = a[0]
     for i in range(1, a.shape[0]):
         ref_val = ref_val & a[i]
@@ -69,8 +68,8 @@ def test_warp_reduce_bitand():
 
 
 def test_warp_reduce_bitor():
-    a = torch.randint(0, 100, size=(32,), dtype=torch.int32, device='cuda')
-    kernel = get_kernel("bitor", 'int32')
+    a = torch.randint(0, 100, size=(32,), dtype=torch.int32, device="cuda")
+    kernel = get_kernel("bitor", T.int32)
     ref_val = a[0]
     for i in range(1, a.shape[0]):
         ref_val = ref_val | a[i]
diff --git a/testing/python/language/test_tilelang_memory_leak.py b/testing/python/language/test_tilelang_memory_leak.py
new file mode 100644
index 000000000..7da187fa3
--- /dev/null
+++ b/testing/python/language/test_tilelang_memory_leak.py
@@ -0,0 +1,79 @@
+import tvm_ffi
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+import torch
+import weakref
+import gc
+
+
+def test_tilelang_globals_leak():
+    @tilelang.jit(
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    def get_dummy_kernel():
+        @T.prim_func
+        def dummy_kernel(
+            a: T.Tensor[(1,), T.float32],
+        ):
+            with T.Kernel(1) as _:
+                a[0] = 1
+
+        return dummy_kernel
+
+    a = torch.randn(1, 1024)
+    a_weak = weakref.ref(a)
+    _kernel = get_dummy_kernel()
+    del a
+    torch.cuda.empty_cache()
+    gc.collect()
+    torch.cuda.empty_cache()
+    a_upgrade = a_weak()
+    assert a_upgrade is None, "A is not garbage collected"
+
+    # use objgraph to debug
+    # if a_upgrade is not None:
+    #     objgraph.show_backrefs([a_upgrade], max_depth=5)
+
+
+def test_error_no_cyclic_reference() -> None:
+    # This test case ensures that when an error is raised from C++ side,
+    # there is no cyclic reference that slows down the garbage collection.
+    # Please see `_with_append_backtrace` in error.py
+
+    # temporarily disable gc
+    gc.disable()
+
+    try:
+        # We should create a class as a probe to detect gc activity
+        # because weakref doesn't support list, dict or other trivial types
+        class SampleObject: ...
+
+        # trigger a C++ side KeyError by accessing a non-existent key
+        def trigger_cpp_side_error() -> None:
+            try:
+                tmp_map = tvm_ffi.Map(dict())
+                tmp_map["a"]
+            except KeyError:
+                pass
+
+        def may_create_cyclic_reference() -> weakref.ReferenceType:
+            obj = SampleObject()
+            trigger_cpp_side_error()
+            return weakref.ref(obj)
+
+        wref = may_create_cyclic_reference()
+
+        # if the object is not collected, wref() will return the object
+        assert wref() is None, "Cyclic reference occurs inside error handling pipeline"
+
+    finally:
+        # re-enable gc whenever exception occurs
+        gc.enable()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/layout/test_tilelang_layout_equal.py b/testing/python/layout/test_tilelang_layout_equal.py
new file mode 100644
index 000000000..994429376
--- /dev/null
+++ b/testing/python/layout/test_tilelang_layout_equal.py
@@ -0,0 +1,178 @@
+"""Tests for Layout and Fragment equality comparison."""
+
+import tilelang
+import tilelang.testing
+from tilelang.layout import Layout
+from tilelang.layout.fragment import Fragment
+
+tilelang.testing.set_random_seed()
+
+
+class TestLayoutEqual:
+    """Test cases for Layout.is_equal()."""
+
+    def test_same_layout_is_equal(self):
+        """Two layouts with identical mapping should be equal."""
+        layout1 = Layout([32, 4], lambda i, j: i * 4 + j)
+        layout2 = Layout([32, 4], lambda i, j: i * 4 + j)
+        assert layout1.is_equal(layout2)
+
+    def test_different_index_order_not_equal(self):
+        """Layouts with different index order (i*4+j vs j*4+i) should not be equal."""
+        layout1 = Layout([32, 4], lambda i, j: i * 4 + j)
+        layout2 = Layout([32, 4], lambda i, j: j * 4 + i)
+        assert not layout1.is_equal(layout2)
+
+    def test_different_coefficient_not_equal(self):
+        """Layouts with different coefficients should not be equal."""
+        layout1 = Layout([32, 4], lambda i, j: i * 4 + j)
+        layout2 = Layout([32, 4], lambda i, j: i * 8 + j)
+        assert not layout1.is_equal(layout2)
+
+    def test_different_shape_not_equal(self):
+        """Layouts with different shapes should not be equal."""
+        layout1 = Layout([32, 4], lambda i, j: i * 4 + j)
+        layout2 = Layout([16, 8], lambda i, j: i * 8 + j)
+        assert not layout1.is_equal(layout2)
+
+    def test_same_layout_different_var_names(self):
+        """Layouts with same mapping but created with different variable names should be equal."""
+        layout1 = Layout([32, 4], lambda x, y: x * 4 + y)
+        layout2 = Layout([32, 4], lambda a, b: a * 4 + b)
+        assert layout1.is_equal(layout2)
+
+    def test_2d_output_layout_equal(self):
+        """Layouts with 2D output should compare correctly."""
+        layout1 = Layout([32, 4], lambda i, j: [i, j])
+        layout2 = Layout([32, 4], lambda i, j: [i, j])
+        assert layout1.is_equal(layout2)
+
+    def test_2d_output_layout_different_order(self):
+        """Layouts with swapped output dimensions should not be equal."""
+        layout1 = Layout([32, 4], lambda i, j: [i, j])
+        layout2 = Layout([32, 4], lambda i, j: [j, i])
+        assert not layout1.is_equal(layout2)
+
+    def test_complex_expression_equal(self):
+        """Layouts with complex but equivalent expressions should be equal."""
+        layout1 = Layout([16, 8], lambda i, j: i * 8 + j)
+        layout2 = Layout([16, 8], lambda i, j: j + i * 8)
+        # Note: This tests if the comparison handles commutative operations
+        # With StructuralEqual, a*b+c and c+a*b have different AST structure
+        # So this may or may not be equal depending on implementation
+        # For now we test the actual behavior
+        result = layout1.is_equal(layout2)
+        # The key point is it should not crash and return a boolean
+        assert isinstance(result, bool)
+
+
+class TestFragmentEqual:
+    """Test cases for Fragment.is_equal()."""
+
+    def test_same_fragment_is_equal(self):
+        """Two fragments with identical mapping should be equal."""
+        frag1 = Fragment([32, 4], forward_thread_fn=lambda i, j: i * 4 + j)
+        frag2 = Fragment([32, 4], forward_thread_fn=lambda i, j: i * 4 + j)
+        assert frag1.is_equal(frag2)
+
+    def test_different_thread_mapping_not_equal(self):
+        """Fragments with different thread mapping (i*4+j vs j*4+i) should not be equal."""
+        frag1 = Fragment([32, 4], forward_thread_fn=lambda i, j: i * 4 + j)
+        frag2 = Fragment([32, 4], forward_thread_fn=lambda i, j: j * 4 + i)
+        assert not frag1.is_equal(frag2)
+
+    def test_different_forward_index_not_equal(self):
+        """Fragments with different forward_index should not be equal."""
+        frag1 = Fragment([32, 4], forward_thread_fn=lambda i, j: i * 4 + j, forward_index_fn=lambda i, j: i)
+        frag2 = Fragment([32, 4], forward_thread_fn=lambda i, j: i * 4 + j, forward_index_fn=lambda i, j: j)
+        assert not frag1.is_equal(frag2)
+
+    def test_same_fragment_different_var_names(self):
+        """Fragments with same mapping but different variable names should be equal."""
+        frag1 = Fragment([32, 4], forward_thread_fn=lambda x, y: x * 4 + y)
+        frag2 = Fragment([32, 4], forward_thread_fn=lambda a, b: a * 4 + b)
+        assert frag1.is_equal(frag2)
+
+    def test_fragment_with_replicate_equal(self):
+        """Fragments with same replicate factor should be equal."""
+        frag1 = Fragment([32, 4], forward_thread_fn=lambda i, j, rep: i * 4 + rep, replicate=4)
+        frag2 = Fragment([32, 4], forward_thread_fn=lambda i, j, rep: i * 4 + rep, replicate=4)
+        assert frag1.is_equal(frag2)
+
+    def test_fragment_different_replicate_not_equal(self):
+        """Fragments with different replicate factors should not be equal."""
+        frag1 = Fragment([32, 4], forward_thread_fn=lambda i, j, rep: i * 4 + rep, replicate=4)
+        frag2 = Fragment([32, 4], forward_thread_fn=lambda i, j, rep: i * 4 + rep, replicate=2)
+        assert not frag1.is_equal(frag2)
+
+    def test_fragment_with_forward_fn(self):
+        """Fragments created with forward_fn should compare correctly."""
+        frag1 = Fragment([32, 4], forward_fn=lambda i, j: (i * 4 + j, i * 4 + j))
+        frag2 = Fragment([32, 4], forward_fn=lambda i, j: (i * 4 + j, i * 4 + j))
+        assert frag1.is_equal(frag2)
+
+    def test_fragment_forward_fn_different_thread(self):
+        """Fragments with different thread mapping via forward_fn should not be equal."""
+        frag1 = Fragment([32, 4], forward_fn=lambda i, j: (i * 4 + j, i))
+        frag2 = Fragment([32, 4], forward_fn=lambda i, j: (j * 4 + i, i))
+        assert not frag1.is_equal(frag2)
+
+    def test_fragment_forward_fn_different_index(self):
+        """Fragments with different forward_index via forward_fn should not be equal."""
+        frag1 = Fragment([32, 4], forward_fn=lambda i, j: (i * 4 + j, i))
+        frag2 = Fragment([32, 4], forward_fn=lambda i, j: (i * 4 + j, j))
+        assert not frag1.is_equal(frag2)
+
+
+class TestLayoutFragmentEdgeCases:
+    """Edge cases and regression tests."""
+
+    def test_single_dim_layout_equal(self):
+        """Single dimension layouts should compare correctly."""
+        layout1 = Layout([128], lambda i: i)
+        layout2 = Layout([128], lambda i: i)
+        assert layout1.is_equal(layout2)
+
+    def test_single_dim_layout_not_equal(self):
+        """Single dimension layouts with different mappings should not be equal."""
+        layout1 = Layout([128], lambda i: i)
+        layout2 = Layout([128], lambda i: i * 2)
+        assert not layout1.is_equal(layout2)
+
+    def test_three_dim_layout_equal(self):
+        """Three dimension layouts should compare correctly."""
+        layout1 = Layout([8, 16, 4], lambda i, j, k: i * 64 + j * 4 + k)
+        layout2 = Layout([8, 16, 4], lambda i, j, k: i * 64 + j * 4 + k)
+        assert layout1.is_equal(layout2)
+
+    def test_three_dim_layout_different_order(self):
+        """Three dimension layouts with different index order should not be equal."""
+        layout1 = Layout([8, 16, 4], lambda i, j, k: i * 64 + j * 4 + k)
+        layout2 = Layout([8, 16, 4], lambda i, j, k: k * 64 + j * 4 + i)
+        assert not layout1.is_equal(layout2)
+
+    def test_fragment_empty_forward_index(self):
+        """Fragments with empty forward_index should compare correctly."""
+        frag1 = Fragment([32, 4], forward_thread_fn=lambda i, j: i * 4 + j)
+        frag2 = Fragment([32, 4], forward_thread_fn=lambda i, j: i * 4 + j)
+        assert frag1.is_equal(frag2)
+
+    def test_constant_layout_equal(self):
+        """Layouts mapping to constant should be equal."""
+        from tvm.tir import const
+
+        layout1 = Layout([32, 4], lambda i, j: const(0, "int32"))
+        layout2 = Layout([32, 4], lambda i, j: const(0, "int32"))
+        assert layout1.is_equal(layout2)
+
+    def test_constant_vs_variable_layout_not_equal(self):
+        """Layout mapping to constant vs variable should not be equal."""
+        from tvm.tir import const
+
+        layout1 = Layout([32, 4], lambda i, j: const(0, "int32"))
+        layout2 = Layout([32, 4], lambda i, j: i)
+        assert not layout1.is_equal(layout2)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/layout/test_tilelang_layout_fused_replicate.py b/testing/python/layout/test_tilelang_layout_fused_replicate.py
new file mode 100644
index 000000000..8aa5f6c42
--- /dev/null
+++ b/testing/python/layout/test_tilelang_layout_fused_replicate.py
@@ -0,0 +1,62 @@
+import pytest
+import torch
+
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+tilelang.testing.set_random_seed()
+
+VEC_SIZE = 32
+
+
+@tilelang.jit
+def fused_index_kernel(B: int, M: int, N: int, BLOCK_MN: int, BLOCK_K: int):
+    @T.prim_func
+    def main(
+        a: T.Buffer((B, M, N), T.bfloat16),
+        a_out: T.Buffer((B, M, N), T.float32),
+    ):
+        with T.Kernel(
+            T.ceildiv(M, BLOCK_MN),
+            T.ceildiv(N, BLOCK_K),
+            B,
+            threads=128,
+        ) as (pid_m, pid_n, pid_b):
+            a_fp32_local = T.alloc_fragment((BLOCK_MN * BLOCK_K // VEC_SIZE, VEC_SIZE), T.float32)
+            offs_m = pid_m * BLOCK_MN
+            offs_n = pid_n * BLOCK_K
+
+            for i, j in T.Parallel(BLOCK_MN, BLOCK_K):
+                idx = i * BLOCK_K + j
+                a_out[pid_b, offs_m + i, offs_n + j] = a_fp32_local[idx // VEC_SIZE, idx % VEC_SIZE]
+
+    return main
+
+
+def _require_cuda_tensor(shape, dtype):
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+    try:
+        return torch.randn(*shape, device="cuda", dtype=dtype)
+    except RuntimeError as err:
+        pytest.skip(f"CUDA runtime unavailable: {err}")
+
+
+def test_layout_infer_compiles_and_runs():
+    B, M, N = 1, 32, 64
+    BLOCK_MN, BLOCK_K = 32, 64
+    kernel = fused_index_kernel(B, M, N, BLOCK_MN, BLOCK_K)
+
+    a = _require_cuda_tensor((B, M, N), torch.bfloat16)
+    a_out = torch.empty((B, M, N), dtype=torch.float32, device=a.device)
+
+    # Ensure kernel compiles and executes without layout inversion failure
+    kernel(a, a_out)
+
+    assert a_out.shape == a.shape
+    assert a_out.dtype == torch.float32
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/layout/test_tilelang_layout_inference.py b/testing/python/layout/test_tilelang_layout_inference.py
new file mode 100644
index 000000000..831d9d8ef
--- /dev/null
+++ b/testing/python/layout/test_tilelang_layout_inference.py
@@ -0,0 +1,36 @@
+import tilelang
+import tilelang.testing
+from tilelang import language as T
+
+
+@tilelang.jit
+def _tilelang_issue_layout_free_inference_choose_smallest_replication():
+    @T.prim_func
+    def main(A: T.Tensor((128, 4), T.float), B: T.Tensor((128, 4), T.float)):
+        with T.Kernel(1, threads=128) as _:
+            A_frag = T.alloc_fragment((128, 4), T.float)
+            B_frag = T.alloc_fragment((128, 4), T.float)
+            S_frag = T.alloc_fragment((4,), T.float)
+            T.annotate_layout(
+                {
+                    A_frag: T.Fragment(A_frag.shape, lambda i, j: (i, j)),
+                }
+            )
+            for i, j in T.Parallel(128, 4):
+                A_frag[i, j] = S_frag[j]
+            for i, j in T.Parallel(128, 4):
+                B_frag[i, j] = S_frag[j]
+
+    return main
+
+
+def test_tilelang_issue_layout_free_inference_choose_smallest_replication():
+    kernel = _tilelang_issue_layout_free_inference_choose_smallest_replication()
+    source = kernel.get_kernel_source()
+    assert "float S_frag[4];" in source, "S_frag is not in the source"
+    assert "float B_frag[4];" in source, "B_frag is not in the source"
+    assert "float A_frag[4];" in source, "A_frag is not in the source"
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/math/test_math_bitwise_reduce.py b/testing/python/math/test_math_bitwise_reduce.py
index 9c2294669..044e0ea37 100644
--- a/testing/python/math/test_math_bitwise_reduce.py
+++ b/testing/python/math/test_math_bitwise_reduce.py
@@ -19,18 +19,17 @@ def bitwise_reduce(
     func,
     clear=True,
 ):
-
     @T.prim_func
     def reduce_func(
-            A: T.Tensor((M, N), "int32"),
-            B: T.Tensor((M), "int32"),
-            Output: T.Tensor((M), "int32"),
+        A: T.Tensor((M, N), T.int32),
+        B: T.Tensor((M), T.int32),
+        Output: T.Tensor((M), T.int32),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_N), "int32")
-            A_fragment = T.alloc_fragment((block_M, block_N), "int32")
-            B_shared = T.alloc_shared((block_M,), "int32")
-            B_fragment = T.alloc_fragment((block_M), "int32")
+            A_shared = T.alloc_shared((block_M, block_N), T.int32)
+            A_fragment = T.alloc_fragment((block_M, block_N), T.int32)
+            B_shared = T.alloc_shared((block_M,), T.int32)
+            B_fragment = T.alloc_fragment((block_M), T.int32)
             T.copy(A[by * block_M, bx * block_N], A_shared)
             T.copy(A_shared, A_fragment)
             T.copy(B[by * block_M], B_shared)
@@ -64,7 +63,7 @@ def run_single_bitwise_reduce(
             row_pattern = (i & 0xF) << (i % 4)  # 4-bit patterns shifted by row
 
             # Column-based pattern: different bit positions set based on column
-            col_pattern = (1 << (j % 31))  # Single bit set at different positions
+            col_pattern = 1 << (j % 31)  # Single bit set at different positions
 
             # Combine patterns with XOR to create diverse bit distributions
             # Add some deterministic "noise" based on position
@@ -76,7 +75,7 @@ def run_single_bitwise_reduce(
             if i % 4 == 0:
                 a[i, j] &= ~(0x1 << (i // 4))
             elif i % 2 == 0:
-                a[i, j] |= (0x1 << (i // 2))
+                a[i, j] |= 0x1 << (i // 2)
 
     if name == "reduce_bitand":
         expected = torch.full((M,), -1, device="cuda", dtype=torch.int32)
diff --git a/testing/python/math/test_math_fast_math.py b/testing/python/math/test_math_fast_math.py
index c3b5d1b52..3c50e95f4 100644
--- a/testing/python/math/test_math_fast_math.py
+++ b/testing/python/math/test_math_fast_math.py
@@ -7,16 +7,16 @@
 
 def get_mathop_lines(source, mathop_name):
     """Extract lines containing the mathop from CUDA source for debugging"""
-    lines = source.split('\n')
+    lines = source.split("\n")
     relevant_lines = []
     for i, line in enumerate(lines):
-        if mathop_name in line and ('(' in line):
+        if mathop_name in line and ("(" in line):
             # Include some context
             start = max(0, i - 1)
             end = min(len(lines), i + 2)
             relevant_lines.extend([f"{j}: {lines[j]}" for j in range(start, end)])
             relevant_lines.append("---")
-    return '\n'.join(relevant_lines[-10:])  # Show last 10 lines to avoid too much output
+    return "\n".join(relevant_lines[-10:])  # Show last 10 lines to avoid too much output
 
 
 def check_fastmath_usage(source, mathop_name, expect_fastmath=False):
@@ -27,9 +27,7 @@ def check_fastmath_usage(source, mathop_name, expect_fastmath=False):
     fastmath_matches = re.findall(fastmath_pattern, source)
     non_fastmath_matches = re.findall(non_fastmath_pattern, source)
 
-    print(
-        f"Found {len(fastmath_matches)} fastmath calls, {len(non_fastmath_matches)} non-fastmath calls"
-    )
+    print(f"Found {len(fastmath_matches)} fastmath calls, {len(non_fastmath_matches)} non-fastmath calls")
     if len(fastmath_matches) > 0:
         print(f"Fastmath calls found: {fastmath_matches}")
     if len(non_fastmath_matches) > 0:
@@ -51,13 +49,7 @@ def check_non_fastmath_usage(source, mathop_name):
     check_fastmath_usage(source, mathop_name, expect_fastmath=False)
 
 
-def run_single_arg_mathop_test(mathop_name,
-                               mathop_func,
-                               M=128,
-                               N=128,
-                               block_M=32,
-                               block_N=32,
-                               dtype="float32"):
+def run_single_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test single-argument mathops.
     T.exp should generate expf (non-fastmath), T.__exp should generate __expf (fastmath)
@@ -65,13 +57,12 @@ def run_single_arg_mathop_test(mathop_name,
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i,
-                                                                      bx * block_N + j])
+                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j])
 
     # Test with FAST_MATH disabled
     kernel_no_fastmath = tilelang.compile(
@@ -80,7 +71,8 @@ def main(
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     source_no_fastmath = kernel_no_fastmath.get_kernel_source()
 
@@ -93,28 +85,22 @@ def main(
     print(f"✓ {mathop_name} compilation and execution test passed")
 
 
-def run_two_arg_mathop_test(mathop_name,
-                            mathop_func,
-                            M=128,
-                            N=128,
-                            block_M=32,
-                            block_N=32,
-                            dtype="float32"):
+def run_two_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test two-argument mathops to ensure they generate non-fastmath CUDA code.
     """
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                C[by * block_M + i,
-                  bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j],
-                                                  B[by * block_M + i, bx * block_N + j])
+                C[by * block_M + i, bx * block_N + j] = mathop_func(
+                    A[by * block_M + i, bx * block_N + j], B[by * block_M + i, bx * block_N + j]
+                )
 
     # Test with FAST_MATH disabled
     kernel_no_fastmath = tilelang.compile(
@@ -123,7 +109,8 @@ def main(
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     # Test with FAST_MATH enabled
     kernel_fastmath = tilelang.compile(
@@ -132,7 +119,8 @@ def main(
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
+        },
+    )
 
     source_no_fastmath = kernel_no_fastmath.get_kernel_source()
     source_fastmath = kernel_fastmath.get_kernel_source()
@@ -145,7 +133,7 @@ def main(
     check_non_fastmath_usage(source_fastmath, mathop_name)
 
     # Test numerical correctness
-    torch_dtype = getattr(torch, dtype)
+    torch_dtype = dtype.as_torch()
     a = torch.randn(M, N, device="cuda", dtype=torch_dtype)
     b = torch.randn(M, N, device="cuda", dtype=torch_dtype)
 
@@ -171,8 +159,8 @@ def run_abs_test():
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), "float32"),
-            B: T.Tensor((M, N), "float32"),
+        A: T.Tensor((M, N), T.float32),
+        B: T.Tensor((M, N), T.float32),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
@@ -184,7 +172,8 @@ def main(
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     source = kernel.get_kernel_source()
     print("\n=== Testing abs (maps to fabs) ===")
@@ -199,26 +188,19 @@ def main(
     print("✓ abs numerical test passed")
 
 
-def run_fastmath_mathop_test(mathop_name,
-                             mathop_func,
-                             M=128,
-                             N=128,
-                             block_M=32,
-                             block_N=32,
-                             dtype="float32"):
+def run_fastmath_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test fastmath mathops to ensure they generate fastmath CUDA code (with __ prefix).
     """
 
     @T.prim_func
     def main(
-            A: T.Tensor((M, N), dtype),
-            B: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             for i, j in T.Parallel(block_M, block_N):
-                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i,
-                                                                      bx * block_N + j])
+                B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j])
 
     # Test with FAST_MATH enabled
     kernel_fastmath = tilelang.compile(
@@ -227,18 +209,19 @@ def main(
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        })
+        },
+    )
 
     source_fastmath = kernel_fastmath.get_kernel_source()
 
     print(f"\n=== Testing {mathop_name} (fastmath version) ===")
     print("FAST_MATH=True:")
     # Strip the __ prefix for checking in the CUDA source
-    cuda_mathop_name = mathop_name.lstrip('_')
+    cuda_mathop_name = mathop_name.lstrip("_")
     check_fastmath_usage(source_fastmath, cuda_mathop_name, expect_fastmath=True)
 
     # Test numerical correctness
-    torch_dtype = getattr(torch, dtype)
+    torch_dtype = dtype.as_torch()
     a = torch.randn(M, N, device="cuda", dtype=torch_dtype)
 
     # Ensure positive values for functions that need them
@@ -290,7 +273,7 @@ def test_mathops_generate_no_fastmath():
     ]
 
     for name, func in single_arg_mathops:
-        run_single_arg_mathop_test(name, func, dtype="float32")
+        run_single_arg_mathop_test(name, func, dtype=T.float32)
         print(f"✓ {name} test passed")
 
 
@@ -304,7 +287,7 @@ def test_two_arg_mathops_fastmath():
     ]
 
     for name, func in two_arg_mathops:
-        run_two_arg_mathop_test(name, func, dtype="float32")
+        run_two_arg_mathop_test(name, func, dtype=T.float32)
 
 
 @tilelang.testing.requires_cuda
@@ -329,7 +312,7 @@ def test_fastmath_versions():
     ]
 
     for name, func in fastmath_mathops:
-        run_fastmath_mathop_test(name, func, dtype="float32")
+        run_fastmath_mathop_test(name, func, dtype=T.float32)
         print(f"✓ {name} test passed")
 
 
diff --git a/testing/python/math/test_math_ieee_math.py b/testing/python/math/test_math_ieee_math.py
index 0b04e3bab..5d4988002 100644
--- a/testing/python/math/test_math_ieee_math.py
+++ b/testing/python/math/test_math_ieee_math.py
@@ -5,14 +5,7 @@
 import pytest
 
 
-def run_ieee_math_test(mathop_name,
-                       mathop_func,
-                       rounding_mode="rn",
-                       M=128,
-                       N=128,
-                       block_M=32,
-                       block_N=32,
-                       dtype="float32"):
+def run_ieee_math_test(mathop_name, mathop_func, rounding_mode="rn", M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
     """
     Test IEEE-compliant math operations with specified rounding modes.
     """
@@ -22,18 +15,19 @@ def run_ieee_math_test(mathop_name,
 
         @T.prim_func
         def main_func(
-                A: T.Tensor((M, N), dtype),
-                B: T.Tensor((M, N), dtype),
-                C: T.Tensor((M, N), dtype),
-                D: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, N), dtype),
+            B: T.Tensor((M, N), dtype),
+            C: T.Tensor((M, N), dtype),
+            D: T.Tensor((M, N), dtype),
         ):
             with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
                 for i, j in T.Parallel(block_M, block_N):
-                    D[by * block_M + i,
-                      bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j],
-                                                      B[by * block_M + i, bx * block_N + j],
-                                                      C[by * block_M + i,
-                                                        bx * block_N + j], rounding_mode)
+                    D[by * block_M + i, bx * block_N + j] = mathop_func(
+                        A[by * block_M + i, bx * block_N + j],
+                        B[by * block_M + i, bx * block_N + j],
+                        C[by * block_M + i, bx * block_N + j],
+                        rounding_mode,
+                    )
 
         out_idx = [3]
         num_inputs = 3
@@ -41,16 +35,15 @@ def main_func(
 
         @T.prim_func
         def main_func(
-                A: T.Tensor((M, N), dtype),
-                B: T.Tensor((M, N), dtype),
-                C: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, N), dtype),
+            B: T.Tensor((M, N), dtype),
+            C: T.Tensor((M, N), dtype),
         ):
             with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
                 for i, j in T.Parallel(block_M, block_N):
-                    C[by * block_M + i,
-                      bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j],
-                                                      B[by * block_M + i,
-                                                        bx * block_N + j], rounding_mode)
+                    C[by * block_M + i, bx * block_N + j] = mathop_func(
+                        A[by * block_M + i, bx * block_N + j], B[by * block_M + i, bx * block_N + j], rounding_mode
+                    )
 
         out_idx = [2]
         num_inputs = 2
@@ -58,14 +51,12 @@ def main_func(
 
         @T.prim_func
         def main_func(
-                A: T.Tensor((M, N), dtype),
-                B: T.Tensor((M, N), dtype),
+            A: T.Tensor((M, N), dtype),
+            B: T.Tensor((M, N), dtype),
         ):
             with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
                 for i, j in T.Parallel(block_M, block_N):
-                    B[by * block_M + i,
-                      bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j],
-                                                      rounding_mode)
+                    B[by * block_M + i, bx * block_N + j] = mathop_func(A[by * block_M + i, bx * block_N + j], rounding_mode)
 
         out_idx = [1]
         num_inputs = 1
@@ -77,13 +68,14 @@ def main_func(
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     print(f"\n=== Testing {mathop_name} with rounding mode {rounding_mode} ===")
     print(f"✓ {mathop_name} compilation test passed")
 
     # Test numerical execution
-    torch_dtype = getattr(torch, dtype)
+    torch_dtype = dtype.as_torch()
     a = torch.randn(M, N, device="cuda", dtype=torch_dtype)
 
     if num_inputs >= 2:
@@ -194,8 +186,8 @@ def test_ieee_frsqrt_rn_only():
 
     @T.prim_func
     def main(
-            A: T.Tensor((128, 128), "float32"),
-            B: T.Tensor((128, 128), "float32"),
+        A: T.Tensor((128, 128), T.float32),
+        B: T.Tensor((128, 128), T.float32),
     ):
         with T.Kernel(T.ceildiv(128, 32), T.ceildiv(128, 32), threads=128) as (bx, by):
             for i, j in T.Parallel(32, 32):
@@ -207,7 +199,8 @@ def main(
         target="cuda",
         pass_configs={
             tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False,
-        })
+        },
+    )
 
     print("\n=== Testing ieee_frsqrt (rn only) ===")
     print("✓ ieee_frsqrt compilation test passed")
diff --git a/testing/python/metal/test_metal_codegen.py b/testing/python/metal/test_metal_codegen.py
index 22f4beb89..5349bbec5 100644
--- a/testing/python/metal/test_metal_codegen.py
+++ b/testing/python/metal/test_metal_codegen.py
@@ -5,18 +5,17 @@
 import torch
 
 
-@tilelang.jit(execution_backend='torch')
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float32", accum_dtype="float"):
-
+@tilelang.jit(execution_backend="torch")
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float32, accum_dtype=T.float32):
     @T.prim_func
     def gemm(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_K), dtype, scope='shared')
-            B_shared = T.alloc_shared((block_K, block_N), dtype, scope='shared')
+            A_shared = T.alloc_shared((block_M, block_K), dtype, scope="shared")
+            B_shared = T.alloc_shared((block_K, block_N), dtype, scope="shared")
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
 
             T.clear(C_local)
@@ -40,21 +39,21 @@ def assert_gemm(
     block_M,
     block_N,
     block_K,
-    dtype="float32",
-    accum_dtype="float",
+    dtype=T.float32,
+    accum_dtype=T.float32,
     atol=1e-8,
 ):
     jit_kernel = matmul(M, N, K, block_M, block_N, block_K, dtype=dtype, accum_dtype=accum_dtype)
 
-    torch_dtype = getattr(torch, dtype)
+    torch_dtype = dtype.as_torch()
     a, b = None, None
-    if 'int' in dtype:
-        a = torch.randint(100, (M, K), dtype=torch_dtype, device='mps')
-        b = torch.randint(100, (K, N), dtype=torch_dtype, device='mps')
+    if "int" in dtype:
+        a = torch.randint(100, (M, K), dtype=torch_dtype, device="mps")
+        b = torch.randint(100, (K, N), dtype=torch_dtype, device="mps")
     else:
-        a = torch.randn(M, K, dtype=torch_dtype, device='mps')
-        b = torch.randn(K, N, dtype=torch_dtype, device='mps')
-    c = torch.zeros(M, N, dtype=torch_dtype, device='mps')
+        a = torch.randn(M, K, dtype=torch_dtype, device="mps")
+        b = torch.randn(K, N, dtype=torch_dtype, device="mps")
+    c = torch.zeros(M, N, dtype=torch_dtype, device="mps")
 
     jit_kernel(a, b, c)
 
@@ -70,12 +69,12 @@ def test_gemm_float32():
 
 @tilelang.testing.requires_metal
 def test_gemm_float16():
-    assert_gemm(1024, 1024, 1024, 16, 16, 16, dtype='float16', atol=1)
+    assert_gemm(1024, 1024, 1024, 16, 16, 16, dtype=T.float16, atol=1)
 
 
 @tilelang.testing.requires_metal
 def test_gemm_int32():
-    assert_gemm(1024, 1024, 1024, 16, 16, 16, dtype='int32', atol=1)
+    assert_gemm(1024, 1024, 1024, 16, 16, 16, dtype=T.int32, atol=1)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/primitives/test_tilelang_primitives_mma.py b/testing/python/primitives/test_tilelang_primitives_mma.py
deleted file mode 100644
index fcda9878c..000000000
--- a/testing/python/primitives/test_tilelang_primitives_mma.py
+++ /dev/null
@@ -1,379 +0,0 @@
-from tilelang import tvm as tvm
-import tilelang.testing
-from tilelang import primitives as P
-
-
-def matmul_ssr(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    threads,
-):
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-    shared_scope = "shared"  # or "shared.dyn" for dynamic shared memory
-    import tilelang.language as T
-
-    @T.prim_func
-    def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[ko * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, ko * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, ko * block_K], B_shared)
-                else:
-                    T.copy(B[ko * block_K, bx * block_N], B_shared)
-                P.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def run_matmul_ssr(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-):
-    program = matmul_ssr(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-    # TODO(lei): gemm_v2 with tma is not fully tested.
-    kernel = tilelang.compile(
-        program,
-        out_idx=[2],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
-    profiler = kernel.get_profiler()
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
-
-
-def test_gemm_f16f16f16_nt_ssr():
-    run_matmul_ssr(
-        16, 16, 16, False, True, "float16", "float16", "float16", 16, 16, 16, 0, num_threads=32)
-    run_matmul_ssr(
-        128, 128, 128, False, True, "float16", "float16", "float16", 32, 32, 32, 0, num_threads=64)
-    run_matmul_ssr(
-        1024,
-        1024,
-        1024,
-        False,
-        True,
-        "float16",
-        "float16",
-        "float16",
-        128,
-        128,
-        32,
-        2,
-        num_threads=128)
-
-
-def matmul_rsr(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    threads,
-):
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-    A_local_shape = A_shared_shape
-    shared_scope = "shared"  # or "shared.dyn" for dynamic shared memory
-    import tilelang.language as T
-
-    @T.prim_func
-    def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype, scope=shared_scope)
-            A_local = T.alloc_fragment(A_local_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[ko * block_K, by * block_M], A_shared)
-                else:
-                    T.copy(A[by * block_M, ko * block_K], A_shared)
-                if trans_B:
-                    T.copy(B[bx * block_N, ko * block_K], B_shared)
-                else:
-                    T.copy(B[ko * block_K, bx * block_N], B_shared)
-                T.copy(A_shared, A_local)
-                P.gemm(A_local, B_shared, C_local, trans_A, trans_B)
-                # T.gemm(A_local, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def run_matmul_rsr(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-):
-    program = matmul_rsr(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-    kernel = tilelang.compile(
-        program,
-        out_idx=[2],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
-    profiler = kernel.get_profiler()
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
-
-
-# TODO(lei): Fix the test case in future release
-# Now it has some bugs related to is_m_first
-# def test_gemm_f16f16f16_nt_rsr():
-#     run_matmul_rsr(
-#         1024,
-#         1024,
-#         1024,
-#         False,
-#         True,
-#         "float16",
-#         "float16",
-#         "float16",
-#         128,
-#         128,
-#         32,
-#         0,
-#         num_threads=128,
-#     )
-
-
-def matmul_rrr(
-    M,
-    N,
-    K,
-    block_M,
-    block_N,
-    block_K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    accum_dtype,
-    num_stages,
-    threads,
-):
-    A_shape = (K, M) if trans_A else (M, K)
-    B_shape = (N, K) if trans_B else (K, N)
-    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
-    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
-    A_local_shape = A_shared_shape
-    B_local_shape = B_shared_shape
-    import tilelang.language as T
-
-    @T.prim_func
-    def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
-    ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
-            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
-            A_local = T.alloc_fragment(A_local_shape, in_dtype)
-            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            B_local = T.alloc_fragment(B_local_shape, in_dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                if trans_A:
-                    T.copy(A[k * block_K, by * block_M], A_shared)
-                    T.copy(A_shared, A_local)
-                else:
-                    T.copy(A[by * block_M, k * block_K], A_shared)
-                    T.copy(A_shared, A_local)
-                if trans_B:
-                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                    T.copy(B_shared, B_local)
-                else:
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
-                    T.copy(B_shared, B_local)
-                P.gemm(A_local, B_local, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def run_matmul_rrr(
-    M,
-    N,
-    K,
-    trans_A,
-    trans_B,
-    in_dtype,
-    out_dtype,
-    dtypeAccum,
-    block_M,
-    block_N,
-    block_K,
-    num_stages=3,
-    num_threads=128,
-):
-    program = matmul_rrr(
-        M,
-        N,
-        K,
-        block_M,
-        block_N,
-        block_K,
-        trans_A,
-        trans_B,
-        in_dtype,
-        out_dtype,
-        dtypeAccum,
-        num_stages,
-        num_threads,
-    )
-    kernel = tilelang.compile(
-        program,
-        out_idx=[2],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
-    profiler = kernel.get_profiler()
-
-    def ref_program(A, B):
-        import torch
-
-        if trans_A:
-            A = A.T
-        if trans_B:
-            B = B.T
-        C = torch.matmul(A.to(torch.float), B.to(torch.float))
-        C = C.to(torch.__getattribute__(out_dtype))
-        return C
-
-    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
-
-
-# def test_gemm_f16f16f16_nt_rrr():
-#     run_matmul_rrr(
-#         1024,
-#         1024,
-#         1024,
-#         False,
-#         True,
-#         "float16",
-#         "float16",
-#         "float16",
-#         128,
-#         128,
-#         32,
-#         2,
-#     )
-
-if __name__ == "__main__":
-    tilelang.testing.main()
diff --git a/testing/python/profiler/test_tilelang_profiler.py b/testing/python/profiler/test_tilelang_profiler.py
index ee46725b9..09d894c59 100644
--- a/testing/python/profiler/test_tilelang_profiler.py
+++ b/testing/python/profiler/test_tilelang_profiler.py
@@ -3,13 +3,12 @@
 
 
 @tilelang.jit(out_idx=[-1])
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def gemm(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
diff --git a/testing/python/runtime/test_tilelang_runtime_dynamic_shared_memory.py b/testing/python/runtime/test_tilelang_runtime_dynamic_shared_memory.py
new file mode 100644
index 000000000..083373eb7
--- /dev/null
+++ b/testing/python/runtime/test_tilelang_runtime_dynamic_shared_memory.py
@@ -0,0 +1,52 @@
+import pytest
+import torch
+
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+
+
+@tilelang.jit
+def dynamic_smem_kernel():
+    # Symbolic length to drive dynamic shared memory allocation
+    length = T.symbolic("len", dtype=T.int32)  # noqa: F821
+
+    @T.prim_func
+    def main(global_tensor: T.Tensor[(length,), T.int32]):  # noqa: F821
+        # Launch a simple kernel that copies from global memory into shared memory
+        # using a dynamically-sized allocation. No writes back to global_tensor.
+        with T.Kernel(1, threads=32) as _:
+            buffer_shared = T.alloc_shared((length,), dtype=T.int32)  # noqa: F821
+            T.copy(buffer_shared, global_tensor)
+
+    return main
+
+
+def _require_cuda_tensor(shape, dtype):
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+    try:
+        return torch.randint(0, 100, shape, dtype=dtype, device="cuda")
+    except RuntimeError as err:
+        pytest.skip(f"CUDA runtime unavailable: {err}")
+
+
+def _run_and_check(kernel, n):
+    a = _require_cuda_tensor((n,), torch.int32)
+    kernel(a)
+    torch.cuda.synchronize()
+
+
+def test_dynamic_shared_memory_varies_across_calls():
+    kernel = dynamic_smem_kernel()
+
+    # Run with different dynamic shared memory sizes across invocations
+    _run_and_check(kernel, 100)
+    _run_and_check(kernel, 200)
+    # Repeat sizes to exercise attribute caching path
+    _run_and_check(kernel, 200)
+    _run_and_check(kernel, 100)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py
index 3a89eeb85..67123cb8c 100644
--- a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py
@@ -1,3 +1,4 @@
+import tilelang.language as T
 from tilelang import tvm as tvm
 import tilelang.testing
 import pytest
@@ -23,13 +24,11 @@ def matmul(
     A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -89,7 +88,8 @@ def run_gemm_ss(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
 
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
 
@@ -108,30 +108,27 @@ def ref_program(A, B):
 
 
 @pytest.mark.skip(reason="Temporarily disabling until GEMM SS issues are resolved")
-def test_gemm_ss():
-    # More test case can be found in kernel/test_tilelang_kernel_gemm.py
-    # GEMM tests for float16
-    run_gemm_ss(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 128, 32, 2)
-    run_gemm_ss(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 128, 32, 2)
-    run_gemm_ss(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 128, 32, 2)
-    run_gemm_ss(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 128, 32, 2)
-    # n8 test
-    run_gemm_ss(128, 8, 32, False, True, "float16", "float16", "float16", 128, 8, 32, 0, 128)
-
-    # int8 test
-    run_gemm_ss(128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_ss(128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_ss(128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_ss(128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 32, 2)
-
-    # float8 tests
-    run_gemm_ss(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2)
-
-    # tfloat32 test
-    run_gemm_ss(128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_ss(128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_ss(128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_ss(128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2)
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float16, 128, 128, 32, 2, 128),
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 128, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float16, 128, 128, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float16, 128, 128, 32, 2, 128),
+        (128, 8, 32, False, True, T.float16, T.float16, T.float16, 128, 8, 32, 0, 128),
+        (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_ss(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_ss(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
 
 def matmul_rs(
@@ -155,13 +152,11 @@ def matmul_rs(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
     A_frag_shape = A_shared_shape
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -169,9 +164,11 @@ def main(
             A_frag = T.alloc_fragment(A_frag_shape, in_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             T.clear(C_local)
-            T.annotate_layout({
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                }
+            )
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 if trans_A:
                     T.copy(A[k * block_K, by * block_M], A_shared)
@@ -225,7 +222,8 @@ def run_gemm_rs(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
 
     def ref_program(A, B):
@@ -243,30 +241,27 @@ def ref_program(A, B):
 
 
 @pytest.mark.skip(reason="Temporarily disabling until GEMM RS issues are resolved")
-def test_gemm_rs():
-    # GEMM tests for float16
-    run_gemm_rs(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rs(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rs(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rs(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 256, 32, 2)
-
-    # n8 tests
-    run_gemm_rs(128, 8, 32, False, True, "float16", "float16", "float16", 128, 8, 32, 0, 128)
-
-    # int8 tests
-    run_gemm_rs(128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_rs(128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_rs(128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_rs(128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 32, 2)
-
-    # float8 tests
-    run_gemm_rs(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2)
-
-    # float32 tests
-    run_gemm_rs(128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_rs(128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_rs(128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_rs(128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2)
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (128, 8, 32, False, True, T.float16, T.float16, T.float16, 128, 8, 32, 0, 128),
+        (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_rs(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_rs(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
 
 def matmul_sr(
@@ -290,13 +285,11 @@ def matmul_sr(
     B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
     B_frag_shape = B_shared_shape
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -304,9 +297,11 @@ def main(
             B_frag = T.alloc_fragment(B_frag_shape, in_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             T.clear(C_local)
-            T.annotate_layout({
-                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                }
+            )
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 if trans_A:
                     T.copy(A[k * block_K, by * block_M], A_shared)
@@ -360,7 +355,8 @@ def run_gemm_sr(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
 
     def ref_program(A, B):
@@ -377,30 +373,27 @@ def ref_program(A, B):
     profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
 
 
-def test_gemm_sr():
-    # GEMM tests for float16
-    run_gemm_sr(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_sr(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_sr(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_sr(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 256, 32, 2)
-
-    # n8 tests
-    run_gemm_sr(128, 8, 32, False, True, "float16", "float16", "float16", 128, 8, 32, 0, 128)
-
-    # int8 tests
-    run_gemm_sr(128, 128, 32, False, True, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_sr(128, 128, 32, False, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_sr(128, 128, 32, True, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_sr(128, 128, 32, True, True, "int8", "int8", "int32", 128, 128, 32, 2)
-
-    # float8 tests
-    run_gemm_sr(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2)
-
-    # float32 tests
-    run_gemm_sr(128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_sr(128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_sr(128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_sr(128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2)
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (128, 8, 32, False, True, T.float16, T.float16, T.float16, 128, 8, 32, 0, 128),
+        (128, 128, 32, False, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 32, False, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 32, True, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 32, True, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_sr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_sr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
 
 def matmul_rr(
@@ -429,9 +422,9 @@ def matmul_rr(
 
     @T.prim_func
     def main(
-            A: T.Tensor(A_shape, in_dtype),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
@@ -440,10 +433,12 @@ def main(
             B_frag = T.alloc_fragment(B_frag_shape, in_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             T.clear(C_local)
-            T.annotate_layout({
-                A_shared: tilelang.layout.make_swizzled_layout(A_shared),
-                B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-            })
+            T.annotate_layout(
+                {
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                }
+            )
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 if trans_A:
                     T.copy(A[k * block_K, by * block_M], A_shared)
@@ -498,7 +493,8 @@ def run_gemm_rr(
         pass_configs={
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        })
+        },
+    )
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
 
     def ref_program(A, B):
@@ -515,31 +511,29 @@ def ref_program(A, B):
     profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
 
 
-def test_gemm_rr():
-    # GEMM tests for float16
-    run_gemm_rr(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rr(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rr(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rr(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 256, 32, 2)
-    run_gemm_rr(512, 1024, 768, False, True, "bfloat16", "bfloat16", "float", 128, 256, 32, 2)
-    # n8 tests
-    run_gemm_rr(128, 8, 128, False, True, "float16", "float16", "float16", 128, 8, 32, 2)
-    run_gemm_rr(128, 8, 128, False, True, "int8", "int8", "int32", 128, 8, 32, 2)
-
-    # int8 tests
-    run_gemm_rr(128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_rr(128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_rr(128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 32, 2)
-    run_gemm_rr(128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 32, 2)
-
-    # float8 tests
-    run_gemm_rr(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2)
-
-    # float32 tests
-    run_gemm_rr(128, 128, 128, False, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_rr(128, 128, 128, False, True, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_rr(128, 128, 128, True, False, "float", "float", "float32", 128, 128, 32, 2)
-    run_gemm_rr(128, 128, 128, True, True, "float", "float", "float32", 128, 128, 32, 2)
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float16, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.bfloat16, T.bfloat16, T.float, 128, 256, 32, 2, 128),
+        (128, 8, 128, False, True, T.float16, T.float16, T.float16, 128, 8, 32, 2, 128),
+        (128, 8, 128, False, True, T.int8, T.int8, T.int32, 128, 8, 32, 2, 128),
+        (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, False, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, False, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+        (128, 128, 128, True, True, T.float, T.float, T.float32, 128, 128, 32, 2, 128),
+    ],
+)
+def test_gemm_rr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_rr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
index 74b9729f6..b0f4a29c9 100644
--- a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
@@ -1,29 +1,33 @@
+import pytest
 import torch
 import tilelang
 import tilelang.testing
-
-from tilelang.utils.sparse import compress, randn_semi_sparse
-from tilelang.layout import make_metadata_layout
-
-torch.set_printoptions(threshold=float('inf'), edgeitems=float('inf'), linewidth=10000)
-torch.manual_seed(42)
-
-STR_TO_TYPE = {
-    'float32': torch.float32,
-    "float16": torch.float16,
-    "bfloat16": torch.bfloat16,
-    "float8_e4m3": torch.float8_e4m3fn,
-    "int8": torch.int8,
-    "int32": torch.int32,
-}
-
-SPARSITY_MAP = {
-    # 'float32': (1, 2),  # not supported for now
-    torch.float16: (2, 4),
-    torch.bfloat16: (2, 4),
-    torch.float8_e4m3fn: (2, 4),
-    torch.int8: (2, 4),
-}
+import tilelang.language as T
+
+from tilelang.utils.sparse import compress, randn_semi_sparse, randint_semi_sparse
+from tilelang.layout import make_cutlass_metadata_layout
+from tilelang.utils.tensor import torch_assert_close, map_torch_type
+from tilelang.intrinsics.mma_sp_macro_generator import SparseTensorCoreIntrinEmitter
+
+torch.backends.cuda.matmul.allow_tf32 = False
+# torch.manual_seed(42)  # only enable when debugging
+
+
+def generate_dense_input(M, N, K, trans_A, trans_B, in_dtype):
+    is_8bit = "8" in in_dtype
+    is_unsigned = "uint" in in_dtype
+    is_int = "int" in in_dtype
+    if is_int:
+        if is_8bit:
+            low, high = (0, 4) if is_unsigned else (-2, 2)
+        else:
+            low, high = (0, 128) if is_unsigned else (-64, 64)
+        A = randint_semi_sparse(M, K, low=low, high=high, dtype=map_torch_type(in_dtype), device="cuda", transposed=trans_A)
+        B = torch.randint(size=(N, K) if trans_B else (K, N), low=low, high=high, dtype=map_torch_type(in_dtype), device="cuda")
+    else:
+        A = randn_semi_sparse(M, K, dtype=torch.float32, device="cuda", transposed=trans_A).to(map_torch_type(in_dtype))
+        B = torch.randn((N, K) if trans_B else (K, N), device="cuda", dtype=torch.float32).to(map_torch_type(in_dtype))
+    return A, B
 
 
 def matmul_sp_sm90(
@@ -41,40 +45,32 @@ def matmul_sp_sm90(
     trans_A,
     trans_B,
 ):
-    E_factor = 4 if in_dtype == "float32" else 8
+    E_factor = 4 if in_dtype == T.float32 else 8
     A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
     B_shape = (K, N) if not trans_B else (N, K)
     A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
     B_shared_shape = (block_K, block_N) if not trans_B else (block_N, block_K)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A_sparse: T.Tensor(A_sparse_shape, in_dtype),
-            E: T.Tensor((M, K // E_factor), 'uint8'),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // E_factor), "uint8"),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            E_shared = T.alloc_shared((block_M, block_K // E_factor), 'uint8')
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.annotate_layout({
-                E:
-                    make_metadata_layout(
-                        E, mma_dtype="float16", arch="9.0", backend="cutlass", block_k=block_K),
-                E_shared:
-                    make_metadata_layout(
-                        E_shared,
-                        mma_dtype="float16",
-                        arch="9.0",
-                        backend="cutlass",
-                        block_k=block_K),
-            })
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), "uint8")
+            C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.annotate_layout(
+                {
+                    E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="9.0", block_k=block_K),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="9.0", block_k=block_K),
+                }
+            )
             T.disable_warp_group_reg_alloc()
-            T.clear(C_local)
+            T.clear(C_frag)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
                 if trans_A:
@@ -85,8 +81,8 @@ def main(
                     T.copy(B[bx * block_N, k * block_K], B_shared)
                 else:
                     T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm_sp(A_shared, E_shared, B_shared, C_local, trans_A, trans_B)
-            T.copy(C_local, C[by * block_M, bx * block_N])
+                T.gemm_sp(A_shared, E_shared, B_shared, C_frag, trans_A, trans_B)
+            T.copy(C_frag, C[by * block_M, bx * block_N])
 
     return main
 
@@ -107,34 +103,31 @@ def matmul_sp_sm80(
     trans_B,
 ):
     is_8_bit = "8" in in_dtype
-    E_factor = 32 if is_8_bit else 16
+    metadata_dtype = T.int32 if is_8_bit else T.int16
+    E_factor = SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype]
     A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
     B_shape = (K, N) if not trans_B else (N, K)
     A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
     B_shared_shape = (block_K, block_N) if not trans_B else (block_N, block_K)
 
-    import tilelang.language as T
-
     @T.prim_func
     def main(
-            A_sparse: T.Tensor(A_sparse_shape, in_dtype),
-            E: T.Tensor((M, K // E_factor), 'int32' if is_8_bit else 'int16'),
-            B: T.Tensor(B_shape, in_dtype),
-            C: T.Tensor((M, N), out_dtype),
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // E_factor), metadata_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
-            E_shared = T.alloc_shared((block_M, block_K // E_factor),
-                                      'int32' if is_8_bit else 'int16')
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)
             C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
-            T.annotate_layout({
-                E:
-                    make_metadata_layout(E, mma_dtype="float16", backend="cutlass", arch="8.0"),
-                E_shared:
-                    make_metadata_layout(
-                        E_shared, mma_dtype="float16", backend="cutlass", arch="8.0"),
-            })
+            T.annotate_layout(
+                {
+                    E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
+                }
+            )
             T.clear(C_frag)
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
@@ -181,19 +174,14 @@ def run_gemm_sp(
         kernel,
         out_idx=[-1],
     )
-    A = randn_semi_sparse(M, K, dtype=STR_TO_TYPE[in_dtype], device='cuda', transposed=trans_A)
-    if trans_B:
-        B = torch.randn((N, K), device='cuda', dtype=torch.float32)
-    else:
-        B = torch.randn((K, N), device='cuda', dtype=torch.float32)
-
-    if "float8" in in_dtype or "int8" in in_dtype:
-        A = normalize(A.float())
-        B = normalize(B.float())
-
-    A = A.to(STR_TO_TYPE[in_dtype])
-    B = B.to(STR_TO_TYPE[in_dtype])
-
+    A, B = generate_dense_input(
+        M=M,
+        N=N,
+        K=K,
+        trans_A=trans_A,
+        trans_B=trans_B,
+        in_dtype=in_dtype,
+    )
     A_sparse, E = compress(A, transposed=trans_A, block_k=block_K)
 
     C_sp = kernel(A_sparse, E, B)
@@ -206,17 +194,27 @@ def _matmul(A, B):
         if "float8" in in_dtype or "int8" in in_dtype:
             A = A.to(torch.float32)
             B = B.to(torch.float32)
-        return torch.matmul(A, B).to(STR_TO_TYPE[out_dtype])
+        return torch.matmul(A, B)
 
     C = _matmul(A, B)
-    if 'float8' in in_dtype:
+
+    if "float8" in in_dtype:
         diff = calc_diff(C_sp, C)
         assert diff < 1e-3, f"{diff=}"
     else:
-        torch.testing.assert_close(C_sp, C, atol=1e-3, rtol=1e-3)
+        torch_assert_close(
+            C_sp.to(torch.float32),
+            C.to(torch.float32),
+            rtol=1e-3,
+            atol=1e-3,
+            base_name="tilelang_sp",
+            ref_name="ref_dense",
+        )
     print("pass")
 
 
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
 def run_gemm_sp_sm90(
     M,
     N,
@@ -229,8 +227,8 @@ def run_gemm_sp_sm90(
     block_K,
     num_stages,
     num_threads,
-    trans_A=False,
-    trans_B=False,
+    trans_A,
+    trans_B,
 ):
     kernel = matmul_sp_sm90(
         M,
@@ -260,6 +258,9 @@ def run_gemm_sp_sm90(
     )
 
 
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(8, 0)
+@tilelang.testing.requires_cuda_compute_version_le(8, 9)
 def run_gemm_sp_sm80(
     M,
     N,
@@ -272,8 +273,8 @@ def run_gemm_sp_sm80(
     block_K,
     num_stages,
     num_threads,
-    trans_A=False,
-    trans_B=False,
+    trans_A,
+    trans_B,
 ):
     kernel = matmul_sp_sm80(
         M,
@@ -305,57 +306,51 @@ def run_gemm_sp_sm80(
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version(9, 0)
-def test_gemm_sp_sm90():
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 32, 2, 128)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 32, 0, 256)
-
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 2, 128)
-
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 128, 128, 128, 0, 128)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 128, 128, 128, 2, 128)
-
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 128, 256, 0, 128)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 128, 256, 2, 128)
-
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, False,
-                     True)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, True,
-                     False)
-    run_gemm_sp_sm90(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, True,
-                     True)
-
-    run_gemm_sp_sm90(512, 1024, 768, "float8_e4m3", "float16", "float16", 64, 64, 64, 2, 128, False,
-                     True)
-    run_gemm_sp_sm90(512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 2, 128, False, True)
+@pytest.mark.parametrize(
+    "M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B",
+    [
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 32, 2, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 32, 0, 256, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 2, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 128, 128, 128, 0, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 128, 128, 128, 2, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 128, 256, 0, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 128, 256, 2, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 128, False, True),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 128, False, False),
+        (512, 1024, 768, T.float8_e4m3fn, T.float16, T.float16, 64, 64, 64, 2, 128, False, True),
+        (512, 1024, 768, T.int8, T.int32, T.int32, 64, 64, 64, 2, 128, False, True),
+    ],
+)
+def test_gemm_sp_sm90(M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B):
+    run_gemm_sp_sm90(M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B)
 
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(8, 0)
 @tilelang.testing.requires_cuda_compute_version_le(8, 9)
-def test_gemm_sp_sm80():
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 32, 32, 32, 0, 32)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 32)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128)
-
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 32, 32, 64, 0, 32, False,
-                     True)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 32, False,
-                     True)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 0, 128, False,
-                     True)
-
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 1, 128)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 2, 128)
-    run_gemm_sp_sm80(512, 1024, 768, "float16", "float32", "float32", 64, 64, 64, 3, 128)
-
-    run_gemm_sp_sm80(512, 1024, 768, "int8", "int32", "int32", 32, 32, 64, 0, 32, False, True)
-    run_gemm_sp_sm80(512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 0, 32, False, True)
-    run_gemm_sp_sm80(512, 1024, 768, "int8", "int32", "int32", 128, 128, 128, 0, 128, False, True)
-
-    run_gemm_sp_sm80(512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 1, 128, False, True)
-    run_gemm_sp_sm80(512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 2, 128, False, True)
-    run_gemm_sp_sm80(512, 1024, 768, "int8", "int32", "int32", 64, 64, 64, 3, 128, False, True)
+@pytest.mark.parametrize(
+    "M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B",
+    [
+        (512, 1024, 768, T.float16, T.float32, T.float32, 32, 32, 32, 0, 32, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 32, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 32, 32, 64, 0, 32, False, True),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 32, False, True),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 128, False, True),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 1, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 2, 128, False, False),
+        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 3, 128, False, False),
+        (512, 1024, 768, T.int8, T.int32, T.int32, 32, 32, 64, 0, 32, False, True),
+        (512, 1024, 768, T.int8, T.int32, T.int32, 64, 64, 64, 0, 32, False, True),
+        (512, 1024, 768, T.int8, T.int32, T.int32, 128, 128, 128, 0, 128, False, True),
+        (512, 1024, 768, T.int8, T.int32, T.int32, 64, 64, 64, 1, 128, False, True),
+        (512, 1024, 768, T.int8, T.int32, T.int32, 64, 64, 64, 2, 128, False, True),
+    ],
+)
+def test_gemm_sp_sm80(M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B):
+    run_gemm_sp_sm80(M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py
new file mode 100644
index 000000000..9d232902c
--- /dev/null
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py
@@ -0,0 +1,633 @@
+import pytest
+from tilelang import tvm as tvm
+from tilelang.utils.sparse import compress, randn_semi_sparse, randint_semi_sparse
+from tilelang.utils.tensor import torch_assert_close, map_torch_type
+from tilelang.layout import make_cutlass_metadata_layout
+from tilelang.intrinsics.mma_sp_macro_generator import SparseTensorCoreIntrinEmitter
+
+import tilelang.testing
+import torch
+import tilelang.language as T
+
+
+def matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    metadata_dtype,
+    E_factor,
+    num_stages,
+    threads,
+):
+    A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // E_factor), metadata_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)
+            C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.annotate_layout(
+                {
+                    E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
+                }
+            )
+            T.clear(C_frag)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
+                if trans_A:
+                    T.copy(A_sparse[k * block_K // 2, by * block_M], A_shared)
+                else:
+                    T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm_sp_v2(A_shared, E_shared, B_shared, C_frag, trans_A, trans_B)
+            T.copy(C_frag, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_ss(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    metadata_dtype = T.int32 if ("8" in in_dtype) else T.int16
+    program = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        metadata_dtype,
+        SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype],  # E_factor
+        num_stages,
+        num_threads,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[3],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    A, B = generate_dense_input(M, N, K, trans_A, trans_B, in_dtype)
+
+    A_sparse, E = compress(A, transposed=trans_A, block_k=block_K, arch="8.0")
+    C_sp = kernel(A_sparse, E, B)
+
+    def _matmul(A, B):
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        A = A.to(torch.float32)
+        B = B.to(torch.float32)
+        return torch.matmul(A, B)
+
+    C = _matmul(A, B)
+
+    torch_assert_close(
+        C_sp.to(map_torch_type(out_dtype)).to(torch.float32),
+        C.to(map_torch_type(out_dtype)).to(torch.float32),
+        rtol=1e-3,
+        atol=1e-3,
+        base_name="tilelang_sp",
+        ref_name="ref_dense",
+    )
+    print("pass")
+
+
+def generate_dense_input(M, N, K, trans_A, trans_B, in_dtype):
+    is_8bit = "8" in in_dtype
+    is_unsigned = "uint" in in_dtype
+    is_int = "int" in in_dtype
+    if is_int:
+        if is_8bit:
+            low, high = (0, 4) if is_unsigned else (-2, 2)
+        else:
+            low, high = (0, 128) if is_unsigned else (-64, 64)
+        A = randint_semi_sparse(M, K, low=low, high=high, dtype=map_torch_type(in_dtype), device="cuda", transposed=trans_A)
+        B = torch.randint(size=(N, K) if trans_B else (K, N), low=low, high=high, dtype=map_torch_type(in_dtype), device="cuda")
+    else:
+        A = randn_semi_sparse(M, K, dtype=map_torch_type(in_dtype), device="cuda", transposed=trans_A)
+        B = torch.randn((N, K) if trans_B else (K, N), device="cuda", dtype=torch.float32).to(map_torch_type(in_dtype))
+    return A, B
+
+
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float, 128, 128, 32, 2, 128),
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float, 128, 128, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float, 128, 128, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float, 128, 128, 32, 2, 128),
+        (128, 8, 64, False, True, T.float16, T.float16, T.float, 128, 8, 32, 0, 128),
+        (128, 128, 128, False, True, T.int8, T.int32, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, False, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 64, 2, 128),
+    ],
+)
+def test_gemm_ss(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_ss(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
+def matmul_rs(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    metadata_dtype,
+    E_factor,
+    num_stages,
+    threads,
+):
+    A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+    A_frag_shape = A_shared_shape
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // E_factor), metadata_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)
+            A_frag = T.alloc_fragment(A_frag_shape, in_dtype)
+            C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.annotate_layout(
+                {
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                    E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
+                }
+            )
+            T.clear(C_frag)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
+                if trans_A:
+                    T.copy(A_sparse[k * block_K // 2, by * block_M], A_shared)
+                else:
+                    T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.copy(A_shared, A_frag)
+                T.gemm_sp_v2(A_frag, E_shared, B_shared, C_frag, trans_A, trans_B)
+            T.copy(C_frag, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_rs(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    metadata_dtype = T.int32 if ("8" in in_dtype) else T.int16
+    program = matmul_rs(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        metadata_dtype,
+        SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype],  # E_factor
+        num_stages,
+        num_threads,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[3],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    A, B = generate_dense_input(M, N, K, trans_A, trans_B, in_dtype)
+    A_sparse, E = compress(A, transposed=trans_A, block_k=block_K, arch="8.0")
+    C_sp = kernel(A_sparse, E, B)
+
+    def _matmul(A, B):
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        A = A.to(torch.float32)
+        B = B.to(torch.float32)
+        return torch.matmul(A, B)
+
+    C = _matmul(A, B)
+
+    torch_assert_close(
+        C_sp.to(map_torch_type(out_dtype)).to(torch.float32),
+        C.to(map_torch_type(out_dtype)).to(torch.float32),
+        rtol=1e-3,
+        atol=1e-3,
+        base_name="tilelang_sp",
+        ref_name="ref_dense",
+    )
+    print("pass")
+
+
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (128, 8, 64, False, True, T.float16, T.float16, T.float32, 128, 8, 32, 0, 128),
+        (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 64, 2, 128),
+    ],
+)
+def test_gemm_rs(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_rs(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
+def matmul_sr(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    metadata_dtype,
+    E_factor,
+    num_stages,
+    threads,
+):
+    A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+    B_frag_shape = B_shared_shape
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // E_factor), metadata_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)
+            B_frag = T.alloc_fragment(B_frag_shape, in_dtype)
+            C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.annotate_layout(
+                {
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                    E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
+                }
+            )
+            T.clear(C_frag)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
+                if trans_A:
+                    T.copy(A_sparse[k * block_K // 2, by * block_M], A_shared)
+                else:
+                    T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.copy(B_shared, B_frag)
+                T.gemm_sp_v2(A_shared, E_shared, B_frag, C_frag, trans_A, trans_B)
+            T.copy(C_frag, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_sr(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    metadata_dtype = T.int32 if ("8" in in_dtype) else T.int16
+    program = matmul_sr(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        metadata_dtype,
+        SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype],  # E_factor
+        num_stages,
+        num_threads,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[3],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    A, B = generate_dense_input(M, N, K, trans_A, trans_B, in_dtype)
+    A_sparse, E = compress(A, transposed=trans_A, block_k=block_K, arch="8.0")
+    C_sp = kernel(A_sparse, E, B)
+
+    def _matmul(A, B):
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        A = A.to(torch.float32)
+        B = B.to(torch.float32)
+        return torch.matmul(A, B)
+
+    C = _matmul(A, B)
+
+    torch_assert_close(
+        C_sp.to(map_torch_type(out_dtype)).to(torch.float32),
+        C.to(map_torch_type(out_dtype)).to(torch.float32),
+        rtol=1e-3,
+        atol=1e-3,
+        base_name="tilelang_sp",
+        ref_name="ref_dense",
+    )
+    print("pass")
+
+
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (128, 8, 64, False, True, T.float16, T.float16, T.float32, 128, 8, 32, 0, 128),
+        (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 128, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 128, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 64, 2, 128),
+    ],
+)
+def test_gemm_sr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_sr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
+def matmul_rr(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    metadata_dtype,
+    E_factor,
+    num_stages,
+    threads,
+):
+    A_sparse_shape = (M, K // 2) if not trans_A else (K // 2, M)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_M, block_K // 2) if not trans_A else (block_K // 2, block_M)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+    A_frag_shape = A_shared_shape
+    B_frag_shape = B_shared_shape
+
+    import tilelang.language as T
+
+    @T.prim_func
+    def main(
+        A_sparse: T.Tensor(A_sparse_shape, in_dtype),
+        E: T.Tensor((M, K // E_factor), metadata_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            E_shared = T.alloc_shared((block_M, block_K // E_factor), metadata_dtype)
+            A_frag = T.alloc_fragment(A_frag_shape, in_dtype)
+            B_frag = T.alloc_fragment(B_frag_shape, in_dtype)
+            C_frag = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.annotate_layout(
+                {
+                    A_shared: tilelang.layout.make_swizzled_layout(A_shared),
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                    E: make_cutlass_metadata_layout(E, mma_dtype=in_dtype, arch="8.0"),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=in_dtype, arch="8.0"),
+                }
+            )
+            T.clear(C_frag)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(E[by * block_M, k * block_K // E_factor], E_shared)
+                if trans_A:
+                    T.copy(A_sparse[k * block_K // 2, by * block_M], A_shared)
+                else:
+                    T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.copy(A_shared, A_frag)
+                T.copy(B_shared, B_frag)
+                T.gemm_sp_v2(A_frag, E_shared, B_frag, C_frag, trans_A, trans_B)
+            T.copy(C_frag, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_rr(
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=3,
+    num_threads=128,
+):
+    metadata_dtype = T.int32 if ("8" in in_dtype) else T.int16
+    program = matmul_rr(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        metadata_dtype,
+        SparseTensorCoreIntrinEmitter.E_FACTOR_MAP[in_dtype][metadata_dtype],  # E_factor
+        num_stages,
+        num_threads,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[3],
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    A, B = generate_dense_input(M, N, K, trans_A, trans_B, in_dtype)
+    A_sparse, E = compress(A, transposed=trans_A, block_k=block_K, arch="8.0")
+    C_sp = kernel(A_sparse, E, B)
+
+    def _matmul(A, B):
+        if trans_A:
+            A = A.T
+        if trans_B:
+            B = B.T
+        A = A.to(torch.float32)
+        B = B.to(torch.float32)
+        return torch.matmul(A, B)
+
+    C = _matmul(A, B)
+
+    torch_assert_close(
+        C_sp.to(map_torch_type(out_dtype)).to(torch.float32),
+        C.to(map_torch_type(out_dtype)).to(torch.float32),
+        rtol=1e-3,
+        atol=1e-3,
+        base_name="tilelang_sp",
+        ref_name="ref_dense",
+    )
+    print("pass")
+
+
+@pytest.mark.parametrize(
+    "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
+    [
+        (512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, False, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, True, True, T.float16, T.float16, T.float32, 128, 256, 32, 2, 128),
+        (512, 1024, 768, False, True, T.bfloat16, T.bfloat16, T.float32, 128, 256, 32, 2, 128),
+        (128, 8, 128, False, True, T.float16, T.float16, T.float32, 128, 8, 32, 2, 128),
+        (128, 8, 128, False, True, T.int8, T.int8, T.int32, 128, 8, 64, 2, 128),
+        (128, 128, 128, False, True, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, False, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, False, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.int8, T.int8, T.int32, 128, 128, 64, 2, 128),
+        (128, 128, 128, True, True, T.float8_e5m2, T.float8_e5m2, T.float32, 128, 128, 64, 2, 128),
+    ],
+)
+def test_gemm_rr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads):
+    run_gemm_rr(M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/transform/test_nullable_buffer_params.py b/testing/python/transform/test_nullable_buffer_params.py
new file mode 100644
index 000000000..e02c8125a
--- /dev/null
+++ b/testing/python/transform/test_nullable_buffer_params.py
@@ -0,0 +1,104 @@
+import torch
+import tilelang
+import tilelang.testing
+from tilelang import language as T
+
+
+def test_nullable_shared_shape():
+    """Test that buffers sharing a shape variable can be nullable."""
+
+    @tilelang.jit
+    def get_kernel():
+        m = T.dynamic("m")
+
+        @T.prim_func
+        def test_kernel(
+            a: T.Tensor[(m,), T.int32],
+            b: T.Tensor[(m,), T.int32],
+            c: T.Tensor[(m,), T.int32],
+        ):
+            with T.Kernel(1, threads=64):
+                tx = T.get_thread_binding()
+                if tx == 0:
+                    T.print(m)
+
+        return test_kernel
+
+    m = 200
+    kernel = get_kernel()
+
+    # Create test tensors
+    tensor_a = torch.randn((m,), device="cuda", dtype=torch.float32).to(torch.int32)
+    tensor_b = torch.randn((m,), device="cuda", dtype=torch.float32).to(torch.int32)
+    tensor_c = torch.randn((m,), device="cuda", dtype=torch.float32).to(torch.int32)
+
+    print("Test 1: All tensors provided")
+    kernel(tensor_a, tensor_b, tensor_c)
+    print("✓ PASS: All tensors provided")
+
+    print("\nTest 2: Only first tensor provided")
+    kernel(tensor_a, None, None)
+    print("✓ PASS: Only first tensor provided")
+
+    print("\nTest 3: Only middle tensor provided")
+    kernel(None, tensor_b, None)
+    print("✓ PASS: Only middle tensor provided")
+
+    print("\nTest 4: Only last tensor provided")
+    kernel(None, None, tensor_c)
+    print("✓ PASS: Only last tensor provided")
+
+    print("\nTest 5: First and last tensors provided")
+    kernel(tensor_a, None, tensor_c)
+    print("✓ PASS: First and last tensors provided")
+
+    print("\nTest 6: All tensors are None (should fail)")
+    try:
+        kernel(None, None, None)
+        print("✗ FAIL: Should have raised an error")
+        return False
+    except RuntimeError as e:
+        if "at least one non-null buffer" in str(e):
+            print(f"✓ PASS: Correctly rejected with error: {e}")
+        else:
+            print(f"✗ FAIL: Wrong error message: {e}")
+            return False
+
+    print("\n" + "=" * 60)
+    print("All tests passed!")
+    return True
+
+
+def test_nullable_single_source_shape():
+    """Test that a single buffer with a symbolic shape var must be non-null.
+
+    This guards against the previous segfault when binding m from x.shape[0]
+    with x == None.
+    """
+
+    @tilelang.jit
+    def get_kernel():
+        m = T.dynamic("m")
+
+        @T.prim_func
+        def sample_kernel(x: T.Tensor[(m,), T.int32]):
+            with T.Kernel(1, threads=1):
+                tx = T.get_thread_binding()
+                if tx == 0:
+                    T.print(m)
+
+        return sample_kernel
+
+    m = 16
+    kernel = get_kernel()
+
+    # Provide a valid tensor: should run
+    x = torch.randn((m,), device="cuda", dtype=torch.float32).to(torch.int32)
+    kernel(x)
+
+    # Passing None should not segfault; m binds to 0 and kernel is a no-op
+    kernel(None)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/transform/test_readonly_param_const_codegen.py b/testing/python/transform/test_readonly_param_const_codegen.py
new file mode 100644
index 000000000..0d255b46b
--- /dev/null
+++ b/testing/python/transform/test_readonly_param_const_codegen.py
@@ -0,0 +1,54 @@
+import tilelang.language as T
+from tilelang.engine.lower import lower
+from tilelang.jit.adapter.utils import match_declare_kernel
+
+
+def _simple_add_kernel():
+    @T.prim_func
+    def main(
+        x: T.Tensor((128,), T.float32),
+        y: T.Tensor((128,), T.float32),
+    ):
+        # One-dimensional kernel; writes y from x without modifying x
+        with T.Kernel(128, threads=32) as pid:
+            y[pid] = x[pid] + 1.0
+
+    return main
+
+
+def test_codegen_emits_const_for_readonly_params():
+    # Lower without device compilation to retrieve CUDA source reliably
+    func = _simple_add_kernel()
+    artifact = lower(func, target="cuda", enable_device_compile=False)
+
+    src = artifact.kernel_source
+    print(src)
+    assert 'extern "C" __global__' in src
+
+    # Extract kernel signature and check qualifiers
+    lparen = match_declare_kernel(src)
+    rparen = src.find(")", lparen)
+    assert rparen != -1
+    signature = src[lparen:rparen]
+
+    # x is read-only: should be `const` and `__restrict__`
+    assert "const float* __restrict__" in signature
+    # y is written: must not be const, but still `__restrict__` due to noalias
+    # We ensure there is a non-const float* parameter with __restrict__ as well
+    assert "const float* __restrict__ x" in src or "const float *__restrict__ x" in src
+    assert " float* __restrict__ y" in src or " float *__restrict__ y" in src
+
+    # Also validate the function attribute carries read-only param indices
+    # Expect only the first handle parameter (x) to be marked read-only
+    device_mod = artifact.device_mod
+    prim_funcs = [f for f in device_mod.functions.values() if hasattr(f, "attrs")]
+    assert prim_funcs, "No PrimFunc found in device module"
+    pf = prim_funcs[0]
+    ro = pf.attrs.get("tl.readonly_param_indices")
+    assert ro is not None, "Expected tl.readonly_param_indices to be present"
+    ro_list = [int(i) for i in ro]
+    assert 0 in ro_list and 1 not in ro_list
+
+
+if __name__ == "__main__":
+    test_codegen_emits_const_for_readonly_params()
diff --git a/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py b/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py
index c0444043d..cdff6fb1d 100644
--- a/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py
+++ b/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py
@@ -10,26 +10,19 @@ def _check(original, transformed):
     mod = tl.transform.InjectSoftwarePipeline()(mod)
     mod = tl.transform.Simplify()(mod)
     mod = tl.transform.LowerOpaqueBlock()(mod)
-    tvm.ir.assert_structural_equal(mod["main"], transformed.with_attr("global_symbol", "main"),
-                                   True)
+    mod = tl.transform.Simplify()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], transformed.with_attr("global_symbol", "main"), True)
 
 
 def test_trival_pipeline():
-
     @T.prim_func
-    def before(A: T.Tensor((16, 1), "float32"), C: T.Tensor((16, 1), "float32")):
+    def before(A: T.Tensor((16, 1), T.float32), C: T.Tensor((16, 1), T.float32)):
         for tx in T.thread_binding(0, 16, thread="threadIdx.x"):
-            for i in T.serial(
-                    0,
-                    1,
-                    annotations={
-                        "software_pipeline_stage": [0, 1],
-                        "software_pipeline_order": [0, 1]
-                    }):
+            for i in T.serial(0, 1, annotations={"software_pipeline_stage": [0, 1], "software_pipeline_order": [0, 1]}):
                 with T.block():
                     T.reads(A[tx, i])
                     T.writes(C[tx, i])
-                    B = T.alloc_buffer((16, 1), dtype="float32", scope="shared")
+                    B = T.alloc_buffer((16, 1), dtype=T.float32, scope="shared")
                     with T.block():
                         T.reads(A[tx, i])
                         T.writes(B[tx, 0])
diff --git a/testing/python/transform/test_tilelang_transform_cluster_planning.py b/testing/python/transform/test_tilelang_transform_cluster_planning.py
index 8029305ae..296c6ce94 100644
--- a/testing/python/transform/test_tilelang_transform_cluster_planning.py
+++ b/testing/python/transform/test_tilelang_transform_cluster_planning.py
@@ -21,14 +21,12 @@ def _check(original, transformed):
 
 
 def test_cluster_planning():
-
     @T.prim_func
-    def before(A: T.Tensor((1024, 32), "float16"), B: T.Tensor((32, 1024), "float16"), C: T.Tensor(
-        (1024, 1024), "float16")):
+    def before(A: T.Tensor((1024, 32), T.float16), B: T.Tensor((32, 1024), T.float16), C: T.Tensor((1024, 1024), T.float16)):
         with T.Kernel(8, 8, threads=128) as (bx, by):
-            A_shared = T.alloc_shared((128, 32), "float16")
-            B_shared = T.alloc_shared((32, 128), "float16")
-            C_local = T.alloc_fragment((128, 128), "float32")
+            A_shared = T.alloc_shared((128, 32), T.float16)
+            B_shared = T.alloc_shared((32, 128), T.float16)
+            C_local = T.alloc_fragment((128, 128), T.float32)
 
             T.clear(C_local)
 
@@ -41,13 +39,12 @@ def before(A: T.Tensor((1024, 32), "float16"), B: T.Tensor((32, 1024), "float16"
             T.copy(C_local, C[by * 128, bx * 128])
 
     @T.prim_func
-    def after(A: T.Tensor((1024, 32), "float16"), B: T.Tensor((32, 1024), "float16"), C: T.Tensor(
-        (1024, 1024), "float16")):
+    def after(A: T.Tensor((1024, 32), T.float16), B: T.Tensor((32, 1024), T.float16), C: T.Tensor((1024, 1024), T.float16)):
         T.func_attr({"clusterIdx.y": T.int32(2)})
         with T.Kernel(8, 8, threads=128) as (bx, by):
-            A_shared = T.alloc_shared((128, 32), "float16")
-            B_shared = T.alloc_shared((32, 128), "float16")
-            C_local = T.alloc_fragment((128, 128), "float32")
+            A_shared = T.alloc_shared((128, 32), T.float16)
+            B_shared = T.alloc_shared((32, 128), T.float16)
+            C_local = T.alloc_fragment((128, 128), T.float32)
 
             T.clear(C_local)
 
diff --git a/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py b/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py
index f051f0282..559b2ffb4 100644
--- a/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py
+++ b/testing/python/transform/test_tilelang_transform_config_index_bitwidth.py
@@ -9,7 +9,7 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
     block_N = 64
     num_stages = 0
     threads = 128
-    scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
+    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
 
     batch = T.int32(batch)
     heads = T.int32(heads)
@@ -19,12 +19,11 @@ def blocksparse_flashattn(batch, heads, seq_len, dim, downsample_len, is_causal)
     shape = [batch, heads, seq_len, dim]
     block_mask_shape = [batch, heads, downsample_len, downsample_len]
 
-    dtype = "bfloat16"
-    accum_dtype = "float"
+    dtype = T.bfloat16
+    accum_dtype = T.float32
     block_mask_dtype = "bool"
 
     def kernel_func(block_M, block_N, num_stages, threads):
-
         @T.macro
         def MMA0(
             K: T.Tensor(shape, dtype),
@@ -36,41 +35,42 @@ def MMA0(
             by: T.int32,
             bz: T.int32,
         ):
-            T.copy(K[bz, by, k * block_N:(k + 1) * block_N, :], K_shared)
+            T.copy(K[bz, by, k * block_N : (k + 1) * block_N, :], K_shared)
             if is_causal:
                 for i, j in T.Parallel(block_M, block_N):
-                    acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0,
-                                                 -T.infinity(acc_s.dtype))
+                    acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
             else:
                 T.clear(acc_s)
             T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
         @T.macro
         def MMA1(
-                V: T.Tensor(shape, dtype),
-                V_shared: T.Tensor([block_M, dim], dtype),
-                acc_s_cast: T.Tensor([block_M, block_N], dtype),
-                acc_o: T.Tensor([block_M, dim], accum_dtype),
-                k: T.int32,
-                by: T.int32,
-                bz: T.int32,
+            V: T.Tensor(shape, dtype),
+            V_shared: T.Tensor([block_M, dim], dtype),
+            acc_s_cast: T.Tensor([block_M, block_N], dtype),
+            acc_o: T.Tensor([block_M, dim], accum_dtype),
+            k: T.int32,
+            by: T.int32,
+            bz: T.int32,
         ):
-            T.copy(V[bz, by, k * block_N:(k + 1) * block_N, :], V_shared)
+            T.copy(V[bz, by, k * block_N : (k + 1) * block_N, :], V_shared)
             T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
 
         @T.macro
         def Softmax(
-                acc_s: T.Tensor([block_M, block_N], accum_dtype),
-                acc_s_cast: T.Tensor([block_M, block_N], dtype),
-                scores_max: T.Tensor([block_M], accum_dtype),
-                scores_max_prev: T.Tensor([block_M], accum_dtype),
-                scores_scale: T.Tensor([block_M], accum_dtype),
-                scores_sum: T.Tensor([block_M], accum_dtype),
-                logsum: T.Tensor([block_M], accum_dtype),
+            acc_s: T.Tensor([block_M, block_N], accum_dtype),
+            acc_s_cast: T.Tensor([block_M, block_N], dtype),
+            scores_max: T.Tensor([block_M], accum_dtype),
+            scores_max_prev: T.Tensor([block_M], accum_dtype),
+            scores_scale: T.Tensor([block_M], accum_dtype),
+            scores_sum: T.Tensor([block_M], accum_dtype),
+            logsum: T.Tensor([block_M], accum_dtype),
         ):
             T.copy(scores_max, scores_max_prev)
             T.fill(scores_max, -T.infinity(accum_dtype))
             T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+            for i in T.Parallel(block_M):
+                scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
             # To do causal softmax, we need to set the scores_max to 0 if it is -inf
             # This process is called Check_inf in FlashAttention3 code, and it only need to be done
             # in the first ceil_div(kBlockM, kBlockN) steps.
@@ -90,22 +90,21 @@ def Softmax(
 
         @T.macro
         def Rescale(
-                acc_o: T.Tensor([block_M, dim], accum_dtype),
-                scores_scale: T.Tensor([block_M], accum_dtype),
+            acc_o: T.Tensor([block_M, dim], accum_dtype),
+            scores_scale: T.Tensor([block_M], accum_dtype),
         ):
             for i, j in T.Parallel(block_M, dim):
                 acc_o[i, j] *= scores_scale[i]
 
         @T.prim_func
         def main(
-                Q: T.Tensor(shape, dtype),
-                K: T.Tensor(shape, dtype),
-                V: T.Tensor(shape, dtype),
-                BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
-                Output: T.Tensor(shape, dtype),
+            Q: T.Tensor(shape, dtype),
+            K: T.Tensor(shape, dtype),
+            V: T.Tensor(shape, dtype),
+            BlockSparseMask: T.Tensor(block_mask_shape, block_mask_dtype),
+            Output: T.Tensor(shape, dtype),
         ):
-            with T.Kernel(
-                    T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
+            with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
                 Q_shared = T.alloc_shared([block_M, dim], dtype)
                 K_shared = T.alloc_shared([block_N, dim], dtype)
                 V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -120,7 +119,7 @@ def main(
                 logsum = T.alloc_fragment([block_M], accum_dtype)
                 block_mask = T.alloc_local([downsample_len], block_mask_dtype)
 
-                T.copy(Q[bz, by, bx * block_M:(bx + 1) * block_M, :], Q_shared)
+                T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
                 T.fill(acc_o, 0)
                 T.fill(logsum, 0)
                 T.fill(scores_max, -T.infinity(accum_dtype))
@@ -129,19 +128,18 @@ def main(
                     block_mask[vj] = BlockSparseMask[bz, by, bx, vj]
 
                 loop_range = (
-                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv(
-                        (bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N))
+                    T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+                )
 
                 for k in T.Pipelined(loop_range, num_stages=num_stages):
                     MMA0(K, Q_shared, K_shared, acc_s, k, bx, by, bz)
-                    Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale,
-                            scores_sum, logsum)
+                    Softmax(acc_s, acc_s_cast, scores_max, scores_max_prev, scores_scale, scores_sum, logsum)
                     Rescale(acc_o, scores_scale)
                     MMA1(V, V_shared, acc_s_cast, acc_o, k, by, bz)
                 for i, j in T.Parallel(block_M, dim):
                     acc_o[i, j] /= logsum[i]
                 T.copy(acc_o, O_shared)
-                T.copy(O_shared, Output[bz, by, bx * block_M:(bx + 1) * block_M, :])
+                T.copy(O_shared, Output[bz, by, bx * block_M : (bx + 1) * block_M, :])
 
         return main
 
diff --git a/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py b/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py
index 5e1e85d97..533a62fc6 100644
--- a/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py
+++ b/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py
@@ -22,46 +22,50 @@ def _check(original, transformed):
 
 
 def test_lower_fence_proxy():
-
     @T.prim_func
     def before():
         with T.Kernel(8):
-            A_shared = T.decl_buffer((1, 8, 256), "float16", scope="shared.dyn")
-            B_shared = T.decl_buffer((1, 4, 512), "float16", scope="shared.dyn")
+            A_shared = T.decl_buffer((1, 8, 256), T.float16, scope="shared.dyn")
+            B_shared = T.decl_buffer((1, 4, 512), T.float16, scope="shared.dyn")
             C_local = T.decl_buffer((32,), scope="local")
             for i in T.unroll(16):
-                C_local[i * 2:i * 2 + 2] = T.Broadcast(T.float32(0), 2)
-            T.call_intrin("handle", tir.op.Op.get("tl.tl_gemm"),
-                          "tl::gemm_ss<128, 128, 32, 4, 1, 0, 0, 0, 32, 128, 0, 0, true>",
-                          T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, 0, 2048, 1),
-                          T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, 0, 2048, 1),
-                          T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
+                C_local[i * 2 : i * 2 + 2] = T.Broadcast(T.float32(0), 2)
+            T.call_intrin(
+                "handle",
+                tir.op.Op.get("tl.tl_gemm"),
+                "tl::gemm_ss<128, 128, 32, 4, 1, 0, 0, 0, 32, 128, 0, 0, true>",
+                T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, 0, 2048, 1),
+                T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, 0, 2048, 1),
+                T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
+            )
 
     @T.prim_func
     def after():
         with T.Kernel(8):
-            A_shared = T.decl_buffer((1, 8, 256), "float16", scope="shared.dyn")
-            B_shared = T.decl_buffer((1, 4, 512), "float16", scope="shared.dyn")
+            A_shared = T.decl_buffer((1, 8, 256), T.float16, scope="shared.dyn")
+            B_shared = T.decl_buffer((1, 4, 512), T.float16, scope="shared.dyn")
             C_local = T.decl_buffer((32,), scope="local")
             for i in T.unroll(16):
-                C_local[i * 2:i * 2 + 2] = T.Broadcast(T.float32(0), 2)
+                C_local[i * 2 : i * 2 + 2] = T.Broadcast(T.float32(0), 2)
             T.fence_proxy_async()
-            T.call_intrin("handle", tir.op.Op.get("tl.tl_gemm"),
-                          "tl::gemm_ss<128, 128, 32, 4, 1, 0, 0, 0, 32, 128, 0, 0, true>",
-                          T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, 0, 2048, 1),
-                          T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, 0, 2048, 1),
-                          T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
+            T.call_intrin(
+                "handle",
+                tir.op.Op.get("tl.tl_gemm"),
+                "tl::gemm_ss<128, 128, 32, 4, 1, 0, 0, 0, 32, 128, 0, 0, true>",
+                T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, 0, 2048, 1),
+                T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, 0, 2048, 1),
+                T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
+            )
 
     _check(before, after)
 
 
 def test_async_to_generic_no_double_fence():
-
     @T.prim_func
     def before():
         with T.Kernel(8):
-            A_shared = T.decl_buffer((1024,), "uint8", scope="shared.dyn")
-            B_shared = T.decl_buffer((1024,), "uint8", scope="shared.dyn")
+            A_shared = T.decl_buffer((1024,), T.uint8, scope="shared.dyn")
+            B_shared = T.decl_buffer((1024,), T.uint8, scope="shared.dyn")
             T.ptx_cp_async("uint8", A_shared.data, 0, B_shared.data, 0, 16)
             T.fence_proxy_async()
             T.call_extern("handle", "generic_op")
@@ -90,7 +94,6 @@ def visit(node):
 
 
 def test_proxy_hint_override():
-
     @T.prim_func
     def before():
         with T.Kernel(8):
@@ -123,11 +126,10 @@ def visit(node):
 
 
 def test_tma_store_sync_injection():
-
     @T.prim_func
     def before():
         with T.Kernel(8):
-            A_global = T.decl_buffer((128,), "float16", scope="global")
+            A_global = T.decl_buffer((128,), T.float16, scope="global")
             T.evaluate(T.call_intrin("handle", tir.op.Op.get("tl.tma_store"), A_global.data))
 
     mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
@@ -154,19 +156,33 @@ def visit(node):
 
 
 def test_wgmma_marked_async():
-
     @T.prim_func
     def before():
         with T.Kernel(1):
-            A_shared = T.decl_buffer((1,), "float16", scope="shared")
-            desc_a = T.decl_buffer((1,), "uint64", scope="local.descriptor")
-            desc_b = T.decl_buffer((1,), "uint64", scope="local.descriptor")
-            C_local = T.decl_buffer((32,), "float16", scope="local")
+            A_shared = T.decl_buffer((1,), T.float16, scope="shared")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
             A_shared[0] = T.float16(0)
             T.warpgroup_arrive()
-            T.ptx_wgmma_ss("float16", "m64n64k16", T.bool(True), T.bool(True), "fp16", "fp16",
-                           "fp16", desc_a.data, T.int32(0), desc_b.data, T.int32(0), C_local.data,
-                           T.int32(0), T.bool(True), 1, 1)
+            T.ptx_wgmma_ss(
+                T.float16,
+                "m64n64k16",
+                T.bool(True),
+                T.bool(True),
+                "fp16",
+                "fp16",
+                "fp16",
+                desc_a.data,
+                T.int32(0),
+                desc_b.data,
+                T.int32(0),
+                C_local.data,
+                T.int32(0),
+                T.bool(True),
+                1,
+                1,
+            )
 
     mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
     mod = tvm.tir.transform.BindTarget(auto_target)(mod)
diff --git a/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py b/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py
index 95cbf2db5..1885c7c4b 100644
--- a/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py
+++ b/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py
@@ -9,7 +9,7 @@ def test_inject_set_max_nreg():
     """Test the InjectSetMaxNReg pass"""
 
     @T.prim_func
-    def before(A: T.Tensor((512, 512), "float16"), B: T.Tensor((512, 512), "float16")):
+    def before(A: T.Tensor((512, 512), T.float16), B: T.Tensor((512, 512), T.float16)):
         bx = T.launch_thread("blockIdx.x", 8)
         by = T.launch_thread("blockIdx.y", 8)
         v = T.launch_thread("threadIdx.x", 128)
@@ -22,8 +22,8 @@ def before(A: T.Tensor((512, 512), "float16"), B: T.Tensor((512, 512), "float16"
             T.annotate_producer_reg_dealloc(24)  # Producer: decrease to 24
             T.annotate_consumer_reg_alloc(240)  # Consumer: increase to 240
 
-            A_shared = T.alloc_buffer((3, 1, 8, 256), "float16", scope="shared.dyn")
-            B_shared = T.alloc_buffer((3, 1, 4, 512), "float16", scope="shared.dyn")
+            A_shared = T.alloc_buffer((3, 1, 8, 256), T.float16, scope="shared.dyn")
+            B_shared = T.alloc_buffer((3, 1, 4, 512), T.float16, scope="shared.dyn")
             C_local = T.alloc_buffer((32,), scope="local")
 
             T.create_list_of_mbarrier(128, 128, 128, 128, 128, 128)
@@ -35,26 +35,25 @@ def before(A: T.Tensor((512, 512), "float16"), B: T.Tensor((512, 512), "float16"
                     T.mbarrier_wait_parity(T.get_mbarrier(k % 3 + 3), T.bitwise_xor(k // 3 % 2, 1))
                     if v - 128 == 0:
                         T.tma_load(
-                            T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1,
-                                                    0, 2, 2, 0), T.get_mbarrier(k % 3),
-                            T.tvm_access_ptr(
-                                T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 2),
-                            k * 32, by * 64)
-                    T.evaluate(
-                        tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3)]))
+                            T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
+                            T.get_mbarrier(k % 3),
+                            T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 2),
+                            k * 32,
+                            by * 64,
+                        )
+                    T.evaluate(tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3)]))
             else:
                 # Consumer branch - should have set_max_nreg(240, 1)
                 for k in range(16):
                     T.mbarrier_wait_parity(T.get_mbarrier(k % 3), k // 3 % 2)
                     T.call_extern(
-                        "handle", "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 1),
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 1),
-                        T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
-                    T.evaluate(
-                        tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3 + 3)]))
+                        "handle",
+                        "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
+                        T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 1),
+                        T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 1),
+                        T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
+                    )
+                    T.evaluate(tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3 + 3)]))
 
     # Apply the InjectSetMaxNReg pass
     func = before
@@ -67,15 +66,18 @@ def before(A: T.Tensor((512, 512), "float16"), B: T.Tensor((512, 512), "float16"
     set_max_nreg_calls = []
 
     def collect_set_max_nreg(stmt):
-        if (isinstance(stmt, tvm.tir.Evaluate) and hasattr(stmt.value, 'op') and
-                hasattr(stmt.value.op, 'name') and stmt.value.op.name == "tl.set_max_nreg"):
+        if (
+            isinstance(stmt, tvm.tir.Evaluate)
+            and hasattr(stmt.value, "op")
+            and hasattr(stmt.value.op, "name")
+            and stmt.value.op.name == "tl.set_max_nreg"
+        ):
             set_max_nreg_calls.append(stmt.value)
 
     tvm.tir.stmt_functor.post_order_visit(main_func.body, collect_set_max_nreg)
 
     # We should have at least 2 set_max_nreg calls (one for producer, one for consumer)
-    assert len(set_max_nreg_calls
-              ) >= 2, f"Expected at least 2 set_max_nreg calls, got {len(set_max_nreg_calls)}"
+    assert len(set_max_nreg_calls) >= 2, f"Expected at least 2 set_max_nreg calls, got {len(set_max_nreg_calls)}"
 
     print("InjectSetMaxNReg test passed!")
 
@@ -84,7 +86,7 @@ def test_inject_set_max_nreg_no_set_max_nreg():
     """Test the InjectSetMaxNReg pass with no_set_max_nreg"""
 
     @T.prim_func
-    def before_no_set_max_nreg(A: T.Tensor((512, 512), "float16")):
+    def before_no_set_max_nreg(A: T.Tensor((512, 512), T.float16)):
         bx = T.launch_thread("blockIdx.x", 8)
         v = T.launch_thread("threadIdx.x", 128)
 
@@ -116,16 +118,18 @@ def before_no_set_max_nreg(A: T.Tensor((512, 512), "float16")):
     set_max_nreg_calls = []
 
     def collect_set_max_nreg(stmt):
-        if (isinstance(stmt, tvm.tir.Evaluate) and hasattr(stmt.value, 'op') and
-                hasattr(stmt.value.op, 'name') and stmt.value.op.name == "tl.set_max_nreg"):
+        if (
+            isinstance(stmt, tvm.tir.Evaluate)
+            and hasattr(stmt.value, "op")
+            and hasattr(stmt.value.op, "name")
+            and stmt.value.op.name == "tl.set_max_nreg"
+        ):
             set_max_nreg_calls.append(stmt.value)
 
     tvm.tir.stmt_functor.post_order_visit(main_func.body, collect_set_max_nreg)
 
     # Should have no set_max_nreg calls when no_set_max_nreg is present
-    assert len(
-        set_max_nreg_calls
-    ) == 0, f"Expected 0 set_max_nreg calls when no_set_max_nreg is present, got {len(set_max_nreg_calls)}"
+    assert len(set_max_nreg_calls) == 0, f"Expected 0 set_max_nreg calls when no_set_max_nreg is present, got {len(set_max_nreg_calls)}"
 
     print("InjectSetMaxNReg with no_set_max_nreg test passed!")
 
diff --git a/testing/python/transform/test_tilelang_transform_layout_inference.py b/testing/python/transform/test_tilelang_transform_layout_inference.py
index dd7f7e2ce..82fcd19ab 100644
--- a/testing/python/transform/test_tilelang_transform_layout_inference.py
+++ b/testing/python/transform/test_tilelang_transform_layout_inference.py
@@ -8,18 +8,21 @@
 auto_target = tvm.target.Target(determine_target("auto"))
 
 
-@pytest.mark.parametrize("block_M, block_N, block_K, threads, vec_load_b, dtype", [
-    (64, 64, 32, 128, 8, "float16"),
-])
+@pytest.mark.parametrize(
+    "block_M, block_N, block_K, threads, vec_load_b, dtype",
+    [
+        (64, 64, 32, 128, 8, T.float16),
+    ],
+)
 def test_loop_tail_split(block_M, block_N, block_K, threads, vec_load_b, dtype):
     N = tvm.te.var("n")
     K = tvm.te.var("k")
 
-    @tvm.script.ir.ir_module
-    class Before:
-
+    def before():
         @T.prim_func
-        def main(B: T.Tensor((K, N), dtype),):
+        def main(
+            B: T.Tensor((K, N), dtype),
+        ):
             with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
                 B_shared = T.alloc_shared((block_K, block_N), dtype)
                 thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
@@ -27,61 +30,68 @@ def main(B: T.Tensor((K, N), dtype),):
                     t = thread_bindings
                     for i in T.unroll(0, block_N * block_K // (threads * vec_load_b)):
                         for vec in T.Parallel(vec_load_b):
-                            B_shared[i * (threads * vec_load_b // block_N) + t //
-                                     (block_N // vec_load_b), t % (block_N // vec_load_b) *
-                                     (block_N // vec_load_b) + vec] = T.if_then_else(
-                                         k * block_K + i * (threads * vec_load_b // block_N) + t //
-                                         (block_N // vec_load_b) < K and bx * block_N + t %
-                                         (block_N // vec_load_b) * (block_N // vec_load_b) < N,
-                                         B[k * block_K + i * (threads * vec_load_b // block_N) +
-                                           t // (block_N // vec_load_b), bx * block_N + t %
-                                           (block_N // vec_load_b) * (block_N // vec_load_b) + vec],
-                                         T.float16(0))
+                            B_shared[
+                                i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                            ] = T.if_then_else(
+                                k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b) < K
+                                and bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) < N,
+                                B[
+                                    k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                    bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                ],
+                                T.float16(0),
+                            )
 
-    @tvm.script.ir.ir_module
-    class After:
+        return tvm.IRModule({"main": main})
 
+    def after():
         @T.prim_func
-        def main(B: T.Tensor((K, N), dtype),):
+        def main(
+            B: T.Tensor((K, N), dtype),
+        ):
             with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
                 B_shared = T.alloc_shared((block_K, block_N), dtype)
                 thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
                 for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
                     t = thread_bindings
                     for i in T.unroll(0, block_N * block_K // (threads * vec_load_b)):
-                        if (k * block_K + i * (threads * vec_load_b // block_N) + t //
-                            (block_N // vec_load_b)) * N % vec_load_b == 0:
+                        if (k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b)) * N % vec_load_b == 0:
                             for vec in T.vectorized(vec_load_b):
-                                B_shared[i * (threads * vec_load_b // block_N) + t //
-                                         (block_N // vec_load_b), t % (block_N // vec_load_b) *
-                                         (block_N // vec_load_b) + vec] = T.if_then_else(
-                                             k * block_K + i *
-                                             (threads * vec_load_b // block_N) + t //
-                                             (block_N // vec_load_b) < K and bx * block_N + t %
-                                             (block_N // vec_load_b) * (block_N // vec_load_b) < N,
-                                             B[k * block_K + i * (threads * vec_load_b // block_N) +
-                                               t // (block_N // vec_load_b),
-                                               bx * block_N + t % (block_N // vec_load_b) *
-                                               (block_N // vec_load_b) + vec], T.float16(0))
+                                B_shared[
+                                    i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                    t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                ] = T.if_then_else(
+                                    k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b) < K
+                                    and bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) < N,
+                                    B[
+                                        k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                        bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                    ],
+                                    T.float16(0),
+                                )
                         else:
                             for vec in T.serial(vec_load_b):
-                                B_shared[i * (threads * vec_load_b // block_N) + t //
-                                         (block_N // vec_load_b), t % (block_N // vec_load_b) *
-                                         (block_N // vec_load_b) + vec] = T.if_then_else(
-                                             k * block_K + i *
-                                             (threads * vec_load_b // block_N) + t //
-                                             (block_N // vec_load_b) < K and bx * block_N + t %
-                                             (block_N // vec_load_b) * (block_N // vec_load_b) < N,
-                                             B[k * block_K + i * (threads * vec_load_b // block_N) +
-                                               t // (block_N // vec_load_b),
-                                               bx * block_N + t % (block_N // vec_load_b) *
-                                               (block_N // vec_load_b) + vec], T.float16(0))
+                                B_shared[
+                                    i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                    t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                ] = T.if_then_else(
+                                    k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b) < K
+                                    and bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) < N,
+                                    B[
+                                        k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                        bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                    ],
+                                    T.float16(0),
+                                )
+
+        return tvm.IRModule({"main": main})
 
     with tvm.target.Target(auto_target):
-        mod = tvm.tir.transform.BindTarget(auto_target)(Before)
+        mod = tvm.tir.transform.BindTarget(auto_target)(before())
         mod = tl.transform.LayoutInference()(mod)
         mod = tvm.tir.transform.Simplify()(mod)
-        ref_mod = tvm.tir.transform.BindTarget(auto_target)(After)
+        ref_mod = tvm.tir.transform.BindTarget(auto_target)(after())
         ref_mod = tvm.tir.transform.Simplify()(ref_mod)
         # Note(tzj): The structures are equal except one more "for" loop after the LayoutInference pass
         # This loop is "for vec in T.parallel(1)",
@@ -92,4 +102,4 @@ def main(B: T.Tensor((K, N), dtype),):
 
 if __name__ == "__main__":
     # tilelang.testing.main()
-    test_loop_tail_split(64, 64, 32, 128, 8, "float16")
+    test_loop_tail_split(64, 64, 32, 128, 8, T.float16)
diff --git a/testing/python/transform/test_tilelang_transform_legalize_negative_index.py b/testing/python/transform/test_tilelang_transform_legalize_negative_index.py
new file mode 100644
index 000000000..26c151141
--- /dev/null
+++ b/testing/python/transform/test_tilelang_transform_legalize_negative_index.py
@@ -0,0 +1,342 @@
+from tilelang import tvm as tvm
+import tilelang as tl
+import tilelang.language as T
+import tilelang.testing
+
+
+def _check(original, expected):
+    """Helper function to verify structural equality after transformations"""
+    func = original
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = tl.transform.LegalizeNegativeIndex()(mod)
+    expected = tvm.IRModule.from_expr(expected.with_attr("global_symbol", "main"))
+    tvm.ir.assert_structural_equal(mod["main"], expected["main"], True)
+
+
+def test_buffer_load_negative_index_legalized():
+    """
+    Test that negative indices are legalized by adding buffer extent.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        value = A[-1]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        value = A[1023]  # A[-1] becomes A[1023]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_mixed_negative_positive_indices():
+    """
+    Test mixed negative and positive indices - only negative ones are legalized.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024, 512), T.float32)):
+        value = A[-1, 10]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024, 512), T.float32)):
+        value = A[1023, 10]  # A[-1, 10] becomes A[1023, 10]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_multiple_negative_indices():
+    """
+    Test multiple negative indices in different dimensions.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024, 512, 256), T.float32)):
+        value = A[-1, -2, -3]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024, 512, 256), T.float32)):
+        value = A[1023, 510, 253]  # -1+1024=1023, -2+512=510, -3+256=253
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_negative_index_in_expression():
+    """
+    Test negative index as part of a larger expression.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        B = T.alloc_buffer((1024,), T.float32)
+        for i in T.serial(1, 1024):
+            value = A[-i]
+            B[-i] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        B = T.alloc_buffer((1024,), T.float32)
+        for i in T.serial(1, 1024):
+            value = A[1024 - i]
+            B[1024 - i] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_non_negative_index_unchanged():
+    """
+    Test that non-negative indices remain unchanged.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        value = A[0]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        # No changes expected for non-negative indices
+        value = A[0]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_unknown_sign_index_warning():
+    """
+    Test that indices with unknown sign trigger warnings but are processed.
+    This test mainly checks that the pass doesn't crash on unknown signs.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        i = T.Var("i", T.int32)
+        value = A[i]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        i = T.Var("i", T.int32)
+        # Unknown sign indices should remain unchanged
+        value = A[i]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_vector_index_negative_broadcast():
+    """
+    Test negative indices in vectorized operations (broadcast case).
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        vec = T.Broadcast(-1, 4)
+        value = A[vec]
+        B = T.alloc_buffer((4,), T.float32)
+        B[T.Ramp(0, 1, 4)] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        # vec is unused and can be delimed by Simplify.
+        vec = T.Broadcast(-1, 4)  # noqa: F841
+        value = A[T.Broadcast(1023, 4)]
+        B = T.alloc_buffer((4,), T.float32)
+        B[T.Ramp(0, 1, 4)] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_vector_index_negative_ramp():
+    """
+    Test negative indices in vectorized operations (ramp case).
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        vec = T.Ramp(-4, 1, 4)  # indices: [-4, -3, -2, -1]
+        value = A[vec]
+        B = T.alloc_buffer((4,), T.float32)
+        B[T.Ramp(0, 1, 4)] = value
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        # vec is unused and can be delimed by Simplify.
+        vec = T.Ramp(-4, 1, 4)  # noqa: F841
+        value = A[T.Ramp(1020, 1, 4)]
+        B = T.alloc_buffer((4,), T.float32)
+        B[T.Ramp(0, 1, 4)] = value
+
+    _check(before, after)
+
+
+def test_buffer_load_nested_buffer_loads():
+    """
+    Test legalization with nested buffer load expressions.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024, 512), T.float32)):
+        inner_val = A[-1, 10]
+        outer_val = A[inner_val.astype(T.int32), -2]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = outer_val
+
+    @T.prim_func
+    def after(A: T.Tensor((1024, 512), T.float32)):
+        inner_val = A[1023, 10]
+        outer_val = A[inner_val.astype(T.int32), 510]
+        B = T.alloc_buffer((1,), T.float32)
+        B[0] = outer_val
+
+    _check(before, after)
+
+
+def test_buffer_store_negative_index():
+    """
+    Test negative indices in buffer store operations are legalized.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        A[-1] = 42.0
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        A[1023] = 42.0
+
+    _check(before, after)
+
+
+def test_buffer_store_mixed_negative_positive_indices():
+    """
+    Test mixed negative and positive indices in buffer store.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024, 512), T.float32)):
+        A[-1, 10] = 42.0
+
+    @T.prim_func
+    def after(A: T.Tensor((1024, 512), T.float32)):
+        A[1023, 10] = 42.0
+
+    _check(before, after)
+
+
+def test_buffer_store_multiple_negative_indices():
+    """
+    Test multiple negative indices in different dimensions for buffer store.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024, 512, 256), T.float32)):
+        A[-1, -2, -3] = 42.0
+
+    @T.prim_func
+    def after(A: T.Tensor((1024, 512, 256), T.float32)):
+        A[1023, 510, 253] = 42.0  # -1+1024=1023, -2+512=510, -3+256=253
+
+    _check(before, after)
+
+
+def test_buffer_store_negative_index_in_expression():
+    """
+    Test negative index as part of a larger expression in buffer store.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        for i in T.serial(1, 1024):
+            A[-i] = i * 2.0
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        for i in T.serial(1, 1024):
+            A[1024 - i] = i * 2.0
+
+    _check(before, after)
+
+
+def test_buffer_store_vector_index_negative_broadcast():
+    """
+    Test negative indices in vectorized store operations (broadcast case).
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        vec = T.Broadcast(-1, 4)
+        values = T.Broadcast(42.0, 4)
+        A[vec] = values
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        # vec is unused and can be delimed by Simplify.
+        vec = T.Broadcast(-1, 4)  # noqa: F841
+        values = T.Broadcast(42.0, 4)
+        A[T.Broadcast(1023, 4)] = values
+
+    _check(before, after)
+
+
+def test_buffer_store_vector_index_negative_ramp():
+    """
+    Test negative indices in vectorized store operations (ramp case).
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32)):
+        vec = T.Ramp(-4, 1, 4)  # indices: [-4, -3, -2, -1]
+        values = T.Ramp(0.0, 1.0, 4)  # values: [0.0, 1.0, 2.0, 3.0]
+        A[vec] = values
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32)):
+        # vec is unused and can be delimed by Simplify.
+        vec = T.Ramp(-4, 1, 4)  # noqa: F841
+        values = T.Ramp(0.0, 1.0, 4)
+        A[T.Ramp(1020, 1, 4)] = values
+
+    _check(before, after)
+
+
+def test_buffer_store_nested_in_condition():
+    """
+    Test negative index buffer store within conditional statements.
+    """
+
+    @T.prim_func
+    def before(A: T.Tensor((1024,), T.float32), flag: T.int32):
+        if flag > 0:
+            A[-1] = 42.0
+        else:
+            A[-2] = 24.0
+
+    @T.prim_func
+    def after(A: T.Tensor((1024,), T.float32), flag: T.int32):
+        if flag > 0:
+            A[1023] = 42.0
+        else:
+            A[1022] = 24.0
+
+    _check(before, after)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py b/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py
index 5202ab647..4f75fa05d 100644
--- a/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py
+++ b/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py
@@ -5,10 +5,12 @@
 
 
 def vectorize_access_legalize(M: int = 64, N: int = 64, M_offset: int = 2, N_offset: int = 2):
-    dtype = "float32"
+    dtype = T.float32
 
     @T.prim_func
-    def main(A: T.Tensor((M, N), dtype=dtype),):
+    def main(
+        A: T.Tensor((M, N), dtype=dtype),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N), dtype=dtype)
             tid = T.get_thread_binding()
@@ -16,17 +18,18 @@ def main(A: T.Tensor((M, N), dtype=dtype),):
                 A_shared[tid, j] = A[tid + M_offset, j + N_offset]
 
     @T.prim_func
-    def expected(A: T.Tensor((M, N), dtype=dtype),):
+    def expected(
+        A: T.Tensor((M, N), dtype=dtype),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N), dtype=dtype)
             tid = T.get_thread_binding()
 
-            T.reads(A[tid + M_offset, N_offset:N + N_offset])
+            T.reads(A[tid + M_offset, N_offset : N + N_offset])
             for j in T.serial(N):
                 A_shared[tid, j] = T.if_then_else(
-                    j + N_offset < N,
-                    T.if_then_else(tid + M_offset < M, A[tid + M_offset, j + N_offset],
-                                   T.float32(0)), T.float32(0))
+                    j + N_offset < N, T.if_then_else(tid + M_offset < M, A[tid + M_offset, j + N_offset], T.float32(0)), T.float32(0)
+                )
 
     return main, expected
 
@@ -38,45 +41,13 @@ def assert_vectorize_access(M: int = 64, N: int = 64):
     tvm.ir.assert_structural_equal(transformed["main"].body, expected.body)
 
 
-def issue_1013_buggy_kernel():
-    # NOTE: This kernel is mainly to test some corner cases in boundary check
+def vectorize_access_with_atmoic_add_legalize(M: int = 64, N: int = 64, M_offset: int = 2, N_offset: int = 2):
+    dtype = T.float32
 
-    num_tokens = T.dynamic('num_tokens')
-    num_threads = 128
-
-    @T.prim_func
-    def main(x: T.Tensor((num_tokens,), dtype="int64")):
-        with T.Kernel(1, threads=num_threads) as _:
-            count = T.alloc_var('int')
-            thread_idx = T.get_thread_binding()
-            for i in T.serial(0, T.ceildiv(num_tokens - thread_idx, num_threads)):
-                idx = thread_idx + i * num_threads
-                count += x[idx] == 2
-
-    # NOTE(chaofan): Ideally, the prover should be able to prove that the access is safe
-    # and the padding value is not used. However, the current prover cannot handle this case.
-    # So for now the expected kernel is a if-else statement to check the boundary.
     @T.prim_func
-    def expected(x: T.Tensor((num_tokens,), dtype="int64")):
-        with T.Kernel(1, threads=num_threads) as _:
-            count = T.alloc_var('int')
-            thread_idx = T.get_thread_binding()
-            for i in T.serial(0, T.ceildiv(num_tokens - thread_idx, num_threads)):
-                idx = thread_idx + i * num_threads
-                count += T.Cast("int32",
-                                T.if_then_else(idx < num_tokens, x[idx], T.int64(0)) == T.int64(2))
-
-    return main, expected
-
-
-def vectorize_access_with_atmoic_add_legalize(M: int = 64,
-                                              N: int = 64,
-                                              M_offset: int = 2,
-                                              N_offset: int = 2):
-    dtype = "float32"
-
-    @T.prim_func
-    def main(A: T.Tensor((M, N), dtype=dtype),):
+    def main(
+        A: T.Tensor((M, N), dtype=dtype),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N), dtype=dtype)
             tid = T.get_thread_binding()
@@ -85,21 +56,22 @@ def main(A: T.Tensor((M, N), dtype=dtype),):
                 T.atomic_add(A[tid + M_offset, j + N_offset], 1)
 
     @T.prim_func
-    def expected(A: T.Tensor((M, N), dtype=dtype),):
+    def expected(
+        A: T.Tensor((M, N), dtype=dtype),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N), dtype=dtype)
             tid = T.get_thread_binding()
 
-            T.reads(A[tid + M_offset, N_offset:N + N_offset])
+            T.reads(A[tid + M_offset, N_offset : N + N_offset])
             for j in T.serial(N):
                 A_shared[tid, j] = T.if_then_else(
-                    j + N_offset < N,
-                    T.if_then_else(tid + M_offset < M, A[tid + M_offset, j + N_offset],
-                                   T.float32(0)), T.float32(0))
+                    j + N_offset < N, T.if_then_else(tid + M_offset < M, A[tid + M_offset, j + N_offset], T.float32(0)), T.float32(0)
+                )
                 # Nest if-then-else is expected, do not flatten it to pass structural equal check
                 if j + N_offset < N:  # noqa: SIM102
                     if tid + M_offset < M:
-                        T.call_extern("handle", "AtomicAdd", A[tid + M_offset, j + N_offset], 1)
+                        T.call_extern("handle", "AtomicAdd", T.address_of(A[tid + M_offset, j + N_offset]), 1)
 
     return main, expected
 
@@ -112,20 +84,24 @@ def assert_vectorize_access_with_atmoic_add(M: int = 64, N: int = 64):
 
 
 def oob_store_legalize(M: int = 64, N: int = 64, M_offset: int = 2, N_offset: int = 2):
-    dtype = "float32"
+    dtype = T.float32
 
     @T.prim_func
-    def main(A: T.Tensor((M, N), dtype=dtype),):
+    def main(
+        A: T.Tensor((M, N), dtype=dtype),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             tid = T.get_thread_binding()
             for j in T.serial(N):
                 A[tid + M_offset, j + N_offset] = 1
 
     @T.prim_func
-    def expected(A: T.Tensor((M, N), dtype=dtype),):
+    def expected(
+        A: T.Tensor((M, N), dtype=dtype),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             tid = T.get_thread_binding()
-            T.writes(A[tid + M_offset, N_offset:N + N_offset])
+            T.writes(A[tid + M_offset, N_offset : N + N_offset])
             for j in T.serial(N):
                 if j + N_offset < N:  # noqa: SIM102
                     if tid + M_offset < M:
@@ -145,13 +121,6 @@ def test_vectorize_access():
     assert_vectorize_access(64, 64)
 
 
-def test_issue_1013():
-    func, expected = issue_1013_buggy_kernel()
-    mod = tvm.IRModule({func.attrs["global_symbol"]: func})
-    transformed = tl.transform.LegalizeSafeMemoryAccess()(mod)
-    tvm.ir.assert_structural_equal(transformed["main"].body, expected.body)
-
-
 def test_vectorize_access_with_atmoic_add():
     assert_vectorize_access_with_atmoic_add(64, 64)
 
diff --git a/testing/python/transform/test_tilelang_transform_legalize_vectorized_loop.py b/testing/python/transform/test_tilelang_transform_legalize_vectorized_loop.py
index c95af8777..3cc7541cc 100644
--- a/testing/python/transform/test_tilelang_transform_legalize_vectorized_loop.py
+++ b/testing/python/transform/test_tilelang_transform_legalize_vectorized_loop.py
@@ -5,11 +5,13 @@
 
 
 def vectorize_access_legalize(M: int = 64, N: int = 64):
-    dtype = "float32"
+    dtype = T.float32
     vec_len = 8
 
     @T.prim_func
-    def main(A: T.Tensor((M, N, vec_len), dtype="float32"),):
+    def main(
+        A: T.Tensor((M, N, vec_len), dtype=T.float32),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N, vec_len), dtype=dtype)
             tid = T.get_thread_binding()
@@ -18,7 +20,9 @@ def main(A: T.Tensor((M, N, vec_len), dtype="float32"),):
                     A_shared[tid, j, v] = A[tid, j, v]
 
     @T.prim_func
-    def expected(A: T.Tensor((M, N, vec_len), dtype="float32"),):
+    def expected(
+        A: T.Tensor((M, N, vec_len), dtype=T.float32),
+    ):
         with T.Kernel(1, 1, threads=M) as (bx, by):
             A_shared = T.alloc_shared((M, N, vec_len), dtype=dtype)
             tid = T.get_thread_binding()
diff --git a/testing/python/transform/test_tilelang_transform_let_inline.py b/testing/python/transform/test_tilelang_transform_let_inline.py
index aa2638af1..e773e3fee 100644
--- a/testing/python/transform/test_tilelang_transform_let_inline.py
+++ b/testing/python/transform/test_tilelang_transform_let_inline.py
@@ -8,14 +8,12 @@ def _check(original, transformed):
     func = original
     mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
     mod = tl.transform.LetInline()(mod)
-    tvm.ir.assert_structural_equal(mod["main"], transformed.with_attr("global_symbol", "main"),
-                                   True)
+    tvm.ir.assert_structural_equal(mod["main"], transformed.with_attr("global_symbol", "main"), True)
 
 
 def test_let_binding():
-
     @T.prim_func
-    def before(A: T.Tensor((128, 128), "float32"), B: T.Tensor((128, 128), "float32")):
+    def before(A: T.Tensor((128, 128), T.float32), B: T.Tensor((128, 128), T.float32)):
         for i in range(128):
             for j in range(128):
                 with T.block("compute"):
@@ -24,7 +22,7 @@ def before(A: T.Tensor((128, 128), "float32"), B: T.Tensor((128, 128), "float32"
                     B[i, j] = value
 
     @T.prim_func
-    def expected(A: T.Tensor((128, 128), "float32"), B: T.Tensor((128, 128), "float32")):
+    def expected(A: T.Tensor((128, 128), T.float32), B: T.Tensor((128, 128), T.float32)):
         for i in range(128):
             for j in range(128):
                 with T.block("compute"):
@@ -34,16 +32,15 @@ def expected(A: T.Tensor((128, 128), "float32"), B: T.Tensor((128, 128), "float3
 
 
 def test_parallel_scope():
-
     @T.prim_func
-    def before(A: T.Tensor((128,), "float32")):
+    def before(A: T.Tensor((128,), T.float32)):
         for i in T.Parallel(128):
             with T.block("parallel"):
                 value = T.float32(1.0)
                 A[i] = value
 
     @T.prim_func
-    def expected(A: T.Tensor((128,), "float32")):
+    def expected(A: T.Tensor((128,), T.float32)):
         for i in T.Parallel(128):
             with T.block("parallel"):
                 A[i] = T.float32(1.0)
diff --git a/testing/python/transform/test_tilelang_transform_lower_hopper_intrin.py b/testing/python/transform/test_tilelang_transform_lower_hopper_intrin.py
index ca5042e0f..f411b3d5b 100644
--- a/testing/python/transform/test_tilelang_transform_lower_hopper_intrin.py
+++ b/testing/python/transform/test_tilelang_transform_lower_hopper_intrin.py
@@ -24,7 +24,6 @@ def _check(original, transformed):
 
 
 def test_lower_hopper_intrin_barrier():
-
     @T.prim_func
     def before():
         with T.Kernel(8):
@@ -37,18 +36,10 @@ def after():
             v_1 = T.launch_thread("threadIdx.x", 128)
             T.evaluate(tir.Call("handle", "tir.create_barriers", [4]))
             with T.If(v_1 == 0), T.Then():
-                T.evaluate(
-                    tir.Call("handle", "tir.ptx_init_barrier_thread_count",
-                             [T.get_mbarrier(0), 128]))
-                T.evaluate(
-                    tir.Call("handle", "tir.ptx_init_barrier_thread_count",
-                             [T.get_mbarrier(1), 128]))
-                T.evaluate(
-                    tir.Call("handle", "tir.ptx_init_barrier_thread_count",
-                             [T.get_mbarrier(2), 128]))
-                T.evaluate(
-                    tir.Call("handle", "tir.ptx_init_barrier_thread_count",
-                             [T.get_mbarrier(3), 128]))
+                T.evaluate(tir.Call("handle", "tir.ptx_init_barrier_thread_count", [T.get_mbarrier(0), 128]))
+                T.evaluate(tir.Call("handle", "tir.ptx_init_barrier_thread_count", [T.get_mbarrier(1), 128]))
+                T.evaluate(tir.Call("handle", "tir.ptx_init_barrier_thread_count", [T.get_mbarrier(2), 128]))
+                T.evaluate(tir.Call("handle", "tir.ptx_init_barrier_thread_count", [T.get_mbarrier(3), 128]))
             T.evaluate(tir.Call("handle", "tir.tvm_storage_sync", ["shared"]))
 
     _check(before, after)
diff --git a/testing/python/transform/test_tilelang_transform_lower_tile_op.py b/testing/python/transform/test_tilelang_transform_lower_tile_op.py
index 1729072d2..16c7cb802 100644
--- a/testing/python/transform/test_tilelang_transform_lower_tile_op.py
+++ b/testing/python/transform/test_tilelang_transform_lower_tile_op.py
@@ -8,67 +8,75 @@
 auto_target = tvm.target.Target(determine_target("auto"))
 
 
-@pytest.mark.parametrize("block_M, block_N, block_K, threads, vec_load_b, dtype", [
-    (64, 64, 32, 128, 8, "float16"),
-])
+@pytest.mark.parametrize(
+    "block_M, block_N, block_K, threads, vec_load_b, dtype",
+    [
+        (64, 64, 32, 128, 8, T.float16),
+    ],
+)
 def test_loop_tail_split(block_M, block_N, block_K, threads, vec_load_b, dtype):
     N = tvm.te.var("n")
     K = tvm.te.var("k")
 
-    @tvm.script.ir.ir_module
-    class Before:
-
+    def before():
         @T.prim_func
-        def main(B: T.Tensor((K, N), dtype),):
+        def main(
+            B: T.Tensor((K, N), dtype),
+        ):
             with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
                 B_shared = T.alloc_shared((block_K, block_N), dtype)
                 for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
                     T.copy(B[k * block_K, bx * block_N], B_shared)
 
-    @tvm.script.ir.ir_module
-    class After:
+        return tvm.IRModule({"main": main})
 
+    def after():
         @T.prim_func
-        def main(B: T.Tensor((K, N), dtype),):
+        def main(
+            B: T.Tensor((K, N), dtype),
+        ):
             with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
                 B_shared = T.alloc_shared((block_K, block_N), dtype)
                 thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
                 for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
                     t = thread_bindings
                     for i in T.unroll(0, block_N * block_K // (threads * vec_load_b)):
-                        if (k * block_K + i * (threads * vec_load_b // block_N) + t //
-                            (block_N // vec_load_b)) * N % vec_load_b == 0:
+                        if (k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b)) * N % vec_load_b == 0:
                             for vec in T.vectorized(vec_load_b):
-                                B_shared[i * (threads * vec_load_b // block_N) + t //
-                                         (block_N // vec_load_b), t % (block_N // vec_load_b) *
-                                         (block_N // vec_load_b) + vec] = T.if_then_else(
-                                             k * block_K + i *
-                                             (threads * vec_load_b // block_N) + t //
-                                             (block_N // vec_load_b) < K and bx * block_N + t %
-                                             (block_N // vec_load_b) * (block_N // vec_load_b) < N,
-                                             B[k * block_K + i * (threads * vec_load_b // block_N) +
-                                               t // (block_N // vec_load_b),
-                                               bx * block_N + t % (block_N // vec_load_b) *
-                                               (block_N // vec_load_b) + vec], T.float16(0))
+                                B_shared[
+                                    i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                    t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                ] = T.if_then_else(
+                                    k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b) < K
+                                    and bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) < N,
+                                    B[
+                                        k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                        bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                    ],
+                                    T.float16(0),
+                                )
                         else:
                             for vec in T.serial(vec_load_b):
-                                B_shared[i * (threads * vec_load_b // block_N) + t //
-                                         (block_N // vec_load_b), t % (block_N // vec_load_b) *
-                                         (block_N // vec_load_b) + vec] = T.if_then_else(
-                                             k * block_K + i *
-                                             (threads * vec_load_b // block_N) + t //
-                                             (block_N // vec_load_b) < K and bx * block_N + t %
-                                             (block_N // vec_load_b) * (block_N // vec_load_b) < N,
-                                             B[k * block_K + i * (threads * vec_load_b // block_N) +
-                                               t // (block_N // vec_load_b),
-                                               bx * block_N + t % (block_N // vec_load_b) *
-                                               (block_N // vec_load_b) + vec], T.float16(0))
+                                B_shared[
+                                    i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                    t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                ] = T.if_then_else(
+                                    k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b) < K
+                                    and bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) < N,
+                                    B[
+                                        k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
+                                        bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
+                                    ],
+                                    T.float16(0),
+                                )
+
+        return tvm.IRModule({"main": main})
 
     with tvm.transform.PassContext():
-        mod = tvm.tir.transform.BindTarget(auto_target)(Before)
+        mod = tvm.tir.transform.BindTarget(auto_target)(before())
         mod = tl.transform.LowerTileOp()(mod)
         mod = tvm.tir.transform.Simplify()(mod)
-    ref_mod = tvm.tir.transform.BindTarget(auto_target)(After)
+    ref_mod = tvm.tir.transform.BindTarget(auto_target)(after())
     ref_mod = tvm.tir.transform.Simplify()(ref_mod)
     # Note(tzj): The structures are equal except the argument in "T.reads" function.
     # The difference is just between the first index and the indices range, which is totally equivalent
diff --git a/testing/python/transform/test_tilelang_transform_make_packed_api.py b/testing/python/transform/test_tilelang_transform_make_packed_api.py
index ff4487326..2508a9d12 100644
--- a/testing/python/transform/test_tilelang_transform_make_packed_api.py
+++ b/testing/python/transform/test_tilelang_transform_make_packed_api.py
@@ -80,7 +80,6 @@ def test_target_host_removed():
 
     @I.ir_module
     class before:
-
         @T.prim_func
         def main(A: T.Buffer(1, "float32")):
             T.func_attr({"global_symbol": "main", "target": T.target("cuda", host=host)})
@@ -102,7 +101,6 @@ def test_internal_subroutine_call():
 
     @I.ir_module
     class before:
-
         @T.prim_func
         def main(A: T.Buffer(1, "float32")):
             T.func_attr({"target": T.target("llvm", host="llvm")})
@@ -121,7 +119,8 @@ def subroutine(A_data: T.handle("float32")):
     subroutine_call_op = compute_scope.body.value.op
     assert isinstance(subroutine_call_op, tvm.ir.GlobalVar), (
         f"The main function's CallNode should use the subroutine's GLobalVar as the operation, "
-        f"but instead has an operation of type {subroutine_call_op}")
+        f"but instead has an operation of type {subroutine_call_op}"
+    )
 
 
 def test_subroutine_call_to_externally_visible_subroutine():
@@ -135,7 +134,6 @@ def test_subroutine_call_to_externally_visible_subroutine():
 
     @I.ir_module
     class before:
-
         @T.prim_func
         def main(A: T.Buffer(1, "float32")):
             T.func_attr({"global_symbol": "main", "target": T.target("llvm", host="llvm")})
@@ -154,11 +152,10 @@ def subroutine(A_data: T.handle("float32")):
     assert subroutine_compute_scope is not None
 
     subroutine_call_op = main_compute_scope.body.value.op
-    assert (
-        isinstance(subroutine_call_op, tvm.ir.Op) and
-        subroutine_call_op.name == "tir.tvm_call_cpacked"
-    ), (f"The main function's CallNode should be lowered to the builtin 'tir.tvm_call_cpacked', "
-        f"but instead has an operation of type {subroutine_call_op}")
+    assert isinstance(subroutine_call_op, tvm.ir.Op) and subroutine_call_op.name == "tir.tvm_call_cpacked", (
+        f"The main function's CallNode should be lowered to the builtin 'tir.tvm_call_cpacked', "
+        f"but instead has an operation of type {subroutine_call_op}"
+    )
 
 
 @tilelang.testing.requires_llvm
@@ -167,10 +164,10 @@ def test_function_call_with_wrong_argument_count():
 
     @T.prim_func
     def func(
-            A: T.Buffer([16, 16], "int32"),
-            B: T.Buffer([16, 16], "int32"),
-            C: T.Buffer([16, 16], "int32"),
-            D: T.Buffer([16, 16], "int32"),
+        A: T.Buffer([16, 16], "int32"),
+        B: T.Buffer([16, 16], "int32"),
+        C: T.Buffer([16, 16], "int32"),
+        D: T.Buffer([16, 16], "int32"),
     ):
         pass
 
diff --git a/testing/python/transform/test_tilelang_transform_multi_version_buffer.py b/testing/python/transform/test_tilelang_transform_multi_version_buffer.py
index 6c9b5c539..e85fd8db8 100644
--- a/testing/python/transform/test_tilelang_transform_multi_version_buffer.py
+++ b/testing/python/transform/test_tilelang_transform_multi_version_buffer.py
@@ -24,14 +24,13 @@ def _check(original, transformed):
 M = 512
 N = 512
 K = 512
-dtype = "float16"
+dtype = T.float16
 block_M = 64
 block_N = 64
 block_K = 32
 
 
 def test_multi_version_buffer():
-
     @T.prim_func
     def before(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
         bx = T.launch_thread("blockIdx.x", 8)
@@ -40,8 +39,8 @@ def before(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
         with T.block(""):
             T.reads(A[by * 64, 0:481], B[0:481, bx * 64])
             T.writes()
-            A_shared = T.alloc_buffer((1, 8, 256), "float16", scope="shared.dyn")
-            B_shared = T.alloc_buffer((1, 4, 512), "float16", scope="shared.dyn")
+            A_shared = T.alloc_buffer((1, 8, 256), T.float16, scope="shared.dyn")
+            B_shared = T.alloc_buffer((1, 4, 512), T.float16, scope="shared.dyn")
             C_local = T.alloc_buffer((32,), scope="local")
             for i in T.unroll(16, annotations={"pragma_unroll_explicit": T.bool(False)}):
                 for vec in T.vectorized(2):
@@ -49,21 +48,27 @@ def before(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
             for k in T.serial(16, annotations={"num_stages": T.int32(3)}):
                 if v == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2,
-                                                2, 0), 0,
-                        T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, 0, 2048, 2),
-                        k * 32, by * 64)
+                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
+                        0,
+                        T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, 0, 2048, 2),
+                        k * 32,
+                        by * 64,
+                    )
                 if v == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3,
-                                                2, 0), 0,
-                        T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, 0, 2048, 2),
-                        bx * 64, k * 32)
+                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3, 2, 0),
+                        0,
+                        T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, 0, 2048, 2),
+                        bx * 64,
+                        k * 32,
+                    )
                 T.call_extern(
-                    "handle", "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
-                    T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, 0, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, 0, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
+                    "handle",
+                    "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
+                    T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, 0, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, 0, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
+                )
 
     @T.prim_func
     def after(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
@@ -73,8 +78,8 @@ def after(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
         with T.block(""):
             T.reads(A[by * 64, 0:481], B[0:481, bx * 64])
             T.writes()
-            A_shared = T.alloc_buffer((3, 1, 8, 256), "float16", scope="shared.dyn")
-            B_shared = T.alloc_buffer((3, 1, 4, 512), "float16", scope="shared.dyn")
+            A_shared = T.alloc_buffer((3, 1, 8, 256), T.float16, scope="shared.dyn")
+            B_shared = T.alloc_buffer((3, 1, 4, 512), T.float16, scope="shared.dyn")
             C_local = T.alloc_buffer((32,), scope="local")
             for i in T.unroll(16, annotations={"pragma_unroll_explicit": T.bool(False)}):
                 for vec in T.vectorized(2):
@@ -82,50 +87,51 @@ def after(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
             for k in T.serial(16, annotations={"num_stages": T.int32(3)}):
                 if v == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2,
-                                                2, 0), 0,
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 2),
-                        k * 32, by * 64)
+                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
+                        0,
+                        T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 2),
+                        k * 32,
+                        by * 64,
+                    )
                 if v == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3,
-                                                2, 0), 0,
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 2),
-                        bx * 64, k * 32)
+                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3, 2, 0),
+                        0,
+                        T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 2),
+                        bx * 64,
+                        k * 32,
+                    )
                 T.call_extern(
-                    "handle", "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
-                    T.tvm_access_ptr(
-                        T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(
-                        T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
+                    "handle",
+                    "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
+                    T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
+                )
 
     _check(before, after)
 
 
 def test_multi_version_buffer_with_let():
-
     @T.prim_func
-    def before(scales: T.Tensor((4,), "float32")):
+    def before(scales: T.Tensor((4,), T.float32)):
         with T.block("root"):
-            shared = T.alloc_buffer((8,), "float32", scope="shared.dyn")
-            accum = T.alloc_buffer((8,), "float32", scope="local")
+            shared = T.alloc_buffer((8,), T.float32, scope="shared.dyn")
+            accum = T.alloc_buffer((8,), T.float32, scope="local")
             for k in T.serial(4, annotations={"num_stages": T.int32(2)}):
-                value: T.float32 = scales[k]
+                value = scales[k]
                 for i in T.serial(8):
                     shared[i] = value
                 for i in T.serial(8):
                     accum[i] = accum[i] + shared[i]
 
     @T.prim_func
-    def after(scales: T.Tensor((4,), "float32")):
+    def after(scales: T.Tensor((4,), T.float32)):
         with T.block("root"):
-            shared = T.alloc_buffer((2, 8), "float32", scope="shared.dyn")
-            accum = T.alloc_buffer((8,), "float32", scope="local")
+            shared = T.alloc_buffer((2, 8), T.float32, scope="shared.dyn")
+            accum = T.alloc_buffer((8,), T.float32, scope="local")
             for k in T.serial(4, annotations={"num_stages": T.int32(2)}):
-                value: T.float32 = scales[k]
+                value = scales[k]
                 for i in T.serial(8):
                     shared[k % 2, i] = value
                 for i in T.serial(8):
diff --git a/testing/python/transform/test_tilelang_transform_pipeline_planning.py b/testing/python/transform/test_tilelang_transform_pipeline_planning.py
index b7448a204..83db7f75c 100644
--- a/testing/python/transform/test_tilelang_transform_pipeline_planning.py
+++ b/testing/python/transform/test_tilelang_transform_pipeline_planning.py
@@ -19,14 +19,12 @@ def _check(original, transformed):
 
 
 def test_simple_pipeline():
-
     @T.prim_func
-    def before(A: T.Tensor((1024, 32), "float32"), B: T.Tensor((32, 1024), "float32"), C: T.Tensor(
-        (1024, 1024), "float32")):
+    def before(A: T.Tensor((1024, 32), T.float32), B: T.Tensor((32, 1024), T.float32), C: T.Tensor((1024, 1024), T.float32)):
         with T.Kernel(8, 8, threads=128) as (bx, by):
-            A_shared = T.alloc_shared((128, 32), "float32")
-            B_shared = T.alloc_shared((32, 128), "float32")
-            C_local = T.alloc_fragment((128, 128), "float32")
+            A_shared = T.alloc_shared((128, 32), T.float32)
+            B_shared = T.alloc_shared((32, 128), T.float32)
+            C_local = T.alloc_fragment((128, 128), T.float32)
 
             T.clear(C_local)
 
@@ -39,24 +37,22 @@ def before(A: T.Tensor((1024, 32), "float32"), B: T.Tensor((32, 1024), "float32"
             T.copy(C_local, C[by * 128, bx * 128])
 
     @T.prim_func
-    def after(A: T.Tensor((1024, 32), "float32"), B: T.Tensor((32, 1024), "float32"), C: T.Tensor(
-        (1024, 1024), "float32")):
+    def after(A: T.Tensor((1024, 32), T.float32), B: T.Tensor((32, 1024), T.float32), C: T.Tensor((1024, 1024), T.float32)):
         with T.Kernel(8, 8, threads=128) as (bx, by):
-            A_shared = T.alloc_shared((128, 32), "float32")
-            B_shared = T.alloc_shared((32, 128), "float32")
-            C_local = T.alloc_fragment((128, 128), "float32")
+            A_shared = T.alloc_shared((128, 32), T.float32)
+            B_shared = T.alloc_shared((32, 128), T.float32)
+            C_local = T.alloc_fragment((128, 128), T.float32)
 
             T.clear(C_local)
 
             for ko in T.serial(
-                    32,
-                    annotations={
-                        "software_pipeline_async_stages": [T.int32(0)],
-                        "software_pipeline_order": [T.int32(0), T.int32(1),
-                                                    T.int32(2)],
-                        "software_pipeline_stage": [T.int32(3), T.int32(3),
-                                                    T.int32(3)]
-                    }):
+                32,
+                annotations={
+                    "software_pipeline_async_stages": [T.int32(0)],
+                    "software_pipeline_order": [T.int32(0), T.int32(1), T.int32(2)],
+                    "software_pipeline_stage": [T.int32(3), T.int32(3), T.int32(3)],
+                },
+            ):
                 T.copy(A[by * 128, ko * 32], A_shared)
                 T.copy(B[ko * 32, bx * 128], B_shared)
                 T.gemm(A_shared, B_shared, C_local)
diff --git a/testing/python/transform/test_tilelang_transform_simplify.py b/testing/python/transform/test_tilelang_transform_simplify.py
index e1f4f9469..3b7376820 100644
--- a/testing/python/transform/test_tilelang_transform_simplify.py
+++ b/testing/python/transform/test_tilelang_transform_simplify.py
@@ -8,14 +8,13 @@ def modify(
     with_B: bool = False,
     with_bias: bool = False,
 ):
-
     @T.prim_func
     def main(
-            A: T.Tensor((64, 64)),
-            B: T.Tensor((64, 64)),
-            C: T.Tensor((64, 64)),
-            D: T.Tensor((64, 64)),
-            bias: T.Tensor((64, 64)),
+        A: T.Tensor((64, 64)),
+        B: T.Tensor((64, 64)),
+        C: T.Tensor((64, 64)),
+        D: T.Tensor((64, 64)),
+        bias: T.Tensor((64, 64)),
     ):
         if with_B:
             if with_bias:
@@ -23,9 +22,9 @@ def main(
             T.gemm(A, B, D)
         else:
             with T.block():
-                A_shared = T.alloc_shared((64, 64), dtype="float32")
-                C_shared = T.alloc_shared((64, 64), dtype="float32")
-                D_shared = T.alloc_shared((64, 64), dtype="float32")
+                A_shared = T.alloc_shared((64, 64), dtype=T.float32)
+                C_shared = T.alloc_shared((64, 64), dtype=T.float32)
+                D_shared = T.alloc_shared((64, 64), dtype=T.float32)
                 T.copy(A, A_shared)
                 T.copy(C, C_shared)
                 T.gemm(A_shared, C_shared, D_shared)
@@ -41,8 +40,7 @@ def test_modify(with_B=False, with_bias=False):
     assert mod != mod2
 
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
         a: T.handle,
@@ -76,6 +74,7 @@ def test_matmul():
     kernel = tl.compile(mod["main"], out_idx=[2])
 
     import torch
+
     a = torch.randn(1024, 1024, dtype=torch.float16).cuda().half()
     b = torch.randn(1024, 1024, dtype=torch.float16).cuda().half()
     c = kernel(a, b)
diff --git a/testing/python/transform/test_tilelang_transform_thread_sync.py b/testing/python/transform/test_tilelang_transform_thread_sync.py
index 85daad734..046ed447a 100644
--- a/testing/python/transform/test_tilelang_transform_thread_sync.py
+++ b/testing/python/transform/test_tilelang_transform_thread_sync.py
@@ -11,11 +11,7 @@ def run_passes(func: tvm.tir.PrimFunc):
 
     cuda_target = tvm.target.Target("cuda", host="llvm")
 
-    mod = tvm.tir.transform.Apply(lambda f: f.with_attr({
-        "global_symbol": "test",
-        "target": cuda_target
-    }))(
-        mod)
+    mod = tvm.tir.transform.Apply(lambda f: f.with_attr({"global_symbol": "test", "target": cuda_target}))(mod)
 
     mod = tvm.tir.transform.AnnotateDeviceRegions()(mod)
     mod = tvm.tir.transform.SplitHostDevice()(mod)
@@ -24,7 +20,6 @@ def run_passes(func: tvm.tir.PrimFunc):
 
 @tilelang.testing.requires_cuda
 def test_sync_if_with_same_index():
-
     @T.prim_func(check_well_formed=False)
     def func(p0_arg: T.Buffer((1, 2, 1, 1), "float32"), p1: T.Buffer(2, "float32")) -> None:
         threadIdx_x = T.env_thread("threadIdx.x")
@@ -47,7 +42,6 @@ def func(p0_arg: T.Buffer((1, 2, 1, 1), "float32"), p1: T.Buffer(2, "float32"))
 
 @tilelang.testing.requires_cuda
 def test_sync_read_thread_id_independent_location():
-
     @T.prim_func
     def func(p0_arg: T.Buffer((1, 2, 1, 1), "float32"), p1: T.Buffer(2, "float32")) -> None:
         threadIdx_x = T.env_thread("threadIdx.x")
@@ -71,7 +65,6 @@ def func(p0_arg: T.Buffer((1, 2, 1, 1), "float32"), p1: T.Buffer(2, "float32"))
 
 @tilelang.testing.requires_cuda
 def test_sync_shared():
-
     @T.prim_func(private=True)
     def func(A: T.Buffer((4, 4), "float32"), E: T.Buffer((4, 4), "float32")):
         blockIdx_x = T.launch_thread("blockIdx.x", 1)
@@ -113,7 +106,6 @@ def expected(A: T.Buffer((4, 4), "float32"), E: T.Buffer((4, 4), "float32")):
 
 @tvm.testing.requires_cuda
 def test_sync_let_stmt():
-
     @T.prim_func(private=True)
     def func(A: T.Buffer((16 * 512), "float32")):
         blockIdx_x = T.launch_thread("blockIdx.x", 16)
@@ -136,9 +128,9 @@ def func(A: T.Buffer((16 * 512), "float32")):
             in_thread_A_temp_1[0] = A_temp
         cross_thread_A_temp_1 = T.Buffer((1,), data=cross_thread_A_temp, scope="local")
         with T.attr(
-                T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]),
-                "reduce_scope",
-                T.reinterpret("handle", T.uint64(0)),
+            T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]),
+            "reduce_scope",
+            T.reinterpret("handle", T.uint64(0)),
         ):
             T.tvm_thread_allreduce(
                 T.uint32(1),
@@ -188,5 +180,45 @@ def expected(A: T.Buffer((8192,), "float32")):
     tvm.ir.assert_structural_equal(mod["main"], expected)
 
 
+@tilelang.testing.requires_cuda
+def test_sync_shared_dyn_stmatrix_loop_hoist():
+    @T.prim_func
+    def func():
+        buf_dyn_shmem = T.alloc_buffer((98304,), "uint8", scope="shared.dyn")
+        tx = T.launch_thread("threadIdx.x", 384)
+        for i in T.unroll(8):
+            off = (
+                i // 4 * 8192
+                + tx // 32 * 1024
+                + tx % 16 * 64
+                + (tx % 8 // 4 + i % 4 // 2) % 2 * 32
+                + (tx % 4 // 2 + i % 2) % 2 * 16
+                + (tx % 32 // 16 + tx % 2) % 2 * 8
+            )
+            T.evaluate(
+                T.call_intrin(
+                    "handle",
+                    tvm.tir.op.Op.get("tl.ptx_stmatrix"),
+                    T.int32(0),
+                    T.int32(4),
+                    T.tvm_access_ptr(
+                        T.type_annotation("uint8"),
+                        buf_dyn_shmem.data,
+                        off,
+                        98304 - off,
+                        2,
+                    ),
+                    T.int32(2),
+                )
+            )
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared.dyn")(mod)
+    s = str(mod)
+    assert 'T.tvm_storage_sync("shared.dyn")' in s
+    # Ensure the sync appears before the unrolled loop
+    assert s.index('T.tvm_storage_sync("shared.dyn")') < s.index("for i in T.unroll(8)")
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_warp_specialized.py b/testing/python/transform/test_tilelang_transform_warp_specialized.py
index 063ae2940..0171fab82 100644
--- a/testing/python/transform/test_tilelang_transform_warp_specialized.py
+++ b/testing/python/transform/test_tilelang_transform_warp_specialized.py
@@ -25,14 +25,13 @@ def _check(original, transformed):
 M = 512
 N = 512
 K = 512
-dtype = "float16"
+dtype = T.float16
 block_M = 64
 block_N = 64
 block_K = 32
 
 
 def test_warp_specialized():
-
     @T.prim_func
     def before(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
         bx = T.launch_thread("blockIdx.x", 8)
@@ -41,39 +40,41 @@ def before(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
         with T.block(""):
             T.reads(A[by * 64, 0:481], B[0:481, bx * 64])
             T.writes()
-            A_shared = T.alloc_buffer((3, 1, 8, 256), "float16", scope="shared.dyn")
-            B_shared = T.alloc_buffer((3, 1, 4, 512), "float16", scope="shared.dyn")
+            A_shared = T.alloc_buffer((3, 1, 8, 256), T.float16, scope="shared.dyn")
+            B_shared = T.alloc_buffer((3, 1, 4, 512), T.float16, scope="shared.dyn")
             C_local = T.alloc_buffer((32,), scope="local")
             for k in T.serial(16, annotations={"num_stages": T.int32(3)}):
                 if v == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2,
-                                                2, 0), 0,
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 2),
-                        k * 32, by * 64)
+                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
+                        0,
+                        T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 2),
+                        k * 32,
+                        by * 64,
+                    )
                 if v == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3,
-                                                2, 0), 0,
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 2),
-                        bx * 64, k * 32)
+                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3, 2, 0),
+                        0,
+                        T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 2),
+                        bx * 64,
+                        k * 32,
+                    )
                 T.call_extern(
-                    "handle", "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
-                    T.tvm_access_ptr(
-                        T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(
-                        T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
+                    "handle",
+                    "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
+                    T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
+                )
 
     @T.prim_func
     def after(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
         bx = T.launch_thread("blockIdx.x", 8)
         by = T.launch_thread("blockIdx.y", 8)
         v = T.launch_thread("threadIdx.x", 256)
-        A_shared = T.decl_buffer((3, 1, 8, 256), "float16", scope="shared.dyn")
-        B_shared = T.decl_buffer((3, 1, 4, 512), "float16", scope="shared.dyn")
+        A_shared = T.decl_buffer((3, 1, 8, 256), T.float16, scope="shared.dyn")
+        B_shared = T.decl_buffer((3, 1, 4, 512), T.float16, scope="shared.dyn")
         C_local = T.decl_buffer((32,), scope="local")
         T.create_list_of_mbarrier(128, 128, 128, 128, 128, 128)
         T.attr([128, 128], "kWarpSpecializationScope", 0)
@@ -85,34 +86,35 @@ def after(A: T.Tensor((M, K), dtype), B: T.Tensor((K, N), dtype)):
                     T.mbarrier_expect_tx(T.get_mbarrier(k % 3), 4096)
                 if v - 128 == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2,
-                                                2, 0), T.get_mbarrier(k % 3),
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 2),
-                        k * 32, by * 64)
+                        T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
+                        T.get_mbarrier(k % 3),
+                        T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 2),
+                        k * 32,
+                        by * 64,
+                    )
                 if v - 128 == 0:
                     T.mbarrier_expect_tx(T.get_mbarrier(k % 3), 4096)
                 if v - 128 == 0:
                     T.tma_load(
-                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3,
-                                                2, 0), T.get_mbarrier(k % 3),
-                        T.tvm_access_ptr(
-                            T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 2),
-                        bx * 64, k * 32)
+                        T.create_tma_descriptor(6, 2, B.data, 512, 512, 2, 1024, 64, 32, 1, 1, 0, 3, 2, 0),
+                        T.get_mbarrier(k % 3),
+                        T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 2),
+                        bx * 64,
+                        k * 32,
+                    )
                 T.evaluate(tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3)]))
         else:
             T.set_max_nreg(240, 1)
             for k in range(16):
                 T.mbarrier_wait_parity(T.get_mbarrier(k % 3), k // 3 % 2)
                 T.call_extern(
-                    "handle", "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
-                    T.tvm_access_ptr(
-                        T.type_annotation("float16"), A_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(
-                        T.type_annotation("float16"), B_shared.data, k % 3 * 2048, 2048, 1),
-                    T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
-                T.evaluate(
-                    tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3 + 3)]))
+                    "handle",
+                    "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
+                    T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 1),
+                    T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
+                )
+                T.evaluate(tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3 + 3)]))
 
     _check(before, after)
 
diff --git a/testing/python/utils/test_compress_utils.py b/testing/python/utils/test_compress_utils.py
index 1ec4cace8..e8fc20539 100644
--- a/testing/python/utils/test_compress_utils.py
+++ b/testing/python/utils/test_compress_utils.py
@@ -6,7 +6,7 @@
 
 
 def _test_compress_sm90(M, K, block_k, dtype):
-    A = randn_semi_sparse(M, K, dtype=dtype, device='cuda')
+    A = randn_semi_sparse(M, K, dtype=dtype, device="cuda")
     A_sparse, E = compress_sm90(A, block_k, False)
 
 
diff --git a/testing/python/webgpu/test_webgpu_codegen.py b/testing/python/webgpu/test_webgpu_codegen.py
index 0fe4f196d..b8b199e79 100644
--- a/testing/python/webgpu/test_webgpu_codegen.py
+++ b/testing/python/webgpu/test_webgpu_codegen.py
@@ -4,13 +4,12 @@
 import tilelang.language as T
 
 
-def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
-
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     @T.prim_func
     def main(
-            A: T.Tensor((M, K), dtype),
-            B: T.Tensor((K, N), dtype),
-            C: T.Tensor((M, N), dtype),
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
     ):
         # Initialize Kernel Context
         with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
@@ -39,8 +38,8 @@ def assert_gemm_codegen(
     block_M,
     block_N,
     block_K,
-    dtype="float16",
-    accum_dtype="float",
+    dtype=T.float16,
+    accum_dtype=T.float32,
 ):
     func = matmul(M, N, K, block_M, block_N, block_K, dtype=dtype, accum_dtype=accum_dtype)
     # Because the current pass context have been polluted by previous testing.
diff --git a/tilelang/__init__.py b/tilelang/__init__.py
index 8183c0bca..a687f4d1b 100644
--- a/tilelang/__init__.py
+++ b/tilelang/__init__.py
@@ -4,17 +4,58 @@
 
 import logging
 import warnings
-from tqdm import tqdm
+from pathlib import Path
+from tqdm.auto import tqdm
+
+
+def _compute_version() -> str:
+    """Return the package version without being polluted by unrelated installs.
+
+    Preference order:
+    1) If running from a source checkout (VERSION file present at repo root),
+       use the dynamic version from version_provider (falls back to plain VERSION).
+    2) Otherwise, use importlib.metadata for the installed distribution.
+    3) As a last resort, return a dev sentinel.
+    """
+    try:
+        repo_root = Path(__file__).resolve().parent.parent
+        version_file = repo_root / "VERSION"
+        if version_file.is_file():
+            try:
+                from version_provider import dynamic_metadata  # type: ignore
+
+                return dynamic_metadata("version")
+            except Exception:
+                # Fall back to the raw VERSION file if provider isn't available.
+                return version_file.read_text().strip()
+    except Exception:
+        # If any of the above fails, fall through to installed metadata.
+        pass
+
+    try:
+        from importlib.metadata import version as _dist_version  # py3.8+
+
+        return _dist_version("tilelang")
+    except Exception as exc:
+        warnings.warn(
+            f"tilelang version metadata unavailable ({exc!r}); using development version.",
+            RuntimeWarning,
+            stacklevel=2,
+        )
+        return "0.0.dev0"
+
+
+__version__ = _compute_version()
 
 from importlib.metadata import PackageNotFoundError, version
 
 try:
-    __version__ = version('tilelang')
+    __version__ = version("tilelang")
 except PackageNotFoundError:
     try:
         from version_provider import dynamic_metadata
 
-        __version__ = dynamic_metadata('version')
+        __version__ = dynamic_metadata("version")
     except Exception as exc:
         warnings.warn(
             f"tilelang version metadata unavailable ({exc!r}); using development version.",
@@ -98,7 +139,7 @@ def _load_tile_lang_lib():
 if env.SKIP_LOADING_TILELANG_SO == "0":
     _LIB, _LIB_PATH = _load_tile_lang_lib()
 
-from .jit import jit, JITKernel, compile  # noqa: F401
+from .jit import jit, lazy_jit, JITKernel, compile, par_compile  # noqa: F401
 from .profiler import Profiler  # noqa: F401
 from .cache import clear_cache  # noqa: F401
 
@@ -106,21 +147,32 @@ def _load_tile_lang_lib():
     TensorSupplyType,  # noqa: F401
     deprecated,  # noqa: F401
 )
-from .utils.tensor import tensor  # noqa: F401
-from .utils.allocator import get_allocator  # noqa: F401
+
+# TileScale distributed extensions (optional - only available when tilescale_ext is installed)
+try:
+    from .utils.tensor import tensor  # noqa: F401
+    from .utils.allocator import get_allocator  # noqa: F401
+except ImportError:
+    # tilescale_ext not installed - distributed features unavailable
+    tensor = None
+    get_allocator = None
+
 from .layout import (
     Layout,  # noqa: F401
     Fragment,  # noqa: F401
 )
 from . import (
+    analysis,  # noqa: F401
     transform,  # noqa: F401
     language,  # noqa: F401
     engine,  # noqa: F401
+    tools,  # noqa: F401
 )
+from .language.v2 import dtypes  # noqa: F401
 from .autotuner import autotune  # noqa: F401
 from .transform import PassConfigKey  # noqa: F401
 
-from .engine import lower, register_cuda_postproc, register_hip_postproc  # noqa: F401
+from .engine import lower, register_cuda_postproc, register_hip_postproc, register_c_postproc  # noqa: F401
 
 from .math import *  # noqa: F403
 
diff --git a/tilelang/_ffi_api.py b/tilelang/_ffi_api.py
index d4fb0be49..6e6421bf7 100644
--- a/tilelang/_ffi_api.py
+++ b/tilelang/_ffi_api.py
@@ -1,6 +1,6 @@
 """FFI APIs for tilelang"""
 
-import tvm.ffi
+import tvm_ffi
 
 # TVM_REGISTER_GLOBAL("tl.name").set_body_typed(func);
-tvm.ffi._init_api("tl", __name__)  # pylint: disable=protected-access
+tvm_ffi.init_ffi_api("tl", __name__)
diff --git a/tilelang/analysis/__init__.py b/tilelang/analysis/__init__.py
new file mode 100644
index 000000000..4e4090d80
--- /dev/null
+++ b/tilelang/analysis/__init__.py
@@ -0,0 +1,6 @@
+"""Tilelang IR analysis & visitors."""
+
+from .ast_printer import ASTPrinter  # noqa: F401
+from .nested_loop_checker import NestedLoopChecker  # noqa: F401
+from .fragment_loop_checker import FragmentLoopChecker  # noqa: F401
+from .layout_visual import LayoutVisual  # noqa: F401
diff --git a/tilelang/analysis/ast_printer.py b/tilelang/analysis/ast_printer.py
new file mode 100644
index 000000000..fe94505a5
--- /dev/null
+++ b/tilelang/analysis/ast_printer.py
@@ -0,0 +1,102 @@
+from tvm import tir
+from tvm.tir import PyStmtExprVisitor, PrimFunc, Stmt
+
+from tvm.tir.transform import prim_func_pass
+
+
+_child_fields = ["body", "block", "seq"]
+
+_stmt_line_limit = 140
+_middle_connector = "├── "
+_last_connector = "└── "
+
+_normal_indent = " " * 4
+_seq_middle_indent = "|" + " " * 3
+
+
+@tir.functor.visitor
+class _ASTPrintVisitor(PyStmtExprVisitor):
+    def __init__(self) -> None:
+        super().__init__()
+        self.indent: list[str] = []
+
+    def print_with_clip(self, s: str) -> None:
+        if len(s) > _stmt_line_limit:
+            s = s[:_stmt_line_limit] + "..."
+        print("".join(self.indent) + s)
+
+    def print_stmt_brief(self, stmt: Stmt, prefix: str) -> None:
+        stmt_script = repr(stmt).splitlines()[0].split("  ")[0].strip()
+        self.print_with_clip(prefix + f"{stmt.__class__.__name__}: " + stmt_script)
+
+    def visit_stmt(self, stmt: Stmt) -> None:
+        child_field_name: str = ""
+
+        field_keys = stmt.__class__.__dict__.keys()
+        # Filter out private/built-in fields.
+        field_keys = [key for key in field_keys if not key.startswith("_")]
+
+        for idx, key in enumerate(field_keys):
+            # For child fields, we'll handle them specially below instead of printing them in current line.
+            if key in _child_fields:
+                child_field_name = key
+                continue
+
+            value = getattr(stmt, key, None)
+            if value is None:
+                continue
+            # Try to get its script representation.
+            value = repr(value)
+
+            is_last_child = idx == len(field_keys) - 1 and not child_field_name
+            # Add tree-like connector
+            connector = _last_connector if is_last_child else _middle_connector
+
+            # Every member
+            self.print_with_clip(connector + f"{key}: {value}")
+
+        # Handle child fields
+        if child_field_name and hasattr(stmt, child_field_name):
+            child = getattr(stmt, child_field_name)
+
+            if child_field_name != "seq":
+                prefix = _last_connector + f"{child_field_name}: "
+                self.print_stmt_brief(child, prefix)
+                self.indent.append(_normal_indent)
+                self.visit_stmt(child)
+                self.indent.pop()
+            else:
+                # Special output format for SeqStmt
+                for i, child_node in enumerate(child):
+                    is_last_child = i == len(child) - 1
+                    prefix = (_last_connector if is_last_child else _middle_connector) + f"seq{i}: "
+                    self.print_stmt_brief(child_node, prefix)
+                    self.indent.append(_normal_indent if is_last_child else _seq_middle_indent)
+                    self.visit_stmt(child_node)
+                    self.indent.pop()
+
+
+def ASTPrinter():
+    """
+    A visitor pass that renders the TileLang AST hierarchy in a visual tree format.
+
+    Comparing with TL script, this printer is more suitable for debugging
+    and understanding the internal structure of TensorIR, like the class structure of
+    each node and their connections.
+
+    This printer generates a human-readable, tree-structured representation of the
+    Abstract Syntax Tree (AST). It uses ASCII/Unicode connectors to visualize
+    parent-child relationships, making it easier to inspect nested structures
+    (e.g., loops, blocks, scopes) and verify compiler transformations.
+    """
+
+    def pass_fn(func: PrimFunc, mod, ctx) -> PrimFunc:
+        print(f"PrimFunc(params={func.params}, ret_type={func.ret_type}, buffer_map={func.buffer_map}, attrs={func.attrs})")
+        func_body_prefix = _last_connector + "body="
+        visitor = _ASTPrintVisitor()
+        visitor.print_stmt_brief(func.body, func_body_prefix)
+        visitor.visit_stmt(func.body)
+        visitor.indent.append(_normal_indent)
+        return func
+
+    return prim_func_pass(pass_fn, opt_level=0)
diff --git a/tilelang/analysis/fragment_loop_checker.py b/tilelang/analysis/fragment_loop_checker.py
new file mode 100644
index 000000000..94900a5cc
--- /dev/null
+++ b/tilelang/analysis/fragment_loop_checker.py
@@ -0,0 +1,100 @@
+from __future__ import annotations
+from tvm import tir
+from tvm.tir import PyStmtExprVisitor, BufferStore, For, Var, PrimFunc, BufferLoad, IntImm
+from tvm.tir.transform import prim_func_pass
+from tvm.tir.stmt_functor import post_order_visit
+
+
+@tir.functor.visitor
+class _LoopVarUseAnalyzer(PyStmtExprVisitor):
+    """Analyze whether a loop variable is used in the given expr."""
+
+    def __init__(self, var: Var) -> None:
+        super().__init__()
+        self.var = var
+        self.used = False
+
+    def visit_var_(self, op: Var) -> None:
+        if op == self.var:
+            self.used = True
+        # Don't recursively visit children to avoid infinite recursion
+
+
+def collect_local_buffer_accesses(statement) -> list[BufferLoad | BufferStore]:
+    """
+    Collect local buffer accesses in the loop body.
+
+    Args:
+        statement: The TIR statement to analyze
+
+    Returns:
+        Tuple of buffer accesses in the loop body.
+    """
+
+    buffer_accesses = []
+
+    def visit_buffer_access(node):
+        if isinstance(node, (BufferLoad, BufferStore)) and node.buffer.scope().startswith("local"):
+            buffer_accesses.append(node)
+
+    post_order_visit(statement, visit_buffer_access)
+
+    return buffer_accesses
+
+
+@tir.functor.visitor
+class _FragmentLoopCheckVisitor(PyStmtExprVisitor):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def visit_for_(self, op: For) -> None:
+        if op.kind == tir.ForKind.PARALLEL:
+            # Fuse consecutive parallel loops
+            # Other nested cases are all invalid in TileLang.
+            loops = [op]
+            child = op.body
+            while isinstance(child, For) and child.kind == tir.ForKind.PARALLEL:
+                loops.append(child)
+                child = child.body
+
+            loops_with_symbolic_ranges = []
+            for loop in loops:
+                if not (isinstance(loop.min, IntImm) and isinstance(loop.extent, IntImm)):
+                    loops_with_symbolic_ranges.append(loop)
+
+            if len(loops_with_symbolic_ranges) > 0:
+                buffer_accesses = collect_local_buffer_accesses(child)
+            for loop in loops_with_symbolic_ranges:
+                for buffer_access in buffer_accesses:
+                    indices = buffer_access.indices
+                    analyzer = _LoopVarUseAnalyzer(loop.loop_var)
+                    for index in indices:
+                        analyzer.visit_expr(index)
+                    if analyzer.used:
+                        raise ValueError(
+                            "[Tilelang Semantic Check] "
+                            f"Loop variable {loop.loop_var} in a T.Parallel loop with symbolic range (min={loop.min}, extent={loop.extent}) is used to index "
+                            "a local/fragment buffer, which is not allowed in Tilelang."
+                        )
+
+            return
+
+        self.visit_stmt(op.body)
+
+
+def FragmentLoopChecker():
+    """
+    When using T.Parallel over a local/fragment buffer, there are several restrictions:
+    to ensure that the parallelization is valid.
+
+    1. The range of loop can not be symbolic.
+
+    Returns:
+        A prim_func_pass that applies the transformation
+    """
+
+    def pass_fn(func: PrimFunc, mod, ctx):
+        _FragmentLoopCheckVisitor().visit_stmt(func.body)
+        return func
+
+    return prim_func_pass(pass_fn, opt_level=0)
diff --git a/tilelang/analysis/layout_visual.py b/tilelang/analysis/layout_visual.py
new file mode 100644
index 000000000..141fb808c
--- /dev/null
+++ b/tilelang/analysis/layout_visual.py
@@ -0,0 +1,86 @@
+import tilelang.language as T
+from tvm import tir
+from tvm.tir import PyStmtExprVisitor
+
+from tvm.tir.transform import prim_func_pass
+from tilelang.tools.plot_layout import plot_layout
+
+
+def print_fragment_format(layout: T.Fragment) -> str:
+    """
+    Format fragment layout information into a human-readable string.
+
+    Parameters
+    ----------
+    layout : T.Fragment
+        The fragment layout to format
+
+    Returns
+    -------
+    str
+        Formatted string showing shape, thread mapping, and index mapping
+    """
+    if isinstance(layout, T.Fragment):
+        input_shape = layout.get_input_shape()
+        output_shape = layout.get_output_shape()
+        lines = [f"  Shape: {input_shape} -> {output_shape}", f"  Thread: {layout.forward_thread}", f"  Index:  {layout.forward_index}"]
+        print("\n".join(lines))
+    else:
+        raise ValueError(f"Expected T.Fragment, but got {type(layout).__name__}")
+
+
+@tir.functor.visitor
+class _LayoutVisualVisitor(PyStmtExprVisitor):
+    """
+    User-friendly pass which visualizes fragment layouts inferred during compilation.
+
+    In TileLang, Fragment layouts describe:
+    - How logical indices (e.g., [i, j]) map to thread IDs
+    - How logical indices map to register file locations within each thread
+    - The shape transformation from input dimensions to output dimensions
+
+    This pass generates two types of output:
+    1. Textual output: A human-readable description printed to console
+    2. Visual diagrams: Color-coded plots saved to files (PDF, PNG, SVG formats)
+
+    Configuration:
+    The pass is controlled by the TL_ENABLE_LAYOUT_VISUALIZATION configuration option.
+    The configuration accepts string values:
+
+    - Empty string or not set: Pass does nothing (default, disabled)
+    - "png": Generate PNG format only (recommended for quick inspection)
+    - "pdf": Generate PDF format only (recommended for documentation)
+    - "svg": Generate SVG format only (recommended for web/vector graphics)
+    - "all": Generate all formats (PDF, PNG, SVG)
+    - "png,svg": Generate multiple formats (comma-separated)
+    """
+
+    def __init__(self, formats: list[str] = ""):
+        super().__init__()
+        self.layout_found = []
+        self.processed_layouts = set()
+        self.formats_list = [f for f in formats if f != "txt"]
+
+    def visit_block_(self, op: tir.Block) -> None:
+        if "layout_map" in op.annotations:
+            layout_map = op.annotations["layout_map"]
+
+            for key, layout in layout_map.items():
+                if isinstance(layout, T.Fragment):
+                    layout_id = str(layout)
+                    if layout_id not in self.processed_layouts:
+                        print(f"{key} inferenced layout:")
+                        print_fragment_format(layout)
+                        for fmt in self.formats_list:
+                            plot_layout(layout, name=f"{key}_layout", formats=fmt)
+                        self.processed_layouts.add(layout_id)
+
+        # super().visit_block_(op)
+
+
+def LayoutVisual(formats: str = ""):
+    def pass_fn(func: tir.PrimFunc, mod, ctx):
+        _LayoutVisualVisitor(formats=formats).visit_stmt(func.body)
+        return func
+
+    return prim_func_pass(pass_fn, opt_level=0)
diff --git a/tilelang/analysis/nested_loop_checker.py b/tilelang/analysis/nested_loop_checker.py
new file mode 100644
index 000000000..96f24ed17
--- /dev/null
+++ b/tilelang/analysis/nested_loop_checker.py
@@ -0,0 +1,119 @@
+from tvm import tir
+from tvm.tir import (
+    For,
+    Call,
+    PrimFunc,
+    PyStmtExprVisitor,
+)
+from tvm.tir.transform import prim_func_pass
+
+
+def is_pipelined_for(op: For) -> bool:
+    """Check if a for loop is pipelined."""
+
+    anno_keys = ["num_stages", "tl_pipeline_order", "tl_pipeline_stage", "tl_pipeline_group"]
+    return any(key in op.annotations for key in anno_keys)
+
+
+def is_tile_op(op: Call) -> bool:
+    """Check if a call is a tile-op"""
+
+    return op.op.get_attr("TLOpBuilder") is not None
+
+
+@tir.functor.visitor
+class _NestedLoopCheckVisitor(PyStmtExprVisitor):
+    def __init__(self) -> None:
+        super().__init__()
+        self.in_parallel_context = False
+
+    def visit_for_(self, op: For) -> None:
+        if op.kind == tir.ForKind.PARALLEL:
+            child = op.body
+
+            # Special case: continuous nested parallel loop is allowed.
+            if isinstance(child, tir.For) and child.kind == tir.ForKind.PARALLEL:
+                self.visit_stmt(child)
+                return
+
+            # Otherwise
+            if self.in_parallel_context:
+                raise ValueError("[Tilelang Semantic Check] Nested parallel loops are not allowed. Please check your loop structure.")
+            self.in_parallel_context = True
+            super().visit_for_(op)
+            self.in_parallel_context = False
+            return
+        elif is_pipelined_for(op):
+            if self.in_parallel_context:
+                raise ValueError(
+                    "[Tilelang Semantic Check] Pipelined loop cannot be nested inside a parallel loop. Please check your loop structure."
+                )
+
+        super().visit_for_(op)
+
+    def visit_call_(self, op: Call) -> None:
+        if self.in_parallel_context and is_tile_op(op):
+            raise ValueError(
+                f'[Tilelang Semantic Check] Only elementwise operations are allowed inside a parallel loop. Got a tile-op "{op.op}".'
+            )
+
+
+def NestedLoopChecker():
+    """
+    User-friendly pass which identifies any invalid any nested-loop pattern.
+
+    Nested loops is an annoying problem in tilelang or other polyhedral-style compilers.
+    It contains many corner cases and undefined behaviours.
+
+    In tilelang, there are four loops:
+        T.serial
+        T.Parallel (T.vectorized)
+        T.Pipelined
+        T.Persistent
+
+    T.Persistent is a new feature which we do not consider here.
+
+    We define the following rules:
+    - (Rule 1) T.serial can be nested inside any other loop type without restriction.
+    - (Rule 2) Consecutive T.Parallel nested loops are not allowed. Including any TileOp (T.copy, etc.) which has
+        "parallel" behaviours is also forbidden.
+
+        Examples:
+        for i in T.Parallel(M):
+            stmt
+            for j in T.Parallel(N):
+                ...
+
+        for i in T.Parallel(M):
+            T.copy(A, B) # forbidden!
+
+        **Only a special case is allowed: strict continuous Parallel loops.** Since we can fuse them into a single T.Parallel loop.
+        Example:
+
+        for i in T.Parallel(M):
+                for j in T.Parallel(N):
+                    ... # allowed
+    - (Rule 3) T.Pipelined inside a T.Parallel is forbidden.
+
+        Examples:
+            for i in T.Parallel(M):
+                for j in T.Pipelined(K): # forbidden!
+                    ...
+
+            for i in T.Pipelined(K):
+                for j in T.Parallel(N): # allowed, ok
+                    ...
+
+    In summary, the problem mainly lies in the "T.Parallel". We highly recommend to use
+    T.Parallel to implement a tiled operator inside a kernel (e.g. T.gemm level) instead of other usages.
+    This guideline can help you avoid most of the issues.
+
+    Returns:
+        A prim_func_pass that applies the transformation
+    """
+
+    def pass_fn(func: PrimFunc, mod, ctx):
+        _NestedLoopCheckVisitor().visit_stmt(func.body)
+        return func
+
+    return prim_func_pass(pass_fn, opt_level=0)
diff --git a/tilelang/autotuner/capture.py b/tilelang/autotuner/capture.py
index 27c24f14e..428a6da90 100644
--- a/tilelang/autotuner/capture.py
+++ b/tilelang/autotuner/capture.py
@@ -85,8 +85,7 @@ def _get_current_stack() -> CaptureStack:
 
 
 class AutotuneInputsCapture:
-
-    __slots__ = ("tensors")
+    __slots__ = "tensors"
 
     def __init__(self, tensors: list[Any]):
         self.tensors = tensors
diff --git a/tilelang/autotuner/param.py b/tilelang/autotuner/param.py
index b93c4448e..e3f540177 100644
--- a/tilelang/autotuner/param.py
+++ b/tilelang/autotuner/param.py
@@ -1,5 +1,5 @@
-"""The auto-tune parameters.
-"""
+"""The auto-tune parameters."""
+
 from __future__ import annotations
 
 import tilelang
@@ -13,18 +13,25 @@
 from tilelang.jit import JITKernel
 import cloudpickle
 import os
-import shutil
 from tilelang.engine.param import KernelParam
 from tilelang import logger
 import json
 import hashlib
+import uuid
+from tilelang import env
+from tvm.runtime import Executable
 
 BEST_CONFIG_PATH = "best_config.json"
 FUNCTION_PATH = "function.pkl"
 LATENCY_PATH = "latency.json"
-KERNEL_PATH = "kernel.cu"
-WRAPPED_KERNEL_PATH = "wrapped_kernel.cu"
+
+# Align file names with cache/kernel_cache.py
+DEVICE_KERNEL_PATH = "device_kernel.cu"
+HOST_KERNEL_PATH = "host_kernel.cu"
+EXECUTABLE_PATH = "executable.so"
 KERNEL_LIB_PATH = "kernel_lib.so"
+KERNEL_CUBIN_PATH = "kernel.cubin"
+KERNEL_PY_PATH = "kernel.py"
 PARAMS_PATH = "params.pkl"
 
 
@@ -33,7 +40,7 @@ class CompileArgs:
     """Compile arguments for the auto-tuner. Detailed description can be found in `tilelang.jit.compile`.
     Attributes:
         out_idx: List of output tensor indices.
-        execution_backend: Execution backend to use for kernel execution (default: "cython").
+        execution_backend: Execution backend to use for kernel execution (default: "auto").
         target: Compilation target, either as a string or a TVM Target object (default: "auto").
         target_host: Target host for cross-compilation (default: None).
         verbose: Whether to enable verbose output (default: False).
@@ -42,8 +49,8 @@ class CompileArgs:
     """
 
     out_idx: list[int] | int | None = None
-    execution_backend: Literal["dlpack", "ctypes", "cython"] = "cython"
-    target: Literal['auto', 'cuda', 'hip'] = 'auto'
+    execution_backend: Literal["auto", "tvm_ffi", "cython", "nvrtc", "torch"] = "auto"
+    target: Literal["auto", "cuda", "hip"] = "auto"
     target_host: str | Target = None
     verbose: bool = False
     pass_configs: dict[str, Any] | None = None
@@ -55,24 +62,20 @@ def compile_program(self, program: PrimFunc):
             target=self.target,
             target_host=self.target_host,
             verbose=self.verbose,
-            pass_configs=self.pass_configs)
+            pass_configs=self.pass_configs,
+        )
 
     def __hash__(self):
         data = {
-            "execution_backend":
-                self.execution_backend,
-            "target":
-                str(self.target),
-            "target_host":
-                str(self.target_host) if self.target_host else None,
-            "verbose":
-                self.verbose,
-            "pass_configs":
-                json.dumps(self.pass_configs, sort_keys=True) if self.pass_configs else None,
+            "execution_backend": self.execution_backend,
+            "target": str(self.target),
+            "target_host": str(self.target_host) if self.target_host else None,
+            "verbose": self.verbose,
+            "pass_configs": json.dumps(self.pass_configs, sort_keys=True) if self.pass_configs else None,
         }
 
-        hash_obj = hashlib.sha256(json.dumps(data, sort_keys=True).encode('utf-8'))
-        return int.from_bytes(hash_obj.digest(), byteorder='big')
+        hash_obj = hashlib.sha256(json.dumps(data, sort_keys=True).encode("utf-8"))
+        return int.from_bytes(hash_obj.digest(), byteorder="big")
 
 
 @dataclass(frozen=True)
@@ -97,6 +100,7 @@ class ProfileArgs:
         manual_check_prog: Callable = None
         cache_input_tensors: bool = True
     """
+
     warmup: int = 25
     rep: int = 100
     timeout: int = 30
@@ -120,8 +124,8 @@ def __hash__(self):
             "atol": self.atol,
             "max_mismatched_ratio": self.max_mismatched_ratio,
         }
-        hash_obj = hashlib.sha256(json.dumps(data, sort_keys=True).encode('utf-8'))
-        return int.from_bytes(hash_obj.digest(), byteorder='big')
+        hash_obj = hashlib.sha256(json.dumps(data, sort_keys=True).encode("utf-8"))
+        return int.from_bytes(hash_obj.digest(), byteorder="big")
 
 
 @dataclass(frozen=True)
@@ -136,6 +140,7 @@ class AutotuneResult:
         func: Optimized function.
         kernel: Compiled kernel function.
     """
+
     latency: float | None = None
     config: dict | None = None
     ref_latency: float | None = None
@@ -143,6 +148,31 @@ class AutotuneResult:
     func: Callable | None = None
     kernel: Callable | None = None
 
+    @staticmethod
+    def _load_binary(path: str):
+        with open(path, "rb") as file:
+            binary = file.read()
+        return binary
+
+    @staticmethod
+    def _safe_write_file(path: str, mode: str, operation: Callable[[Any], None]):
+        # Random a temporary file within the same FS as the cache directory
+        tmp_dir = env.TILELANG_TMP_DIR
+        os.makedirs(tmp_dir, exist_ok=True)
+        temp_path = os.path.join(tmp_dir, f"{os.getpid()}_{uuid.uuid4()}")
+        with open(temp_path, mode) as temp_file:
+            operation(temp_file)
+        # Use atomic POSIX replace, so other processes cannot see a partial write
+        os.replace(temp_path, path)
+
+    @staticmethod
+    def _safe_write_executable(executable: Executable, path: str):
+        tmp_dir = env.TILELANG_TMP_DIR
+        os.makedirs(tmp_dir, exist_ok=True)
+        temp_path = os.path.join(tmp_dir, f"{os.getpid()}_{uuid.uuid4()}.so")
+        executable.export_library(temp_path)
+        os.replace(temp_path, path)
+
     def _save_kernel_to_disk(self, cache_path: Path, kernel: JITKernel, verbose: bool = False):
         """
         Persists a compiled kernel to disk cache.
@@ -161,34 +191,62 @@ def _save_kernel_to_disk(self, cache_path: Path, kernel: JITKernel, verbose: boo
         """
         os.makedirs(cache_path, exist_ok=True)  # Ensure directory exists
 
-        # Save kernel source code
+        # Save device kernel source code
         try:
-            kernel_path = os.path.join(cache_path, KERNEL_PATH)
+            device_kernel_path = os.path.join(cache_path, DEVICE_KERNEL_PATH)
             if verbose:
-                logger.debug(f"Saving kernel source code to file: {kernel_path}")
+                logger.debug(f"Saving kernel source code to file: {device_kernel_path}")
             if kernel.kernel_source is not None:
-                with open(kernel_path, "w") as f:
-                    f.write(kernel.kernel_source)
+                self._safe_write_file(device_kernel_path, "w", lambda f: f.write(kernel.kernel_source))
         except Exception as e:
             logger.error(f"Error saving kernel source code to disk: {e}")
 
-        # Save wrapped kernel source code
+        # Save host kernel source code (wrapped)
         try:
-            wrapped_kernel_path = os.path.join(cache_path, WRAPPED_KERNEL_PATH)
+            host_kernel_path = os.path.join(cache_path, HOST_KERNEL_PATH)
             if verbose:
-                logger.debug(f"Saving wrapped kernel source code to file: {wrapped_kernel_path}")
-            with open(wrapped_kernel_path, "w") as f:
-                f.write(kernel.get_kernel_source())
+                logger.debug(f"Saving wrapped kernel source code to file: {host_kernel_path}")
+            # Match kernel_cache behavior: use host source for tvm_ffi, otherwise wrapped kernel
+            if kernel.execution_backend == "tvm_ffi":
+                self._safe_write_file(host_kernel_path, "w", lambda f: f.write(kernel.adapter.get_host_source()))
+            else:
+                self._safe_write_file(host_kernel_path, "w", lambda f: f.write(kernel.adapter.get_kernel_source()))
         except Exception as e:
             logger.error(f"Error saving wrapped kernel source code to disk: {e}")
 
-        # Save kernel library
+        # Save kernel library (backend-specific)
         try:
-            kernel_lib_path = os.path.join(cache_path, KERNEL_LIB_PATH)
-            src_lib_path = kernel.adapter.libpath
-            if verbose:
-                logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
-            shutil.copy(src_lib_path, kernel_lib_path)
+            if kernel.execution_backend == "nvrtc":
+                kernel_lib_file = KERNEL_CUBIN_PATH
+            elif kernel.execution_backend == "tvm_ffi":
+                kernel_lib_file = EXECUTABLE_PATH
+            else:
+                kernel_lib_file = KERNEL_LIB_PATH
+
+            kernel_lib_path = os.path.join(cache_path, kernel_lib_file)
+
+            if kernel.execution_backend == "nvrtc":
+                # Save cubin and python helper file
+                src_lib_path = kernel.adapter.libpath
+                kernel_py_path = os.path.join(cache_path, KERNEL_PY_PATH)
+                py_src_path = src_lib_path.replace(".cubin", ".py")
+                if verbose:
+                    logger.debug(f"Saving kernel nvrtc python code to file: {kernel_py_path}")
+                self._safe_write_file(kernel_py_path, "wb", lambda f: f.write(self._load_binary(py_src_path)))
+                if verbose:
+                    logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
+                self._safe_write_file(kernel_lib_path, "wb", lambda f: f.write(self._load_binary(src_lib_path)))
+            elif kernel.execution_backend == "tvm_ffi":
+                executable = kernel.adapter.executable
+                if verbose:
+                    logger.debug(f"Saving kernel executable to file: {kernel_lib_path}")
+                self._safe_write_executable(executable, kernel_lib_path)
+            else:
+                src_lib_path = kernel.adapter.libpath
+                if verbose:
+                    logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
+                self._safe_write_file(kernel_lib_path, "wb", lambda f: f.write(self._load_binary(src_lib_path)))
+
         except Exception as e:
             logger.error(f"Error saving kernel library to disk: {e}")
 
@@ -197,8 +255,7 @@ def _save_kernel_to_disk(self, cache_path: Path, kernel: JITKernel, verbose: boo
             params_path = os.path.join(cache_path, PARAMS_PATH)
             if verbose:
                 logger.debug(f"Saving kernel parameters to disk: {params_path}")
-            with open(params_path, "wb") as f:
-                cloudpickle.dump(kernel.params, f)
+            self._safe_write_file(params_path, "wb", lambda f: cloudpickle.dump(kernel.params, f))
         except Exception as e:
             logger.error(f"Error saving kernel parameters to disk: {e}")
 
@@ -208,8 +265,9 @@ def _load_kernel_from_disk(
         target: str | Target = "auto",
         target_host: str | Target = None,
         out_idx: list[int] | int | None = None,
-        execution_backend: Literal["dlpack", "ctypes", "cython"] = "cython",
+        execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch"] = "tvm_ffi",
         pass_configs: dict = None,
+        compile_flags: list[str] | str | None = None,
         func: Callable = None,
         verbose: bool = False,
     ) -> JITKernel:
@@ -233,23 +291,46 @@ def _load_kernel_from_disk(
         if not os.path.exists(cache_path):
             return None
 
-        kernel_global_source: str | None = None
+        # Resolve backend to pick correct file names
+        if execution_backend == "nvrtc":
+            kernel_lib_file = KERNEL_CUBIN_PATH
+        elif execution_backend == "tvm_ffi":
+            kernel_lib_file = EXECUTABLE_PATH
+        else:
+            kernel_lib_file = KERNEL_LIB_PATH
+
+        device_kernel_path = os.path.join(cache_path, DEVICE_KERNEL_PATH)
+        host_kernel_path = os.path.join(cache_path, HOST_KERNEL_PATH)
+        kernel_lib_path = os.path.join(cache_path, kernel_lib_file)
+        params_path = os.path.join(cache_path, PARAMS_PATH)
+
+        if not all([os.path.exists(file) for file in (kernel_lib_path, params_path)]):
+            return None
+
+        device_kernel_source: str | None = None
+        host_kernel_source: str | None = None
         kernel_params: list[KernelParam] | None = None
 
+        # Load optional device kernel source
         try:
-            wrapped_kernel_path = os.path.join(cache_path, WRAPPED_KERNEL_PATH)
             if verbose:
-                logger.debug(f"Loading wrapped kernel source code from file: {wrapped_kernel_path}")
-            with open(wrapped_kernel_path) as f:
-                kernel_global_source = f.read()
+                logger.debug(f"Loading kernel source code from file: {device_kernel_path}")
+            with open(device_kernel_path) as f:
+                device_kernel_source = f.read()
         except Exception as e:
-            logger.error(f"Error loading wrapped kernel source code from disk: {e}")
+            logger.error(f"Error loading kernel source code from disk: {e}")
 
-        kernel_lib_path = os.path.join(cache_path, KERNEL_LIB_PATH)
+        # Load optional host kernel source
+        try:
+            if verbose:
+                logger.debug(f"Loading wrapped kernel source code from file: {host_kernel_path}")
+            with open(host_kernel_path) as f:
+                host_kernel_source = f.read()
+        except Exception as e:
+            logger.error(f"Error loading host kernel source code from disk: {e}")
 
         # Load kernel parameters
         try:
-            params_path = os.path.join(cache_path, PARAMS_PATH)
             if verbose:
                 logger.debug(f"Loading kernel parameters from file: {params_path}")
             with open(params_path, "rb") as f:
@@ -257,10 +338,11 @@ def _load_kernel_from_disk(
         except Exception as e:
             logger.error(f"Error loading kernel parameters from disk: {e}")
 
-        if kernel_global_source and kernel_params:
+        if host_kernel_source and device_kernel_source and kernel_params:
             return JITKernel.from_database(
                 func=func,
-                kernel_global_source=kernel_global_source,
+                host_kernel_source=host_kernel_source,
+                device_kernel_source=device_kernel_source,
                 kernel_lib_path=kernel_lib_path,
                 params=kernel_params,
                 target=target,
@@ -268,6 +350,7 @@ def _load_kernel_from_disk(
                 out_idx=out_idx,
                 execution_backend=execution_backend,
                 pass_configs=pass_configs,
+                compile_flags=compile_flags,
             )
         else:
             return None
@@ -276,26 +359,30 @@ def save_to_disk(self, path: Path, verbose: bool = False):
         if not os.path.exists(path):
             os.makedirs(path)
 
-        # save best config
+        # save best config (atomic)
         if verbose:
             logger.debug(f"Saving best config to file: {path / BEST_CONFIG_PATH}")
-        with open(path / BEST_CONFIG_PATH, "w") as f:
-            json.dump(self.config, f)
+        self._safe_write_file(str(path / BEST_CONFIG_PATH), "w", lambda f: json.dump(self.config, f))
 
-        # save function
+        # save function (atomic)
         if verbose:
             logger.debug(f"Saving function to file: {path / FUNCTION_PATH}")
-        with open(path / FUNCTION_PATH, "wb") as f:
-            cloudpickle.dump(self.func, f)
+        self._safe_write_file(str(path / FUNCTION_PATH), "wb", lambda f: cloudpickle.dump(self.func, f))
 
-        # save ref latency
+        # save ref latency (atomic)
         if verbose:
             logger.debug(f"Saving latency to file: {path / LATENCY_PATH}")
-        with open(path / LATENCY_PATH, "w") as f:
-            json.dump({
-                "latency": self.latency,
-                "ref_latency": self.ref_latency,
-            }, f)
+        self._safe_write_file(
+            str(path / LATENCY_PATH),
+            "w",
+            lambda f: json.dump(
+                {
+                    "latency": self.latency,
+                    "ref_latency": self.ref_latency,
+                },
+                f,
+            ),
+        )
 
         # save kernel
         self._save_kernel_to_disk(path, self.kernel)
@@ -306,6 +393,13 @@ def load_from_disk(cls, path: Path, compile_args: CompileArgs) -> AutotuneResult
             return None
 
         verbose = compile_args.verbose
+        # Normalize target and resolve execution backend for loading
+        from tilelang.utils.target import determine_target as _determine_target
+        from tilelang.jit.execution_backend import resolve_execution_backend
+
+        norm_target = Target(_determine_target(compile_args.target)) if isinstance(compile_args.target, str) else compile_args.target
+        requested_backend = compile_args.execution_backend
+        resolved_backend = resolve_execution_backend(requested_backend, norm_target)
         # load best config
         if verbose:
             logger.debug(f"Loading best config from file: {path / BEST_CONFIG_PATH}")
@@ -325,10 +419,17 @@ def load_from_disk(cls, path: Path, compile_args: CompileArgs) -> AutotuneResult
             latency = json.load(f)
             latency, ref_latency = latency["latency"], latency["ref_latency"]
 
-        kernel = cls._load_kernel_from_disk(cls, path, compile_args.target,
-                                            compile_args.target_host, compile_args.out_idx,
-                                            compile_args.execution_backend,
-                                            compile_args.pass_configs, func)
+        kernel = cls._load_kernel_from_disk(
+            cls,
+            path,
+            norm_target,
+            compile_args.target_host,
+            compile_args.out_idx,
+            resolved_backend,
+            compile_args.pass_configs,
+            None,  # compile_flags not tracked here
+            func,
+        )
         if kernel is None:
             return None
         kernel.update_tuner_result(
diff --git a/tilelang/autotuner/tuner.py b/tilelang/autotuner/tuner.py
index e94ac7466..8968000ec 100644
--- a/tilelang/autotuner/tuner.py
+++ b/tilelang/autotuner/tuner.py
@@ -3,18 +3,29 @@
 This module provides functionality for auto-tuning tilelang programs, including JIT compilation
 and performance optimization through configuration search.
 """
+
 from __future__ import annotations
 
+from dataclasses import dataclass
+
 import tilelang
 from tilelang import tvm as tvm
+from tilelang import env
+from tilelang.jit import JITImpl
+from tilelang.jit.kernel import JITKernel
 from tvm.tir import PrimFunc, Var
 from tvm.target import Target
 import inspect
 from functools import partial
-from typing import (Callable, Literal, Any, overload)
-from tqdm import tqdm
+from typing import Callable, Generic, Literal, Any, TypeVar
+
+# Python 3.9 compatibility for ParamSpec
+try:
+    from typing import ParamSpec
+except ImportError:  # Python < 3.10
+    from typing_extensions import ParamSpec
+from tqdm.auto import tqdm
 import logging
-import functools
 import concurrent.futures
 import torch
 import os
@@ -26,11 +37,10 @@
 import traceback
 from pathlib import Path
 
-from tilelang import env
 from tilelang.autotuner.param import CompileArgs, ProfileArgs, AutotuneResult
+from tilelang.utils.language import get_prim_func_name
 from tilelang.autotuner.capture import get_autotune_inputs
 from tilelang.utils.target import determine_target
-from tilelang.jit.param import _P, _RProg
 from tilelang import __version__
 
 
@@ -68,8 +78,8 @@ def _init_logger_handlers():
     global _logger_handlers_initialized
     if _logger_handlers_initialized:
         return
-    formatter = logging.Formatter('%(asctime)s %(levelname)s:%(message)s')
-    file_handler = logging.FileHandler('autotuner.log', mode='w')
+    formatter = logging.Formatter("%(asctime)s %(levelname)s:%(message)s")
+    file_handler = logging.FileHandler("autotuner.log", mode="w")
     file_handler.setLevel(logging.DEBUG)
     file_handler.setFormatter(formatter)
     console_handler = logging.StreamHandler(sys.stdout)
@@ -81,8 +91,7 @@ def _init_logger_handlers():
 
 
 def get_available_cpu_count() -> int:
-    """Gets the number of CPU cores available to the current process.
-    """
+    """Gets the number of CPU cores available to the current process."""
     try:
         cpu_count = len(os.sched_getaffinity(0))
     except AttributeError:
@@ -101,6 +110,7 @@ class AutoTuner:
         fn: The function to be auto-tuned.
         configs: List of configurations to try during auto-tuning.
     """
+
     compile_args = CompileArgs()
     profile_args = ProfileArgs()
 
@@ -131,49 +141,75 @@ def from_kernel(cls, kernel: Callable, configs):
         """
         return cls(kernel, configs)
 
-    def set_compile_args(self,
-                         out_idx: list[int] | int | None = None,
-                         target: Literal['auto', 'cuda', 'hip'] = 'auto',
-                         execution_backend: Literal["dlpack", "ctypes", "cython"] = "cython",
-                         target_host: str | Target = None,
-                         verbose: bool = False,
-                         pass_configs: dict[str, Any] | None = None):
+    def set_compile_args(
+        self,
+        out_idx: list[int] | int | None = None,
+        target: Literal["auto", "cuda", "hip", "metal"] | None = None,
+        execution_backend: Literal["auto", "tvm_ffi", "cython", "nvrtc", "torch"] | None = None,
+        target_host: str | Target | None = None,
+        verbose: bool | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
         """Set compilation arguments for the auto-tuner.
 
         Args:
             out_idx: List of output tensor indices.
-            target: Target platform.
-            execution_backend: Execution backend to use for kernel execution.
+            target: Target platform. If None, reads from TILELANG_TARGET environment variable (defaults to "auto").
+            execution_backend: Execution backend to use for kernel execution. If None, reads from
+                TILELANG_EXECUTION_BACKEND environment variable (defaults to "auto").
             target_host: Target host for cross-compilation.
-            verbose: Whether to enable verbose output.
+            verbose: Whether to enable verbose output. If None, reads from
+                TILELANG_VERBOSE environment variable (defaults to False).
             pass_configs: Additional keyword arguments to pass to the Compiler PassContext.
 
+        Environment Variables:
+            TILELANG_TARGET: Default compilation target (e.g., "cuda", "llvm"). Defaults to "auto".
+            TILELANG_EXECUTION_BACKEND: Default execution backend. Defaults to "auto".
+            TILELANG_VERBOSE: Set to "1", "true", "yes", or "on" to enable verbose compilation by default.
+
         Returns:
             AutoTuner: Self for method chaining.
         """
+        # Apply environment variable defaults if parameters are not explicitly set
+        if target is None:
+            target = env.get_default_target()
+        if execution_backend is None:
+            execution_backend = env.get_default_execution_backend()
+        if verbose is None:
+            verbose = env.get_default_verbose()
+
+        # Normalize target to a concrete TVM Target and resolve execution backend
+        t = Target(determine_target(target))
+        from tilelang.jit.execution_backend import resolve_execution_backend
+
+        resolved_backend = resolve_execution_backend(execution_backend, t)
+
         self.compile_args = CompileArgs(
             out_idx=out_idx,
-            target=Target(determine_target(target)),
-            execution_backend=execution_backend,
+            target=t,
+            execution_backend=resolved_backend,
             target_host=target_host,
             verbose=verbose,
-            pass_configs=pass_configs)
+            pass_configs=pass_configs,
+        )
 
         return self
 
-    def set_profile_args(self,
-                         warmup: int = 25,
-                         rep: int = 100,
-                         timeout: int = 30,
-                         supply_type: tilelang.TensorSupplyType = tilelang.TensorSupplyType.Auto,
-                         ref_prog: Callable = None,
-                         supply_prog: Callable = None,
-                         rtol: float = 1e-2,
-                         atol: float = 1e-2,
-                         max_mismatched_ratio: float = 0.01,
-                         skip_check: bool = False,
-                         manual_check_prog: Callable = None,
-                         cache_input_tensors: bool = False):
+    def set_profile_args(
+        self,
+        warmup: int = 25,
+        rep: int = 100,
+        timeout: int = 30,
+        supply_type: tilelang.TensorSupplyType = tilelang.TensorSupplyType.Auto,
+        ref_prog: Callable = None,
+        supply_prog: Callable = None,
+        rtol: float = 1e-2,
+        atol: float = 1e-2,
+        max_mismatched_ratio: float = 0.01,
+        skip_check: bool = False,
+        manual_check_prog: Callable = None,
+        cache_input_tensors: bool = False,
+    ):
         """Set profiling arguments for the auto-tuner.
 
         Args:
@@ -197,9 +233,7 @@ def set_profile_args(self,
         # the `supply_prog` will be ignored and the `get_autotune_inputs` will be used instead.
         if get_autotune_inputs() is not None:
             if supply_prog is not None:
-                logger.warning(
-                    "`supply_prog` will be ignored as this program is under `with set_autotune_inputs` context."
-                )
+                logger.warning("`supply_prog` will be ignored as this program is under `with set_autotune_inputs` context.")
             supply_prog = lambda _: get_autotune_inputs()  # noqa: E731
 
         self.profile_args = ProfileArgs(
@@ -214,13 +248,13 @@ def set_profile_args(self,
             cache_input_tensors=cache_input_tensors,
             warmup=warmup,
             rep=rep,
-            timeout=timeout)
+            timeout=timeout,
+        )
 
         # If a custom `supply_prog` is provided, the profiler's `supply_type` setting
         # becomes ineffective. The custom supply program will be used instead.
         if supply_prog is not None and supply_type != tilelang.TensorSupplyType.Auto:
-            logger.warning("Ignoring `supply_type` passed to `set_profile_args` because "
-                           "`supply_prog` is not None.")
+            logger.warning("Ignoring `supply_type` passed to `set_profile_args` because `supply_prog` is not None.")
 
         return self
 
@@ -229,9 +263,8 @@ def set_kernel_parameters(self, k_parameters: tuple[str, ...], f_parameters: dic
         self._kernel_parameters = k_parameters
         self._function_parameters = f_parameters
 
-    def generate_cache_key(self, parameters: dict[str, Any]) -> AutotuneResult | None:
-        """Generate a cache key for the auto-tuning process.
-        """
+    def generate_cache_key(self, parameters: dict[str, Any], extra_parameters: dict[str, Any]) -> AutotuneResult | None:
+        """Generate a cache key for the auto-tuning process."""
 
         def _normalize_param(value):
             if isinstance(value, Var):
@@ -255,6 +288,7 @@ def _normalize_param(value):
         key_data = {
             "version": __version__,
             "op_parameters": tuple(op_parameters),
+            "extra_parameters": extra_parameters,
             "func_source": func_source,
             "configs": self.configs,
             "compile_args": hash(self.compile_args),
@@ -287,18 +321,43 @@ def run(self, warmup: int = 25, rep: int = 100, timeout: int = 30):
         sig = inspect.signature(self.fn)
         parameters = sig.parameters
 
+        # NOTE(chaofan):  We need to extract some parameters from the closure.
+        # Consider the case:
+        #   def gemm(M, N, K):
+        #       def kernel(...)
+        # If we only extract source, M/N/K will be symbolic and there will be cache problem.
+        extra_parameters: dict[str, Any] = {}
+        cells = self.fn.__closure__
+        var_names = self.fn.__code__.co_freevars
+        if cells is not None:
+            assert len(var_names) == len(cells), "Number of free variables does not match"
+            for var_name, cell in zip(var_names, cells):
+                if var_name in parameters:
+                    continue
+                # Cell content must be serializable
+                assert isinstance(cell.cell_contents, (int, float, str, bool, type(None))), (
+                    f"Cell contents {cell.cell_contents} is not serializable: {type(cell.cell_contents)}"
+                )
+                extra_parameters[var_name] = cell.cell_contents
+
         if isinstance(self.configs, Callable):
             self.configs = self.configs(*self._kernel_parameters)
 
-        key = self.generate_cache_key(parameters)
+        key = self.generate_cache_key(parameters, extra_parameters)
 
         with self._lock:
-            if env.is_cache_enabled():
+            if env.is_cache_enabled() and not env.is_autotune_cache_disabled():
                 # First check in-memory cache
                 if key in self._memory_cache:
-                    logger.warning("Found kernel in memory cache. For better performance," \
-                                        " consider using `@tilelang.autotune` instead of direct AutoTuner.from_kernel.")
-                    return self._memory_cache[key]
+                    # Include PrimFunc name when hitting autotuner memory cache
+                    cached_result = self._memory_cache[key]
+                    prim = getattr(cached_result, "func", None)
+                    kernel_name = get_prim_func_name(prim, "<unknown>")
+                    logger.warning(
+                        "Found kernel '%s' in memory cache. For better performance, consider using `@tilelang.autotune` instead of direct AutoTuner.from_kernel.",
+                        kernel_name,
+                    )
+                    return cached_result
 
                 # Then check disk cache
                 result = self._load_result_from_disk(key)
@@ -337,7 +396,6 @@ def target_fn(jit_kernel: tilelang.JITKernel):
             # This encapsulates the logic of using either a custom supply program (`supply_prog`)
             # or the default profiler input generation (`profiler._get_inputs`).
             def get_input_tensors_supply(with_output: bool):
-
                 def func():
                     if supply_prog is not None:
                         return supply_prog(profiler._get_params(with_output=with_output))
@@ -355,8 +413,7 @@ def func():
                     self.jit_input_tensors = jit_input_tensors_supply()
                 else:
                     # check if the cached tensors are compatible with the current configuration
-                    assert len(params) == len(
-                        self.jit_input_tensors), "len(params) != len(self.jit_input_tensors)"
+                    assert len(params) == len(self.jit_input_tensors), "len(params) != len(self.jit_input_tensors)"
                     for p, c in zip(params, self.jit_input_tensors):
                         if not isinstance(c, torch.Tensor):
                             # skip non-tensor inputs checking
@@ -365,8 +422,8 @@ def func():
                         # Check tensor compatibility using generator expression
                         def shape_equal(a, b):
                             return all(
-                                a_dim == b_dim or isinstance(a_dim, Var) or isinstance(b_dim, Var)
-                                for a_dim, b_dim in zip(a.shape, b.shape))
+                                a_dim == b_dim or isinstance(a_dim, Var) or isinstance(b_dim, Var) for a_dim, b_dim in zip(a.shape, b.shape)
+                            )
 
                         if p.dtype != c.dtype or not shape_equal(p, c):
                             logger.warning(
@@ -377,7 +434,8 @@ def shape_equal(a, b):
                                 "To ensure fresh, compatible inputs are generated for every trial "
                                 "you can disable caching by setting:\n"
                                 "  `cache_input_tensors=False`\n"
-                                "within your `.set_compile_args(...)` call.\n")
+                                "within your `.set_compile_args(...)` call.\n"
+                            )
                             # otherwise, regenerate the input tensors for safety
                             self.jit_input_tensors = jit_input_tensors_supply()
                             break
@@ -386,24 +444,16 @@ def shape_equal(a, b):
 
             if (not skip_check) and (ref_prog is not None):
                 if manual_check_prog is not None:
-                    profiler.manual_assert_close(
-                        ref_prog,
-                        input_tensors=self.jit_input_tensors,
-                        manual_check_prog=manual_check_prog)
+                    profiler.manual_assert_close(ref_prog, input_tensors=self.jit_input_tensors, manual_check_prog=manual_check_prog)
                 else:
                     profiler.assert_allclose(
-                        ref_prog,
-                        input_tensors=self.jit_input_tensors,
-                        rtol=rtol,
-                        atol=atol,
-                        max_mismatched_ratio=max_mismatched_ratio)
-            latency = profiler.do_bench(
-                warmup=warmup, rep=rep, input_tensors=self.jit_input_tensors)
+                        ref_prog, input_tensors=self.jit_input_tensors, rtol=rtol, atol=atol, max_mismatched_ratio=max_mismatched_ratio
+                    )
+            latency = profiler.do_bench(warmup=warmup, rep=rep, input_tensors=self.jit_input_tensors)
 
             if self.ref_latency_cache is None and ref_prog is not None:
                 self.ref_input_tensors = ref_input_tensors_supply()
-                self.ref_latency_cache = profiler.do_bench(
-                    ref_prog, n_warmup=warmup, n_repeat=rep, input_tensors=self.ref_input_tensors)
+                self.ref_latency_cache = profiler.do_bench(ref_prog, n_warmup=warmup, n_repeat=rep, input_tensors=self.ref_input_tensors)
 
             return latency, self.ref_latency_cache
 
@@ -437,17 +487,14 @@ def check_tunable_argument_value(key, parameters, key_args_tuple) -> bool:
 
             # Check if all tunable arguments have been tuned by comparing config keys with key_kwargs_tuple
             if any(key in top_config for key, _ in key_kwargs_tuple) or any(
-                    check_tunable_argument_value(key, self._function_parameters, key_args_tuple)
-                    for key in tunable_arguments):
+                check_tunable_argument_value(key, self._function_parameters, key_args_tuple) for key in tunable_arguments
+            ):
                 logger.warning(
                     f"Tunable parameters {tunable_arguments} already provided during auto-tuning. Skipping compilation and using direct JIT"
                 )
                 # compile the kernel with the provided parameters
                 jit_kernel = self.jit_compile()
-                autotuner_result = AutotuneResult(
-                    libcode=jit_kernel.get_kernel_source(),
-                    func=jit_kernel.prim_func,
-                    kernel=jit_kernel)
+                autotuner_result = AutotuneResult(libcode=jit_kernel.get_kernel_source(), func=jit_kernel.prim_func, kernel=jit_kernel)
                 self._memory_cache[key] = autotuner_result
                 return autotuner_result
         # get the cpu count
@@ -457,9 +504,7 @@ def check_tunable_argument_value(key, parameters, key_args_tuple) -> bool:
         max_cpu_count = int(env.TILELANG_AUTO_TUNING_MAX_CPU_COUNT)
         if cpu_counts > 0:
             num_workers = min(cpu_counts, available_cpu_count)
-            logger.info(
-                f"Auto-tuning with {cpu_counts} CPU counts, {available_cpu_count} CPUs available, {num_workers} CPUs will be used"
-            )
+            logger.info(f"Auto-tuning with {cpu_counts} CPU counts, {available_cpu_count} CPUs available, {num_workers} CPUs will be used")
         else:
             num_workers = max(1, int(available_cpu_count * cpu_utilizations))
             logger.info(
@@ -477,7 +522,6 @@ def check_tunable_argument_value(key, parameters, key_args_tuple) -> bool:
         future_to_index = {}
 
         def cuda_device_wrapper(func, device):
-
             def inner(**config_arg):
                 torch.cuda.set_device(device)
                 return func(**config_arg)
@@ -500,18 +544,14 @@ def inner(**config_arg):
             future_to_index[future] = i
 
         results_with_configs = []
-        for future in tqdm(
-                concurrent.futures.as_completed(futures),
-                total=len(futures),
-                desc="Compiling configurations"):
+        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Compiling configurations"):
             idx = future_to_index[future]
             config = config_args[idx]
             try:
                 result = future.result()
                 results_with_configs.append((result, config))
             except Exception as e:
-                logger.debug(
-                    f"Compilation failed for config {config} at index {idx} with error: {e}")
+                logger.debug(f"Compilation failed for config {config} at index {idx} with error: {e}")
                 continue
 
         ref_latency = None
@@ -524,14 +564,10 @@ def inner(**config_arg):
                 # latency, ref_latency = target_fn(jit_kernel)
                 latency, ref_latency = run_with_timeout(target_fn, timeout, jit_kernel)
             except TimeoutException:
-                logger.info(
-                    f"A timeout occurred while testing config {config}, checkout autotuner.log for more details"
-                )
+                logger.warning(f"A timeout occurred while testing config {config}, checkout autotuner.log for more details")
                 continue
             except Exception:
-                logger.info(
-                    f"An error occurred while testing config {config}, checkout autotuner.log for more details"
-                )
+                logger.warning(f"An error occurred while testing config {config}, checkout autotuner.log for more details")
                 logger.debug(f"Error: {traceback.format_exc()}")
                 continue
 
@@ -546,8 +582,7 @@ def inner(**config_arg):
         pool.shutdown()
 
         if best_kernel is None:
-            error_msg = ("Auto-tuning failed: No configuration successfully "
-                         "compiled and passed benchmarking/validation.")
+            error_msg = "Auto-tuning failed: No configuration successfully compiled and passed benchmarking/validation."
             logger.error(error_msg)
             raise RuntimeError(error_msg)
 
@@ -563,13 +598,14 @@ def inner(**config_arg):
             ref_latency=ref_latency,
             libcode=best_kernel.get_kernel_source(),
             func=best_kernel.prim_func,
-            kernel=best_kernel)
+            kernel=best_kernel,
+        )
 
-        if self.compile_args.execution_backend in ("dlpack", "torch"):
+        if self.compile_args.execution_backend in ("torch"):
             logger.warning("DLPack backend does not support cache saving to disk.")
         else:
             with self._lock:
-                if env.is_cache_enabled():
+                if env.is_cache_enabled() and not env.is_autotune_cache_disabled():
                     self._save_result_to_disk(key, autotuner_result)
 
         self._memory_cache[key] = autotuner_result
@@ -585,9 +621,13 @@ def __call__(self) -> Any:
         return self.run()
 
 
-class _AutoTunerImplementation:
-    # Overload __init__ to help type checkers understand the effect of return_program
-    # The '-> None' is for __init__ itself. The crucial part is Literal for return_program.
+_P = ParamSpec("_P")
+_T = TypeVar("_T")
+
+
+@dataclass
+class AutoTuneImpl(Generic[_P, _T]):
+    jit_impl: JITImpl
 
     warmup: int = 25
     rep: int = 100
@@ -603,125 +643,54 @@ class _AutoTunerImplementation:
     manual_check_prog: Callable = None
     cache_input_tensors: bool = False
 
-    def __init__(self,
-                 configs: dict | Callable,
-                 warmup: int = 25,
-                 rep: int = 100,
-                 timeout: int = 100,
-                 supply_type: tilelang.TensorSupplyType = tilelang.TensorSupplyType.Auto,
-                 ref_prog: Callable = None,
-                 supply_prog: Callable = None,
-                 rtol: float = 1e-2,
-                 atol: float = 1e-2,
-                 max_mismatched_ratio: float = 0.01,
-                 skip_check: bool = False,
-                 manual_check_prog: Callable = None,
-                 cache_input_tensors: bool = False) -> None:
-        """Initialize the AutoTunerImplementation.
-
-        Args:
-            configs: Configuration space to explore during auto-tuning.
-            warmup: Number of warmup iterations before timing.
-            rep: Number of repetitions for timing measurements.
-            timeout: Maximum time (in seconds) allowed for each configuration.
-            supply_type: Strategy for generating input tensors (random/zeros/etc)
-            ref_prog: Reference implementation for validation
-            supply_prog: Custom function to provide input tensors
-            rtol: Relative tolerance for numerical validation
-            atol: Absolute tolerance for numerical validation
-            max_mismatched_ratio: Allowed percentage of mismatched values
-            skip_check: Bypass validation against reference implementation
-            manual_check_prog: Custom validation function
-            cache_input_tensors: Reuse input tensors across trials
-        """
-        # Configuration and benchmarking parameters
-        self.configs = configs  # Search space of tuning configurations
-        self.warmup = warmup  # Warmup iterations for stable measurements
-        self.rep = rep  # Measurement repetitions for statistics
-        self.timeout = timeout  # Per-configuration timeout threshold
-
-        # Tensor handling and validation setup
-        self.supply_type = supply_type  # Input tensor generation strategy
-        self.ref_prog = ref_prog  # Ground truth implementation
-        self.supply_prog = supply_prog  # Custom input data provider
-        self.rtol = rtol  # Relative error tolerance
-        self.atol = atol  # Absolute error tolerance
-        self.max_mismatched_ratio = max_mismatched_ratio  # Allowed mismatch
-
-        # Validation control flags
-        self.skip_check = skip_check  # Bypass accuracy verification
-        self.manual_check_prog = manual_check_prog  # Custom validation
-        self.cache_input_tensors = cache_input_tensors  # Reuse inputs
-
-        # Cache for storing tuned kernel implementations
-        self._tuner_cache: dict[tuple, tilelang.JITKernel] = {}  # (args, kwargs) -> compiled kernel
-
-    # This tells the type checker what the *wrapper* function will return.
-    # this is for linting, please do not remove it.
-    @overload
-    def __call__(self, fn: Callable[_P, _RProg]) -> Callable[_P, tuple[_RProg, AutotuneResult]]:
-        ...
-
-    @overload
-    def __call__(self, fn: Callable[_P, _RProg]) -> Callable[_P, AutotuneResult]:
-        ...
-
-    # Actual implementation of __call__
-    def __call__(self, fn: Callable[_P, _RProg]) -> Callable[_P, Any]:
-        warmup = self.warmup
-        rep = self.rep
-        timeout = self.timeout
-
-        @functools.wraps(fn)
-        def wrapper(*args, **kwargs):
-
-            key_args_tuple = args
-            key_kwargs_tuple = tuple(sorted(kwargs.items()))
-            key = (key_args_tuple, key_kwargs_tuple)
-
-            if key not in self._tuner_cache:
-
-                def jit_compile(**config_arg):
-                    return fn(*args, **kwargs, __tune_params=config_arg)
-
-                compile_arguments = fn(__return_compile_arguments=True)
-
-                autotuner = AutoTuner(
-                    fn, configs=self.configs).set_profile_args(
-                        supply_type=self.supply_type,
-                        ref_prog=self.ref_prog,
-                        supply_prog=self.supply_prog,
-                        rtol=self.rtol,
-                        atol=self.atol,
-                        max_mismatched_ratio=self.max_mismatched_ratio,
-                        skip_check=self.skip_check,
-                        manual_check_prog=self.manual_check_prog,
-                        cache_input_tensors=self.cache_input_tensors,
-                    ).set_compile_args(
-                        out_idx=compile_arguments['out_idx'],
-                        execution_backend=compile_arguments['execution_backend'],
-                        target=compile_arguments['target'],
-                        target_host=compile_arguments['target_host'],
-                        verbose=compile_arguments['verbose'],
-                        pass_configs=compile_arguments['pass_configs'],
-                    )
-
-                autotuner.jit_compile = jit_compile
-                autotuner.set_kernel_parameters(key, inspect.signature(fn).parameters)
-
-                autotuner.run = partial(autotuner.run, warmup, rep, timeout)
-
-                artifact = autotuner.run()
+    def __post_init__(self):
+        self._tuner_cache = {}
+
+    def get_tunner(self):
+        autotuner = (
+            AutoTuner(self.jit_impl.func, configs=self.configs)
+            .set_profile_args(
+                supply_type=self.supply_type,
+                ref_prog=self.ref_prog,
+                supply_prog=self.supply_prog,
+                rtol=self.rtol,
+                atol=self.atol,
+                max_mismatched_ratio=self.max_mismatched_ratio,
+                skip_check=self.skip_check,
+                manual_check_prog=self.manual_check_prog,
+                cache_input_tensors=self.cache_input_tensors,
+            )
+            .set_compile_args(
+                out_idx=self.jit_impl.out_idx,
+                execution_backend=self.jit_impl.execution_backend,
+                target=self.jit_impl.target,
+                target_host=self.jit_impl.target_host,
+                verbose=self.jit_impl.verbose,
+                pass_configs=self.jit_impl.pass_configs,
+            )
+        )
+        autotuner.run = partial(autotuner.run, self.warmup, self.rep, self.timeout)
+        return autotuner
 
-                self._tuner_cache[key] = artifact.kernel
+    def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> JITKernel:
+        key_args_tuple = args
+        key_kwargs_tuple = tuple(sorted(kwargs.items()))
+        key = (key_args_tuple, key_kwargs_tuple)
+        if key not in self._tuner_cache:
 
-            return self._tuner_cache[key]
+            def jit_compile(**config_arg):
+                return self.jit_impl(*args, **kwargs, __tune_params=config_arg)
 
-        return wrapper
+            autotuner = self.get_tunner()
+            autotuner.jit_compile = jit_compile
+            autotuner.set_kernel_parameters(key, self.jit_impl.signature.parameters)
+            artifact = autotuner.run()
+            self._tuner_cache[key] = artifact.kernel
+        return self._tuner_cache[key]
 
 
 def autotune(  # This is the new public interface
-    func: Callable[_P, _RProg] | PrimFunc | None = None,
+    func: Callable[_P, _T] | PrimFunc | None = None,
     *,  # Indicates subsequent arguments are keyword-only
     configs: dict | Callable,
     # profile arguments
@@ -772,8 +741,9 @@ def autotune(  # This is the new public interface
         Compilation target for TVM (e.g., "cuda", "llvm"). Defaults to "auto".
     target_host : Union[str, Target], optional
         Target host for cross-compilation. Defaults to None.
-    execution_backend : Literal["dlpack", "ctypes", "cython"], optional
-        Backend for kernel execution and argument passing. Defaults to "cython".
+    execution_backend : Literal["auto", "tvm_ffi", "cython", "nvrtc", "torch"], optional
+        Backend for kernel execution and argument passing. Use "auto" to pick a sensible
+        default per target (cuda->tvm_ffi, metal->torch, others->cython).
     verbose : bool, optional
         Enables verbose logging during compilation. Defaults to False.
     pass_configs : Optional[Dict[str, Any]], optional
@@ -790,27 +760,28 @@ def autotune(  # This is the new public interface
     if callable(func):
         # Case 1: Used as @autotune (func_or_out_idx is the function, others are defaults)
         # This is a placeholder for a real auto tuner implementation
-        raise ValueError(
-            "Use tilelang.autotune to decorate func without arguments is not supported yet.")
+        raise ValueError("Use tilelang.autotune to decorate func without arguments is not supported yet.")
     elif isinstance(func, PrimFunc):
         raise ValueError("Use tilelang.jit to decorate prim_func is not supported yet.")
     else:
-        # Case 2: Used as @autotune(...) to configure, or func_or_out_idx is meant as out_idx.
-        # Create a _AutoTunerImplementation instance with the provided/defaulted arguments.
-        # This instance is a decorator that will be applied to the function later.
-        configured_decorator = _AutoTunerImplementation(
-            configs=configs,
-            warmup=warmup,
-            rep=rep,
-            timeout=timeout,
-            supply_type=supply_type,
-            ref_prog=ref_prog,
-            supply_prog=supply_prog,
-            rtol=rtol,
-            atol=atol,
-            max_mismatched_ratio=max_mismatched_ratio,
-            skip_check=skip_check,
-            manual_check_prog=manual_check_prog,
-            cache_input_tensors=cache_input_tensors,
-        )
-        return configured_decorator
+
+        def decorator(impl):
+            assert isinstance(impl, JITImpl), "The @autotune decorator can only be applied to @tilelang.jit decorated instances."
+            return AutoTuneImpl(
+                jit_impl=impl,
+                configs=configs,
+                warmup=warmup,
+                rep=rep,
+                timeout=timeout,
+                supply_type=supply_type,
+                ref_prog=ref_prog,
+                supply_prog=supply_prog,
+                rtol=rtol,
+                atol=atol,
+                max_mismatched_ratio=max_mismatched_ratio,
+                skip_check=skip_check,
+                manual_check_prog=manual_check_prog,
+                cache_input_tensors=cache_input_tensors,
+            )
+
+        return decorator
diff --git a/tilelang/cache/__init__.py b/tilelang/cache/__init__.py
index c338ce61d..d0ee6c9a4 100644
--- a/tilelang/cache/__init__.py
+++ b/tilelang/cache/__init__.py
@@ -1,41 +1,89 @@
 """The cache utils with class and database persistence - Init file"""
+
 from __future__ import annotations
 
-from typing import Literal
+import logging
+from typing import TYPE_CHECKING, Literal
 from tvm.target import Target
 from tvm.tir import PrimFunc
 from tilelang.jit import JITKernel
 from tilelang import env
-from .kernel_cache import KernelCache
+from tilelang.jit.adapter.cutedsl.kernel_cache import CuTeDSLKernelCache
+from tilelang.jit.adapter.cython.kernel_cache import CythonKernelCache
+from tilelang.jit.adapter.nvrtc.kernel_cache import NVRTCKernelCache
+from tilelang.jit.adapter.torch.kernel_cache import TorchKernelCache
+from tilelang.jit.adapter.kernel_cache import TVMFFIKernelCache
+
+if TYPE_CHECKING:
+    from .kernel_cache import KernelCache
 
-# Create singleton instance of KernelCache
-_kernel_cache_instance = KernelCache()
+# Create a map of singleton instance of KernelCaches
+_dispatch_map: dict[str, KernelCache] = {
+    "tvm_ffi": TVMFFIKernelCache(),
+    "cython": CythonKernelCache(),
+    "nvrtc": NVRTCKernelCache(),
+    "cutedsl": CuTeDSLKernelCache(),
+    "torch": TorchKernelCache(),
+}
 
 
 def cached(
     func: PrimFunc = None,
     out_idx: list[int] = None,
     *args,
-    target: str | Target = "auto",
-    target_host: str | Target = None,
-    execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] | None = "cython",
-    verbose: bool | None = False,
+    target: str | Target | None = None,
+    target_host: str | Target | None = None,
+    execution_backend: Literal["auto", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] | None = None,
+    verbose: bool | None = None,
     pass_configs: dict | None = None,
     compile_flags: list[str] | str | None = None,
 ) -> JITKernel:
     """
     Caches and reuses compiled kernels (using KernelCache class).
     """
-    return _kernel_cache_instance.cached(
-        func,
-        out_idx,
-        *args,
-        target=target,
-        target_host=target_host,
-        execution_backend=execution_backend,
-        verbose=verbose,
-        pass_configs=pass_configs,
-        compile_flags=compile_flags)
+    # Apply environment variable defaults if parameters are not explicitly set
+    # This is the SINGLE source of truth for env var processing
+    if target is None:
+        target = env.get_default_target()
+    if execution_backend is None:
+        execution_backend = env.get_default_execution_backend()
+    if verbose is None:
+        verbose = env.get_default_verbose()
+
+    # Normalize target and resolve execution backend before proceeding
+    from tilelang.utils.target import determine_target as _determine_target
+    from tilelang.jit.execution_backend import resolve_execution_backend, allowed_backends_for_target
+
+    norm_target = Target(_determine_target(target)) if isinstance(target, str) else target
+    requested_backend = execution_backend
+    execution_backend = resolve_execution_backend(requested_backend, norm_target)
+    if verbose:
+        allowed_now = allowed_backends_for_target(norm_target, include_unavailable=False)
+        # Avoid duplicate logs when caller already resolved explicitly
+        if requested_backend in (None, "auto") or requested_backend != execution_backend:
+            logger = logging.getLogger(__name__)
+            logger.setLevel(logging.INFO)
+            logger.info(
+                "Execution backend resolved -> '%s' (requested='%s', target='%s', allowed: %s)",
+                execution_backend,
+                requested_backend,
+                norm_target.kind.name,
+                ", ".join(sorted(allowed_now)),
+            )
+    if execution_backend in _dispatch_map:
+        return _dispatch_map[execution_backend].cached(
+            func,
+            out_idx,
+            *args,
+            target=norm_target,
+            target_host=target_host,
+            execution_backend=execution_backend,
+            verbose=verbose,
+            pass_configs=pass_configs,
+            compile_flags=compile_flags,
+        )
+    else:
+        raise ValueError(f'Cannot find support for execution backend "{execution_backend}"')
 
 
 def clear_cache():
@@ -46,9 +94,11 @@ def clear_cache():
         RuntimeError: Always raised to warn users to clear the cache manually.
     """
     cache_dir = env.TILELANG_CACHE_DIR
-    raise RuntimeError("tilelang.clear_cache() is disabled because deleting the cache directory "
-                       "is dangerous. If you accept the risk, remove it manually with "
-                       f"`rm -rf '{cache_dir}'`.")
+    raise RuntimeError(
+        "tilelang.clear_cache() is disabled because deleting the cache directory "
+        "is dangerous. If you accept the risk, remove it manually with "
+        f"`rm -rf '{cache_dir}'`."
+    )
 
 
 if env.TILELANG_CLEAR_CACHE.lower() in ("1", "true", "yes", "on"):
diff --git a/tilelang/cache/kernel_cache.py b/tilelang/cache/kernel_cache.py
index d0a801fb4..76b84590f 100644
--- a/tilelang/cache/kernel_cache.py
+++ b/tilelang/cache/kernel_cache.py
@@ -1,4 +1,5 @@
 """The cache utils with class and database persistence - KernelCache Class"""
+
 from __future__ import annotations
 
 import json
@@ -13,19 +14,13 @@
 import cloudpickle
 from tvm.target import Target
 from tvm.tir import PrimFunc
-
+from tvm.runtime import Executable
 from tilelang.engine.param import KernelParam
+from tilelang.utils.language import get_prim_func_name
 from tilelang import env
 from tilelang.jit import JITKernel
 from tilelang import __version__
 
-KERNEL_PATH = "kernel.cu"
-WRAPPED_KERNEL_PATH = "wrapped_kernel.cu"
-KERNEL_LIB_PATH = "kernel_lib.so"
-KERNEL_CUBIN_PATH = "kernel.cubin"
-KERNEL_PY_PATH = "kernel.py"
-PARAMS_PATH = "params.pkl"
-
 
 class KernelCache:
     """
@@ -40,7 +35,11 @@ class KernelCache:
     _instance = None  # For implementing singleton pattern
     _lock = threading.Lock()  # For thread safety
     _memory_cache = {}  # In-memory cache dictionary
-    execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython"
+    execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi"
+    device_kernel_path = "device_kernel.cu"
+    host_kernel_path = "host_kernel.cu"
+    kernel_lib_path = "kernel_lib.so"
+    params_path = "params.pkl"
 
     def __new__(cls):
         """
@@ -69,7 +68,7 @@ def _generate_key(
         self,
         func: Callable,
         out_idx: list[int],
-        execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
+        execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi",
         args=None,
         target: str | Target = "auto",
         target_host: str | Target = None,
@@ -82,7 +81,7 @@ def _generate_key(
         Args:
             func (Callable): The function to be compiled.
             out_idx (List[int]): Indices specifying which outputs to return.
-            execution_backend (Literal): Backend type for execution. Defaults to "cython".
+            execution_backend (Literal): Backend type for execution. Defaults to "tvm_ffi".
             args: Arguments passed to the function.
             target (Union[str, Target]): Compilation target platform. Defaults to "auto".
             target_host (Union[str, Target], optional): Host target platform.
@@ -91,14 +90,12 @@ def _generate_key(
             str: SHA256 hash key for the kernel configuration.
         """
         self.execution_backend = execution_backend
-        func_binary = cloudpickle.dumps(func.script(show_meta=True))
+        func_binary = func.script(show_meta=True).encode()
         key_data = {
             "version": __version__,
             "func": sha256(func_binary).hexdigest(),  # Use SHA256 to generate hash key
             "out_idx": (tuple(out_idx) if isinstance(out_idx, (list, tuple)) else [out_idx]),
-            "args_repr": tuple(
-                repr(arg) for arg in args
-            ),  # Use repr to serialize arguments, may need more robust serialization
+            "args_repr": tuple(repr(arg) for arg in args),  # Use repr to serialize arguments, may need more robust serialization
             "target": str(target),
             "target_host": str(target_host) if target_host else None,
             "execution_backend": execution_backend,
@@ -115,27 +112,44 @@ def cached(
         func: PrimFunc = None,
         out_idx: list[int] = None,
         *args,
-        target: str | Target = "auto",
-        target_host: str | Target = None,
-        execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
-        verbose: bool = False,
-        pass_configs: dict = None,
+        target: str | Target,
+        target_host: str | Target | None = None,
+        execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi",
+        verbose: bool,
+        pass_configs: dict | None = None,
         compile_flags: list[str] | str | None = None,
     ) -> JITKernel:
         """
         Caches and reuses compiled kernels to avoid redundant compilation.
 
+        This is the ONLY place where environment variable processing, target normalization,
+        and execution backend resolution should happen. All compilation paths go through here.
+
         Args:
             func: Function to be compiled or a prepared PrimFunc
             out_idx: Indices specifying which outputs to return
-            target: Compilation target platform
+            target: Compilation target platform (None = read from TILELANG_TARGET env var)
             target_host: Host target platform
+            execution_backend: Execution backend (None = read from TILELANG_EXECUTION_BACKEND)
+            verbose: Enable verbose output (None = read from TILELANG_VERBOSE)
             *args: Arguments passed to func
 
         Returns:
             JITKernel: The compiled kernel, either freshly compiled or from cache
+
+        Environment Variables
+        ---------------------
+        TILELANG_TARGET : str
+            Default compilation target (e.g., "cuda", "llvm"). Defaults to "auto".
+        TILELANG_EXECUTION_BACKEND : str
+            Default execution backend. Defaults to "auto".
+        TILELANG_VERBOSE : str
+            Set to "1", "true", "yes", or "on" to enable verbose compilation by default.
         """
+
         if not env.is_cache_enabled():
+            if verbose:
+                self.logger.info("Cache is disabled; compiling kernel without caching.")
             return JITKernel(
                 func,
                 out_idx=out_idx,
@@ -157,30 +171,35 @@ def cached(
             pass_configs=pass_configs,
             compile_flags=compile_flags,
         )
+        if verbose:
+            self.logger.info(f"Generated cache key: {key} for kernel {get_prim_func_name(func, '<unknown>')}")
         with self._lock:
             # First check in-memory cache
             if key in self._memory_cache:
-                self.logger.warning("Found kernel in memory cache. For better performance," \
-                                    " consider using `@tilelang.jit` instead of direct kernel caching.")
+                # Include kernel name for easier debugging when hitting memory cache
+                kernel_name = get_prim_func_name(func, "<unknown>")
+                self.logger.warning(
+                    "Found kernel '%s' in memory cache. For better performance, consider using `@tilelang.jit` instead of direct kernel caching.",
+                    kernel_name,
+                )
                 return self._memory_cache[key]
 
             if verbose:
-                self.logger.debug(f"Checking disk cache for kernel {func.attrs['global_symbol']}")
+                self.logger.debug(f"Checking disk cache for kernel {get_prim_func_name(func, '<unknown>')}")
 
             # Then check disk cache
-            kernel = self._load_kernel_from_disk(key, target, target_host, out_idx,
-                                                 execution_backend, pass_configs, compile_flags,
-                                                 func, verbose)
+            kernel = self._load_kernel_from_disk(
+                key, target, target_host, out_idx, execution_backend, pass_configs, compile_flags, func, verbose
+            )
             if kernel is not None:
                 if verbose:
-                    self.logger.debug(
-                        f"Found kernel in disk cache for {func.attrs['global_symbol']}")
+                    self.logger.debug(f"Found kernel in disk cache for {get_prim_func_name(func, '<unknown>')}")
                 # Populate memory cache with disk result
                 self._memory_cache[key] = kernel
                 return kernel
 
         if verbose:
-            self.logger.debug(f"No cached kernel for {func.attrs['global_symbol']}")
+            self.logger.debug(f"No cached kernel for {get_prim_func_name(func, '<unknown>')}")
         # Compile kernel if cache miss; leave critical section
         kernel = JITKernel(
             func,
@@ -192,12 +211,12 @@ def cached(
             pass_configs=pass_configs,
             compile_flags=compile_flags,
         )
-        if execution_backend in ("dlpack", "torch"):
-            self.logger.warning("DLPack or torch backend does not support cache saving to disk.")
-        else:
-            with self._lock:
-                if env.is_cache_enabled():
-                    self._save_kernel_to_disk(key, kernel, func, verbose)
+        with self._lock:
+            if env.is_cache_enabled():
+                cache_path = self._get_cache_path(key)
+                self._save_kernel_to_disk(key, kernel, func, verbose)
+                # Set cache path on adapter so it can save cubin after first execution
+                self._set_adapter_cache_path(kernel, cache_path)
 
         # Store in memory cache after compilation
         self._memory_cache[key] = kernel
@@ -239,11 +258,13 @@ def _safe_write_file(path: str, mode: str, operation: Callable):
         # Use atomic POSIX replace, so other processes cannot see a partial write
         os.replace(temp_path, path)
 
-    def _save_kernel_to_disk(self,
-                             key: str,
-                             kernel: JITKernel,
-                             func: Callable = None,
-                             verbose: bool = False):
+    @staticmethod
+    def _safe_write_executable(executable: Executable, path: str):
+        temp_path = os.path.join(env.TILELANG_TMP_DIR, f"{os.getpid()}_{uuid.uuid4()}.so")
+        executable.export_library(temp_path)
+        os.replace(temp_path, path)
+
+    def _save_kernel_to_disk(self, key: str, kernel: JITKernel, func: Callable = None, verbose: bool = False):
         """
         Persists a compiled kernel to disk cache.
 
@@ -265,71 +286,43 @@ def _save_kernel_to_disk(self,
 
         # Save kernel source code
         try:
-            kernel_path = os.path.join(cache_path, KERNEL_PATH)
-            if verbose:
-                self.logger.debug(f"Saving kernel source code to file: {kernel_path}")
-            if kernel.kernel_source is not None:
-                KernelCache._safe_write_file(kernel_path, "w",
-                                             lambda file: file.write(kernel.kernel_source))
-        except Exception as e:
-            self.logger.error(f"Error saving kernel source code to disk: {e}")
+            self._save_kernel_source_code_to_disk(kernel, cache_path, verbose)
+        except Exception:
+            self.logger.exception("Error saving kernel source code to disk")
 
         # Save wrapped kernel source code
         try:
-            wrapped_kernel_path = os.path.join(cache_path, WRAPPED_KERNEL_PATH)
-            if verbose:
-                self.logger.debug(
-                    f"Saving wrapped kernel source code to file: {wrapped_kernel_path}")
-            KernelCache._safe_write_file(
-                wrapped_kernel_path, "w",
-                lambda file: file.write(kernel.adapter.get_kernel_source()))
-        except Exception as e:
-            self.logger.error(f"Error saving wrapped kernel source code to disk: {e}")
+            self._save_wrapper_kernel_code_to_disk(kernel, cache_path, verbose)
+        except Exception:
+            self.logger.exception("Error saving host kernel source code to disk")
 
         # Save the kernel library
         try:
             # Save CUBIN or SO file
-            kernel_lib_path = KERNEL_CUBIN_PATH if self.execution_backend == "nvrtc" else KERNEL_LIB_PATH
-            kernel_lib_path = os.path.join(cache_path, kernel_lib_path)
-            src_lib_path = kernel.adapter.libpath
-            if verbose:
-                self.logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
-            KernelCache._safe_write_file(
-                kernel_lib_path, "wb",
-                lambda file: file.write(KernelCache._load_binary(src_lib_path)))
-
-            # Save an extra Python file for NVRTC
-            if self.execution_backend == "nvrtc":
-                kernel_py_path = os.path.join(cache_path, KERNEL_PY_PATH)
-                src_lib_path = src_lib_path.replace(".cubin", ".py")
-                if verbose:
-                    self.logger.debug(f"Saving kernel nvrtc python code to file: {kernel_py_path}")
-                KernelCache._safe_write_file(
-                    kernel_py_path, "wb",
-                    lambda file: file.write(KernelCache._load_binary(src_lib_path)))
-        except Exception as e:
-            self.logger.error(f"Error saving kernel library to disk: {e}")
+            self._save_so_cubin_to_disk(kernel, cache_path, verbose)
+
+        except Exception:
+            self.logger.exception("Error saving kernel library to disk")
 
         # Save kernel parameters
         try:
-            params_path = os.path.join(cache_path, PARAMS_PATH)
+            params_path = os.path.join(cache_path, self.params_path)
             if verbose:
                 self.logger.debug(f"Saving kernel parameters to disk: {params_path}")
-            KernelCache._safe_write_file(params_path, "wb",
-                                         lambda file: cloudpickle.dump(kernel.params, file))
-        except Exception as e:
-            self.logger.error(f"Error saving kernel parameters to disk: {e}")
+            KernelCache._safe_write_file(params_path, "wb", lambda file: cloudpickle.dump(kernel.params, file))
+        except Exception:
+            self.logger.exception("Error saving kernel parameters to disk")
 
     def _load_kernel_from_disk(
         self,
         key: str,
         target: str | Target = "auto",
-        target_host: str | Target = None,
-        out_idx: list[int] = None,
-        execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
-        pass_configs: dict = None,
+        target_host: str | Target | None = None,
+        out_idx: list[int] | None = None,
+        execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi",
+        pass_configs: dict | None = None,
         compile_flags: list[str] | str | None = None,
-        func: Callable = None,
+        func: Callable | None = None,
         verbose: bool = False,
     ) -> JITKernel | None:
         """
@@ -340,7 +333,7 @@ def _load_kernel_from_disk(
             target (Union[str, Target]): Compilation target platform. Defaults to "auto".
             target_host (Union[str, Target], optional): Host target platform.
             out_idx (List[int], optional): Indices specifying which outputs to return.
-            execution_backend (Literal): Backend type for execution. Defaults to "cython".
+            execution_backend (Literal): Backend type for execution. Defaults to "tvm_ffi".
             pass_configs (dict, optional): Configuration for compiler passes.
             func (Callable, optional): The original function.
             verbose (bool): Enable verbose log messages.
@@ -349,50 +342,42 @@ def _load_kernel_from_disk(
             JITKernel: The loaded kernel if found, None otherwise.
         """
         cache_path = self._get_cache_path(key)
-        wrapped_kernel_path = os.path.join(cache_path, WRAPPED_KERNEL_PATH)
-        kernel_lib_path = os.path.join(
-            cache_path, KERNEL_CUBIN_PATH if self.execution_backend == "nvrtc" else KERNEL_LIB_PATH)
-        params_path = os.path.join(cache_path, PARAMS_PATH)
-        if not all([os.path.exists(file) for file in (kernel_lib_path, params_path)]):
-            return None
+        device_kernel_path = os.path.join(cache_path, self.device_kernel_path)
+        host_kernel_path = os.path.join(cache_path, self.host_kernel_path)
+        kernel_lib_path = os.path.join(cache_path, self.kernel_lib_path)
+        params_path = os.path.join(cache_path, self.params_path)
 
-        kernel_global_source: str | None = None
-        kernel_params: list[KernelParam] | None = None
+        required_files = self._get_required_files(cache_path)
+
+        if not all([os.path.exists(file) for file in required_files]):
+            return None
 
         # Load the kernel source file (optional)
-        try:
-            if verbose:
-                self.logger.debug(
-                    f"Loading wrapped kernel source code from file: {wrapped_kernel_path}")
-            with open(wrapped_kernel_path) as f:
-                kernel_global_source = f.read()
-        except Exception as e:
-            self.logger.error(f"Error loading wrapped kernel source code from disk: {e}")
+        device_kernel_source, host_kernel_source = self._load_kernel_source(device_kernel_path, host_kernel_path, verbose)
 
         # Load kernel parameters
+        kernel_params: list[KernelParam] | None = None
         try:
             if verbose:
                 self.logger.debug(f"Loading kernel parameters from file: {params_path}")
             with open(params_path, "rb") as f:
                 kernel_params = cloudpickle.load(f)
-        except Exception as e:
-            self.logger.error(f"Error loading kernel parameters from disk: {e}")
-
-        if kernel_global_source and kernel_params:
-            return JITKernel.from_database(
-                func=func,
-                kernel_global_source=kernel_global_source,
-                kernel_lib_path=kernel_lib_path,
-                params=kernel_params,
-                target=target,
-                target_host=target_host,
-                out_idx=out_idx,
-                execution_backend=execution_backend,
-                pass_configs=pass_configs,
-                compile_flags=compile_flags,
-            )
-        else:
-            return None
+        except Exception:
+            self.logger.exception("Error loading kernel parameters from disk")
+
+        return self._build_kernel(
+            func=func,
+            host_kernel_source=host_kernel_source,
+            device_kernel_source=device_kernel_source,
+            kernel_lib_path=kernel_lib_path,
+            kernel_params=kernel_params,
+            target=target,
+            target_host=target_host,
+            out_idx=out_idx,
+            execution_backend=execution_backend,
+            pass_configs=pass_configs,
+            compile_flags=compile_flags,
+        )
 
     def _clear_disk_cache(self):
         """
@@ -408,5 +393,93 @@ def _clear_disk_cache(self):
 
             # Re-create the cache directory
             KernelCache._create_dirs()
-        except Exception as e:
-            self.logger.error(f"Error clearing disk cache: {e}")
+        except Exception:
+            self.logger.exception("Error clearing disk cache")
+
+    def _save_kernel_source_code_to_disk(self, kernel: JITKernel, cache_path: str, verbose: bool = False):
+        device_kernel_path = os.path.join(cache_path, self.device_kernel_path)
+        if verbose:
+            self.logger.debug(f"Saving kernel source code to file: {device_kernel_path}")
+        if kernel.kernel_source is not None:
+            KernelCache._safe_write_file(device_kernel_path, "w", lambda file: file.write(kernel.kernel_source))
+
+    def _save_wrapper_kernel_code_to_disk(self, kernel: JITKernel, cache_path: str, verbose: bool = False):
+        host_kernel_path = os.path.join(cache_path, self.host_kernel_path)
+        if verbose:
+            self.logger.debug(f"Saving wrapped kernel source code to file: {host_kernel_path}")
+        KernelCache._safe_write_file(host_kernel_path, "w", lambda file: file.write(kernel.adapter.get_kernel_source()))
+
+    def _save_so_cubin_to_disk(self, kernel: JITKernel, cache_path: str, verbose: bool = False):
+        kernel_lib_path = os.path.join(cache_path, self.kernel_lib_path)
+        src_lib_path = kernel.adapter.libpath
+        if verbose:
+            self.logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
+        KernelCache._safe_write_file(kernel_lib_path, "wb", lambda file: file.write(KernelCache._load_binary(src_lib_path)))
+
+    def _get_required_files(self, cache_path: str) -> list[str]:
+        kernel_lib_path = os.path.join(cache_path, self.kernel_lib_path)
+        params_path = os.path.join(cache_path, self.params_path)
+        return [kernel_lib_path, params_path]
+
+    def _load_kernel_source(self, device_kernel_path: str, host_kernel_path: str, verbose: bool = False) -> tuple[str | None, str | None]:
+        try:
+            if verbose:
+                self.logger.debug(f"Loading kernel source code from file: {device_kernel_path}")
+            with open(device_kernel_path) as f:
+                device_kernel_source = f.read()
+        except Exception:
+            device_kernel_source = None
+            self.logger.exception("Error loading kernel source code from disk")
+        try:
+            if verbose:
+                self.logger.debug(f"Loading wrapped kernel source code from file: {host_kernel_path}")
+            with open(host_kernel_path) as f:
+                host_kernel_source = f.read()
+        except Exception:
+            host_kernel_source = None
+            self.logger.exception("Error loading host kernel source code from disk")
+        return device_kernel_source, host_kernel_source
+
+    def _set_adapter_cache_path(self, kernel: JITKernel, cache_path: str):
+        return
+
+    def _build_kernel(
+        self,
+        func: Callable | None,
+        host_kernel_source: str,
+        device_kernel_source: str,
+        kernel_lib_path: str,
+        kernel_params: list[KernelParam] | None,
+        target: str | Target,
+        target_host: str | Target | None,
+        out_idx: list[int] | None,
+        execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"],
+        pass_configs: dict | None,
+        compile_flags: list[str] | str | None,
+    ) -> JITKernel | None:
+        # Check all required components and report specific failures
+        missing_components = []
+        if not host_kernel_source:
+            missing_components.append("host_kernel_source")
+        if not device_kernel_source:
+            missing_components.append("device_kernel_source")
+        if not kernel_params:
+            missing_components.append("kernel_params")
+
+        if missing_components:
+            self.logger.warning("Cannot build kernel from cache: missing required component(s): %s", ", ".join(missing_components))
+            return None
+
+        return JITKernel.from_database(
+            func=func,
+            host_kernel_source=host_kernel_source,
+            device_kernel_source=device_kernel_source,
+            kernel_lib_path=kernel_lib_path,
+            params=kernel_params,
+            target=target,
+            target_host=target_host,
+            out_idx=out_idx,
+            execution_backend=execution_backend,
+            pass_configs=pass_configs,
+            compile_flags=compile_flags,
+        )
diff --git a/tilelang/carver/README.md b/tilelang/carver/README.md
index 498cf1571..f484f47d7 100644
--- a/tilelang/carver/README.md
+++ b/tilelang/carver/README.md
@@ -1,12 +1,12 @@
 # Carver: A Tile-Structure Based Hint Recommend Framework for Machine Learning Compilers
 
-**Carver** is a lightweight framework for generating and ranking tile configurations (also known as **tiling strategies**, **blocking schemes**, or **scheduling hints**) for common GPU, CPU, and accelerator backends. It helps you explore efficient mappings of loops for operations such as matrix multiplication, elementwise transforms, and other reduction-oriented kernels. 
+**Carver** is a lightweight framework for generating and ranking tile configurations (also known as **tiling strategies**, **blocking schemes**, or **scheduling hints**) for common GPU, CPU, and accelerator backends. It helps you explore efficient mappings of loops for operations such as matrix multiplication, elementwise transforms, and other reduction-oriented kernels.
 
-Carver combines hardware architecture information, user-defined tile structures, and built-in heuristics to recommend tiling strategies (or "hints"). The recommended hints are easily adaptable to multiple backends, including [TVM](https://tvm.apache.org/), [triton](https://github.com/openai/triton), [tilelang](https://github.com/LeiYanggh/tilelang) (or other domain-specific compilers).
+Carver combines hardware architecture information, user-defined tile structures, and built-in heuristics to recommend tiling strategies (or "hints"). The recommended hints are easily adaptable to multiple backends, including [TVM](https://tvm.apache.org/), [triton](https://github.com/openai/triton), [tilelang](https://github.com/tile-ai/tilelang) (or other domain-specific compilers).
 
 ---
 
-### Key Features
+## Key Features
 - **Unified Tiling Framework**: Generate tile candidates for multiple backends under a unified API.
 - **Architecture-Specific Modeling**: Take into account architecture constraints (e.g., CUDA `smem_cap`, warp size, CPU cache structure, etc.) when generating hints.
 - **Flexible Templates**: High-level templates (like `MatmulTemplate`, `GeneralReductionTemplate`, `ElementwiseTemplate`) let you concisely specify kernel structures.
@@ -33,8 +33,8 @@ arch = CUDA("nvidia/geforce-rtx-4090")
 #         for k in Reduce(1024):
 #             ...
 carve_template = carver.GeneralReductionTemplate(
-    structure="SSR",          
-    shape=[1024, 1024, 1024], 
+    structure="SSR",
+    shape=[1024, 1024, 1024],
     dtype="float16",
 ).with_arch(arch)
 
@@ -72,7 +72,6 @@ A tile structure composed of S and R can simulate various cases. For example, st
 
 We can specialize more advanced templates to provide finer-grained information, such as `MatmulTemplate`.
 
-
 ### Matmul Template
 
 Carver also provides a specialized `MatmulTemplate` for matrix multiplication (e.g., `C = A * B`), automatically inferring common tiling strategies (thread blocks, warps, use of tensor cores, etc.).
@@ -190,8 +189,6 @@ You might interpret this in **Triton** as:
 
 This helps quickly test multiple configurations without manually guessing.
 
-
-
 ## Supported Templates
 
 Carver abstracts common loop patterns through templates:
@@ -203,8 +200,6 @@ Carver abstracts common loop patterns through templates:
 
 You can also create your own specialized templates if you have unique loop structures or constraints. For instance, you might define specialized templates for convolution, flash attention, etc.
 
-
 ## TODO Items
 
 - [ ] **Adapt to tile language**: Provide ready-made scheduling calls or wrappers for [tilelang](https://github.com/LeiYanggh/tilelang) to streamline end-to-end integration.
-
diff --git a/tilelang/carver/__init__.py b/tilelang/carver/__init__.py
index 4ffd43644..f1dfc5b47 100644
--- a/tilelang/carver/__init__.py
+++ b/tilelang/carver/__init__.py
@@ -1,4 +1,5 @@
 """Base infra"""
+
 from .analysis import (
     BlockInfo,  # noqa: F401
     IterInfo,  # noqa: F401
diff --git a/tilelang/carver/analysis.py b/tilelang/carver/analysis.py
index 96606e790..6ca916818 100644
--- a/tilelang/carver/analysis.py
+++ b/tilelang/carver/analysis.py
@@ -1,4 +1,5 @@
 """Analysis on TIR blocks, loops and functions."""
+
 from __future__ import annotations
 from typing_extensions import Literal
 
@@ -144,11 +145,13 @@ def _iter_kind(i: tir.IterVar) -> str:
                         var=iter.var,
                         dom=iter.dom,
                         loop_rv=loop,
-                    ) for loop, iter in zip(loops, iters)
+                    )
+                    for loop, iter in zip(loops, iters)
                 ],
                 block_rv=block,
                 reduction_block=is_reduction,
-            ))
+            )
+        )
     return blocks
 
 
@@ -188,8 +191,7 @@ def get_max_shared_memory_per_block(target: Target) -> int:
     _assert_gpu_target(target)
     max_shared_memory_per_block = target.attrs.get("max_shared_memory_per_block", None)
     if max_shared_memory_per_block is None:
-        raise ValueError(
-            f"Cannot find `max_shared_memory_per_block` in {target}, please specify it manually")
+        raise ValueError(f"Cannot find `max_shared_memory_per_block` in {target}, please specify it manually")
     return int(max_shared_memory_per_block)
 
 
@@ -197,13 +199,11 @@ def get_root_block(sch: Schedule, func_name: str = "main") -> BlockRV:
     try:
         block = sch.mod[func_name].body.block
     except Exception:
-        raise ValueError(f"The function body is expected to be the root block, but got:\n"
-                         f"{sch.mod[func_name].body}") from None
+        raise ValueError(f"The function body is expected to be the root block, but got:\n{sch.mod[func_name].body}") from None
     return sch.get_block(block.name_hint)
 
 
-def collect_block_iter_vars_used_in_access_region(block: tir.Block,
-                                                  region: list[ir.Range]) -> set[tir.Var]:
+def collect_block_iter_vars_used_in_access_region(block: tir.Block, region: list[ir.Range]) -> set[tir.Var]:
     """Collect the block iter variables used in the access region of a buffer region."""
     tir_vars = set()
     for expr in region:
@@ -251,15 +251,13 @@ def is_broadcast_epilogue(
     for buffer_region in sch.get(epilogue).reads:
         if buffer_region.buffer not in write_buffers:
             continue
-        tir_vars = collect_block_iter_vars_used_in_access_region(
-            sch.get(epilogue), buffer_region.region)
+        tir_vars = collect_block_iter_vars_used_in_access_region(sch.get(epilogue), buffer_region.region)
         if len(tir_vars) < len(epilogue_iters):
             return True
     return False
 
 
-def get_reduction_blocks(sch: tir.Schedule,
-                         blocks: list[tir.schedule.BlockRV]) -> list[tir.schedule.BlockRV]:
+def get_reduction_blocks(sch: tir.Schedule, blocks: list[tir.schedule.BlockRV]) -> list[tir.schedule.BlockRV]:
     # Get the main computation block
     def is_reduction(block: BlockRV) -> bool:
         block_stmt = sch.get(block)
diff --git a/tilelang/carver/arch/__init__.py b/tilelang/carver/arch/__init__.py
index c2bc9c75d..b6cb9e72f 100644
--- a/tilelang/carver/arch/__init__.py
+++ b/tilelang/carver/arch/__init__.py
@@ -39,18 +39,18 @@ def auto_infer_current_arch() -> TileDevice:
 
 
 __all__ = [
-    'is_cpu_arch',
-    'is_cuda_arch',
-    'is_volta_arch',
-    'is_ampere_arch',
-    'is_ada_arch',
-    'is_hopper_arch',
-    'is_tensorcore_supported_precision',
-    'has_mma_support',
-    'is_cdna_arch',
-    'is_metal_arch',
-    'CUDA',
-    'CDNA',
-    'METAL',
-    'CPU',
+    "is_cpu_arch",
+    "is_cuda_arch",
+    "is_volta_arch",
+    "is_ampere_arch",
+    "is_ada_arch",
+    "is_hopper_arch",
+    "is_tensorcore_supported_precision",
+    "has_mma_support",
+    "is_cdna_arch",
+    "is_metal_arch",
+    "CUDA",
+    "CDNA",
+    "METAL",
+    "CPU",
 ]
diff --git a/tilelang/carver/arch/arch_base.py b/tilelang/carver/arch/arch_base.py
index a10fa434d..c5e9dfa68 100644
--- a/tilelang/carver/arch/arch_base.py
+++ b/tilelang/carver/arch/arch_base.py
@@ -1,6 +1,3 @@
-from __future__ import annotations
-
-
 class TileDevice:
     """
     Represents the architecture of a computing device, capturing various hardware specifications.
@@ -10,9 +7,7 @@ def __init__(self) -> None:
         self.reg_cap: int = 0  # Register capacity: The amount of register memory available
         self.smem_cap: int = 0  # Shared memory capacity: The amount of shared memory available
         self.compute_max_core: int = 0  # The maximum number of computing cores
-        self.warp_size: int = (
-            0  # The size of a warp, a group of threads that execute instructions in lockstep
-        )
+        self.warp_size: int = 0  # The size of a warp, a group of threads that execute instructions in lockstep
         self.sm_partition: int = 0  # The number of streaming multiprocessor partitions
         self.transaction_size: list[int] = [
             0,
@@ -24,9 +19,7 @@ def __init__(self) -> None:
             0,
         ]  # Bandwidth specifications, possibly including peak and sustained rates
         self.platform: str = "unknown"  # The platform or manufacturer of the device
-        self.compute_capability: str = (
-            "unknown"  # The compute capability, indicating the feature set and performance level
-        )
+        self.compute_capability: str = "unknown"  # The compute capability, indicating the feature set and performance level
         self.l2_cache_size_bytes: int = 0
         # the number of transaction size in bytes
         self.transaction_size: list[int] = [0, 0]  # in bytes
diff --git a/tilelang/carver/arch/cdna.py b/tilelang/carver/arch/cdna.py
index ec5aa905f..5c2d4c4ed 100644
--- a/tilelang/carver/arch/cdna.py
+++ b/tilelang/carver/arch/cdna.py
@@ -9,7 +9,6 @@ def is_cdna_arch(arch: TileDevice) -> bool:
 
 
 class CDNA(TileDevice):
-
     def __init__(self, target: Target | str):
         if isinstance(target, str):
             target = tvm.target.Target(target)
@@ -33,6 +32,6 @@ def __init__(self, target: Target | str):
 
 
 __all__ = [
-    'is_cdna_arch',
-    'CDNA',
+    "is_cdna_arch",
+    "CDNA",
 ]
diff --git a/tilelang/carver/arch/cpu.py b/tilelang/carver/arch/cpu.py
index f4643baa0..fc18c6c8b 100644
--- a/tilelang/carver/arch/cpu.py
+++ b/tilelang/carver/arch/cpu.py
@@ -10,7 +10,6 @@ def is_cpu_arch(arch: TileDevice) -> bool:
 # For LLVM Backend, we do not provide the detailed information of the CPU
 # As the LLVM backend do not required tuning, just maintain the consistency
 class CPU(TileDevice):
-
     def __init__(self, target: Target):
         self.target = target
         device = tvm.runtime.cpu(0)
@@ -21,6 +20,6 @@ def __init__(self, target: Target):
 
 
 __all__ = [
-    'is_cpu_arch',
-    'CPU',
+    "is_cpu_arch",
+    "CPU",
 ]
diff --git a/tilelang/carver/arch/cuda.py b/tilelang/carver/arch/cuda.py
index 4c7f98dff..2b79b2832 100644
--- a/tilelang/carver/arch/cuda.py
+++ b/tilelang/carver/arch/cuda.py
@@ -78,7 +78,6 @@ def has_mma_support(arch: TileDevice) -> bool:
 # instead of assuming both a and b share the same dtype.
 # As the tensorcore may supports float8_e4m3 * float8_e5m2
 def is_tensorcore_supported_precision(in_dtype: str, accum_dtype: str, arch: TileDevice) -> bool:
-
     if is_volta_arch(arch):
         return (in_dtype, accum_dtype) in volta_tensorcore_supported
     elif is_ampere_arch(arch):
@@ -92,7 +91,6 @@ def is_tensorcore_supported_precision(in_dtype: str, accum_dtype: str, arch: Til
 
 
 class TensorInstruction:
-
     def __init__(
         self,
         name: str,
@@ -104,7 +102,6 @@ def __init__(
 
 
 class CUDA(TileDevice):
-
     def __init__(self, target: Target | str):
         if isinstance(target, str):
             target = tvm.target.Target(target)
@@ -148,12 +145,12 @@ def __repr__(self):
 
 
 __all__ = [
-    'is_cuda_arch',
-    'is_volta_arch',
-    'is_ampere_arch',
-    'is_ada_arch',
-    'is_hopper_arch',
-    'is_tensorcore_supported_precision',
-    'has_mma_support',
+    "is_cuda_arch",
+    "is_volta_arch",
+    "is_ampere_arch",
+    "is_ada_arch",
+    "is_hopper_arch",
+    "is_tensorcore_supported_precision",
+    "has_mma_support",
     "CUDA",
 ]
diff --git a/tilelang/carver/arch/driver/cuda_driver.py b/tilelang/carver/arch/driver/cuda_driver.py
index 337987dd8..a63127663 100644
--- a/tilelang/carver/arch/driver/cuda_driver.py
+++ b/tilelang/carver/arch/driver/cuda_driver.py
@@ -2,123 +2,54 @@
 import ctypes
 import sys
 
+try:
+    import torch.cuda._CudaDeviceProperties as _CudaDeviceProperties
+except ImportError:
+    _CudaDeviceProperties = type("DummyCudaDeviceProperties", (), {})
 
-class cudaDeviceProp(ctypes.Structure):
-    _fields_ = [
-        ("name", ctypes.c_char * 256),
-        ("uuid", ctypes.c_byte * 16),  # cudaUUID_t
-        ("luid", ctypes.c_char * 8),
-        ("luidDeviceNodeMask", ctypes.c_uint),
-        ("totalGlobalMem", ctypes.c_size_t),
-        ("sharedMemPerBlock", ctypes.c_size_t),
-        ("regsPerBlock", ctypes.c_int),
-        ("warpSize", ctypes.c_int),
-        ("memPitch", ctypes.c_size_t),
-        ("maxThreadsPerBlock", ctypes.c_int),
-        ("maxThreadsDim", ctypes.c_int * 3),
-        ("maxGridSize", ctypes.c_int * 3),
-        ("clockRate", ctypes.c_int),
-        ("totalConstMem", ctypes.c_size_t),
-        ("major", ctypes.c_int),
-        ("minor", ctypes.c_int),
-        ("textureAlignment", ctypes.c_size_t),
-        ("texturePitchAlignment", ctypes.c_size_t),
-        ("deviceOverlap", ctypes.c_int),
-        ("multiProcessorCount", ctypes.c_int),
-        ("kernelExecTimeoutEnabled", ctypes.c_int),
-        ("integrated", ctypes.c_int),
-        ("canMapHostMemory", ctypes.c_int),
-        ("computeMode", ctypes.c_int),
-        ("maxTexture1D", ctypes.c_int),
-        ("maxTexture1DMipmap", ctypes.c_int),
-        ("maxTexture1DLinear", ctypes.c_int),
-        ("maxTexture2D", ctypes.c_int * 2),
-        ("maxTexture2DMipmap", ctypes.c_int * 2),
-        ("maxTexture2DLinear", ctypes.c_int * 3),
-        ("maxTexture2DGather", ctypes.c_int * 2),
-        ("maxTexture3D", ctypes.c_int * 3),
-        ("maxTexture3DAlt", ctypes.c_int * 3),
-        ("maxTextureCubemap", ctypes.c_int),
-        ("maxTexture1DLayered", ctypes.c_int * 2),
-        ("maxTexture2DLayered", ctypes.c_int * 3),
-        ("maxTextureCubemapLayered", ctypes.c_int * 2),
-        ("maxSurface1D", ctypes.c_int),
-        ("maxSurface2D", ctypes.c_int * 2),
-        ("maxSurface3D", ctypes.c_int * 3),
-        ("maxSurface1DLayered", ctypes.c_int * 2),
-        ("maxSurface2DLayered", ctypes.c_int * 3),
-        ("maxSurfaceCubemap", ctypes.c_int),
-        ("maxSurfaceCubemapLayered", ctypes.c_int * 2),
-        ("surfaceAlignment", ctypes.c_size_t),
-        ("concurrentKernels", ctypes.c_int),
-        ("ECCEnabled", ctypes.c_int),
-        ("pciBusID", ctypes.c_int),
-        ("pciDeviceID", ctypes.c_int),
-        ("pciDomainID", ctypes.c_int),
-        ("tccDriver", ctypes.c_int),
-        ("asyncEngineCount", ctypes.c_int),
-        ("unifiedAddressing", ctypes.c_int),
-        ("memoryClockRate", ctypes.c_int),
-        ("memoryBusWidth", ctypes.c_int),
-        ("l2CacheSize", ctypes.c_int),
-        ("persistingL2CacheMaxSize", ctypes.c_int),
-        ("maxThreadsPerMultiProcessor", ctypes.c_int),
-        ("streamPrioritiesSupported", ctypes.c_int),
-        ("globalL1CacheSupported", ctypes.c_int),
-        ("localL1CacheSupported", ctypes.c_int),
-        ("sharedMemPerMultiprocessor", ctypes.c_size_t),
-        ("regsPerMultiprocessor", ctypes.c_int),
-        ("managedMemory", ctypes.c_int),
-        ("isMultiGpuBoard", ctypes.c_int),
-        ("multiGpuBoardGroupID", ctypes.c_int),
-        ("reserved2", ctypes.c_int * 2),
-        ("reserved1", ctypes.c_int * 1),
-        ("reserved", ctypes.c_int * 60)
-    ]
-
-
-def get_cuda_device_properties(device_id: int = 0) -> cudaDeviceProp | None:
-
-    if sys.platform == "win32":
-        libcudart = ctypes.windll.LoadLibrary("cudart64_110.dll")
-    else:
-        libcudart = ctypes.cdll.LoadLibrary("libcudart.so")
-
-    prop = cudaDeviceProp()
-    cudaGetDeviceProperties = libcudart.cudaGetDeviceProperties
-    cudaGetDeviceProperties.argtypes = [ctypes.POINTER(cudaDeviceProp), ctypes.c_int]
-    cudaGetDeviceProperties.restype = ctypes.c_int
-    ret = cudaGetDeviceProperties(ctypes.byref(prop), device_id)
-    if ret == 0:
-        return prop
-    else:
-        raise RuntimeError(f"cudaGetDeviceProperties failed with error {ret}")
+
+class cudaDeviceAttrNames:
+    r"""
+    refer to https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g49e2f8c2c0bd6fe264f2fc970912e5cd
+    """
+
+    cudaDevAttrMaxThreadsPerBlock: int = 1
+    cudaDevAttrMaxRegistersPerBlock: int = 12
+    cudaDevAttrMaxSharedMemoryPerMultiprocessor: int = 81
+    cudaDevAttrMaxPersistingL2CacheSize: int = 108
+
+
+def get_cuda_device_properties(device_id: int = 0) -> _CudaDeviceProperties | None:
+    try:
+        import torch.cuda
+
+        if not torch.cuda.is_available():
+            return None
+        return torch.cuda.get_device_properties(torch.device(device_id))
+    except ImportError:
+        return None
 
 
 def get_device_name(device_id: int = 0) -> str | None:
     prop = get_cuda_device_properties(device_id)
     if prop:
-        return prop.name.decode()
-    else:
-        raise RuntimeError("Failed to get device properties.")
+        return prop.name
 
 
 def get_shared_memory_per_block(device_id: int = 0, format: str = "bytes") -> int | None:
     assert format in ["bytes", "kb", "mb"], "Invalid format. Must be one of: bytes, kb, mb"
     prop = get_cuda_device_properties(device_id)
-    if prop:
-        # Convert size_t to int to avoid overflow issues
-        shared_mem = int(prop.sharedMemPerBlock)
-        if format == "bytes":
-            return shared_mem
-        elif format == "kb":
-            return shared_mem // 1024
-        elif format == "mb":
-            return shared_mem // (1024 * 1024)
-        else:
-            raise RuntimeError("Invalid format. Must be one of: bytes, kb, mb")
-    else:
+    if prop is None:
         raise RuntimeError("Failed to get device properties.")
+    shared_mem = int(prop.shared_memory_per_block)
+    if format == "bytes":
+        return shared_mem
+    elif format == "kb":
+        return shared_mem // 1024
+    elif format == "mb":
+        return shared_mem // (1024 * 1024)
+    else:
+        raise RuntimeError("Invalid format. Must be one of: bytes, kb, mb")
 
 
 def get_device_attribute(attr: int, device_id: int = 0) -> int:
@@ -130,7 +61,11 @@ def get_device_attribute(attr: int, device_id: int = 0) -> int:
 
         value = ctypes.c_int()
         cudaDeviceGetAttribute = libcudart.cudaDeviceGetAttribute
-        cudaDeviceGetAttribute.argtypes = [ctypes.POINTER(ctypes.c_int), ctypes.c_int, ctypes.c_int]
+        cudaDeviceGetAttribute.argtypes = [
+            ctypes.POINTER(ctypes.c_int),
+            ctypes.c_int,
+            ctypes.c_int,
+        ]
         cudaDeviceGetAttribute.restype = ctypes.c_int
 
         ret = cudaDeviceGetAttribute(ctypes.byref(value), attr, device_id)
@@ -148,28 +83,20 @@ def get_max_dynamic_shared_size_bytes(device_id: int = 0, format: str = "bytes")
     Get the maximum dynamic shared memory size in bytes, kilobytes, or megabytes.
     """
     assert format in ["bytes", "kb", "mb"], "Invalid format. Must be one of: bytes, kb, mb"
-    prop = get_cuda_device_properties(device_id)
-    if prop:
-        # Convert size_t to int to avoid overflow issues
-        shared_mem = int(prop.sharedMemPerMultiprocessor)
-        if format == "bytes":
-            return shared_mem
-        elif format == "kb":
-            return shared_mem // 1024
-        elif format == "mb":
-            return shared_mem // (1024 * 1024)
-        else:
-            raise RuntimeError("Invalid format. Must be one of: bytes, kb, mb")
+    shared_mem = get_device_attribute(cudaDeviceAttrNames.cudaDevAttrMaxSharedMemoryPerMultiprocessor, device_id)
+    if format == "bytes":
+        return shared_mem
+    elif format == "kb":
+        return shared_mem // 1024
+    elif format == "mb":
+        return shared_mem // (1024 * 1024)
     else:
-        raise RuntimeError("Failed to get device properties.")
+        raise RuntimeError("Invalid format. Must be one of: bytes, kb, mb")
 
 
 def get_persisting_l2_cache_max_size(device_id: int = 0) -> int:
-    prop = get_cuda_device_properties(device_id)
-    if prop:
-        return prop.persistingL2CacheMaxSize
-    else:
-        raise RuntimeError("Failed to get device properties for persisting L2 cache max size.")
+    prop = get_device_attribute(cudaDeviceAttrNames.cudaDevAttrMaxPersistingL2CacheSize, device_id)
+    return prop
 
 
 def get_num_sms(device_id: int = 0) -> int:
@@ -186,15 +113,17 @@ def get_num_sms(device_id: int = 0) -> int:
         RuntimeError: If unable to get the device properties.
     """
     prop = get_cuda_device_properties(device_id)
-    if prop:
-        return prop.multiProcessorCount
-    else:
+    if prop is None:
         raise RuntimeError("Failed to get device properties.")
+    return prop.multi_processor_count
 
 
 def get_registers_per_block(device_id: int = 0) -> int:
-    prop = get_cuda_device_properties(device_id)
-    if prop:
-        return prop.regsPerBlock
-    else:
-        raise RuntimeError("Failed to get device properties.")
+    """
+    Get the maximum number of 32-bit registers available per block.
+    """
+    prop = get_device_attribute(
+        cudaDeviceAttrNames.cudaDevAttrMaxRegistersPerBlock,
+        device_id,
+    )
+    return prop
diff --git a/tilelang/carver/arch/metal.py b/tilelang/carver/arch/metal.py
index 9cd1c4d1e..0b76849a7 100644
--- a/tilelang/carver/arch/metal.py
+++ b/tilelang/carver/arch/metal.py
@@ -8,7 +8,6 @@ def is_metal_arch(arch: TileDevice) -> bool:
 
 
 class METAL(TileDevice):
-
     def __init__(self, target: Target | str):
         if isinstance(target, str):
             target = Target(target)
@@ -16,6 +15,6 @@ def __init__(self, target: Target | str):
 
 
 __all__ = [
-    'is_metal_arch',
-    'METAL',
+    "is_metal_arch",
+    "METAL",
 ]
diff --git a/tilelang/carver/common_schedules.py b/tilelang/carver/common_schedules.py
index 2766a15e3..4904b770d 100644
--- a/tilelang/carver/common_schedules.py
+++ b/tilelang/carver/common_schedules.py
@@ -19,7 +19,7 @@
 # Modifications Copyright (c) Microsoft.
 # The code below is mostly copied from apache/tvm common_schedules.py in dlight.
 """Common schedule strategies for TIR."""
-from __future__ import annotations
+
 from typing import Callable
 
 from tvm import tir
diff --git a/tilelang/carver/matmul_analysis.py b/tilelang/carver/matmul_analysis.py
index 02a86cc78..6d27de825 100644
--- a/tilelang/carver/matmul_analysis.py
+++ b/tilelang/carver/matmul_analysis.py
@@ -1,5 +1,6 @@
 # pylint: disable=missing-docstring, invalid-name
 """A GEMM schedule rule for GPU operators."""
+
 from __future__ import annotations
 from dataclasses import dataclass
 from enum import Enum
@@ -157,8 +158,7 @@ def find_last_producer_from_buffer(sch, main_block, buffer: tir.Buffer) -> Block
     return block
 
 
-def find_arg_idx_from_buffer_chain(sch: tir.Schedule, main_block: tir.schedule.BlockRV,
-                                   buffer: tir.Buffer) -> int:
+def find_arg_idx_from_buffer_chain(sch: tir.Schedule, main_block: tir.schedule.BlockRV, buffer: tir.Buffer) -> int:
     """traverse to find the arg index from the buffer"""
     producers = sch.get_producers(main_block)
 
@@ -226,9 +226,7 @@ def make_iter_fusion_index_map(
         else:
             fused_iters[trait.kind] = v_i
 
-    final_indices: list[tir.PrimExpr] = [
-        fused_iters.get(kind, tir.IntImm(traits[0].extent.dtype, 0)) for kind in kind_order
-    ]
+    final_indices: list[tir.PrimExpr] = [fused_iters.get(kind, tir.IntImm(traits[0].extent.dtype, 0)) for kind in kind_order]
 
     return tir.IndexMap(input_iters, final_indices, None)
 
@@ -307,8 +305,7 @@ def get_access_axes(region: list[Range]) -> set[Var]:
     return A_traits, B_traits, C_traits, block_traits
 
 
-def get_index_map(block: tir.Block,
-                  layout: list[str] | None = None) -> tuple[tir.IndexMap, ...] | None:
+def get_index_map(block: tir.Block, layout: list[str] | None = None) -> tuple[tir.IndexMap, ...] | None:
     """Get index maps for the block
 
     Parameters
@@ -343,10 +340,7 @@ def get_ordered_axes(region: list[Range]) -> set[Var]:
         return axes
 
     def is_common_reduce(var: Var) -> bool:
-        for iter_var in block.iter_vars:
-            if iter_var.var == var and iter_var.iter_type == IterVar.CommReduce:
-                return True
-        return False
+        return any(iter_var.var == var and iter_var.iter_type == IterVar.CommReduce for iter_var in block.iter_vars)
 
     def has_common_reduce(var: Var) -> bool:
         vars = collect_vars_from_expr(var)
@@ -384,17 +378,17 @@ def infer_layout(layout: str, region: list[Range], kind: str = "A"):
             if kind == "C":
                 return [IterKind.kIter_S, primary_iter, secondary_iter]
             else:
-                return ([IterKind.kIter_S, spatial_iter, reduction_iter] if check_last_trait(region)
-                        else [IterKind.kIter_S, reduction_iter, spatial_iter])
+                return (
+                    [IterKind.kIter_S, spatial_iter, reduction_iter]
+                    if check_last_trait(region)
+                    else [IterKind.kIter_S, reduction_iter, spatial_iter]
+                )
         else:
             raise ValueError(f"Unknown layout {layout}")
 
-    A_index_map = make_iter_fusion_index_map(
-        A_traits, infer_layout(layout[0], block.reads[0].region, kind="A"))
-    B_index_map = make_iter_fusion_index_map(
-        B_traits, infer_layout(layout[1], block.reads[1].region, kind="B"))
-    C_index_map = make_iter_fusion_index_map(
-        C_traits, infer_layout(layout[2], block.writes[0].region, kind="C"))
+    A_index_map = make_iter_fusion_index_map(A_traits, infer_layout(layout[0], block.reads[0].region, kind="A"))
+    B_index_map = make_iter_fusion_index_map(B_traits, infer_layout(layout[1], block.reads[1].region, kind="B"))
+    C_index_map = make_iter_fusion_index_map(C_traits, infer_layout(layout[2], block.writes[0].region, kind="C"))
 
     matmul_index_map = make_iter_fusion_index_map(
         block_traits,
@@ -429,8 +423,7 @@ def is_dequantize(block: BlockRV) -> bool:
         has_uint_input = any("uint" in str(region.buffer.dtype) for region in block_stmt.reads)
         if not has_uint_input:
             return False
-        return not (len(block_stmt.writes) != 1 or
-                    "float" not in str(block_stmt.writes[0].buffer.dtype))
+        return not (len(block_stmt.writes) != 1 or "float" not in str(block_stmt.writes[0].buffer.dtype))
 
     dequantize_blocks = [block for block in blocks if is_dequantize(block)]
     return dequantize_blocks[0] if len(dequantize_blocks) == 1 else None
@@ -452,8 +445,7 @@ def get_access_vars(region: list[Range]) -> list[Var]:
                 return None
             axes.extend(undefined_vars(r.min))
         # remove trivial axis
-        trivial_vars = set(
-            iter_var.var for iter_var in block_stmt.iter_vars if _is_one(iter_var.dom.extent))
+        trivial_vars = set(iter_var.var for iter_var in block_stmt.iter_vars if _is_one(iter_var.dom.extent))
         axes = [axis for axis in axes if axis not in trivial_vars]
         # remove duplicate axis
         axes = [var for i, var in enumerate(axes) if i == 0 or var != axes[i - 1]]
@@ -462,8 +454,7 @@ def get_access_vars(region: list[Range]) -> list[Var]:
     lhs_access_vars = get_access_vars(block_stmt.reads[0].region)[-2:]
     rhs_access_vars = get_access_vars(block_stmt.writes[0].region)[-2:]
     is_identity = list(lhs_access_vars) == list(rhs_access_vars)
-    is_transpose = list(lhs_access_vars) != list(rhs_access_vars) and set(lhs_access_vars) == set(
-        rhs_access_vars)
+    is_transpose = list(lhs_access_vars) != list(rhs_access_vars) and set(lhs_access_vars) == set(rhs_access_vars)
     return is_identity, is_transpose
 
 
@@ -491,9 +482,7 @@ def inline_transpose_block(sch: tir.Schedule, blocks: list[tir.schedule.BlockRV]
     return result_blocks
 
 
-def normalize_to_matmul(sch: tir.Schedule,
-                        main_block: BlockRV,
-                        layout: list[str] | None = None) -> tir.Schedule | None:
+def normalize_to_matmul(sch: tir.Schedule, main_block: BlockRV, layout: list[str] | None = None) -> tir.Schedule | None:
     if layout is None:
         layout = ["n", "t", "n"]
     block_stmt = sch.get(main_block)
@@ -526,7 +515,7 @@ def get_tensorized_func_and_tags(
     allow_gemv: bool = False,
 ) -> tuple[tir.PrimFunc, dict[str, list[int] | int]]:
     """
-        transform function to matmul if necessary (e.g. transform conv2d with im2col)
+    transform function to matmul if necessary (e.g. transform conv2d with im2col)
     """
     if layout is None:
         layout = ["a", "a", "a"]
@@ -543,10 +532,7 @@ def _can_be_tensorized(sch: tir.Schedule, block: BlockRV) -> bool:
         conditions = []
         conditions.append(len(block_stmt.reads) == 2)
         conditions.append(len(block_stmt.writes) == 1)
-        conditions.append(
-            len(
-                collect_block_iter_vars_used_in_access_region(block_stmt,
-                                                              block_stmt.writes[0].region)) > 0)
+        conditions.append(len(collect_block_iter_vars_used_in_access_region(block_stmt, block_stmt.writes[0].region)) > 0)
         return all(conditions)
 
     # step2. transform function to tensorcore matmul (e.g. conv2d with im2col)
@@ -592,10 +578,7 @@ def get_ordered_axes(region: list[Range]) -> set[Var]:
             return axes
 
         def is_common_reduce(var: Var) -> bool:
-            for iter_var in block_stmt.iter_vars:
-                if iter_var.var == var and iter_var.iter_type == IterVar.CommReduce:
-                    return True
-            return False
+            return any(iter_var.var == var and iter_var.iter_type == IterVar.CommReduce for iter_var in block_stmt.iter_vars)
 
         def has_common_reduce(var: Var) -> bool:
             vars = collect_vars_from_expr(var)
@@ -626,7 +609,7 @@ def check_last_trait(region: list[Range]):
         # When the func is a dequantize like ops, we should consider the M
         require_block_reduce = False
         # And we only support float16 for now
-        if (hasattr(func.attrs, "dequantize_info") and in_dtype in ["bfloat16", "float16"]):
+        if hasattr(func.attrs, "dequantize_info") and in_dtype in ["bfloat16", "float16"]:
             for arg in func.params:
                 inp_shape = func.buffer_map[arg].shape
                 M = inp_shape[0]
@@ -645,9 +628,7 @@ def check_last_trait(region: list[Range]):
     if target.kind.name == "cuda" and check_sm_version(target.arch) >= 70:
         in_dtype, out_dtype = get_in_out_dtypes(block_stmt)
         if not is_tensorcore_supported_precision(in_dtype, out_dtype, arch=get_arch(target)):
-            logger.debug(
-                f"The input and output dtype ({in_dtype}, {out_dtype})is not supported by tensorcore"
-            )
+            logger.debug(f"The input and output dtype ({in_dtype}, {out_dtype})is not supported by tensorcore")
             return func, None
 
         # reindex and transform functions
@@ -676,7 +657,7 @@ def check_last_trait(region: list[Range]):
             else:
                 raise ValueError(f"Unknown IterVar type {iter_type}")
 
-            if (isinstance(extent, tir.expr.IntImm) and extent.value < minimal_tensorize_threshold):
+            if isinstance(extent, tir.expr.IntImm) and extent.value < minimal_tensorize_threshold:
                 return func, None
         tags = analysis_tensorcore_tags(sch, main_block, target)
         return sch.mod["main"], tags
@@ -686,8 +667,10 @@ def check_last_trait(region: list[Range]):
 
 def get_propagate_map(trans: bool = True, dtype="float16", matrix_name="A", index_dtype="int32"):
     from bitblas.tl.mma_layout import (  # pylint: disable=import-outside-toplevel
-        ldmatrix_32x8_to_shared_16x16_layout, ldmatrix_trans_32x8_to_shared_16x16_layout,
-        ldmatrix_32x16_to_shared_16x32_layout_a, ldmatrix_32x16_to_shared_16x32_layout_b,
+        ldmatrix_32x8_to_shared_16x16_layout,
+        ldmatrix_trans_32x8_to_shared_16x16_layout,
+        ldmatrix_32x16_to_shared_16x32_layout_a,
+        ldmatrix_32x16_to_shared_16x32_layout_b,
     )
 
     assert dtype in [
@@ -727,9 +710,7 @@ def ldmatrix_permutation_16x32_32x16_32x16(kernel_i, kernel_j):
         return ldmatrix_layout(thread_id, local_id)
 
     if dtype in ["bfloat16", "float16"]:
-        ldmatrix_index_map = (
-            ldmatrix_trans_permutation_16x16_32x8_16x16
-            if trans else ldmatrix_permutation_16x16_32x8_16x16)
+        ldmatrix_index_map = ldmatrix_trans_permutation_16x16_32x8_16x16 if trans else ldmatrix_permutation_16x16_32x8_16x16
     else:
         ldmatrix_index_map = ldmatrix_permutation_16x32_32x16_32x16
 
@@ -744,7 +725,6 @@ def ldmatrix_permutation_16x32_32x16_32x16(kernel_i, kernel_j):
 # Ladder weight propagation, which can be used to avoid the ldmatrix
 # Instructions.
 def get_ladder_stage3_map(dtype="float16", index_dtype="int32"):
-
     def shared_32x8_to_mma_32x8_layout(i, j):
         thread_id = (i % 8) * 4 + (j // 2)
         local_id = (i // 8) * 2 + (j % 2)
@@ -837,8 +817,7 @@ def layout_propagate_chain(
                 scaling_factor = 1
                 for i, j in zip(write.buffer.shape, read.buffer.shape):
                     scaling_factor *= i // j
-                final_indices = list(
-                    index_map.map_indices(tmp_index_map.map_indices(write_indices)))
+                final_indices = list(index_map.map_indices(tmp_index_map.map_indices(write_indices)))
                 final_indices[-1] = final_indices[-1] // scaling_factor
                 index_map = IndexMap(
                     write_indices,
diff --git a/tilelang/carver/roller/bestfit.py b/tilelang/carver/roller/bestfit.py
index b66ceaae7..ec7817429 100644
--- a/tilelang/carver/roller/bestfit.py
+++ b/tilelang/carver/roller/bestfit.py
@@ -2,7 +2,6 @@
 
 
 class Block:
-
     def __init__(self, start, end, is_free):
         self.start = start
         self.end = end
@@ -21,7 +20,6 @@ def __repr__(self) -> str:
 
 
 class BestFit:
-
     def __init__(self, align=32):
         self.limit = 0
         self.list = []
@@ -31,16 +29,14 @@ def malloc(self, size) -> Block:
         size = (size + self.align - 1) // self.align * self.align
         found = None
         for block in self.list:
-            if block.is_free and block.size() >= size and (not found or
-                                                           found.size() > block.size()):
+            if block.is_free and block.size() >= size and (not found or found.size() > block.size()):
                 found = block
         if found:
             found.is_free = False
             remain = found.size() - size
             if remain != 0:
                 found.end -= remain
-                self.list.insert(
-                    self.list.index(found) + 1, Block(found.end, found.end + remain, True))
+                self.list.insert(self.list.index(found) + 1, Block(found.end, found.end + remain, True))
             return found
         elif len(self.list) > 0 and self.list[-1].is_free:
             add = size - self.list[-1].size()
diff --git a/tilelang/carver/roller/hint.py b/tilelang/carver/roller/hint.py
index 20d62f68f..8fd1fb406 100644
--- a/tilelang/carver/roller/hint.py
+++ b/tilelang/carver/roller/hint.py
@@ -1,5 +1,5 @@
 """Hint definition for schedule"""
-from __future__ import annotations
+
 from tvm import DataType
 from . import PrimFuncNode
 import numpy as np
@@ -61,7 +61,7 @@ def compute_elements_from_shape(self, shape: list[int]) -> int:
             strided_elem = original_shape
         else:
             assert self.ax < len(shape)
-            strided_elem = np.prod(shape[0:self.ax + 1]) * self.stride
+            strided_elem = np.prod(shape[0 : self.ax + 1]) * self.stride
             assert strided_elem >= original_shape
         return int(strided_elem)
 
@@ -218,7 +218,7 @@ def to_dict(self) -> dict:
         return dic
 
     @classmethod
-    def from_dict(cls, dic: dict) -> Hint:
+    def from_dict(cls, dic: dict) -> "Hint":
         hint = cls()
         for k, v in dic.items():
             setattr(hint, k, v)
diff --git a/tilelang/carver/roller/node.py b/tilelang/carver/roller/node.py
index f9e38b168..3122c7b07 100644
--- a/tilelang/carver/roller/node.py
+++ b/tilelang/carver/roller/node.py
@@ -1,4 +1,5 @@
 """PrimFunc Wrapper and Block information Analaysis"""
+
 from __future__ import annotations
 
 import tvm
@@ -31,7 +32,6 @@ def _traverse(block):
 
 
 class BlockAnalyzer:
-
     def __init__(self, sch) -> None:
         self.sch: tir.Schedule = sch
         self.block_infos: list[BlockInfo] = normalize_prim_func(self.sch)
@@ -92,7 +92,6 @@ class Edge:
 
 
 class Node:
-
     def __init__(self, tags: dict | None = None, name: str = "Node") -> None:
         self.name = name
         if tags is None:
@@ -177,7 +176,6 @@ def __repr__(self) -> str:
 
 
 class PlaceHolderNode(Node):
-
     def __init__(self, name=""):
         super().__init__(name="PlaceHolder_" + name)
 
@@ -189,11 +187,7 @@ def get_ir(self) -> str:
 
 
 class PrimFuncNode(Node):
-
-    def __init__(self,
-                 prim_func: PrimFunc,
-                 tags: dict | None = None,
-                 name: str = "PrimFuncNode") -> None:
+    def __init__(self, prim_func: PrimFunc, tags: dict | None = None, name: str = "PrimFuncNode") -> None:
         super().__init__(tags, name=name)
         self.prim_func = self._specialize_func(prim_func)
         self.sch: tir.Schedule = tir.Schedule(self.prim_func)
@@ -227,7 +221,7 @@ def _assign_placeholder_node(self):
         for dst_id, n in enumerate(inputs):
             if isinstance(n, Node):
                 n = (n, 0)
-            assert (len(n) == 2)
+            assert len(n) == 2
             src_node, src_id = n[0], n[1]
             edge = Edge(src_node, self, src_id, dst_id)
             self._in_edges.append(edge)
@@ -338,9 +332,8 @@ def propagate(self, tile, rstep: dict | None = None, targets=None):
         if rstep is None:
             rstep = {}
         shape = {
-            self.block_analyzer.get_output_buffers(block)[0].name: [
-                tvm.arith.ConstIntBound(0, val - 1) for val in tile
-            ] for block in self.schedule_stages
+            self.block_analyzer.get_output_buffers(block)[0].name: [tvm.arith.ConstIntBound(0, val - 1) for val in tile]
+            for block in self.schedule_stages
         }
         return self.ana.infer(shape, rstep, targets)
 
@@ -356,10 +349,7 @@ def propagate_inputs(self, tile, rstep: dict | None = None) -> list[list[int]]:
                 results.append(shapes[arg.name])
                 continue
             # should not exceed original shape
-            trimmed_shape = [
-                self.extent_wrapper(i)
-                for i in list(map(min, zip(shapes[arg.name], self.input_buffers[i].shape)))
-            ]
+            trimmed_shape = [self.extent_wrapper(i) for i in list(map(min, zip(shapes[arg.name], self.input_buffers[i].shape)))]
             results.append(trimmed_shape)
         return results
 
@@ -380,10 +370,8 @@ def propagate_inputs_on_reduction(self, tile, rstep: dict | None = None) -> list
             propagate_shape = shapes[arg.name]
             buffer_shape = args[i].shape
             if len(buffer_shape) > len(propagate_shape):
-                buffer_shape = buffer_shape[-len(propagate_shape):]
-            trimmed_shape = [
-                self.extent_wrapper(j) for j in list(map(min, zip(propagate_shape, buffer_shape)))
-            ]
+                buffer_shape = buffer_shape[-len(propagate_shape) :]
+            trimmed_shape = [self.extent_wrapper(j) for j in list(map(min, zip(propagate_shape, buffer_shape)))]
             results.append(trimmed_shape)
         return results
 
@@ -412,10 +400,7 @@ def propagate_reduction_inputs(self, shape, rstep: dict | None = None) -> dict[s
     def get_reduce_inputs_dtype(self):
         if self.reduction_block is None:
             return {}
-        return {
-            b.name: tvm.DataType(b.dtype)
-            for b in self.block_analyzer.get_input_buffers(self.reduction_block)
-        }
+        return {b.name: tvm.DataType(b.dtype) for b in self.block_analyzer.get_input_buffers(self.reduction_block)}
 
     @functools.lru_cache
     def infer_tensorcore_axis(self) -> tuple[int]:
@@ -425,8 +410,7 @@ def infer_tensorcore_axis(self) -> tuple[int]:
         C_ax_m, C_ax_n = self.get_tag("tensorcore_config")
         wmma_m, wmma_n, wmma_k = [16, 16, 16]  # just for testing, any number is ok
 
-        output_buffer_shape = (
-            self.block_analyzer.sch.get(self.reduction_block).writes[0].buffer.shape)
+        output_buffer_shape = self.block_analyzer.sch.get(self.reduction_block).writes[0].buffer.shape
         valid_region = []
         for region in output_buffer_shape:
             if region.value == 1:
@@ -438,8 +422,7 @@ def infer_tensorcore_axis(self) -> tuple[int]:
 
         def get_cl_shapes(c_ax_m, c_ax_n, num_nvalid_regions):
             spatial_dim = self.get_space_dim()
-            assert len(valid_region) == len(
-                spatial_dim), f" {valid_region} mismatch with {spatial_dim}"
+            assert len(valid_region) == len(spatial_dim), f" {valid_region} mismatch with {spatial_dim}"
             cl_shapes = [1] * len(spatial_dim)
             cl_shapes[c_ax_m - num_nvalid_regions] = wmma_m
             cl_shapes[c_ax_n - num_nvalid_regions] = wmma_n
@@ -467,9 +450,11 @@ def footprint(self, shape, rstep, stride_map: dict | None = None) -> int:
         shapes, _ = self.propagate(shape, rstep)
 
         def is_broadcast_pattern(buffer, output_buffer):
-            return (buffer in self.args and
-                    len(shapes[output_buffer.name]) > len(shapes[buffer.name]) and
-                    np.prod(shapes[output_buffer.name]) > np.prod(shapes[buffer.name]))
+            return (
+                buffer in self.args
+                and len(shapes[output_buffer.name]) > len(shapes[buffer.name])
+                and np.prod(shapes[output_buffer.name]) > np.prod(shapes[buffer.name])
+            )
 
         def is_after_reduce_stage(block):
             if not self.reduction_block:
@@ -491,8 +476,8 @@ def is_after_reduce_stage(block):
             output_buffer = self.block_analyzer.get_output_buffers(block)[0]
             for buffer in self.block_analyzer.get_input_buffers(block):
                 cache = buffer.name not in cached_tensor and (
-                    is_broadcast_pattern(buffer, output_buffer) or
-                    self.block_analyzer.get_block_info(block).is_reduction())
+                    is_broadcast_pattern(buffer, output_buffer) or self.block_analyzer.get_block_info(block).is_reduction()
+                )
                 if not cache:
                     continue
                 cached_tensor.append(buffer.name)
@@ -500,8 +485,7 @@ def is_after_reduce_stage(block):
                     continue  # cache after reduce op can often reuse buffer in reduce stage
 
                 if buffer.name in stride_map:
-                    num_elem = stride_map[buffer.name].compute_elements_from_shape(
-                        shapes[buffer.name])
+                    num_elem = stride_map[buffer.name].compute_elements_from_shape(shapes[buffer.name])
                 else:
                     num_elem = np.prod(shapes[buffer.name])
                 buffer_len = num_elem * int((tvm.DataType(buffer.dtype).bits + 7) // 8)
@@ -514,7 +498,6 @@ def get_input_buffers(self) -> list[tir.Buffer]:
 
 
 class OutputNode(Node):
-
     def __init__(self, node, id=0):
         super().__init__(name="OutputNode")
         # connect node and output node
@@ -549,15 +532,16 @@ def topo_order(list_of_nodes) -> list[Node]:
                 input_ready_count[dst_node] = len(dst_node.inputs)
                 list_of_nodes.append(dst_node)
             input_ready_count[dst_node] -= 1
-            assert (input_ready_count[dst_node] >= 0)
+            assert input_ready_count[dst_node] >= 0
             if input_ready_count[dst_node] == 0:
                 ready.append(dst_node)
-    assert (len(list_of_nodes) == len(output_list))
+    assert len(list_of_nodes) == len(output_list)
     return output_list
 
 
 def find_topo_sort_priority(output_node_list) -> list[Node]:
     import sys
+
     sys.setrecursionlimit(10000)
 
     def topo_sort_get_layer(node, topo_layer):
@@ -576,9 +560,7 @@ def topo_sort_dfs(node, visited, topo_order):
         if node in visited:
             return
         visited.add(node)
-        ordered_input_nodes = sorted([edge.src_node for edge in node.inputs],
-                                     key=lambda n: topo_layer[n],
-                                     reverse=True)
+        ordered_input_nodes = sorted([edge.src_node for edge in node.inputs], key=lambda n: topo_layer[n], reverse=True)
         for n in ordered_input_nodes:
             topo_sort_dfs(n, visited, topo_order)
         topo_order.append(node)
@@ -591,7 +573,6 @@ def topo_sort_dfs(node, visited, topo_order):
 
 
 def find_topo_sort(output_node_list) -> list[Node]:
-
     def topo_sort_dfs(node, visited, topo_order):
         if node in visited:
             return
diff --git a/tilelang/carver/roller/policy/common.py b/tilelang/carver/roller/policy/common.py
index 747dddbb0..fb33eefdb 100644
--- a/tilelang/carver/roller/policy/common.py
+++ b/tilelang/carver/roller/policy/common.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 import numpy as np
 
 
diff --git a/tilelang/carver/roller/policy/default.py b/tilelang/carver/roller/policy/default.py
index 36d8f1f2c..d09216e1c 100644
--- a/tilelang/carver/roller/policy/default.py
+++ b/tilelang/carver/roller/policy/default.py
@@ -1,9 +1,10 @@
 """Policy for cuda core schedule"""
+
 from __future__ import annotations
 import functools
 import math
 from queue import PriorityQueue
-from typing import Iterable
+from collections.abc import Iterable
 
 import numpy as np
 import tvm
@@ -36,20 +37,14 @@ def __init__(self, arch: TileDevice, tags: dict | None = None) -> None:
         self.rasterization = NoRasterization()
 
     @classmethod
-    def from_prim_func(cls,
-                       func: tvm.tir.PrimFunc,
-                       arch: TileDevice,
-                       tags: dict | None = None,
-                       name: str = "PrimFuncNode"):
+    def from_prim_func(cls, func: tvm.tir.PrimFunc, arch: TileDevice, tags: dict | None = None, name: str = "PrimFuncNode"):
         return cls(arch, tags)._init_with_prim_func(func, name)
 
     @classmethod
     def from_output_nodes(cls, nodes: list[OutputNode], arch: TileDevice, tags: dict | None = None):
         return cls(arch, tags)._init_with_output_nodes(nodes)
 
-    def _init_with_prim_func(self,
-                             func: tvm.tir.PrimFunc,
-                             name: str = "PrimFuncNode") -> DefaultPolicy:
+    def _init_with_prim_func(self, func: tvm.tir.PrimFunc, name: str = "PrimFuncNode") -> DefaultPolicy:
         if func is not None and isinstance(func, tvm.tir.PrimFunc):
             self.func = func
             self.prim_func_node = PrimFuncNode(self.func, tags=self.tags, name=name)
@@ -60,9 +55,7 @@ def _init_with_prim_func(self,
         return self
 
     def _init_with_output_nodes(self, output_nodes: list[OutputNode]):
-        self.ordered_nodes = list(
-            filter(lambda n: not n.is_placeholder() and not n.is_output(),
-                   find_topo_sort(output_nodes)))
+        self.ordered_nodes = list(filter(lambda n: not n.is_placeholder() and not n.is_output(), find_topo_sort(output_nodes)))
         for node in self.ordered_nodes:
             node.update_tags(self.tags)
 
@@ -102,13 +95,14 @@ def emit_config(self, topk: int) -> list[Hint]:
 
     def dfs_smem_tile(self, init_tile, rstep_map) -> Iterable[TileDict]:
         _steps = [get_all_factors(n) for n in self.output_nodes[0].get_space_dim()]
-        steps = [step[step.index(t):] for step, t in zip(_steps, init_tile)]
+        steps = [step[step.index(t) :] for step, t in zip(_steps, init_tile)]
         for i in range(len(steps)):
             added = list(
                 filter(
                     lambda s: s < steps[i][-1] and s > steps[i][0] and s not in steps[i],
                     [2, 4, 8, 16, 32],
-                ))
+                )
+            )
             steps[i].extend(added)
             steps[i] = sorted(steps[i])
         visited_tiles = {}
@@ -190,10 +184,7 @@ def _get_output_tile_map(self, tile):
         """
         tile_map = {}
         for node in self.output_nodes:
-            tile_map[node] = [
-                tile[i] * node.get_space_dim()[i] // self.output_nodes[0].get_space_dim()[i]
-                for i in range(len(tile))
-            ]
+            tile_map[node] = [tile[i] * node.get_space_dim()[i] // self.output_nodes[0].get_space_dim()[i] for i in range(len(tile))]
         return tile_map
 
     def compute_workload_per_item(self, output_tile) -> float:
@@ -304,8 +295,7 @@ def _score(rstep_id):
             score = 0
             shape = node.propagate_inputs(tile, rstep=rstep)
             for i, input_buffer in enumerate(node.input_buffers):
-                read_transaction_elements = self.arch.transaction_size[1] // (
-                    (node.get_buffer_dtype(input_buffer).bits + 7) // 8)
+                read_transaction_elements = self.arch.transaction_size[1] // ((node.get_buffer_dtype(input_buffer).bits + 7) // 8)
                 score += sim(
                     int(coalesced_factor(shape[i], input_buffer.shape)),
                     read_transaction_elements,
@@ -380,17 +370,13 @@ def _enlarge(rstep_id):
                     return None
                 return max(candidates, key=lambda x: x[1])[0]
 
-            cur_rstep_id = {
-                k.var.name: all_steps[k.var.name].index(rstep[k.var.name]) for k in node.raxis
-            }
+            cur_rstep_id = {k.var.name: all_steps[k.var.name].index(rstep[k.var.name]) for k in node.raxis}
             new_rstep_map = rstep_map.copy()
             while True:
                 new_rstep_id = _enlarge(cur_rstep_id)
                 if new_rstep_id is None:
                     break
-                new_rstep_map[node] = {
-                    k.var.name: all_steps[k.var.name][new_rstep_id[k.var.name]] for k in node.raxis
-                }
+                new_rstep_map[node] = {k.var.name: all_steps[k.var.name][new_rstep_id[k.var.name]] for k in node.raxis}
                 old_rstep_map = td.rstep_map
                 td.rstep_map = new_rstep_map
                 smem_usage, _ = self._compute_shared_memory_usage(td)
@@ -434,15 +420,14 @@ def _compute_memory_traffic(self, output_tile):
                 if edge.src_node.is_placeholder():
                     nbytes = (edge.src_node.get_dtype().bits + 7) // 8
                     read_transaction_elements = self.arch.transaction_size[1] // nbytes
-                    traffic += coalesced_tensor_shape(input_shapes[i], edge.src_node.get_shape(),
-                                                      read_transaction_elements) * nbytes
+                    traffic += coalesced_tensor_shape(input_shapes[i], edge.src_node.get_shape(), read_transaction_elements) * nbytes
             for edge in node.outputs:
                 if edge.dst_node.is_output():
                     nbytes = (edge.src_node.get_dtype().bits + 7) // 8
                     write_transaction_elements = self.arch.transaction_size[0] // nbytes
-                    traffic += coalesced_tensor_shape(output_shapes[edge.src_id],
-                                                      node.get_shape(edge.src_id),
-                                                      write_transaction_elements) * nbytes
+                    traffic += (
+                        coalesced_tensor_shape(output_shapes[edge.src_id], node.get_shape(edge.src_id), write_transaction_elements) * nbytes
+                    )
 
         return traffic, op_tile_map
 
@@ -487,10 +472,7 @@ def _compute_shared_memory_usage(self, td: TileDict):
         cached_tensors_map = {}
 
         def can_free(node, out_id):
-            for edge in node.outputs:
-                if edge.src_id == out_id and edge.dst_node not in processed:
-                    return False
-            return True
+            return all(not (edge.src_id == out_id and edge.dst_node not in processed) for edge in node.outputs)
 
         for node in self.ordered_nodes:
             node_internal_bytes, cached_tensors_map[node] = self.infer_node_smem_usage(td, node)
@@ -528,9 +510,7 @@ def compute_node_stride_map(self, node: PrimFuncNode, td: TileDict):
         Tuple[Dict, Dict]
             A tuple of dictionaries containing the output strides and tensor strides.
         """
-        output_strides = {
-            int(i + len(node.input_buffers)): Stride() for i, _ in enumerate(node.output_buffers)
-        }
+        output_strides = {int(i + len(node.input_buffers)): Stride() for i, _ in enumerate(node.output_buffers)}
         tensor_strides = {}
         return output_strides, tensor_strides
 
@@ -551,8 +531,7 @@ def _compute_stride_map(self, td: TileDict):
         output_strides_map = {}
         tensor_strides_map = {}
         for node in self.ordered_nodes:
-            output_strides_map[node], tensor_strides_map[node] = self.compute_node_stride_map(
-                node, td)
+            output_strides_map[node], tensor_strides_map[node] = self.compute_node_stride_map(node, td)
         td.output_strides_map, td.tensor_strides_map = output_strides_map, tensor_strides_map
 
     def compute_tile_dict(self, output_tile: list[int], rstep_map) -> TileDict:
@@ -582,9 +561,7 @@ def compute_tile_dict(self, output_tile: list[int], rstep_map) -> TileDict:
         output_shape = self.output_nodes[0].get_space_dim()
         td.grid_size = int(np.prod([(y + x - 1) // x for x, y in zip(output_tile, output_shape)]))
         # estimated reg usage
-        reg_usage = int(2 * max([
-            np.prod(td.get_tile(node)) * node.get_dtype().bits / 32 for node in self.ordered_nodes
-        ]))
+        reg_usage = int(2 * max([np.prod(td.get_tile(node)) * node.get_dtype().bits / 32 for node in self.ordered_nodes]))
         if reg_usage > self.arch.reg_cap:
             td.valid = False
             return td
@@ -609,13 +586,10 @@ def check_tile_shape_isvalid(self, td: TileDict) -> bool:
         for node in self.ordered_nodes:
             if np.prod(td.get_tile(node)) == 0:
                 return False
-            node_grid_size = np.prod([
-                (y + x - 1) // x for x, y in zip(td.get_tile(node), node.get_space_dim())
-            ])
+            node_grid_size = np.prod([(y + x - 1) // x for x, y in zip(td.get_tile(node), node.get_space_dim())])
             if node_grid_size != td.grid_size:
                 return False
-            if (hasattr(node, "reduce_op") and node.reduce_op is not None and
-                    len(node.reduce_op.axis) == len(td.output_tile)):
+            if hasattr(node, "reduce_op") and node.reduce_op is not None and len(node.reduce_op.axis) == len(td.output_tile):
                 for i, tile_extent in enumerate(td.output_tile):
                     if node.reduce_op.axis[i].dom.extent % tile_extent:
                         return False
@@ -639,23 +613,22 @@ def recommend_block_size(self, td: TileDict) -> list[int]:
         node_space_sizes = [int(np.prod(td.get_tile(node))) for node in self.ordered_nodes]
         max_block_size = functools.reduce(math.gcd, node_space_sizes)
 
-        if max_block_size < self.arch.warp_size * self.arch.sm_partition and max_block_size == min(
-                node_space_sizes):
-            node_reduce_sizes = [
-                int(np.prod(list(td.get_rstep(node).values()))) for node in self.ordered_nodes
-            ]
+        if max_block_size < self.arch.warp_size * self.arch.sm_partition and max_block_size == min(node_space_sizes):
+            node_reduce_sizes = [int(np.prod(list(td.get_rstep(node).values()))) for node in self.ordered_nodes]
             total_sizes = [x * y for x, y in zip(node_space_sizes, node_reduce_sizes)]
             max_possible_size = functools.reduce(math.gcd, total_sizes)
             possible_block_sizes = list(
                 filter(
                     lambda x: x % max_block_size == 0 and x <= 1024,
                     get_all_factors(max_possible_size),
-                ))
+                )
+            )
             possible_block_sizes = list(
                 filter(  # either be a factor of space or cover fully cover the space
                     lambda x: all([x % s == 0 or s % x == 0 for s in node_space_sizes]),
                     possible_block_sizes,
-                ))
+                )
+            )
             factor_ordered = sorted(possible_block_sizes, key=self.score_block_size)
             return factor_ordered
         else:
@@ -821,8 +794,7 @@ def is_type_allowed(dtype, vec):
         vectorize_result = {}
         for tensor, shape in shapes.items():
             for v in vectorize_sizes:
-                if (is_shape_aligned(shape, block_size * v) and is_cont(shape, v) and
-                        is_type_allowed(dtypes[tensor], v)):
+                if is_shape_aligned(shape, block_size * v) and is_cont(shape, v) and is_type_allowed(dtypes[tensor], v):
                     vectorize_result[tensor] = v
                     break
         return vectorize_result
diff --git a/tilelang/carver/roller/policy/tensorcore.py b/tilelang/carver/roller/policy/tensorcore.py
index 15bad4122..86c79ea73 100644
--- a/tilelang/carver/roller/policy/tensorcore.py
+++ b/tilelang/carver/roller/policy/tensorcore.py
@@ -1,4 +1,5 @@
 """Policy for tensorcore schedule"""
+
 from __future__ import annotations
 import tvm
 import numpy as np
@@ -13,7 +14,6 @@
 
 
 class TensorCorePolicy(DefaultPolicy):
-
     # this is the trick for wmma.
     # However, for int8 mma, the wmma_k should be 32.
     wmma_k: int = 16
@@ -70,9 +70,9 @@ def _compute_tc_strides(
         A_high_ax = min(A_ax_m, A_ax_k)
         B_high_ax = min(B_ax_n, B_ax_k)
         C_high_ax = min(C_ax_m, C_ax_n)
-        A_stride = Stride(stride=np.prod(AS_shape[A_high_ax + 1:]) + offset, ax=A_high_ax)
-        B_stride = Stride(stride=np.prod(BS_shape[B_high_ax + 1:]) + offset, ax=B_high_ax)
-        C_stride = Stride(stride=np.prod(CS_shape[C_high_ax + 1:]) + offset, ax=C_high_ax)
+        A_stride = Stride(stride=np.prod(AS_shape[A_high_ax + 1 :]) + offset, ax=A_high_ax)
+        B_stride = Stride(stride=np.prod(BS_shape[B_high_ax + 1 :]) + offset, ax=B_high_ax)
+        C_stride = Stride(stride=np.prod(CS_shape[C_high_ax + 1 :]) + offset, ax=C_high_ax)
         return A_stride, B_stride, C_stride
 
     def infer_node_smem_usage(self, td: TileDict, node: PrimFuncNode):
@@ -86,8 +86,7 @@ def _assign_reduce_step(self, node):
         # get reduce input size
         target_transaction = self.arch.transaction_size[0] * 2
         # 512 bytes // type bits
-        reduce_input_dtype = node.get_buffer_dtype(
-            node.block_analyzer.get_input_buffers(node.reduction_block)[0])
+        reduce_input_dtype = node.get_buffer_dtype(node.block_analyzer.get_input_buffers(node.reduction_block)[0])
         basic = (target_transaction * 8) // reduce_input_dtype.bits
 
         result = {}
@@ -95,7 +94,7 @@ def _assign_reduce_step(self, node):
             iter_name = iter_info.var.name
             iter_dom = iter_info.dom.extent
             if iter_dom % 16 > 0:
-                result[iter_name] = (16 if iter_dom < basic else basic)  # for the case of padding
+                result[iter_name] = 16 if iter_dom < basic else basic  # for the case of padding
             elif iter_dom % basic == 0:
                 result[iter_name] = basic
             else:
@@ -114,7 +113,6 @@ def _check_small_tile(td: TileDict):
             return False
 
         if _check_small_tile(td):
-
             smem_limit = min(self.arch.max_smem_usage // td.block_per_SM, self.arch.smem_cap)
             rstep_map = td.rstep_map.copy()
 
@@ -127,13 +125,10 @@ def _optimize(node, rstep):
                     return rstep
 
                 def _shared_memory_usage(td: TileDict):
-                    return node.footprint(td.output_tile, new_rstep_map,
-                                          td.tensor_strides_map[node])
+                    return node.footprint(td.output_tile, new_rstep_map, td.tensor_strides_map[node])
 
                 def _score(rstep_id):
-                    rstep = {
-                        k.var.name: all_steps[k.var.name][rstep_id[k.var.name]] for k in node.raxis
-                    }
+                    rstep = {k.var.name: all_steps[k.var.name][rstep_id[k.var.name]] for k in node.raxis}
                     score = 0
                     shape = node.propagate_inputs_on_reduction(td.get_tile(node), rstep=rstep)
                     input_buffers = node.block_analyzer.get_input_buffers(node.reduction_block)
@@ -153,18 +148,13 @@ def _enlarge(rstep_id):
                         return None
                     return max(candidates, key=lambda x: x[1])[0]
 
-                cur_rstep_id = {
-                    k.var.name: all_steps[k.var.name].index(rstep[k.var.name]) for k in node.raxis
-                }
+                cur_rstep_id = {k.var.name: all_steps[k.var.name].index(rstep[k.var.name]) for k in node.raxis}
                 new_rstep_map = rstep_map.copy()
                 while True:
                     new_rstep_id = _enlarge(cur_rstep_id)
                     if new_rstep_id is None:
                         break
-                    new_rstep_map = {
-                        k.var.name: all_steps[k.var.name][new_rstep_id[k.var.name]]
-                        for k in node.raxis
-                    }
+                    new_rstep_map = {k.var.name: all_steps[k.var.name][new_rstep_id[k.var.name]] for k in node.raxis}
                     old_rstep_map = td.rstep_map
                     td.rstep_map = new_rstep_map
                     smem_usage, _ = _shared_memory_usage(td)
@@ -173,9 +163,7 @@ def _enlarge(rstep_id):
                         break
                     else:
                         cur_rstep_id = new_rstep_id
-                rstep = {
-                    k.var.name: all_steps[k.var.name][cur_rstep_id[k.var.name]] for k in node.raxis
-                }
+                rstep = {k.var.name: all_steps[k.var.name][cur_rstep_id[k.var.name]] for k in node.raxis}
                 return rstep
 
             for node in self.ordered_nodes:
@@ -206,11 +194,7 @@ def get_node_reduce_step_candidates(self, node):
             return super().get_node_reduce_step_candidates(node)
         else:
             # must be a a multiple of wmma_k
-            return {
-                k.var.name: [
-                    x * self.wmma_k for x in get_all_factors(int(k.dom.extent) // self.wmma_k)
-                ] for k in node.raxis
-            }
+            return {k.var.name: [x * self.wmma_k for x in get_all_factors(int(k.dom.extent) // self.wmma_k)] for k in node.raxis}
 
     def check_tile_shape_isvalid(self, td: TileDict):
         for node in self.ordered_nodes:
@@ -221,10 +205,7 @@ def check_tile_shape_isvalid(self, td: TileDict):
                     td.tile_map[node][ax_n],
                 )
                 # check the tile size is valid
-                wmma_invalid = [
-                    block_m < wmma_m or block_n < wmma_n
-                    for wmma_m, wmma_n in self.arch.get_avaliable_tensorintrin_shapes()
-                ]
+                wmma_invalid = [block_m < wmma_m or block_n < wmma_n for wmma_m, wmma_n in self.arch.get_avaliable_tensorintrin_shapes()]
                 if all(wmma_invalid):
                     return False
                 if any([y % x for x, y in zip(td.tile_map[node], node.get_space_dim())]):
@@ -242,13 +223,10 @@ def compute_node_stride_map(self, node: PrimFuncNode, td: TileDict):
             return super().compute_node_stride_map(node, td)
         use_layout = self._can_implement_layout(node, td)
 
-        AS_stride, BS_stride, C_stride = self._compute_tc_strides(node, td.get_tile(node),
-                                                                  td.get_rstep(node))
+        AS_stride, BS_stride, C_stride = self._compute_tc_strides(node, td.get_tile(node), td.get_rstep(node))
         A_stride, B_stride, _ = self._compute_tc_strides(node, td.get_tile(node))
         tensor_strides = {}
-        output_strides = {
-            int(i + len(node.input_buffers)): Stride() for i, _ in enumerate(node.output_buffers)
-        }
+        output_strides = {int(i + len(node.input_buffers)): Stride() for i, _ in enumerate(node.output_buffers)}
         tensor_strides = {}
         # when connected to shared input, should use full stride without rstep
         for i, (_, _) in enumerate(zip([AS_stride, BS_stride], [A_stride, B_stride])):
@@ -347,8 +325,7 @@ def _check_memory_size():
             overall_gmem_size_in_bytes: int = 0
             for node in self.ordered_nodes:
                 for buffer in node.input_buffers:
-                    overall_gmem_size_in_bytes += (
-                        int(np.prod(buffer.shape)) * tvm.DataType(buffer.dtype).bits // 8)
+                    overall_gmem_size_in_bytes += int(np.prod(buffer.shape)) * tvm.DataType(buffer.dtype).bits // 8
             return overall_gmem_size_in_bytes < self.arch.l2_cache_size_bytes
 
         conditions.append(_check_memory_size())
diff --git a/tilelang/carver/roller/rasterization.py b/tilelang/carver/roller/rasterization.py
index 39c603b6b..ec565a1c7 100644
--- a/tilelang/carver/roller/rasterization.py
+++ b/tilelang/carver/roller/rasterization.py
@@ -1,9 +1,7 @@
 """Rasteration Plan For L2 Cache Locality"""
-from __future__ import annotations
 
 
 class Rasterization:
-
     panel_width_ = None
 
     def __init__(self) -> None:
@@ -19,7 +17,6 @@ def panel_width(self):
 
 
 class NoRasterization(Rasterization):
-
     def __init__(self) -> None:
         super().__init__()
 
diff --git a/tilelang/carver/roller/shape_inference/common.py b/tilelang/carver/roller/shape_inference/common.py
index aaf59aed9..4a3c34f3a 100644
--- a/tilelang/carver/roller/shape_inference/common.py
+++ b/tilelang/carver/roller/shape_inference/common.py
@@ -5,9 +5,7 @@
 
 
 class Statement:
-
-    def __init__(self, output: str, dependent_region: dict, var_map: OrderedDict,
-                 range_map: OrderedDict):
+    def __init__(self, output: str, dependent_region: dict, var_map: OrderedDict, range_map: OrderedDict):
         self.output = output
         self.dependent_region = dependent_region
         self.var_map = var_map
@@ -19,7 +17,6 @@ def _merge_two_bounds(x: arith.ConstIntBound, y: arith.ConstIntBound):
 
 
 class InputShapeInference:
-
     def __init__(self, deps: list[Statement]):
         self.deps = deps
 
diff --git a/tilelang/carver/roller/shape_inference/tir.py b/tilelang/carver/roller/shape_inference/tir.py
index c1b97188a..d7b11d608 100644
--- a/tilelang/carver/roller/shape_inference/tir.py
+++ b/tilelang/carver/roller/shape_inference/tir.py
@@ -1,12 +1,10 @@
-from __future__ import annotations
-from typing import Mapping
+from collections.abc import Mapping
 from tvm.tir.schedule.schedule import BlockRV
 from tvm.ir import structural_equal
 from tvm import arith, tir
 
 
 class Statement:
-
     def __init__(self, block_analyzer, block: BlockRV):
         self.block_analyzer = block_analyzer
         self.block = block
@@ -22,9 +20,7 @@ def make_reverse(self, input_name: str, input_iter: list[tir.PrimExpr]):
         if len(self.dependent_region[input_name]) != 1:
             return None
         indices = self.dependent_region[input_name][0]
-        iter_map_range = {
-            _iter.var: _iter.dom for _iter in self.block_analyzer.get_spatial_axis(self.block)
-        }
+        iter_map_range = {_iter.var: _iter.dom for _iter in self.block_analyzer.get_spatial_axis(self.block)}
         iter_map_result = arith.detect_iter_map(
             indices,
             iter_map_range,
@@ -78,7 +74,6 @@ def __repr__(self):
 
 
 class DependencyAnalysis:
-
     def __init__(self, deps):
         self.deps = deps
         # issue: duplicate name when we have two same ops.
@@ -113,8 +108,7 @@ def get_or_create_node(self, name):
 
     def traverse_dependencies(self, compute):
         if isinstance(compute, Statement):
-            node = self.get_or_create_node(
-                compute.block_analyzer.get_output_buffers(compute.block)[0].name)
+            node = self.get_or_create_node(compute.block_analyzer.get_output_buffers(compute.block)[0].name)
             # Loop through input tensors
             for input_buffer in compute.block_analyzer.get_input_buffers(compute.block):
                 # Get the input node
@@ -168,7 +162,6 @@ def _find_path_recursive(self, current_node, target_name, visited, path):
 
 
 class InputShapeInference:
-
     def __init__(self, deps: list[Statement]):
         self.deps = deps
         self.target_mapping = {}
@@ -184,16 +177,11 @@ def construct_dependency_target(self, targets: tuple[str]):
         if targets in self.target_mapping:
             return self.target_mapping[targets]
         # should be buffer name instead of block name
-        name2dep = {
-            dep.block_analyzer.get_output_buffers(dep.block)[0].name: dep for dep in self.deps
-        }
+        name2dep = {dep.block_analyzer.get_output_buffers(dep.block)[0].name: dep for dep in self.deps}
         mapping = {}
         input_vars = []
         for target in targets:
-            vars = [
-                iter.var
-                for iter in name2dep[target].block_analyzer.get_spatial_axis(name2dep[target].block)
-            ]
+            vars = [iter.var for iter in name2dep[target].block_analyzer.get_spatial_axis(name2dep[target].block)]
             input_vars.append(vars)
             mapping[target] = [vars]
         ana = arith.Analyzer()
@@ -222,13 +210,8 @@ def construct_dependency_target(self, targets: tuple[str]):
                     mapping[input_name] = []
                 for indices in indices_list:
                     for region in regions:
-                        vmap = {
-                            k: (tir.Cast(k.dtype, v) if v.dtype != k.dtype else v)
-                            for k, v in zip(ax_vars, indices)
-                        }
-                        region = [
-                            ana.simplify(tir.stmt_functor.substitute(ax, vmap)) for ax in region
-                        ]
+                        vmap = {k: (tir.Cast(k.dtype, v) if v.dtype != k.dtype else v) for k, v in zip(ax_vars, indices)}
+                        region = [ana.simplify(tir.stmt_functor.substitute(ax, vmap)) for ax in region]
                         if not region_exist_in_list(region, mapping[input_name]):
                             mapping[input_name].append(region)
         buffers = []
@@ -242,10 +225,7 @@ def construct_dependency_target(self, targets: tuple[str]):
         self.target_mapping[targets] = input_vars, mapping
         return input_vars, mapping
 
-    def infer(self,
-              shape: dict[str, list[arith.ConstIntBound]],
-              rstep: dict[str, int] = None,
-              targets=None):
+    def infer(self, shape: dict[str, list[arith.ConstIntBound]], rstep: dict[str, int] = None, targets=None):
         if rstep is None:
             rstep = {}
         compute_targets = tuple(shape.keys())
@@ -259,8 +239,7 @@ def infer(self,
         for ax in self.reduce_axes:
             # assume the dom.min is always 0, maybe we can extend the IterInfo to include the min value.
             if ax.var.name in rstep:
-                bound = arith.ConstIntBound(
-                    int(ax.dom.min), int(ax.dom.min + min(ax.dom.extent, rstep[ax.var.name]) - 1))
+                bound = arith.ConstIntBound(int(ax.dom.min), int(ax.dom.min + min(ax.dom.extent, rstep[ax.var.name]) - 1))
             else:
                 bound = arith.ConstIntBound(int(ax.dom.min), int(ax.dom.min + ax.dom.extent - 1))
             ana.update(ax.var, bound, True)
@@ -313,14 +292,11 @@ def get_input_exprs(self, output_exprs):
 
         for name, regions in mapping.items():
             region = regions[0]
-            result[name] = [
-                ana.simplify(tir.stmt_functor.substitute(index, vmap)) for index in region
-            ]
+            result[name] = [ana.simplify(tir.stmt_functor.substitute(index, vmap)) for index in region]
         return result
 
 
 def region_exist_in_list(a, list) -> bool:
-
     def expr_is_same(a, b) -> bool:
         if isinstance(a, tir.IntImm) and isinstance(b, tir.IntImm):
             return a.value == b.value
diff --git a/tilelang/carver/template/base.py b/tilelang/carver/template/base.py
index 5aa5074c2..98a4fe83f 100644
--- a/tilelang/carver/template/base.py
+++ b/tilelang/carver/template/base.py
@@ -3,7 +3,12 @@
 from abc import ABC, abstractmethod  # For defining abstract base classes
 from dataclasses import dataclass, field  # For defining data classes
 from ..arch import (  # Import architecture-related utilities and classes
-    TileDevice, is_volta_arch, is_ampere_arch, is_cdna_arch, auto_infer_current_arch)
+    TileDevice,
+    is_volta_arch,
+    is_ampere_arch,
+    is_cdna_arch,
+    auto_infer_current_arch,
+)
 from ..roller.hint import Hint  # Import the Hint class
 from ..roller.node import OutputNode  # Import the OutputNode class
 from tvm.tir import PrimFunc  # Import PrimFunc for handling tensor IR functions
diff --git a/tilelang/carver/template/conv.py b/tilelang/carver/template/conv.py
index f180084d5..33179c7c1 100644
--- a/tilelang/carver/template/conv.py
+++ b/tilelang/carver/template/conv.py
@@ -29,6 +29,7 @@ class ConvTemplate(BaseTemplate):
         accum_dtype (str): Data type used for accumulation.
         with_bias (bool): Whether to add a bias term.
     """
+
     # Operation-related configuration parameters
     N: int  # The number of input samples processed simultaneously in a batch.
     C: int  # The number of input feature maps.
@@ -70,12 +71,18 @@ def initialize_function(self) -> None:
             AssertionError: If N, C, H, W, F, K, S, D, P are not positive integers.
         """
         N, C, H, W, F, K, S, D, P = self.N, self.C, self.H, self.W, self.F, self.K, self.S, self.D, self.P
-        assert (isinstance(N, int) and isinstance(C, int) and isinstance(H, int) and
-                isinstance(W, int) and isinstance(F, int) and isinstance(K, int) and
-                isinstance(S, int) and isinstance(D, int) and
-                isinstance(P, int)), "Only Support Integer Params"
-        assert (N > 0 and C > 0 and H > 0 and W > 0 and F > 0 and K > 0 and S > 0 and D > 0 and
-                P > 0), "Params should be positive"
+        assert (
+            isinstance(N, int)
+            and isinstance(C, int)
+            and isinstance(H, int)
+            and isinstance(W, int)
+            and isinstance(F, int)
+            and isinstance(K, int)
+            and isinstance(S, int)
+            and isinstance(D, int)
+            and isinstance(P, int)
+        ), "Only Support Integer Params"
+        assert N > 0 and C > 0 and H > 0 and W > 0 and F > 0 and K > 0 and S > 0 and D > 0 and P > 0, "Params should be positive"
 
         # Load configuration parameters
         in_dtype, out_dtype, accum_dtype = self.in_dtype, self.out_dtype, self.accum_dtype
@@ -124,8 +131,10 @@ def _compute_conv(n, h, w, f):
                 te.if_then_else(
                     te.all(h_in >= 0, h_in < H, w_in >= 0, w_in < W),
                     A[n, h_in, w_in, c].astype(accum_dtype) * B[kh, kw, c, f].astype(accum_dtype),
-                    tir.const(0, accum_dtype)),
-                axis=[kh, kw, c])
+                    tir.const(0, accum_dtype),
+                ),
+                axis=[kh, kw, c],
+            )
 
         # Compute convolution result
         C = te.compute(
diff --git a/tilelang/carver/template/flashattention.py b/tilelang/carver/template/flashattention.py
index 760b19817..0a0a4c6dd 100644
--- a/tilelang/carver/template/flashattention.py
+++ b/tilelang/carver/template/flashattention.py
@@ -10,7 +10,6 @@
 
 @dataclass
 class FlashAttentionTemplate(BaseTemplate):
-
     _output_nodes: list[OutputNode] = None
 
     # Operation-related configuration parameters
@@ -92,10 +91,7 @@ def _compute_matmul(b, i, j):
                 """
                 A_indices = [b, i, k]
                 B_indices = [b, j, k]
-                return te.sum(
-                    A[tuple(A_indices)].astype(accum_dtype) *
-                    B[tuple(B_indices)].astype(accum_dtype),
-                    axis=k)
+                return te.sum(A[tuple(A_indices)].astype(accum_dtype) * B[tuple(B_indices)].astype(accum_dtype), axis=k)
 
             # Compute matrix multiplication result
             C = te.compute(
diff --git a/tilelang/carver/template/gemv.py b/tilelang/carver/template/gemv.py
index 7195a0b87..d96b679ec 100644
--- a/tilelang/carver/template/gemv.py
+++ b/tilelang/carver/template/gemv.py
@@ -51,9 +51,8 @@ def initialize_function(self) -> None:
         N, K = self.N, self.K
 
         # Ensure M, N, K are valid positive integers
-        assert (isinstance(M, int) and isinstance(N, int) and
-                isinstance(K, int)), "Only Support Integer M, N, K"
-        assert (M > 0 and N > 0 and K > 0), "M, N, K should be positive"
+        assert isinstance(M, int) and isinstance(N, int) and isinstance(K, int), "Only Support Integer M, N, K"
+        assert M > 0 and N > 0 and K > 0, "M, N, K should be positive"
 
         # Load configuration parameters
         trans_B = self.trans_B
@@ -87,9 +86,7 @@ def _compute_matmul(i, j):
             """
             A_indices = [i, k]
             B_indices = [k, j] if not trans_B else [j, k]
-            return te.sum(
-                A[tuple(A_indices)].astype(accum_dtype) * B[tuple(B_indices)].astype(accum_dtype),
-                axis=k)
+            return te.sum(A[tuple(A_indices)].astype(accum_dtype) * B[tuple(B_indices)].astype(accum_dtype), axis=k)
 
         # Compute matrix multiplication result
         C = te.compute(
diff --git a/tilelang/carver/template/general_reduce.py b/tilelang/carver/template/general_reduce.py
index a8da5fd6c..b7a55157c 100644
--- a/tilelang/carver/template/general_reduce.py
+++ b/tilelang/carver/template/general_reduce.py
@@ -9,15 +9,13 @@
 
 @dataclass
 class GeneralReductionTemplate(BaseTemplate):
-
     # OP Related Config
     structure: str | list[str] = None
     shape: list[int] = None
     dtype: str = "float16"
 
     def get_hardware_aware_configs(self, arch: TileDevice = None, topk: int = 10) -> list[Hint]:
-        roller_hints = get_roller_hints_from_func(
-            self._func, arch=arch, topk=topk, allow_gemv=False)
+        roller_hints = get_roller_hints_from_func(self._func, arch=arch, topk=topk, allow_gemv=False)
         return roller_hints
 
     def initialize_function(self) -> None:
@@ -38,9 +36,9 @@ def initialize_function(self) -> None:
         spatial_axes = []
         reduce_axes = []
         for i, axis_type in enumerate(self.structure):
-            if axis_type.upper() == 'S':
+            if axis_type.upper() == "S":
                 spatial_axes.append((i, self.shape[i]))
-            elif axis_type.upper() == 'R':
+            elif axis_type.upper() == "R":
                 reduce_axes.append((i, self.shape[i]))
             else:
                 raise ValueError(f"Unrecognized axis type '{axis_type}', only 'S'/'R' allowed.")
@@ -90,7 +88,7 @@ def compute_func(*spatial_indices):
 
             # Walk through the structure in order
             for axis_type in self.structure:
-                if axis_type.upper() == 'S':
+                if axis_type.upper() == "S":
                     # use the next spatial_indices item
                     full_index.append(spatial_indices[spatial_iter])
                     spatial_iter += 1
diff --git a/tilelang/carver/template/matmul.py b/tilelang/carver/template/matmul.py
index 4847cdb22..e7cbb66f8 100644
--- a/tilelang/carver/template/matmul.py
+++ b/tilelang/carver/template/matmul.py
@@ -66,9 +66,8 @@ def initialize_function(self) -> None:
         M, N, K = self.M, self.N, self.K
 
         # Ensure M, N, K are valid positive integers
-        assert (isinstance(M, int) and isinstance(N, int) and
-                isinstance(K, int)), "Only Support Integer M, N, K"
-        assert (M > 0 and N > 0 and K > 0), "M, N, K should be positive"
+        assert isinstance(M, int) and isinstance(N, int) and isinstance(K, int), "Only Support Integer M, N, K"
+        assert M > 0 and N > 0 and K > 0, "M, N, K should be positive"
 
         # Load configuration parameters
         trans_A, trans_B = self.trans_A, self.trans_B
@@ -102,9 +101,7 @@ def _compute_matmul(i, j):
             """
             A_indices = [i, k] if not trans_A else [k, i]  # Adjust indexing if A is transposed
             B_indices = [k, j] if not trans_B else [j, k]  # Adjust indexing if B is transposed
-            return te.sum(
-                A[tuple(A_indices)].astype(accum_dtype) * B[tuple(B_indices)].astype(accum_dtype),
-                axis=k)
+            return te.sum(A[tuple(A_indices)].astype(accum_dtype) * B[tuple(B_indices)].astype(accum_dtype), axis=k)
 
         # Compute matrix multiplication result
         C = te.compute(
diff --git a/tilelang/carver/utils.py b/tilelang/carver/utils.py
index cedb7547a..67db89e39 100644
--- a/tilelang/carver/utils.py
+++ b/tilelang/carver/utils.py
@@ -26,11 +26,9 @@ def get_rasterization_code(pannel_width: int = 8) -> str:
     """
 
 
-def get_roller_hints_from_func(func_or_module: tir.PrimFunc | IRModule,
-                               arch: TileDevice,
-                               topk: int = 10,
-                               tensorcore_only: bool = False,
-                               allow_gemv: bool = False) -> list[Hint] | None:
+def get_roller_hints_from_func(
+    func_or_module: tir.PrimFunc | IRModule, arch: TileDevice, topk: int = 10, tensorcore_only: bool = False, allow_gemv: bool = False
+) -> list[Hint] | None:
     func = None
     if isinstance(func_or_module, tir.PrimFunc):
         func = func_or_module
@@ -44,8 +42,7 @@ def get_roller_hints_from_func(func_or_module: tir.PrimFunc | IRModule,
     roller_hints = None
     if tensorcore_only:
         try:
-            tensorized_func, tags = get_tensorized_func_and_tags(
-                func, arch.target, allow_gemv=allow_gemv)
+            tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target, allow_gemv=allow_gemv)
         except Exception as e_msg:
             logger.debug("Get tensorized func and tags failed: ", e_msg)
             tags = None
@@ -58,8 +55,7 @@ def get_roller_hints_from_func(func_or_module: tir.PrimFunc | IRModule,
         policy = DefaultPolicy.from_prim_func(func=func, arch=arch)
         tensorized_func = None
         try:
-            tensorized_func, tags = get_tensorized_func_and_tags(
-                func, arch.target, allow_gemv=allow_gemv)
+            tensorized_func, tags = get_tensorized_func_and_tags(func, arch.target, allow_gemv=allow_gemv)
         except Exception as e_msg:
             logger.debug("Get tensorized func and tags failed: ", e_msg)
             tags = None
@@ -69,10 +65,9 @@ def get_roller_hints_from_func(func_or_module: tir.PrimFunc | IRModule,
     return roller_hints
 
 
-def get_roller_hints_from_output_nodes(output_nodes: list[OutputNode],
-                                       arch: TileDevice,
-                                       topk: int = 10,
-                                       extra_tags: list[str] | None = None) -> list[Hint] | None:
+def get_roller_hints_from_output_nodes(
+    output_nodes: list[OutputNode], arch: TileDevice, topk: int = 10, extra_tags: list[str] | None = None
+) -> list[Hint] | None:
     assert isinstance(output_nodes, list), "The input should be a list of functions."
 
     lints = []
@@ -80,8 +75,7 @@ def get_roller_hints_from_output_nodes(output_nodes: list[OutputNode],
         policy = TensorCorePolicy.from_output_nodes(output_nodes, arch=arch, tags=None)
         lints = policy.emit_config(topk)
     except Exception as e_msg:
-        logger.debug(f"Generate hints from output nodes failed: {e_msg}",
-                     "fallback to default policy")
+        logger.debug(f"Generate hints from output nodes failed: {e_msg}", "fallback to default policy")
 
     if len(lints) == 0:
         policy = DefaultPolicy.from_output_nodes(output_nodes, arch=arch, tags=None)
@@ -92,7 +86,6 @@ def get_roller_hints_from_output_nodes(output_nodes: list[OutputNode],
 def retrieve_func_from_module(ir_module: IRModule) -> PrimFunc:
     if not isinstance(ir_module, IRModule):
         raise ValueError("Not supported type: ", type(ir_module))
-    assert len(ir_module.get_global_vars()) == 1, (
-        "The optimized module should only have one global variable for default schedule.")
+    assert len(ir_module.get_global_vars()) == 1, "The optimized module should only have one global variable for default schedule."
     func = list(ir_module.functions.values())[0]
     return func
diff --git a/tilelang/contrib/cc.py b/tilelang/contrib/cc.py
index d5cba6c4e..7dc459770 100644
--- a/tilelang/contrib/cc.py
+++ b/tilelang/contrib/cc.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Util to invoke C/C++ compilers in the system."""
-from __future__ import annotations
+
 import functools
 import os
 import shutil
@@ -31,8 +31,7 @@
 
 
 def _is_linux_like():
-    return (sys.platform == "darwin" or sys.platform.startswith("linux") or
-            sys.platform.startswith("freebsd"))
+    return sys.platform == "darwin" or sys.platform.startswith("linux") or sys.platform.startswith("freebsd")
 
 
 def _is_windows_like():
@@ -64,7 +63,7 @@ def get_cc():
     return None
 
 
-@functools.lru_cache(maxsize=None)
+@functools.cache
 def get_cplus_compiler():
     """Return the path to the default C/C++ compiler.
 
@@ -91,7 +90,7 @@ def get_cplus_compiler():
 
 
 def is_darwin():
-    return platform.system() == 'Darwin'
+    return platform.system() == "Darwin"
 
 
 def create_shared(output, objects, options=None, cc=None, cwd=None, ccache_env=None):
@@ -288,11 +287,7 @@ def get_target_triple():
 create_shared.get_target_triple = get_target_by_dump_machine(os.environ.get("CXX", get_cc()))
 
 
-def cross_compiler(compile_func,
-                   options=None,
-                   output_format=None,
-                   get_target_triple=None,
-                   add_files=None):
+def cross_compiler(compile_func, options=None, output_format=None, get_target_triple=None, add_files=None):
     """Create a cross compiler function by specializing compile_func with options.
 
     This function can be used to construct compile functions that
@@ -364,13 +359,7 @@ def _fcompile(outputs, objects, options=None):
     return _fcompile
 
 
-def _linux_compile(output,
-                   objects,
-                   options,
-                   compile_cmd,
-                   cwd=None,
-                   ccache_env=None,
-                   compile_shared=False):
+def _linux_compile(output, objects, options, compile_cmd, cwd=None, ccache_env=None, compile_shared=False):
     cmd = [compile_cmd]
     if compile_cmd != "nvcc":
         if compile_shared or output.endswith(".so") or output.endswith(".dylib"):
@@ -431,15 +420,15 @@ def _windows_compile(output, objects, options, cwd=None, ccache_env=None):
             raise ValueError("ccache not found")
 
     try:
-        proc = subprocess.Popen(
-            cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=cwd, env=env)
+        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=cwd, env=env)
         (out, _) = proc.communicate()
     except FileNotFoundError:
-        raise RuntimeError("Can not find the LLVM clang for Windows clang.exe)."
-                           "Make sure it's installed"
-                           " and the installation directory is in the %PATH% environment "
-                           "variable. Prebuilt binaries can be found at: https://llvm.org/") \
-                               from None
+        raise RuntimeError(
+            "Can not find the LLVM clang for Windows clang.exe)."
+            "Make sure it's installed"
+            " and the installation directory is in the %PATH% environment "
+            "variable. Prebuilt binaries can be found at: https://llvm.org/"
+        ) from None
     if proc.returncode != 0:
         msg = "Compilation error:\n"
         msg += " ".join(cmd) + "\n"
diff --git a/tilelang/contrib/cutedsl/__init__.py b/tilelang/contrib/cutedsl/__init__.py
new file mode 100644
index 000000000..1028badea
--- /dev/null
+++ b/tilelang/contrib/cutedsl/__init__.py
@@ -0,0 +1,128 @@
+import cutlass
+import cutlass.cute as cute
+from cutlass._mlir.dialects import nvvm
+from cutlass.cutlass_dsl import T
+
+# re-export cutlass.cute.arch functions first
+from cutlass.cute.arch import sync_threads  # noqa: F401
+from cutlass.cute.arch import alloc_smem, get_dyn_smem  # noqa: F401
+from cutlass.cute.arch import warpgroup_reg_alloc, warpgroup_reg_dealloc  # noqa: F401
+
+from cutlass.cute import make_tensor, make_rmem_tensor, recast_ptr  # noqa: F401
+from cutlass.cute.typing import Numeric
+
+from cutlass.base_dsl.typing import as_numeric, Int32, Uint16, Uint32  # noqa: F401
+from cutlass._mlir.dialects import llvm, arith  # noqa: F401
+from cutlass._mlir import ir as mlir_ir
+from cutlass.cutlass_dsl import dsl_user_op
+
+# Import our custom implementations (will override if names conflict)
+from .mbar import *
+from .cpasync import *
+from .gemm_V1 import *
+from .reduce import *
+from .ldsm import *
+from .math import *
+from .threadblock_swizzle import *
+
+# Forward nvvm enums
+from cutlass._mlir.dialects.nvvm import (
+    MemOrderKind,
+    MemScopeKind,
+    AtomicOpKind,
+)
+
+BYTES_PER_TENSORMAP = 128
+BYTES_PER_POINTER = 8
+
+
+def make_filled_tensor(shape, value):
+    t = cute.make_rmem_tensor(shape, type(value))
+    t.fill(value)
+    return t
+
+
+def make_tensor_at_offset(ptr: cute.Pointer, offset, shape, div_by=1):
+    if div_by != 1:
+        offset = cute.assume(cutlass.as_numeric(offset), divby=div_by)
+    return cute.make_tensor(ptr + offset, shape)
+
+
+def shuffle_elect(thread_extent):
+    # thread_extent is the number of threads of a warpgroup
+    warp_idx = cute.arch.warp_idx()
+    warp_idx = cute.arch.make_warp_uniform(warp_idx)
+    if thread_extent == 0:
+        return warp_idx == 0
+    else:
+        return (warp_idx % (thread_extent // 32)) == 0
+
+
+def sync_thread_partial(barrier_id=None, thread_count=None):
+    bar_sync_ptx(barrier_id, thread_count)
+
+
+# Packing functions
+def pack_half2(x, y):
+    """
+    Pack two half-precision (fp16) values into a single 32-bit value.
+    Corresponds to CUDA's __pack_half2 intrinsic.
+
+    This packs two fp16 values into a single int32 by treating the fp16 bits
+    as raw data and concatenating them.
+    """
+
+    @dsl_user_op
+    def pack_half2_impl(x_val, y_val, *, loc=None, ip=None):
+        # Cast fp16 to uint16 (bitcast)
+        x_ir = x_val.ir_value(loc=loc, ip=ip) if hasattr(x_val, "ir_value") else x_val
+        y_ir = y_val.ir_value(loc=loc, ip=ip) if hasattr(y_val, "ir_value") else y_val
+
+        # Bitcast fp16 to i16
+        i16_type = mlir_ir.IntegerType.get_signless(16)
+        x_i16 = llvm.bitcast(i16_type, x_ir, loc=loc, ip=ip)
+        y_i16 = llvm.bitcast(i16_type, y_ir, loc=loc, ip=ip)
+
+        packed_xy = llvm.inline_asm(
+            Int32.mlir_type,
+            [x_i16, y_i16],
+            "mov.b32 $0, {$1, $2};",
+            "=r,h,h",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+
+        return Int32(packed_xy)
+
+    return pack_half2_impl(x, y)
+
+
+def AtomicAdd(ptr: cute.Pointer, value: Numeric, *, loc=None, ip=None):
+    if ptr.dtype == cutlass.Float32:
+        ret = nvvm.atomicrmw(
+            T.f32(),
+            AtomicOpKind.FADD,
+            ptr.llvm_ptr,
+            ptr.dtype(value).ir_value(loc=loc, ip=ip),
+            mem_order=MemOrderKind.RELAXED,
+            syncscope=MemScopeKind.GPU,
+            loc=loc,
+            ip=ip,
+        )
+    elif ptr.dtype == cutlass.Int32:
+        ret = nvvm.atomicrmw(
+            T.i32(),
+            AtomicOpKind.ADD,
+            ptr.llvm_ptr,
+            ptr.dtype(value).ir_value(loc=loc, ip=ip),
+            mem_order=MemOrderKind.RELAXED,
+            syncscope=MemScopeKind.GPU,
+            loc=loc,
+            ip=ip,
+        )
+    else:
+        raise ValueError(f"Unsupported dtype: {ptr.dtype}")
+    return ptr.dtype(ret)
diff --git a/tilelang/contrib/cutedsl/cpasync.py b/tilelang/contrib/cutedsl/cpasync.py
new file mode 100644
index 000000000..6ddeb8933
--- /dev/null
+++ b/tilelang/contrib/cutedsl/cpasync.py
@@ -0,0 +1,215 @@
+from __future__ import annotations
+from cutlass.cutlass_dsl import CuTeDSL, T, if_generate, dsl_user_op  # noqa: F401
+
+from cutlass._mlir.dialects import nvvm, cute_nvgpu  # noqa: F401
+from cutlass._mlir import ir
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+
+import cutlass.cute as cute
+from cutlass.cute.typing import Int, Boolean, Int32, Int16, Uint64, Union  # noqa: F401
+from cutlass.impl_utils import check_value_in
+
+from cutlass.cute.arch import cp_async_commit_group as cp_async_commit  # noqa: F401
+from cutlass.cute.arch import cp_async_wait_group as cp_async_wait  # noqa: F401
+
+BYTES_PER_TENSORMAP = 128
+BYTES_PER_POINTER = 8
+
+
+def cp_async_gs(size, dst, dst_offset, src, src_offset):
+    assert size in [16, 8, 4]
+    # use CG (cache global) to by pass L1 when loading contiguous 128B.
+    mode = nvvm.LoadCacheModifierKind.CG if size == 16 else nvvm.LoadCacheModifierKind.CA
+    if isinstance(src, cute.Tensor):
+        src_ptr = src.iterator
+    elif isinstance(src, cute.Pointer):
+        src_ptr = src
+    else:
+        raise ValueError(f"Invalid source type: {type(src)}")
+    if isinstance(dst, cute.Tensor):
+        dst_ptr = dst.iterator
+    elif isinstance(dst, cute.Pointer):
+        dst_ptr = dst
+    else:
+        raise ValueError(f"Invalid destination type: {type(dst)}")
+    cp_async_shared_global(dst_ptr + dst_offset, src_ptr + src_offset, size, mode)
+
+
+@cute.jit
+def cp_async_gs_conditional(size, dst, dst_offset, src, src_offset, cond):
+    if cond:
+        cp_async_gs(size, dst, dst_offset, src, src_offset)
+
+
+@dsl_user_op
+def extract_tensormap_ptr(tma_atom: cute.CopyAtom, *, loc=None, ip=None) -> cute.Pointer:
+    """
+    extract the tensormap pointer from a TMA Copy Atom.
+    :param tma_atom:      The TMA Copy Atom
+    :type tma_atom:       CopyAtom
+    """
+    exec_value = _cute_nvgpu_ir.atom_make_exec_tma(tma_atom._trait.value, loc=loc, ip=ip)
+    ptr_type = _cute_ir.PtrType.get(Uint64.mlir_type, _cute_ir.AddressSpace.generic, 64)
+    tensormap_ptr = _cute_nvgpu_ir.get_tma_desc_addr(ptr_type, exec_value, loc=loc, ip=ip)
+    return tensormap_ptr
+
+
+@dsl_user_op
+def tma_load(tma_desc, mbar: cute.Pointer, smem_ptr: cute.Pointer, crd: Int | tuple[Int, ...], *, loc=None, ip=None) -> None:
+    """
+    Load data from global memory to shared memory using TMA (Tensor Memory Access).
+
+    :param tma_desc:                 TMA descriptor for the tensor
+    :type tma_desc:                  CopyAtom or tensormap_ptr or Tensor of tensormap_ptr
+    :param mbar:                     Mbarrier pointer in shared memory
+    :type mbar:                      Pointer
+    :param smem_ptr:                 Destination pointer in shared memory
+    :type smem_ptr:                  Pointer
+    :param crd:                      Coordinates tuple for the tensor access
+    :type crd:                       tuple[Int, ...]
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(arch, ["sm_90", "sm_90a", "sm_100a"], "arch")
+
+    if not isinstance(crd, tuple) and isinstance(tma_desc, cute.Pointer):
+        # Legacy signature: tma_load(smem_ptr, gmem_ptr, mbar, size)
+        _smem_ptr = tma_desc
+        _gmem_ptr = mbar
+        _mbar = smem_ptr
+        nvvm.cp_async_bulk_shared_cluster_global(
+            dst_mem=_smem_ptr.llvm_ptr,
+            src_mem=_gmem_ptr.llvm_ptr,
+            mbar=_mbar.llvm_ptr,
+            size=Int32(crd).ir_value(loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+    else:
+        if isinstance(tma_desc, cute.CopyAtom):
+            tma_desc_ptr = extract_tensormap_ptr(tma_desc)
+        elif isinstance(tma_desc, cute.Tensor):
+            tma_desc_ptr = tma_desc.iterator
+        else:
+            tma_desc_ptr = tma_desc
+        nvvm.cp_async_bulk_tensor_shared_cluster_global(
+            dst_mem=smem_ptr.llvm_ptr,
+            tma_descriptor=tma_desc_ptr.llvm_ptr,
+            coordinates=[Int32(i).ir_value(loc=loc, ip=ip) for i in crd],
+            mbar=mbar.llvm_ptr,
+            im2col_offsets=[],
+            load_mode=nvvm.CpAsyncBulkTensorLoadMode.TILE,
+            group=nvvm.Tcgen05GroupKind.CTA_1,
+            use_intrinsic=False,  # set to True would lead to compile error
+            loc=loc,
+            ip=ip,
+        )
+
+
+@dsl_user_op
+def tma_store(tma_desc, smem_ptr: cute.Pointer, crd: Int | tuple[Int, ...], *, loc=None, ip=None) -> None:
+    """
+    Store data from shared memory to global memory using TMA (Tensor Memory Access).
+
+    :param tma_desc:                 TMA descriptor for the tensor
+    :type tma_desc:                  TMA descriptor
+    :param smem_ptr:                 Source pointer in shared memory
+    :type smem_ptr:                  Pointer
+    :param crd:                      Coordinates tuple for the tensor access
+    :type crd:                       tuple[Int, ...]
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(arch, ["sm_90", "sm_90a", "sm_100a"], "arch")
+    if not isinstance(crd, tuple):
+        if arch not in ("sm_90", "sm_90a"):
+            raise NotImplementedError("tma_store(size) path is only implemented for sm_90/sm_90a")
+        gmem_ptr = tma_desc.align(smem_ptr.alignment)
+        _cute_nvgpu_ir.arch_copy_SM90_bulk_copy_s2g(
+            dsmem_data_addr=smem_ptr.value,
+            gmem_data_addr=gmem_ptr.value,
+            size=ir.IntegerAttr.get(ir.IntegerType.get_signless(32), crd),
+            loc=loc,
+            ip=ip,
+        )
+    else:
+        if isinstance(tma_desc, cute.CopyAtom):
+            tma_desc_ptr = extract_tensormap_ptr(tma_desc)
+        elif isinstance(tma_desc, cute.Tensor):
+            tma_desc_ptr = tma_desc.iterator
+        else:
+            tma_desc_ptr = tma_desc
+        nvvm.cp_async_bulk_tensor_global_shared_cta(
+            tma_descriptor=tma_desc_ptr.llvm_ptr,
+            src_mem=smem_ptr.llvm_ptr,
+            coordinates=[Int32(i).ir_value(loc=loc, ip=ip) for i in crd],
+            predicate=None,
+            loc=loc,
+            ip=ip,
+        )
+
+
+@dsl_user_op
+def tma_store_arrive(*, loc=None, ip=None) -> None:
+    """
+    Indicate arrival of warp issuing TMA_STORE.
+    Corresponds to PTX instruction: cp.async.bulk.commit_group;
+    """
+    nvvm.cp_async_bulk_commit_group(loc=loc, ip=ip)
+
+
+@dsl_user_op
+def tma_store_wait(count: int, *, read=None, loc=None, ip=None) -> None:
+    """
+    Wait for TMA_STORE operations to complete.
+    Corresponds to PTX instruction: cp.async.bulk.wait_group.read <count>;
+
+    :param count: The number of outstanding bulk async groups to wait for
+    :type count: Int
+    """
+    nvvm.cp_async_bulk_wait_group(group=count, read=read, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def cp_async_shared_global(
+    dst: cute.Pointer, src: cute.Pointer, cp_size: Int, modifier: nvvm.LoadCacheModifierKind, *, src_size: Int = None, loc=None, ip=None
+) -> None:
+    """
+    Asynchronously copy data from global memory to shared memory.
+
+    :param dst: Destination pointer in shared memory
+    :type dst: Pointer
+    :param src: Source pointer in global memory
+    :type src: Pointer
+    :param size: Size of the copy in bytes
+    :type size: Int
+    :param modifier: Cache modifier
+    :type modifier: Int
+    :param cp_size: Optional copy size override
+    :type cp_size: Int
+    """
+    size = src_size if src_size else cp_size
+    nvvm.cp_async_shared_global(
+        dst=dst.llvm_ptr,
+        src=src.llvm_ptr,
+        size=ir.IntegerAttr.get(ir.IntegerType.get_signless(32), size),
+        modifier=modifier,
+        cp_size=Int32(cp_size).ir_value(loc=loc, ip=ip),
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def prefetch_tma_descriptor(tma_desc, *, loc=None, ip=None) -> None:
+    """
+    Prefetch a TMA descriptor.
+    Corresponds to PTX instruction: prefetch.tensormap;
+    """
+    if isinstance(tma_desc, cute.CopyAtom):
+        tma_desc_ptr = extract_tensormap_ptr(tma_desc)
+    elif isinstance(tma_desc, cute.Tensor):
+        tma_desc_ptr = tma_desc.iterator
+    else:
+        tma_desc_ptr = tma_desc
+    nvvm.prefetch_tensormap(tma_desc_ptr.llvm_ptr, loc=loc, ip=ip)
diff --git a/tilelang/contrib/cutedsl/gemm_V1.py b/tilelang/contrib/cutedsl/gemm_V1.py
new file mode 100644
index 000000000..0f6cc71e9
--- /dev/null
+++ b/tilelang/contrib/cutedsl/gemm_V1.py
@@ -0,0 +1,569 @@
+import cutlass
+import cutlass.cute as cute
+import cutlass.utils as utils  # noqa: F401
+import math
+import cutlass.utils.hopper_helpers as hopper_utils
+from cutlass.utils import LayoutEnum
+from cutlass.cute.nvgpu.warpgroup import OperandMajorMode, OperandSource, make_smem_layout_atom
+
+
+def make_aligned_tensor(ptr: cute.Pointer, layout: cute.Layout, align_bytes: int, swizzle=False):
+    ptr = ptr.align(align_bytes)
+    if swizzle and isinstance(layout, cute.ComposedLayout):
+        ptr = cute.recast_ptr(ptr=ptr, swizzle_=layout.inner, dtype=ptr.dtype)
+        return cute.make_tensor(ptr, layout.outer)
+    return cute.make_tensor(ptr, layout)
+
+
+def gemm_ss(
+    M,
+    N,
+    K,
+    warp_m,
+    warp_n,
+    trans_A,
+    trans_B,
+    clear_accum,
+    stride_A,
+    stride_B,
+    offset_A,
+    offset_B,
+    use_wgmma=None,
+    wg_wait=0,
+    A_ptr: cute.Pointer = None,
+    B_ptr: cute.Pointer = None,
+    C_ptr: cute.Pointer = None,
+):
+    """GEMM with both A and B from shared memory"""
+    if use_wgmma:
+        gemm = Gemm_SM90(
+            M,
+            N,
+            K,
+            warp_m,
+            warp_n,
+            trans_A,
+            trans_B,
+            clear_accum,
+            stride_A,
+            stride_B,
+            offset_A,
+            offset_B,
+            A_ptr.dtype,
+            B_ptr.dtype,
+            C_ptr.dtype,
+        )
+        gemm(A_ptr, B_ptr, C_ptr, wg_wait=wg_wait, clear_accum=clear_accum)
+    else:
+        gemm = Gemm_SM80(
+            M,
+            N,
+            K,
+            warp_m,
+            warp_n,
+            trans_A,
+            trans_B,
+            clear_accum,
+            stride_A,
+            stride_B,
+            offset_A,
+            offset_B,
+            A_ptr.dtype,
+            B_ptr.dtype,
+            C_ptr.dtype,
+        )
+        gemm(A_ptr, B_ptr, C_ptr)
+
+
+def gemm_rs(
+    M,
+    N,
+    K,
+    warp_m,
+    warp_n,
+    trans_A,
+    trans_B,
+    clear_accum,
+    stride_A,
+    stride_B,
+    offset_A,
+    offset_B,
+    use_wgmma=None,
+    wg_wait=0,
+    A_ptr: cute.Pointer = None,
+    B_ptr: cute.Pointer = None,
+    C_ptr: cute.Pointer = None,
+):
+    """GEMM with A from register/fragment and B from shared memory"""
+    if use_wgmma:
+        gemm = Gemm_SM90(
+            M,
+            N,
+            K,
+            warp_m,
+            warp_n,
+            trans_A,
+            trans_B,
+            clear_accum,
+            stride_A,
+            stride_B,
+            offset_A,
+            offset_B,
+            A_ptr.dtype,
+            B_ptr.dtype,
+            C_ptr.dtype,
+        )
+        gemm.body_rs(A_ptr, B_ptr, C_ptr, wg_wait=wg_wait, clear_accum=clear_accum)
+    else:
+        gemm = Gemm_SM80(
+            M,
+            N,
+            K,
+            warp_m,
+            warp_n,
+            trans_A,
+            trans_B,
+            clear_accum,
+            stride_A,
+            stride_B,
+            offset_A,
+            offset_B,
+            A_ptr.dtype,
+            B_ptr.dtype,
+            C_ptr.dtype,
+        )
+        gemm.body_rs(A_ptr, B_ptr, C_ptr)
+
+
+def gemm_sr(
+    M,
+    N,
+    K,
+    warp_m,
+    warp_n,
+    trans_A,
+    trans_B,
+    clear_accum,
+    stride_A,
+    stride_B,
+    offset_A,
+    offset_B,
+    use_wgmma=None,
+    wg_wait=0,
+    A_ptr: cute.Pointer = None,
+    B_ptr: cute.Pointer = None,
+    C_ptr: cute.Pointer = None,
+):
+    """GEMM with A from shared memory and B from register/fragment"""
+    # wgmma doesn't support gemm_sr, only use SM80
+    gemm = Gemm_SM80(
+        M,
+        N,
+        K,
+        warp_m,
+        warp_n,
+        trans_A,
+        trans_B,
+        clear_accum,
+        stride_A,
+        stride_B,
+        offset_A,
+        offset_B,
+        A_ptr.dtype,
+        B_ptr.dtype,
+        C_ptr.dtype,
+    )
+    gemm.body_sr(A_ptr, B_ptr, C_ptr)
+
+
+def gemm_rr(
+    M,
+    N,
+    K,
+    warp_m,
+    warp_n,
+    trans_A,
+    trans_B,
+    clear_accum,
+    stride_A,
+    stride_B,
+    offset_A,
+    offset_B,
+    use_wgmma=None,
+    wg_wait=0,
+    A_ptr: cute.Pointer = None,
+    B_ptr: cute.Pointer = None,
+    C_ptr: cute.Pointer = None,
+):
+    """GEMM with both A and B from register/fragment"""
+    # Both operands in register, no copy needed
+    gemm = Gemm_SM80(
+        M,
+        N,
+        K,
+        warp_m,
+        warp_n,
+        trans_A,
+        trans_B,
+        clear_accum,
+        stride_A,
+        stride_B,
+        offset_A,
+        offset_B,
+        A_ptr.dtype,
+        B_ptr.dtype,
+        C_ptr.dtype,
+    )
+    # For gemm_rr, directly call _body_impl with copy_A=False, copy_B=False
+    gemm._body_impl(A_ptr, B_ptr, C_ptr, copy_A=False, copy_B=False)
+
+
+class Gemm_SM80:
+    _instances = {}  # cache instances for the same arguments
+
+    def __new__(cls, *args):
+        key = args
+        if key not in cls._instances:
+            cls._instances[key] = super().__new__(cls)
+        return cls._instances[key]
+
+    # in Tilelang, trans_A == 0 or trans_B == 1 means K major
+    # in Cute, trans == 0 means K major
+    def __init__(
+        self, M, N, K, warp_m, warp_n, trans_A, trans_B, clear_accum, stride_A, stride_B, offset_A, offset_B, A_type, B_type, C_type
+    ):
+        if not hasattr(self, "initialized"):
+            self.cta_tiler = (M, N, K)
+            self.mma_inst_shape = (16, 8, 16)
+            self.trans_A = trans_A != 0  # same with Tilelang
+            self.trans_B = trans_B == 0  # inverse with Tilelang
+            A_major_mode = LayoutEnum.COL_MAJOR if self.trans_A else LayoutEnum.ROW_MAJOR
+            B_major_mode = LayoutEnum.COL_MAJOR if self.trans_B else LayoutEnum.ROW_MAJOR
+            self.A_layout = self._make_smem_layout_AB(A_type, A_major_mode, 128, (M, K))
+            self.B_layout = self._make_smem_layout_AB(B_type, B_major_mode, 128, (N, K))
+            self.ab_dtype = A_type
+            self.acc_dtype = C_type
+            self.tiled_mma = self._make_tiled_mma(warp_m, warp_n)
+            self.clear_accum = clear_accum
+
+    def _make_smem_layout_AB(self, dtype, major_mode, copy_bits, smem_tiler):
+        is_row_major = major_mode == LayoutEnum.ROW_MAJOR
+        major_mode_size = smem_tiler[1] if is_row_major else smem_tiler[0]
+        major_mode_size = 64 if major_mode_size >= 64 else major_mode_size
+
+        swizzle_bits = int(math.log2(major_mode_size * dtype.width // copy_bits))
+        swizzle_bits = min(swizzle_bits, 3)
+
+        layout_atom_outer = (
+            cute.make_layout((8, major_mode_size), stride=(major_mode_size, 1))
+            if is_row_major
+            else cute.make_layout((major_mode_size, 8), stride=(1, major_mode_size))
+        )
+        layout_atom = cute.make_composed_layout(
+            cute.make_swizzle(swizzle_bits, 3, 3),
+            0,
+            layout_atom_outer,
+        )
+        layout = cute.tile_to_shape(layout_atom, smem_tiler, (0, 1) if is_row_major else (1, 0))
+        return layout
+
+    def _make_tiled_mma(self, warp_m, warp_n):
+        atom_layout_mnk = (warp_m, warp_n, 1)
+        op = cute.nvgpu.warp.MmaF16BF16Op(self.ab_dtype, self.acc_dtype, self.mma_inst_shape)
+        permutation_mnk = (
+            atom_layout_mnk[0] * self.mma_inst_shape[0],
+            atom_layout_mnk[1] * self.mma_inst_shape[1] * 2,
+            atom_layout_mnk[2] * self.mma_inst_shape[2],
+        )
+        tiled_mma = cute.make_tiled_mma(op, atom_layout_mnk, permutation_mnk)
+        return tiled_mma
+
+    @cute.jit
+    def __call__(
+        self,
+        sA_ptr: cute.Pointer,
+        sB_ptr: cute.Pointer,
+        rC_ptr: cute.Pointer,
+    ):
+        """GEMM body: both A and B from shared memory"""
+        self._body_impl(sA_ptr, sB_ptr, rC_ptr, copy_A=True, copy_B=True)
+
+    @cute.jit
+    def body_rs(
+        self,
+        rA_ptr: cute.Pointer,  # A already in register
+        sB_ptr: cute.Pointer,  # B from shared memory
+        rC_ptr: cute.Pointer,
+    ):
+        """GEMM body_rs: A from register, B from shared memory"""
+        self._body_impl(rA_ptr, sB_ptr, rC_ptr, copy_A=False, copy_B=True)
+
+    @cute.jit
+    def body_sr(
+        self,
+        sA_ptr: cute.Pointer,  # A from shared memory
+        rB_ptr: cute.Pointer,  # B already in register
+        rC_ptr: cute.Pointer,
+    ):
+        """GEMM body_sr: A from shared memory, B from register"""
+        self._body_impl(sA_ptr, rB_ptr, rC_ptr, copy_A=True, copy_B=False)
+
+    @cute.jit
+    def _body_impl(
+        self,
+        A_ptr: cute.Pointer,
+        B_ptr: cute.Pointer,
+        rC_ptr: cute.Pointer,
+        copy_A: cutlass.Constexpr = True,
+        copy_B: cutlass.Constexpr = True,
+    ):
+        """Internal implementation with configurable copy operations"""
+        tidx, _, _ = cute.arch.thread_idx()
+        thr_mma = self.tiled_mma.get_slice(tidx)
+
+        tCrA = None
+        tCrB = None
+        tCrC = cute.make_tensor(rC_ptr, self.tiled_mma.partition_shape_C((self.cta_tiler[0], self.cta_tiler[1])))
+
+        # Create copy operations only for operands that need copying
+        if cutlass.const_expr(copy_A):
+            sA = make_aligned_tensor(A_ptr, self.A_layout, 16)
+            tCsA = thr_mma.partition_A(sA)
+            tCrA = self.tiled_mma.make_fragment_A(tCsA)
+            atom_copy_s2r_A = cute.make_copy_atom(
+                cute.nvgpu.warp.LdMatrix8x8x16bOp(self.trans_A, 4),
+                sA.element_type,
+            )
+            tiled_copy_s2r_A = cute.make_tiled_copy(
+                atom_copy_s2r_A,
+                layout_tv=self.tiled_mma.tv_layout_A_tiled,
+                tiler_mn=(self.tiled_mma.get_tile_size(0), self.tiled_mma.get_tile_size(2)),
+            )
+            thr_copy_ldmatrix_A = tiled_copy_s2r_A.get_slice(tidx)
+            tCsA_copy_view = thr_copy_ldmatrix_A.partition_S(sA)
+            tCrA_copy_view = thr_copy_ldmatrix_A.retile(tCrA)
+        else:
+            # A already in register
+            tCrA = cute.make_tensor(A_ptr, self.tiled_mma.partition_shape_A((self.cta_tiler[0], self.cta_tiler[2])))
+
+        if cutlass.const_expr(copy_B):
+            sB = make_aligned_tensor(B_ptr, self.B_layout, 16)
+            tCsB = thr_mma.partition_B(sB)
+            tCrB = self.tiled_mma.make_fragment_B(tCsB)
+            atom_copy_s2r_B = cute.make_copy_atom(
+                cute.nvgpu.warp.LdMatrix8x8x16bOp(self.trans_B, 4),
+                sB.element_type,
+            )
+            tiled_copy_s2r_B = cute.make_tiled_copy(
+                atom_copy_s2r_B,
+                layout_tv=self.tiled_mma.tv_layout_B_tiled,
+                tiler_mn=(self.tiled_mma.get_tile_size(1), self.tiled_mma.get_tile_size(2)),
+            )
+            thr_copy_ldmatrix_B = tiled_copy_s2r_B.get_slice(tidx)
+            tCsB_copy_view = thr_copy_ldmatrix_B.partition_S(sB)
+            tCrB_copy_view = thr_copy_ldmatrix_B.retile(tCrB)
+        else:
+            # B already in register
+            tCrB = cute.make_tensor(B_ptr, self.tiled_mma.partition_shape_B((self.cta_tiler[1], self.cta_tiler[2])))
+
+        if self.clear_accum:
+            tCrC.fill(0)
+
+        for k in cutlass.range(cute.size(tCrA, mode=[2])):
+            if cutlass.const_expr(copy_A):
+                cute.copy(tiled_copy_s2r_A, tCsA_copy_view[None, None, k], tCrA_copy_view[None, None, k])
+            if cutlass.const_expr(copy_B):
+                cute.copy(tiled_copy_s2r_B, tCsB_copy_view[None, None, k], tCrB_copy_view[None, None, k])
+            cute.gemm(self.tiled_mma, tCrC, tCrA[None, None, k], tCrB[None, None, k], tCrC)
+
+
+class Gemm_SM90:
+    _instances = {}  # cache instances for the same arguments
+
+    def __new__(cls, *args):
+        key = args
+        if key not in cls._instances:
+            cls._instances[key] = super().__new__(cls)
+        return cls._instances[key]
+
+    # in Tilelang, trans_A == 0 or trans_B == 1 means K major
+    # in Cute, trans == 0 means K major
+    def __init__(
+        self, M, N, K, warp_m, warp_n, trans_A, trans_B, clear_accum, stride_A, stride_B, offset_A, offset_B, A_type, B_type, C_type
+    ):
+        if not hasattr(self, "initialized"):
+            self.cta_tiler = (M, N, K)
+            self.tiler_mn = (M, N)
+            self.atom_layout_mnk = (warp_m // 4, warp_n, 1)
+            self.trans_A = trans_A != 0  # same with Tilelang
+            self.trans_B = trans_B == 0  # inverse with Tilelang
+            self.a_leading_mode = OperandMajorMode.MN if self.trans_A else OperandMajorMode.K
+            self.b_leading_mode = OperandMajorMode.MN if self.trans_B else OperandMajorMode.K
+            A_major_mode = LayoutEnum.COL_MAJOR if self.trans_A else LayoutEnum.ROW_MAJOR
+            B_major_mode = LayoutEnum.COL_MAJOR if self.trans_B else LayoutEnum.ROW_MAJOR
+            self.A_layout = self.make_smem_layout_AB(A_type, A_major_mode, (M, K))
+            self.B_layout = self.make_smem_layout_AB(B_type, B_major_mode, (N, K))
+            self.a_dtype = A_type
+            self.b_dtype = B_type
+            self.acc_dtype = C_type
+            self.tiled_mma = None
+            self.A_source = None
+            self.clear_accum = clear_accum
+
+    @staticmethod
+    def make_tma_atom(
+        tensor,
+        smem_layout_staged,
+        smem_tile,
+        mcast_dim,
+    ):
+        op = cute.nvgpu.cpasync.CopyBulkTensorTileG2SOp() if mcast_dim == 1 else cute.nvgpu.cpasync.CopyBulkTensorTileG2SMulticastOp()
+
+        smem_layout = cute.slice_(smem_layout_staged, (None, None, 0))
+
+        tma_atom, tma_tensor = cute.nvgpu.cpasync.make_tiled_tma_atom(
+            op,
+            tensor,
+            smem_layout,
+            smem_tile,
+            num_multicast=mcast_dim,
+        )
+
+        return tma_atom
+
+    @staticmethod
+    def get_tma_atom(tensor, tiler_mk, stages=1):
+        smem_layout_staged = Gemm_SM90.make_smem_layout_AB(tensor.element_type, LayoutEnum.from_tensor(tensor), tiler_mk, stages)
+        tma_atom = Gemm_SM90.make_tma_atom(tensor, smem_layout_staged, tiler_mk, 1)
+        return tma_atom
+
+    @staticmethod
+    def make_smem_layout_AB(dtype, major_mode: LayoutEnum, tiler_mk, stages=1):
+        smem_shape = tiler_mk
+        # Determine if K is the major mode and get the major mode size
+        is_k_major = major_mode.sm90_mma_major_mode() == cute.nvgpu.warpgroup.OperandMajorMode.K
+        major_mode_size = tiler_mk[1] if is_k_major else tiler_mk[0]
+
+        # Create SMEM layout atom for A tensor based on major mode and data type
+        smem_layout_atom = make_smem_layout_atom(
+            hopper_utils.get_smem_layout_atom(major_mode, dtype, major_mode_size),
+            dtype,
+        )
+        # Tile the SMEM layout atom to the A tensor shape and add staging dimension
+        smem_layout = cute.tile_to_shape(smem_layout_atom, cute.append(smem_shape, stages), order=(0, 1, 2) if is_k_major else (1, 0, 2))
+        return smem_layout
+
+    def _make_tiled_mma(self, is_rsMode=False):
+        tiled_mma = hopper_utils.make_trivial_tiled_mma(
+            self.a_dtype,
+            self.b_dtype,
+            self.a_leading_mode,
+            self.b_leading_mode,
+            self.acc_dtype,
+            self.atom_layout_mnk,
+            (64, self.tiler_mn[1] // self.atom_layout_mnk[1]),
+            OperandSource.SMEM if not is_rsMode else OperandSource.RMEM,
+        )
+        return tiled_mma
+
+    @cute.jit
+    def __call__(
+        self,
+        sA_ptr: cute.Pointer,
+        sB_ptr: cute.Pointer,
+        rC_ptr: cute.Pointer,
+        wg_wait: cutlass.Constexpr = 0,
+        clear_accum: cutlass.Constexpr = False,
+    ):
+        tidx, _, _ = cute.arch.thread_idx()
+        self.tiled_mma = self._make_tiled_mma()
+        thr_mma = self.tiled_mma.get_slice(tidx)
+
+        sA_ptr = cute.recast_ptr(sA_ptr, self.A_layout.inner, dtype=sA_ptr.dtype)
+        sB_ptr = cute.recast_ptr(sB_ptr, self.B_layout.inner, dtype=sB_ptr.dtype)
+        sA = cute.make_tensor(sA_ptr, self.A_layout.outer)
+        sB = cute.make_tensor(sB_ptr, self.B_layout.outer)
+
+        tCsA = thr_mma.partition_A(sA)
+        tCsB = thr_mma.partition_B(sB)
+
+        tCrA = self.tiled_mma.make_fragment_A(tCsA)
+        tCrB = self.tiled_mma.make_fragment_B(tCsB)
+        tCrC = cute.make_tensor(rC_ptr, self.tiled_mma.partition_shape_C((self.cta_tiler[0], self.cta_tiler[1])))
+
+        cute.nvgpu.warpgroup.fence()
+        if cutlass.const_expr(clear_accum):
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, False)
+        else:
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, True)
+        num_k_blocks = cute.size(tCrA, mode=[2])
+        for k in cutlass.range(num_k_blocks):
+            tCrA_1phase = tCrA[None, None, k, 0]
+            tCrB_1phase = tCrB[None, None, k, 0]
+            cute.gemm(self.tiled_mma, tCrC, tCrA_1phase, tCrB_1phase, tCrC)
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, True)
+
+        cute.nvgpu.warpgroup.commit_group()
+        if cutlass.const_expr(wg_wait >= 0):
+            cute.nvgpu.warpgroup.wait_group(wg_wait)
+
+    @cute.jit
+    def body_rs(
+        self,
+        rA_ptr: cute.Pointer,  # A already in register (Fragment)
+        sB_ptr: cute.Pointer,  # B from shared memory
+        rC_ptr: cute.Pointer,
+        wg_wait: cutlass.Constexpr = 0,
+        clear_accum: cutlass.Constexpr = False,
+    ):
+        """
+        GEMM body_rs for SM90/Hopper: A from register, B from shared memory.
+        Based on cute::tl_wgmma::GemmTensorOp::body_rs from gemm_sm90.h
+        """
+        tidx, _, _ = cute.arch.thread_idx()
+        self.tiled_mma = self._make_tiled_mma(is_rsMode=True)
+        # if self.A_source != OperandSource.RMEM or self.tiled_mma is None:
+        #     self.tiled_mma = self._make_tiled_mma(is_rsMode = True)
+        #     self.A_source = OperandSource.RMEM
+        # B from shared memory (with swizzle)
+        sB_ptr = cute.recast_ptr(sB_ptr, self.B_layout.inner, dtype=sB_ptr.dtype)
+        sB = cute.make_tensor(sB_ptr, self.B_layout.outer)
+
+        # Use the existing tiled_mma
+        thr_mma = self.tiled_mma.get_slice(tidx)
+
+        # Partition B from shared memory - standard path
+        tCsB = thr_mma.partition_B(sB)
+        tCrB = self.tiled_mma.make_fragment_B(tCsB)
+
+        # A already in register
+        # For body_rs, A is NOT partitioned through thr_mma (it's already partitioned)
+        # We create the tensor directly with the full shape
+        # This matches C++: make_tensor(make_rmem_ptr(pA), partition_shape_A(...))
+        tCrA = cute.make_tensor(rA_ptr, self.tiled_mma.partition_shape_A((self.cta_tiler[0], self.cta_tiler[2])))
+
+        # C accumulator
+        tCrC = cute.make_tensor(rC_ptr, self.tiled_mma.partition_shape_C((self.cta_tiler[0], self.cta_tiler[1])))
+
+        # Fence operands (prepare for wgmma)
+        cute.nvgpu.warpgroup.fence()
+        # Note: warpgroup_arrive() is called internally by wgmma
+        # Set accumulation mode
+        if cutlass.const_expr(clear_accum):
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, False)
+        else:
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, True)
+        # GEMM loop
+        num_k_blocks = cute.size(tCrB, mode=[2])
+        for k_block in cutlass.range(num_k_blocks):
+            # Match the indexing pattern from __call__
+            # If tCrB has 4 dimensions (with pipeline), use [None, None, k, 0]
+            # Otherwise use [None, None, k]
+            tCrB_k = tCrB[None, None, k_block, 0] if cute.rank(tCrB) >= 4 else tCrB[None, None, k_block]
+            tCrA_k = tCrA[None, None, k_block, 0] if cute.rank(tCrA) >= 4 else tCrA[None, None, k_block]
+            cute.gemm(self.tiled_mma, tCrC, tCrA_k, tCrB_k, tCrC)
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, True)
+
+        cute.nvgpu.warpgroup.commit_group()
+        if cutlass.const_expr(wg_wait >= 0):
+            cute.nvgpu.warpgroup.wait_group(wg_wait)
diff --git a/tilelang/contrib/cutedsl/ldsm.py b/tilelang/contrib/cutedsl/ldsm.py
new file mode 100644
index 000000000..4f3602697
--- /dev/null
+++ b/tilelang/contrib/cutedsl/ldsm.py
@@ -0,0 +1,127 @@
+"""
+LDMATRIX and STMATRIX operations for CuTeDSL backend.
+Based on tl_templates/cuda/ldsm.h
+
+These functions provide wrappers around PTX ldmatrix/stmatrix instructions
+for loading/storing 8x8 matrix fragments between shared memory and registers.
+"""
+
+from cutlass.cutlass_dsl import T, dsl_user_op
+from cutlass._mlir.dialects import nvvm, llvm
+from cutlass._mlir import ir  # noqa: F401
+from cutlass.cute.typing import Pointer, Int32  # noqa: F401
+import cutlass.cute as cute
+
+
+def _to_ir_value(v, loc=None, ip=None):
+    """Convert value to MLIR IR, handling both cutlass types and raw MLIR Values"""
+    if hasattr(v, "ir_value"):
+        return v.ir_value(loc=loc, ip=ip)
+    else:
+        # Already an MLIR Value
+        return v
+
+
+def _ldmatrix(smem_ptr, local_ptr, num, transpose, loc=None, ip=None):
+    """Internal helper for ldmatrix operations"""
+    layout = nvvm.MMALayout.col if transpose else nvvm.MMALayout.row
+    assert num in [2, 4]
+    ret_type = llvm.StructType.get_literal([T.i32()] * num)
+    out_i32 = nvvm.ldmatrix(ret_type, smem_ptr.llvm_ptr, num=num, layout=layout, loc=loc, ip=ip)
+    out = cute.make_tensor(cute.recast_ptr(local_ptr, dtype=cute.Int32), num)
+    for i in range(num):
+        out[i] = cute.Int32(llvm.extractvalue(T.i32(), out_i32, [i], loc=loc, ip=ip))
+
+
+def _stmatrix(smem_ptr, values, transpose, loc=None, ip=None):
+    """Internal helper for stmatrix operations"""
+    layout = nvvm.MMALayout.col if transpose else nvvm.MMALayout.row
+    ir_values = [_to_ir_value(v, loc, ip) for v in values]
+    nvvm.stmatrix(smem_ptr.llvm_ptr, ir_values, layout=layout, loc=loc, ip=ip)
+
+
+# ============================================================================
+# LDMATRIX operations (load from shared memory to registers)
+# ============================================================================
+
+
+@dsl_user_op
+def ptx_ldmatrix_x1(smem_ptr: Pointer, local_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """Load 1 matrix (8x8) from shared memory"""
+    # _ldmatrix(smem_ptr, local_ptr, 1, False, loc, ip)
+    out_i32 = nvvm.ldmatrix(T.i32(), smem_ptr.llvm_ptr, num=1, layout=nvvm.MMALayout.row, loc=loc, ip=ip)
+    out = cute.make_tensor(cute.recast_ptr(local_ptr, dtype=cute.Int32), 1)
+    out[0] = cute.Int32(out_i32)
+
+
+@dsl_user_op
+def ptx_ldmatrix_x2(smem_ptr: Pointer, local_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """Load 2 matrices (8x8 each) from shared memory"""
+    _ldmatrix(smem_ptr, local_ptr, 2, False, loc, ip)
+
+
+@dsl_user_op
+def ptx_ldmatrix_x4(smem_ptr: Pointer, local_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """Load 4 matrices (8x8 each) from shared memory"""
+    _ldmatrix(smem_ptr, local_ptr, 4, False, loc, ip)
+
+
+@dsl_user_op
+def ptx_ldmatrix_x1_trans(smem_ptr: Pointer, local_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """Load 1 matrix (8x8) with transpose from shared memory"""
+    out_i32 = nvvm.ldmatrix(T.i32(), smem_ptr.llvm_ptr, num=1, layout=nvvm.MMALayout.col, loc=loc, ip=ip)
+    out = cute.make_tensor(cute.recast_ptr(local_ptr, dtype=cute.Int32), 1)
+    out[0] = cute.Int32(out_i32)
+
+
+@dsl_user_op
+def ptx_ldmatrix_x2_trans(smem_ptr: Pointer, local_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """Load 2 matrices (8x8 each) with transpose from shared memory"""
+    _ldmatrix(smem_ptr, local_ptr, 2, True, loc, ip)
+
+
+@dsl_user_op
+def ptx_ldmatrix_x4_trans(smem_ptr: Pointer, local_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """Load 4 matrices (8x8 each) with transpose from shared memory"""
+    _ldmatrix(smem_ptr, local_ptr, 4, True, loc, ip)
+
+
+# ============================================================================
+# STMATRIX operations (store from registers to shared memory)
+# ============================================================================
+
+
+@dsl_user_op
+def ptx_stmatrix_x1(smem_ptr: Pointer, value0, *, loc=None, ip=None) -> None:
+    """Store 1 matrix (8x8) to shared memory"""
+    _stmatrix(smem_ptr, [value0], False, loc, ip)
+
+
+@dsl_user_op
+def ptx_stmatrix_x2(smem_ptr: Pointer, value0, value1, *, loc=None, ip=None) -> None:
+    """Store 2 matrices (8x8 each) to shared memory"""
+    _stmatrix(smem_ptr, [value0, value1], False, loc, ip)
+
+
+@dsl_user_op
+def ptx_stmatrix_x4(smem_ptr: Pointer, value0, value1, value2, value3, *, loc=None, ip=None) -> None:
+    """Store 4 matrices (8x8 each) to shared memory"""
+    _stmatrix(smem_ptr, [value0, value1, value2, value3], False, loc, ip)
+
+
+@dsl_user_op
+def ptx_stmatrix_x1_trans(smem_ptr: Pointer, value0, *, loc=None, ip=None) -> None:
+    """Store 1 matrix (8x8) with transpose to shared memory"""
+    _stmatrix(smem_ptr, [value0], True, loc, ip)
+
+
+@dsl_user_op
+def ptx_stmatrix_x2_trans(smem_ptr: Pointer, value0, value1, *, loc=None, ip=None) -> None:
+    """Store 2 matrices (8x8 each) with transpose to shared memory"""
+    _stmatrix(smem_ptr, [value0, value1], True, loc, ip)
+
+
+@dsl_user_op
+def ptx_stmatrix_x4_trans(smem_ptr: Pointer, value0, value1, value2, value3, *, loc=None, ip=None) -> None:
+    """Store 4 matrices (8x8 each) with transpose to shared memory"""
+    _stmatrix(smem_ptr, [value0, value1, value2, value3], True, loc, ip)
diff --git a/tilelang/contrib/cutedsl/math.py b/tilelang/contrib/cutedsl/math.py
new file mode 100644
index 000000000..3f775091b
--- /dev/null
+++ b/tilelang/contrib/cutedsl/math.py
@@ -0,0 +1,9 @@
+import cutlass.cute as cute
+from cutlass.cute.typing import Union, Numeric
+from cutlass.cute.tensor import TensorSSA
+from cutlass._mlir.dialects import arith
+from cutlass.cute.math import exp, exp2, log, log2, log10, tan, cos, sin, sqrt  # noqa: F401
+
+
+def divf(x: Union[TensorSSA, Numeric], y: Union[TensorSSA, Numeric], fastmath: bool = False) -> Union[TensorSSA, Numeric]:
+    return cute.math._math_op(arith.divf, fastmath, x, y)
diff --git a/tilelang/contrib/cutedsl/mbar.py b/tilelang/contrib/cutedsl/mbar.py
new file mode 100644
index 000000000..ca956e2f4
--- /dev/null
+++ b/tilelang/contrib/cutedsl/mbar.py
@@ -0,0 +1,45 @@
+"""
+Simple wrappers that delegate to cutlass.cute.arch implementations.
+We use the existing implementations from cutlass rather than reinventing the wheel.
+"""
+
+from cutlass.cute.typing import Pointer, Int, Int32, Boolean  # noqa: F401
+from cutlass.cutlass_dsl import CuTeDSL, dsl_user_op  # noqa: F401
+from cutlass._mlir.dialects import nvvm
+
+from cutlass.cute.arch import mbarrier_init, mbarrier_expect_tx, mbarrier_arrive  # noqa: F401
+from cutlass.cute.arch import mbarrier_arrive_and_expect_tx as arrive_and_expect_tx  # noqa: F401
+from cutlass.cute.arch import cp_async_mbarrier_arrive_noinc as mbarrier_cp_async_arrive_noinc  # noqa: F401
+
+import cutlass.cute.arch as arch
+
+
+@dsl_user_op
+def mbarrier_wait(mbar_ptr: Pointer, phase: Int, timeout_ns: Int = 10000000, *, loc=None, ip=None) -> None:
+    """Waits on a mbarrier with a specified phase."""
+    nvvm.mbarrier_try_wait_parity_shared(
+        mbar_ptr.llvm_ptr,
+        Int32(phase).ir_value(loc=loc, ip=ip),
+        Int32(timeout_ns).ir_value(loc=loc, ip=ip),
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def mbarrier_cp_async_arrive(mbar_ptr: Pointer, *, loc=None, ip=None) -> None:
+    mbar_llvm_ptr = mbar_ptr.llvm_ptr
+    nvvm.cp_async_mbarrier_arrive_shared(
+        mbar_llvm_ptr,
+        noinc=False,
+        loc=loc,
+        ip=ip,
+    )
+
+
+def fence_proxy_async():
+    arch.fence_proxy(arch.ProxyKind.async_shared, space=arch.SharedSpace.shared_cta)
+
+
+def fence_barrier_init():
+    arch.mbarrier_init_fence()
diff --git a/tilelang/contrib/cutedsl/reduce.py b/tilelang/contrib/cutedsl/reduce.py
new file mode 100644
index 000000000..f835b149b
--- /dev/null
+++ b/tilelang/contrib/cutedsl/reduce.py
@@ -0,0 +1,186 @@
+"""
+Reduce operations for CuTeDSL backend.
+Based on tl_templates/cuda/reduce.h
+"""
+
+from __future__ import annotations
+
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.typing import Int32, Float32
+from cutlass.cutlass_dsl import dsl_user_op, T
+from cutlass._mlir.dialects import nvvm
+from cutlass.cute.arch.nvvm_wrappers import shuffle_sync_op
+
+
+@dsl_user_op
+def min(a: float | Float32, b: float | Float32, c: float | Float32 | None = None, *, loc=None, ip=None) -> Float32:
+    return Float32(
+        nvvm.fmin(
+            T.f32(),
+            Float32(a).ir_value(loc=loc, ip=ip),
+            Float32(b).ir_value(loc=loc, ip=ip),
+            c=Float32(c).ir_value(loc=loc, ip=ip) if c is not None else None,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def max(a: float | Float32, b: float | Float32, c: float | Float32 | None = None, *, loc=None, ip=None) -> Float32:
+    return Float32(
+        nvvm.fmax(
+            T.f32(),
+            Float32(a).ir_value(loc=loc, ip=ip),
+            Float32(b).ir_value(loc=loc, ip=ip),
+            c=Float32(c).ir_value(loc=loc, ip=ip) if c is not None else None,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+class SumOp:
+    """Sum reduction operator"""
+
+    @staticmethod
+    def __call__(x, y):
+        return x + y
+
+
+class MaxOp:
+    """Max reduction operator"""
+
+    @staticmethod
+    def __call__(x, y):
+        return max(x, y)
+
+
+class MinOp:
+    """Min reduction operator"""
+
+    @staticmethod
+    def __call__(x, y):
+        # Use cutlass.min which is JIT-friendly
+        return min(x, y)
+
+
+class BitAndOp:
+    """Bitwise AND reduction operator"""
+
+    @staticmethod
+    def __call__(x, y):
+        return x & y
+
+
+class BitOrOp:
+    """Bitwise OR reduction operator"""
+
+    @staticmethod
+    def __call__(x, y):
+        return x | y
+
+
+class BitXorOp:
+    """Bitwise XOR reduction operator"""
+
+    @staticmethod
+    def __call__(x, y):
+        return x ^ y
+
+
+def bar_sync(barrier_id, number_of_threads):
+    cute.arch.barrier(barrier_id=barrier_id, number_of_threads=number_of_threads)
+
+
+def bar_sync_ptx(barrier_id, number_of_threads):
+    from cutlass._mlir.dialects import llvm
+
+    llvm.inline_asm(
+        None,
+        [Int32(barrier_id).ir_value(), Int32(number_of_threads).ir_value()],
+        "bar.sync $0, $1;",
+        "r,r",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+
+
+def AllReduce(reducer, threads, scale, thread_offset, all_threads=None):
+    """
+    AllReduce operation implementing warp/block-level reduction.
+    Based on tl::AllReduce from reduce.h
+
+    Args:
+        reducer: Reducer operator class (SumOp, MaxOp, etc.)
+        threads: Number of threads participating in reduction
+        scale: Reduction scale factor
+        thread_offset: Thread ID offset
+        all_threads: Total number of threads in block
+
+    Returns:
+        A callable object with run() and run_hopper() methods
+    """
+
+    class AllReduceInstance:
+        def __init__(self, reducer, threads, scale, thread_offset: cutlass.Constexpr[int], all_threads: cutlass.Constexpr[int]):
+            self.reducer = reducer
+            self.threads = threads
+            self.scale = scale
+            self.thread_offset = thread_offset
+            self.all_threads = all_threads if all_threads is not None else threads
+
+        def run(self, x, red_buf: cute.Pointer = None):
+            """
+            Perform all-reduce across threads.
+            Based on tl::AllReduce<...>::run from reduce.h
+            """
+            offset = self.threads // 2
+
+            if offset >= 32:
+                # Use shared memory for large thread counts
+                cute.arch.sync_threads()
+                tidx, _, _ = cute.arch.thread_idx()
+                cute.make_tensor(red_buf + tidx - self.thread_offset, (1,))[0] = x
+                cute.arch.sync_threads()
+                x = self.reducer()(x, cute.make_tensor(red_buf + ((tidx - self.thread_offset) ^ offset), (1,))[0])
+            else:
+                # Use warp shuffle for small thread counts
+                # Use the pre-existing shuffle_sync_op with butterfly (XOR) mode
+                other = shuffle_sync_op(x, offset, mask=0xFFFFFFFF, mask_and_clamp=0x1F, kind=nvvm.ShflKind.bfly)
+                x = self.reducer()(x, other)
+
+            return (
+                x
+                if offset == self.scale
+                else AllReduce(self.reducer, offset, self.scale, self.thread_offset, self.all_threads).run(x, red_buf)
+            )
+
+        def run_hopper(self, x, red_buf: cute.Pointer = None):
+            """
+            Perform all-reduce on Hopper architecture using bar.sync.
+            Based on tl::AllReduce<...>::run_hopper from reduce.h
+            """
+            offset = self.threads // 2
+            tidx, _, _ = cute.arch.thread_idx()
+            if offset >= 32:
+                # Use inlined asm for bar.sync to avoid instruction reordering
+                bar_sync_ptx(1, self.all_threads)
+                cute.make_tensor(red_buf + tidx - self.thread_offset, (1,))[0] = x
+                bar_sync_ptx(2, self.all_threads)
+                x = self.reducer()(x, cute.make_tensor(red_buf + ((tidx - self.thread_offset) ^ offset), (1,))[0])
+            else:
+                # Use warp shuffle for small thread counts
+                # Use the pre-existing shuffle_sync_op with butterfly (XOR) mode
+                other = shuffle_sync_op(x, offset, mask=0xFFFFFFFF, mask_and_clamp=0x1F, kind=nvvm.ShflKind.bfly)
+                x = self.reducer()(x, other)
+
+            return (
+                x
+                if offset == self.scale
+                else AllReduce(self.reducer, offset, self.scale, self.thread_offset, self.all_threads).run_hopper(x, red_buf)
+            )
+
+    return AllReduceInstance(reducer, threads, scale, thread_offset, all_threads)
diff --git a/tilelang/contrib/cutedsl/threadblock_swizzle.py b/tilelang/contrib/cutedsl/threadblock_swizzle.py
new file mode 100644
index 000000000..1ce78eb86
--- /dev/null
+++ b/tilelang/contrib/cutedsl/threadblock_swizzle.py
@@ -0,0 +1,54 @@
+import cutlass.cute as cute
+from cutlass.cute.typing import Constexpr
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class dim3:
+    x: int
+    y: int
+    z: int
+
+
+def ThreadIdx() -> dim3:
+    return dim3(*cute.arch.thread_idx())
+
+
+def BlockIdx() -> dim3:
+    return dim3(*cute.arch.block_idx())
+
+
+def GridDim() -> dim3:
+    return dim3(*cute.arch.grid_dim())
+
+
+@cute.jit
+def rasterization2DRow(panel_width: Constexpr[int]) -> dim3:
+    blockIdx = BlockIdx()
+    gridDim = GridDim()
+    block_idx = blockIdx.x + blockIdx.y * gridDim.x
+    grid_size = gridDim.x * gridDim.y
+    panel_size = panel_width * gridDim.x
+    panel_offset = block_idx % panel_size
+    panel_idx = block_idx // panel_size
+    total_panel = cute.ceil_div(grid_size, panel_size)
+    stride = panel_width if panel_idx + 1 < total_panel else (grid_size - panel_idx * panel_size) // gridDim.x
+    col_idx = (gridDim.x - 1 - panel_offset // stride) if (panel_idx & 1 != 0) else (panel_offset // stride)
+    row_idx = panel_offset % stride + panel_idx * panel_width
+    return dim3(col_idx, row_idx, blockIdx.z)
+
+
+@cute.jit
+def rasterization2DColumn(panel_width: Constexpr[int]) -> dim3:
+    blockIdx = BlockIdx()
+    gridDim = GridDim()
+    block_idx = blockIdx.x + blockIdx.y * gridDim.x
+    grid_size = gridDim.x * gridDim.y
+    panel_size = panel_width * gridDim.y
+    panel_offset = block_idx % panel_size
+    panel_idx = block_idx // panel_size
+    total_panel = cute.ceil_div(grid_size, panel_size)
+    stride = panel_width if panel_idx + 1 < total_panel else (grid_size - panel_idx * panel_size) // gridDim.y
+    row_idx = (gridDim.y - 1 - panel_offset // stride) if (panel_idx & 1 != 0) else (panel_offset // stride)
+    col_idx = panel_offset % stride + panel_idx * panel_width
+    return dim3(col_idx, row_idx, blockIdx.z)
diff --git a/tilelang/contrib/dlpack.py b/tilelang/contrib/dlpack.py
index 58e82f8b1..d80f0fdbc 100644
--- a/tilelang/contrib/dlpack.py
+++ b/tilelang/contrib/dlpack.py
@@ -15,7 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 """Wrapping functions to bridge frameworks with DLPack support to TVM"""
-from tvm.runtime import ndarray
+
+from tvm import runtime
 
 
 def convert_func(tvm_func, tensor_type, to_dlpack_func):
@@ -45,13 +46,9 @@ def convert_func(tvm_func, tensor_type, to_dlpack_func):
 
     def adapt_tensor(arg):
         if isinstance(arg, tensor_type):
-            if arg.dtype in {
-                    torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2,
-                    torch.float8_e5m2fnuz
-            }:
-                return ndarray.from_dlpack(to_dlpack_func(arg.view(torch.int8)))._create_view(
-                    arg.shape, dtype=float8_dtype_map[arg.dtype])
-            return ndarray.from_dlpack(to_dlpack_func(arg))
+            if arg.dtype in {torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz}:
+                return runtime.from_dlpack(to_dlpack_func(arg.view(torch.int8)))._create_view(arg.shape, dtype=float8_dtype_map[arg.dtype])
+            return runtime.from_dlpack(to_dlpack_func(arg))
         return arg
 
     def _wrapper(*args):
@@ -59,23 +56,3 @@ def _wrapper(*args):
         return tvm_func(*args)
 
     return _wrapper
-
-
-def to_pytorch_func(tvm_func):
-    """Convert a tvm function into one that accepts PyTorch tensors
-
-    Parameters
-    ----------
-    tvm_func: Function
-        Built tvm function operating on arrays
-
-    Returns
-    -------
-    wrapped_func: Function
-        Wrapped tvm function that operates on PyTorch tensors
-    """
-    # pylint: disable=import-outside-toplevel
-    import torch
-    import torch.utils.dlpack
-
-    return convert_func(tvm_func, torch.Tensor, torch.utils.dlpack.to_dlpack)
diff --git a/tilelang/contrib/hipcc.py b/tilelang/contrib/hipcc.py
index 92fbcc8e3..7b7f9f947 100644
--- a/tilelang/contrib/hipcc.py
+++ b/tilelang/contrib/hipcc.py
@@ -9,19 +9,14 @@
 
 import subprocess
 
-import tvm.ffi
+import tvm_ffi
 
 from tvm.contrib import utils
 from tvm.base import py_str
 from tvm.contrib.rocm import get_rocm_arch, find_rocm_path
 
 
-def compile_hip(code,
-                target_format="hsaco",
-                arch=None,
-                options=None,
-                path_target=None,
-                verbose=False):
+def compile_hip(code, target_format="hsaco", arch=None, options=None, path_target=None, verbose=False):
     """Compile HIP code with hipcc.
 
     Parameters
@@ -61,7 +56,7 @@ def compile_hip(code,
 
     file_target = path_target if path_target else temp_target
     cmd = ["hipcc"]
-    cmd += ["-O3", '-c']
+    cmd += ["-O3", "-c"]
     if isinstance(arch, str):
         cmd += [f"--offload-arch={arch}"]
     if target_format == "hsaco":
@@ -96,7 +91,7 @@ def compile_hip(code,
         return data
 
 
-@tvm.ffi.register_func("tilelang_callback_hip_compile", override=True)
+@tvm_ffi.register_global_func("tilelang_callback_hip_compile", override=True)
 def tilelang_callback_hip_compile(code, target):
     """use hipcc to generate fatbin code for better optimization"""
     hsaco = compile_hip(code, target_format="hsaco")
diff --git a/tilelang/contrib/nvcc.py b/tilelang/contrib/nvcc.py
index 8e813d92b..0f6ec52eb 100644
--- a/tilelang/contrib/nvcc.py
+++ b/tilelang/contrib/nvcc.py
@@ -1,27 +1,48 @@
 # pylint: disable=invalid-name
 # modified from apache tvm python/tvm/contrib/nvcc.py
 """Utility to invoke nvcc compiler in the system"""
-from __future__ import absolute_import as _abs
+
 from __future__ import annotations
 
 import os
 import subprocess
 import warnings
-from tilelang.env import CUDA_HOME
+import contextlib
+from tilelang.env import CUDA_HOME, CUTLASS_INCLUDE_DIR, TILELANG_TEMPLATE_PATH
+
+
+def _get_nvshmem_include_path():
+    """Get NVSHMEM include path from pip-installed nvidia-nvshmem-cu12 or environment."""
+    # Try pip-installed nvidia-nvshmem-cu12
+    try:
+        import nvidia.nvshmem
 
-import tvm.ffi
+        nvshmem_path = nvidia.nvshmem.__path__[0]
+        include_path = os.path.join(nvshmem_path, "include")
+        if os.path.exists(include_path):
+            return include_path
+    except ImportError:
+        pass
+    # Try environment variable
+    nvshmem_home = os.environ.get("NVSHMEM_HOME", "")
+    if nvshmem_home:
+        include_path = os.path.join(nvshmem_home, "include")
+        if os.path.exists(include_path):
+            return include_path
+    return None
+
+
+import shutil
+import tempfile
+import tvm_ffi
+from tilelang import tvm as tvm
 from tvm.target import Target
 
 from tvm.base import py_str
 from tvm.contrib import utils
 
 
-def compile_cuda(code,
-                 target_format="ptx",
-                 arch=None,
-                 options=None,
-                 path_target=None,
-                 verbose=False):
+def compile_cuda(code, target_format="ptx", arch=None, options=None, path_target=None, verbose=False):
     """Compile cuda code with NVCC from env.
 
     Parameters
@@ -65,7 +86,7 @@ def compile_cuda(code,
     temp_target = temp.relpath(f"{file_name}.{target_format}")
 
     pass_context = tvm.get_global_func("transform.GetCurrentPassContext")()
-    kernels_output_dir = (pass_context.config.get("cuda.kernels_output_dir", None))
+    kernels_output_dir = pass_context.config.get("cuda.kernels_output_dir", None)
     if kernels_output_dir is not None:
         if not os.path.isdir(kernels_output_dir):
             os.makedirs(kernels_output_dir)
@@ -76,10 +97,10 @@ def compile_cuda(code,
         out_file.write(code)
 
     file_target = path_target if path_target else temp_target
-    cmd = ["nvcc"]
+    cmd = [get_nvcc_compiler()]
     cmd += [f"--{target_format}", "-O3"]
-    if kernels_output_dir is not None:
-        cmd += ["-lineinfo"]
+    # Always include line info for better profiling and mapping
+    cmd += ["-lineinfo"]
     if isinstance(arch, list):
         cmd += arch
     elif isinstance(arch, str):
@@ -112,10 +133,7 @@ def compile_cuda(code,
         print(py_str(out))
 
     if proc.returncode != 0:
-        msg = f"{code}\n" \
-            f"Compilation error:\n" \
-            f"{py_str(out)}\n" \
-            f"Command: {' '.join(cmd)}\n"
+        msg = f"{code}\nCompilation error:\n{py_str(out)}\nCommand: {' '.join(cmd)}\n"
         raise RuntimeError(msg)
 
     with open(file_target, "rb") as f:
@@ -125,6 +143,153 @@ def compile_cuda(code,
         return data
 
 
+def default_compile_options(compile_flags: list[str] | None = None) -> list[str]:
+    """
+    Build a set of default NVCC compile options for TileLang generated sources.
+
+    Includes C++ standard and common include paths (TileLang templates, CUTLASS,
+    CUDA include). Merges user-provided compile flags if given.
+
+    Parameters
+    ----------
+    compile_flags : Optional[List[str]]
+        Additional flags to include. Items are split on whitespace.
+
+    Returns
+    -------
+    List[str]
+        A list of flags suitable for NVCC's command line.
+    """
+    options: list[str] = ["-std=c++17"]
+    try:
+        if TILELANG_TEMPLATE_PATH:
+            options.append(f"-I{TILELANG_TEMPLATE_PATH}")
+    except Exception:
+        pass
+    try:
+        if CUTLASS_INCLUDE_DIR:
+            options.append(f"-I{CUTLASS_INCLUDE_DIR}")
+    except Exception:
+        pass
+    try:
+        if CUDA_HOME:
+            options.append(f"-I{os.path.join(CUDA_HOME, 'include')}")
+    except Exception:
+        pass
+
+    # Add NVSHMEM include path for distributed support
+    nvshmem_include = _get_nvshmem_include_path()
+    if nvshmem_include:
+        options.append(f"-I{nvshmem_include}")
+
+    # Preserve user flags exactly, including repeated tokens required by NVCC
+    # (e.g., multiple "-gencode" pairs or repeated "-Xcompiler" entries).
+    if compile_flags:
+        import shlex
+
+        for flag in compile_flags:
+            # Split each string like a shell would, preserving quoted args
+            tokens = shlex.split(flag) if isinstance(flag, str) else [str(flag)]
+            options.extend(tokens)
+    return options
+
+
+def get_ptx_from_source(code: str, compile_flags: list[str] | None = None, verbose: bool = False) -> str:
+    """
+    Compile CUDA C++ source to PTX using NVCC and return as text.
+
+    Parameters
+    ----------
+    code : str
+        CUDA C++ kernel source code.
+    compile_flags : Optional[List[str]]
+        Additional flags merged with defaults.
+    verbose : bool
+        Print NVCC output when True.
+
+    Returns
+    -------
+    str
+        PTX text.
+    """
+    opts = default_compile_options(compile_flags)
+    ptx_bytes = compile_cuda(code, target_format="ptx", options=opts, verbose=verbose)
+    try:
+        return ptx_bytes.decode("utf-8")
+    except Exception:
+        return str(ptx_bytes)
+
+
+def _find_tool(name: str) -> str | None:
+    """Find a CUDA binary in PATH or under CUDA_HOME/bin."""
+    path = shutil.which(name)
+    if path:
+        return path
+    if CUDA_HOME:
+        candidate = os.path.join(CUDA_HOME, "bin", name)
+        if os.path.exists(candidate):
+            return candidate
+    return None
+
+
+def get_sass_from_source(code: str, compile_flags: list[str] | None = None, verbose: bool = False) -> str:
+    """
+    Compile CUDA C++ source to CUBIN and disassemble to SASS.
+
+    Uses nvdisasm if available; otherwise falls back to cuobjdump.
+
+    Parameters
+    ----------
+    code : str
+        CUDA C++ kernel source code.
+    compile_flags : Optional[List[str]]
+        Additional flags merged with defaults.
+    verbose : bool
+        Print tool outputs when True.
+
+    Returns
+    -------
+    str
+        SASS text.
+    """
+    opts = default_compile_options(compile_flags)
+    cubin_bytes = compile_cuda(code, target_format="cubin", options=opts, verbose=verbose)
+
+    # Write to a temp .cubin file
+    with tempfile.NamedTemporaryFile(suffix=".cubin", delete=False) as tmp:
+        tmp.write(cubin_bytes)
+        cubin_path = tmp.name
+
+    # Try disassembly tools (prefer nvdisasm, fallback cuobjdump)
+    cand_nvdisasm = _find_tool("nvdisasm")
+    cand_cuobjdump = _find_tool("cuobjdump")
+    if not cand_nvdisasm and not cand_cuobjdump:
+        raise RuntimeError("Cannot find 'nvdisasm' or 'cuobjdump'. Please ensure CUDA toolkit is installed and in PATH.")
+    last_err: str | None = None
+    try:
+        # Attempt nvdisasm first
+        tools_to_try = []
+        if cand_nvdisasm:
+            tools_to_try.append(("nvdisasm", [cand_nvdisasm, cubin_path]))
+        if cand_cuobjdump:
+            tools_to_try.append(("cuobjdump", [cand_cuobjdump, "--dump-sass", cubin_path]))
+
+        for tool_name, cmd in tools_to_try:
+            proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+            out, _ = proc.communicate()
+            text = py_str(out)
+            if verbose:
+                print(f"[{tool_name}] output:\n{text}")
+            if proc.returncode == 0 and text.strip():
+                return text
+            last_err = f"{tool_name} rc={proc.returncode}, output:\n{text}"
+        # If we reach here, all attempts failed
+        raise RuntimeError(f"SASS disassembly failed. Tried tools: {', '.join(name for name, _ in tools_to_try)}\n{last_err or ''}")
+    finally:
+        with contextlib.suppress(Exception):
+            os.remove(cubin_path)
+
+
 def find_cuda_path():
     """Utility function to find cuda path
 
@@ -182,14 +347,7 @@ def get_cuda_version(cuda_path=None):
     raise RuntimeError("Cannot read cuda version file")
 
 
-@tvm.ffi.register_func("tilelang_callback_cuda_compile", override=True)
-def tilelang_callback_cuda_compile(code, target):  # pylint: disable=unused-argument
-    """use nvcc to generate fatbin code for better optimization"""
-    ptx = compile_cuda(code, target_format="fatbin")
-    return ptx
-
-
-@tvm.ffi.register_func("tilelang_callback_libdevice_path", override=True)
+@tvm_ffi.register_global_func("tilelang_callback_libdevice_path", override=True)
 def find_libdevice_path(arch):
     """Utility function to find libdevice
 
@@ -254,7 +412,7 @@ def callback_libdevice_path(arch):
         return ""
 
 
-@tvm.ffi.register_func("tvm.contrib.nvcc.get_compute_version", override=True)
+@tvm_ffi.register_global_func("tvm.contrib.nvcc.get_compute_version", override=True)
 def get_target_compute_version(target=None):
     """Utility function to get compute capability of compilation target.
 
@@ -275,28 +433,23 @@ def get_target_compute_version(target=None):
     # 2. Target.current()
     target = target or Target.current()
     if target and target.arch:
-        arch = target.arch.split("_")[1]
+        arch = target.arch.split("_")[1].rstrip("af")
         if len(arch) == 2:
             major, minor = arch
             # Handle old format like sm_89
             return major + "." + minor
         elif len(arch) == 3:
-            major = int(arch[0])
-            if major < 2:
-                major = arch[0:2]
-                minor = arch[2]
-                return major + "." + minor
-            else:
-                # This is for arch like "sm_90a"
-                major, minor, suffix = arch
-            return major + "." + minor + "." + suffix
+            major = arch[0:2]
+            minor = arch[2]
+            return major + "." + minor
+        else:
+            raise ValueError(f"Unsupported arch: {arch}")
 
     # 3. GPU compute version
     if tvm.cuda(0).exist:
         return tvm.cuda(0).compute_version
 
-    raise ValueError("No CUDA architecture was specified or GPU detected."
-                     "Try specifying it by adding '-arch=sm_xx' to your target.")
+    raise ValueError("No CUDA architecture was specified or GPU detected.Try specifying it by adding '-arch=sm_xx' to your target.")
 
 
 def parse_compute_version(compute_version) -> tuple[int, int]:
@@ -324,8 +477,11 @@ def parse_compute_version(compute_version) -> tuple[int, int]:
         raise RuntimeError("Compute version parsing error") from err
 
 
-def get_target_arch(compute_version) -> str:
-    major, minor = parse_compute_version(compute_version)
+def get_target_arch(compute_version: str | tuple[int, int]) -> str:
+    if isinstance(compute_version, str):
+        major, minor = parse_compute_version(compute_version)
+    else:
+        major, minor = compute_version
     target_arch = str(major * 10 + minor)
     if major >= 9:
         target_arch += "a"
@@ -381,7 +537,8 @@ def have_tensorcore(compute_version=None, target=None):
                 warnings.warn(
                     "Tensorcore will be disabled due to no CUDA architecture specified."
                     "Try specifying it by adding '-arch=sm_xx' to your target.",
-                    stacklevel=2)
+                    stacklevel=2,
+                )
                 return False
             compute_version = target.attrs["arch"]
             # Compute version will be in the form "sm_{major}{minor}"
@@ -400,7 +557,7 @@ def have_cudagraph():
         return False
 
 
-@tvm.ffi.register_func("tvm.contrib.nvcc.supports_bf16", override=True)
+@tvm_ffi.register_global_func("tvm.contrib.nvcc.supports_bf16", override=True)
 def have_bf16(compute_version):
     """Either bf16 support is provided in the compute capability or not
 
@@ -413,7 +570,7 @@ def have_bf16(compute_version):
     return major >= 8
 
 
-@tvm.ffi.register_func("tvm.contrib.nvcc.supports_fp8", override=True)
+@tvm_ffi.register_global_func("tvm.contrib.nvcc.supports_fp8", override=True)
 def have_fp8(compute_version):
     """Whether fp8 support is provided in the specified compute capability or not
 
@@ -430,7 +587,7 @@ def have_fp8(compute_version):
     return any(conditions)
 
 
-@tvm.ffi.register_func("tvm.contrib.nvcc.supports_tma", override=True)
+@tvm_ffi.register_global_func("tvm.contrib.nvcc.supports_tma", override=True)
 def have_tma(target):
     """Whether TMA support is provided in the specified compute capability or not
 
diff --git a/tilelang/contrib/nvrtc.py b/tilelang/contrib/nvrtc.py
index b69115549..105c51819 100644
--- a/tilelang/contrib/nvrtc.py
+++ b/tilelang/contrib/nvrtc.py
@@ -11,11 +11,13 @@ def get_nvrtc_version() -> tuple[int, int]:
     return (major, minor)
 
 
-def compile_cuda(code: str,
-                 target_format: Literal["ptx", "cubin"] = "ptx",
-                 arch: int | None = None,
-                 options: str | list[str] | None = None,
-                 verbose: bool = False) -> bytearray:
+def compile_cuda(
+    code: str,
+    target_format: Literal["ptx", "cubin"] = "ptx",
+    arch: int | None = None,
+    options: str | list[str] | None = None,
+    verbose: bool = False,
+) -> bytearray:
     """Compile cuda code with NVRTC.
 
     Parameters
@@ -43,8 +45,7 @@ def compile_cuda(code: str,
     if arch is None:
         # If None, then it will use `tvm.target.Target.current().arch`.
         # Target arch could be a str like "80", "90", "90a", etc.
-        major, minor = parse_compute_version(
-            get_target_compute_version(Target.current(allow_none=True)))
+        major, minor = parse_compute_version(get_target_compute_version(Target.current(allow_none=True)))
         arch = major * 10 + minor
     prefix = "compute" if target_format == "ptx" else "sm"
     suffix = "a" if arch >= 90 else ""
@@ -77,8 +78,7 @@ def compile_cuda(code: str,
     compile_result = nvrtc.nvrtcCompileProgram(program, len(options_bytes), options_bytes)[0]
 
     if compile_result != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-        msg = f"{code}\n" \
-            f"Compilation error:\n"
+        msg = f"{code}\nCompilation error:\n"
         if verbose:
             result, log_size = nvrtc.nvrtcGetProgramLogSize(program)
             assert result == nvrtc.nvrtcResult.NVRTC_SUCCESS, f"Failed to get program log size: {result}"
@@ -105,7 +105,6 @@ def compile_cuda(code: str,
         assert result == nvrtc.nvrtcResult.NVRTC_SUCCESS, f"Failed to get PTX: {result}"
 
     # Destroy handler
-    assert nvrtc.nvrtcDestroyProgram(
-        program)[0] == nvrtc.nvrtcResult.NVRTC_SUCCESS, f"Failed to destroy program: {result}"
+    assert nvrtc.nvrtcDestroyProgram(program)[0] == nvrtc.nvrtcResult.NVRTC_SUCCESS, f"Failed to destroy program: {result}"
 
     return result_bytes
diff --git a/tilelang/contrib/rocm.py b/tilelang/contrib/rocm.py
index 8bb9e1d85..f3b92e54d 100644
--- a/tilelang/contrib/rocm.py
+++ b/tilelang/contrib/rocm.py
@@ -15,13 +15,14 @@
 # specific language governing permissions and limitations
 # under the License.
 """Utility for ROCm backend"""
+
 # ruff: noqa
 import re
 import subprocess
 import os
 from os.path import join, exists
 
-import tvm.ffi
+import tvm_ffi
 from tvm.base import py_str
 import tvm.runtime
 import tvm.target
@@ -100,7 +101,7 @@ def rocm_link(in_file, out_file, lld=None):
         raise RuntimeError(msg)
 
 
-@tvm.ffi.register_func("tvm_callback_rocm_link", override=True)
+@tvm_ffi.register_global_func("tvm_callback_rocm_link", override=True)
 def callback_rocm_link(obj_bin):
     """Links object file generated from LLVM to HSA Code Object
 
@@ -124,7 +125,7 @@ def callback_rocm_link(obj_bin):
     return cobj_bin
 
 
-@tvm.ffi.register_func("tvm_callback_rocm_bitcode_path", override=True)
+@tvm_ffi.register_global_func("tvm_callback_rocm_bitcode_path", override=True)
 def callback_rocm_bitcode_path(rocdl_dir=None):
     """Utility function to find ROCm device library bitcodes
 
@@ -226,7 +227,7 @@ def have_matrixcore(compute_version=None):
     return False
 
 
-@tvm.ffi.register_func("tvm_callback_rocm_get_arch", override=True)
+@tvm_ffi.register_global_func("tvm_callback_rocm_get_arch", override=True)
 def get_rocm_arch(rocm_path="/opt/rocm"):
     """Utility function to get the AMD GPU architecture
 
@@ -255,9 +256,11 @@ def get_rocm_arch(rocm_path="/opt/rocm"):
             gpu_arch = match.group(1)
         return gpu_arch
     except subprocess.CalledProcessError:
-        print(f"Unable to execute rocminfo command, \
+        print(
+            f"Unable to execute rocminfo command, \
                 please ensure ROCm is installed and you have an AMD GPU on your system.\
-                    using default {gpu_arch}.")
+                    using default {gpu_arch}."
+        )
         return gpu_arch
 
 
diff --git a/tilelang/distributed/build_nvshmem.sh b/tilelang/distributed/build_nvshmem.sh
old mode 100644
new mode 100755
index 8f4d44d1d..0ed532b25
--- a/tilelang/distributed/build_nvshmem.sh
+++ b/tilelang/distributed/build_nvshmem.sh
@@ -100,4 +100,4 @@ fi
 make VERBOSE=1 -j"${JOBS}"
 popd
 
-echo "NVSHMEM installed successfully to ${NVSHMEM_SRC}"
\ No newline at end of file
+echo "NVSHMEM installed successfully to ${NVSHMEM_SRC}"
diff --git a/tilelang/distributed/install_deepep.sh b/tilelang/distributed/install_deepep.sh
index 2d369a239..54b14733a 100644
--- a/tilelang/distributed/install_deepep.sh
+++ b/tilelang/distributed/install_deepep.sh
@@ -1,4 +1,17 @@
-# This script is for automatic installation of DeepEP for CI workflow
+# This script is for automatic installation of DeepEP
+# Please run this script in the root directory of TileScale
+
+# Detect current CUDA arch and export TORCH_CUDA_ARCH_LIST using torch
+
+CUDA_ARCH_LIST=$(python -c 'import torch; print(".".join(str(x) for x in torch.cuda.get_device_capability()) if torch.cuda.is_available() else "")' 2>/dev/null)
+
+if [ -z "$CUDA_ARCH_LIST" ]; then
+    echo "torch.cuda not available or failed to detect CUDA arch"
+    exit 1
+else
+    export TORCH_CUDA_ARCH_LIST="$CUDA_ARCH_LIST"
+    echo "TORCH_CUDA_ARCH_LIST set to $TORCH_CUDA_ARCH_LIST"
+fi
 
 # Ensure DeepEP is cloned into 3rdparty folder
 if [ ! -d "3rdparty/DeepEP" ]; then
@@ -8,12 +21,12 @@ fi
 
 # Ensure NVSHMEM installed
 if pip list | grep nvshmem > /dev/null 2>&1; then
-    echo "nvshmem is already installed."
+    echo "NVSHMEM is already installed."
 else
     pip install nvidia-nvshmem-cu12
 fi
 
-# Fix a bug of NVSHMEM path
+# Fix a bug of NVSHMEM linking
 export NVSHMEM_DIR=$(python -c "import site; print(site.getsitepackages()[0])")/nvidia/nvshmem
 echo "NVSHMEM_DIR is set to $NVSHMEM_DIR"
 ln -sf $NVSHMEM_DIR/lib/libnvshmem_host.so.3 $NVSHMEM_DIR/lib/libnvshmem_host.so
diff --git a/tilelang/distributed/launch.sh b/tilelang/distributed/launch.sh
index 024b777b2..c1cb56f55 100755
--- a/tilelang/distributed/launch.sh
+++ b/tilelang/distributed/launch.sh
@@ -13,7 +13,7 @@ export NCCL_IB_TIMEOUT=${NCCL_IB_TIMEOUT:=23}
 
 # set nccl log level
 export NCCL_DEBUG=${NCCL_DEBUG:="WARN"}  # set env var. `NCCL_DEBUG` to expected NCCL log level
-# Choices: [VERSION, WARN(default), INFO, TRACE], 
+# Choices: [VERSION, WARN(default), INFO, TRACE],
 
 # set launch configurations
 nproc_per_node=${GPUS:=$(nvidia-smi --list-gpus | wc -l)}  # set env var. `GPUS` to # of GPUs per node
@@ -52,4 +52,4 @@ echo ${CMD}
 ${CMD}
 
 ret=$?
-exit $ret
\ No newline at end of file
+exit $ret
diff --git a/tilelang/distributed/pynvshmem/python/_pynvshmem/__init__.pyi b/tilelang/distributed/pynvshmem/python/_pynvshmem/__init__.pyi
index bf3466eb1..3a342d32e 100644
--- a/tilelang/distributed/pynvshmem/python/_pynvshmem/__init__.pyi
+++ b/tilelang/distributed/pynvshmem/python/_pynvshmem/__init__.pyi
@@ -1,98 +1,37 @@
 from __future__ import annotations
 
-from typing import Sequence
+from collections.abc import Sequence
 import numpy as np
 import torch
 
-
-def nvshmemx_cumodule_init(module: np.intp) -> None:
-    ...
-
-
-def nvshmemx_cumodule_finalize(module: np.intp) -> None:
-    ...
-
-
-def nvshmem_my_pe() -> np.int32:
-    ...
-
-
-def nvshmem_n_pes() -> np.int32:
-    ...
-
-
-def nvshmem_team_my_pe(team: np.int32) -> np.int32:
-    ...
-
-
-def nvshmem_team_n_pes(team: np.int32) -> np.int32:
-    ...
-
-
-def nvshmem_malloc(size: np.uint) -> np.intp:
-    ...
-
-
-def nvshmem_ptr(ptr, peer):
-    ...
-
-
+def nvshmemx_cumodule_init(module: np.intp) -> None: ...
+def nvshmemx_cumodule_finalize(module: np.intp) -> None: ...
+def nvshmem_my_pe() -> np.int32: ...
+def nvshmem_n_pes() -> np.int32: ...
+def nvshmem_team_my_pe(team: np.int32) -> np.int32: ...
+def nvshmem_team_n_pes(team: np.int32) -> np.int32: ...
+def nvshmem_malloc(size: np.uint) -> np.intp: ...
+def nvshmem_ptr(ptr, peer): ...
 def nvshmemx_mc_ptr(team, ptr):
-    """ DON'T CALL this function if NVLS is not used on NVSHMEM 3.2.5!!!
+    """DON'T CALL this function if NVLS is not used on NVSHMEM 3.2.5!!!
     even nvshmem official doc say that it returns a nullptr(https://docs.nvidia.com/nvshmem/api/gen/api/setup.html?highlight=nvshmemx_mc_ptr#nvshmemx-mc-ptr), it actually core dump without any notice. use this function only when you are sure NVLS is used.
     here is an issue: https://forums.developer.nvidia.com/t/how-do-i-query-if-nvshmemx-mc-ptr-is-supported-nvshmemx-mc-ptr-core-dump-if-nvls-is-not-used/328986
     """
     ...
 
-
-def nvshmemx_get_uniqueid() -> bytes:
-    ...
-
-
-def nvshmemx_init_attr_with_uniqueid(rank: np.int32, nranks: np.int32, unique_id: bytes) -> None:
-    ...
-
-
-def nvshmem_int_p(ptr: np.intp, src: np.int32, dst: np.int32) -> None:
-    ...
-
-
-def nvshmem_barrier_all():
-    ...
-
-
-def nvshmemx_barrier_all_on_stream(stream: np.intp):
-    ...
-
-
-def nvshmem_getmem(dest: np.intp, source: np.intp, nelems: int, pe: int):
-    ...
-
-
-def nvshmem_putmem(dest: np.intp, source: np.intp, nelems: int, pe: int):
-    ...
-
-
-def nvshmemx_getmem_on_stream(dest: np.intp, source: np.intp, nelems: int, pe: int,
-                              stream: np.intp):
-    ...
-
-
-def nvshmemx_putmem_on_stream(dest: np.intp, source: np.intp, nelems: int, pe: int,
-                              stream: np.intp):
-    ...
-
-
-def nvshmemx_putmem_signal_on_stream(dest: np.intp, source: np.intp, nelems: int, sig_add: np.intp,
-                                     signal: int, sig_op: int, pe: int, stream: np.intp):
-    ...
-
+def nvshmemx_get_uniqueid() -> bytes: ...
+def nvshmemx_init_attr_with_uniqueid(rank: np.int32, nranks: np.int32, unique_id: bytes) -> None: ...
+def nvshmem_int_p(ptr: np.intp, src: np.int32, dst: np.int32) -> None: ...
+def nvshmem_barrier_all(): ...
+def nvshmemx_barrier_all_on_stream(stream: np.intp): ...
+def nvshmem_getmem(dest: np.intp, source: np.intp, nelems: int, pe: int): ...
+def nvshmem_putmem(dest: np.intp, source: np.intp, nelems: int, pe: int): ...
+def nvshmemx_getmem_on_stream(dest: np.intp, source: np.intp, nelems: int, pe: int, stream: np.intp): ...
+def nvshmemx_putmem_on_stream(dest: np.intp, source: np.intp, nelems: int, pe: int, stream: np.intp): ...
+def nvshmemx_putmem_signal_on_stream(
+    dest: np.intp, source: np.intp, nelems: int, sig_add: np.intp, signal: int, sig_op: int, pe: int, stream: np.intp
+): ...
 
 # torch related
-def nvshmem_create_tensor(shape: Sequence[int], dtype: torch.dtype) -> torch.Tensor:
-    ...
-
-
-def nvshmem_create_tensor_list_intra_node(shape: Sequence[int],
-                                          dtype: torch.dtype) -> list[torch.Tensor]:
-    ...
+def nvshmem_create_tensor(shape: Sequence[int], dtype: torch.dtype) -> torch.Tensor: ...
+def nvshmem_create_tensor_list_intra_node(shape: Sequence[int], dtype: torch.dtype) -> list[torch.Tensor]: ...
diff --git a/tilelang/distributed/pynvshmem/python/pynvshmem/__init__.py b/tilelang/distributed/pynvshmem/python/pynvshmem/__init__.py
index 56230a45e..7b424eb88 100644
--- a/tilelang/distributed/pynvshmem/python/pynvshmem/__init__.py
+++ b/tilelang/distributed/pynvshmem/python/pynvshmem/__init__.py
@@ -8,6 +8,7 @@
 
 cuda_python_version = importlib.metadata.version("cuda-python")
 from packaging import version
+
 if version.parse(cuda_python_version) >= version.parse("12.8.0"):
     from cuda.bindings import driver as cuda
     from cuda.bindings import runtime as cudart
@@ -75,8 +76,9 @@ def write32_on_stream(tensor: torch.Tensor, value: int, stream: torch.cuda.Strea
         stream (torch.cuda.Stream | None): The CUDA stream to use for the operation.
             If None, the current stream will be used.
     """
-    assert isinstance(tensor, torch.Tensor) and tensor.dtype in (torch.int32, torch.uint32), \
+    assert isinstance(tensor, torch.Tensor) and tensor.dtype in (torch.int32, torch.uint32), (
         f"tensor must be a torch.Tensor with 32-bit dtype, but got {tensor.dtype}"
+    )
     assert tensor.numel() == 1, "tensor must have exactly one element"
     if stream is None:
         stream = torch.cuda.current_stream()
@@ -97,8 +99,9 @@ def write64_on_stream(tensor: torch.Tensor, value: int, stream: torch.cuda.Strea
         stream (torch.cuda.Stream | None): The CUDA stream to use for the operation.
             If None, the current stream will be used.
     """
-    assert isinstance(tensor, torch.Tensor) and tensor.dtype in (torch.int64, torch.uint64), \
+    assert isinstance(tensor, torch.Tensor) and tensor.dtype in (torch.int64, torch.uint64), (
         f"tensor must be a torch.Tensor with 64-bit dtype, but got {tensor.dtype}"
+    )
     assert tensor.numel() == 1, "tensor must have exactly one element"
     if stream is None:
         stream = torch.cuda.current_stream()
diff --git a/tilelang/distributed/pynvshmem/setup.py b/tilelang/distributed/pynvshmem/setup.py
index fe45c762e..b3610ac7e 100644
--- a/tilelang/distributed/pynvshmem/setup.py
+++ b/tilelang/distributed/pynvshmem/setup.py
@@ -52,7 +52,6 @@ def get_package_version():
 
 
 def pathlib_wrapper(func):
-
     def wrapper(*kargs, **kwargs):
         include_dirs, library_dirs, libraries = func(*kargs, **kwargs)
         return map(str, include_dirs), map(str, library_dirs), map(str, libraries)
@@ -110,10 +109,7 @@ def setup_pytorch_extension() -> setuptools.Extension:
         libraries=libraries,
         dlink=True,
         dlink_libraries=["nvshmem_device", "cudart_static"],
-        extra_compile_args={
-            "cxx": cxx_flags,
-            "nvcc": ["-rdc=true"]
-        },
+        extra_compile_args={"cxx": cxx_flags, "nvcc": ["-rdc=true"]},
         extra_link_args=ld_flags,
     )
 
diff --git a/tilelang/distributed/pynvshmem/testing/cpp/run_nvshmem_example.sh b/tilelang/distributed/pynvshmem/testing/cpp/run_nvshmem_example.sh
old mode 100644
new mode 100755
index 8d2e80877..d2dc56864
--- a/tilelang/distributed/pynvshmem/testing/cpp/run_nvshmem_example.sh
+++ b/tilelang/distributed/pynvshmem/testing/cpp/run_nvshmem_example.sh
@@ -7,4 +7,4 @@ nvcc -shared -rdc=true -Xcompiler -fPIC test_nvshmem_example.cu -o libnvshmem_ex
     -L/home/aiscuser/cy/tilelang/3rdparty/nvshmem//build/src/lib \
     -lnvshmem_host -lnvshmem_device
 
-/home/aiscuser/cy/tilelang/3rdparty/nvshmem/scripts/build/bin/nvshmrun -n 8 python test_nvshmem_example.py
\ No newline at end of file
+/home/aiscuser/cy/tilelang/3rdparty/nvshmem/scripts/build/bin/nvshmrun -n 8 python test_nvshmem_example.py
diff --git a/tilelang/distributed/pynvshmem/testing/cpp/test_nvshmem_example.cu b/tilelang/distributed/pynvshmem/testing/cpp/test_nvshmem_example.cu
index 00fe093ed..72c6b3153 100644
--- a/tilelang/distributed/pynvshmem/testing/cpp/test_nvshmem_example.cu
+++ b/tilelang/distributed/pynvshmem/testing/cpp/test_nvshmem_example.cu
@@ -3,7 +3,7 @@
 #include <nvshmem.h>
 #include <nvshmemx.h>
 
-__global__ void simple_shift(int *destination) 
+__global__ void simple_shift(int *destination)
 {
     int mype = nvshmem_my_pe();
     int npes = nvshmem_n_pes();
@@ -12,7 +12,7 @@ __global__ void simple_shift(int *destination)
     nvshmem_int_p(destination, mype, peer);
 }
 
-extern "C" int run_simple_shift() 
+extern "C" int run_simple_shift()
 {
     int mype_node, msg = -1;
     cudaStream_t stream;
@@ -34,6 +34,6 @@ extern "C" int run_simple_shift()
     nvshmem_free(destination);
     nvshmem_finalize();
     cudaStreamDestroy(stream);
-    
+
     return msg;
-}
\ No newline at end of file
+}
diff --git a/tilelang/distributed/pynvshmem/testing/cpp/test_nvshmem_example.py b/tilelang/distributed/pynvshmem/testing/cpp/test_nvshmem_example.py
index bdad16d0c..cc74bb714 100644
--- a/tilelang/distributed/pynvshmem/testing/cpp/test_nvshmem_example.py
+++ b/tilelang/distributed/pynvshmem/testing/cpp/test_nvshmem_example.py
@@ -1,6 +1,6 @@
 import ctypes
 
-lib = ctypes.CDLL('./libnvshmem_example.so')
+lib = ctypes.CDLL("./libnvshmem_example.so")
 
 lib.run_simple_shift.restype = ctypes.c_int
 
diff --git a/tilelang/distributed/pynvshmem/testing/python/test_nvshmem_create_tensor.py b/tilelang/distributed/pynvshmem/testing/python/test_nvshmem_create_tensor.py
index 36c09f6c9..7bb80f107 100644
--- a/tilelang/distributed/pynvshmem/testing/python/test_nvshmem_create_tensor.py
+++ b/tilelang/distributed/pynvshmem/testing/python/test_nvshmem_create_tensor.py
@@ -7,11 +7,10 @@
 import datetime
 
 
-#TODO: Add more checks, e.g. tensor manipulations.
+# TODO: Add more checks, e.g. tensor manipulations.
 def test_nvshmem_create_tensor(N, dtype):
     t = pynvshmem.nvshmem_create_tensor((N,), dtype)
-    assert t.numel() == N and t.dtype == dtype and t.device == torch.device(
-        "cuda", RANK) and t.is_contiguous()
+    assert t.numel() == N and t.dtype == dtype and t.device == torch.device("cuda", RANK) and t.is_contiguous()
     if RANK == 0:
         print("nvshmem_create_tensor test passed!✅")
 
@@ -19,13 +18,12 @@ def test_nvshmem_create_tensor(N, dtype):
 def test_nvshmem_create_tensor_list_intra_node(N, dtype):
     tensor_list = pynvshmem.nvshmem_create_tensor_list_intra_node((N,), dtype)
     t = tensor_list[RANK]
-    assert t.numel() == N and t.dtype == dtype and t.device == torch.device(
-        "cuda", RANK) and t.is_contiguous()
+    assert t.numel() == N and t.dtype == dtype and t.device == torch.device("cuda", RANK) and t.is_contiguous()
     if RANK == 0:
         print("nvshmem_create_tensor_list_intra_node test passed!✅")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     WORLD_SIZE = int(os.environ.get("WORLD_SIZE", 1))
     assert 2 <= WORLD_SIZE <= 8, "This test is for intra-node multi-GPU"
     RANK = int(os.environ.get("RANK", 0))
diff --git a/tilelang/distributed/pynvshmem/testing/python/test_nvshmem_query.py b/tilelang/distributed/pynvshmem/testing/python/test_nvshmem_query.py
index f04aacd5b..78c537c45 100644
--- a/tilelang/distributed/pynvshmem/testing/python/test_nvshmem_query.py
+++ b/tilelang/distributed/pynvshmem/testing/python/test_nvshmem_query.py
@@ -6,7 +6,7 @@
 import os
 import datetime
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     WORLD_SIZE = int(os.environ.get("WORLD_SIZE", 1))
     assert WORLD_SIZE > 2
     RANK = int(os.environ.get("RANK", 0))
@@ -28,4 +28,4 @@
     assert pynvshmem.nvshmem_my_pe() == RANK
     assert pynvshmem.nvshmem_n_pes() == WORLD_SIZE
     if RANK == 0:
-        print('Test for basic queries passed!✅')
+        print("Test for basic queries passed!✅")
diff --git a/tilelang/distributed/pynvshmem/testing/test_rs.sh b/tilelang/distributed/pynvshmem/testing/test_rs.sh
old mode 100644
new mode 100755
index 56d653170..a5bb82a87
--- a/tilelang/distributed/pynvshmem/testing/test_rs.sh
+++ b/tilelang/distributed/pynvshmem/testing/test_rs.sh
@@ -3,4 +3,4 @@
 
 TILELANG_PATH=$NVSHMEM_PATH/../..
 bash $TILELANG_PATH/tilelang/distributed/launch.sh \
-    $TILELANG_PATH/benchmark/distributed/benchmark_reduce_scatter.py
\ No newline at end of file
+    $TILELANG_PATH/benchmark/distributed/benchmark_reduce_scatter.py
diff --git a/tilelang/distributed/testing/sync/test_barrier_gpu.py b/tilelang/distributed/testing/sync/test_barrier_gpu.py
index 75801339a..54d6a7582 100644
--- a/tilelang/distributed/testing/sync/test_barrier_gpu.py
+++ b/tilelang/distributed/testing/sync/test_barrier_gpu.py
@@ -7,18 +7,14 @@
 tilelang.disable_cache()
 
 
-@tilelang.jit(
-    out_idx=-1, pass_configs={
-        "tl.disable_warp_specialized": True,
-        "tl.disable_tma_lower": True
-    })
+@tilelang.jit(out_idx=-1, pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True})
 def get_test_barrier_gpu_kernel(num_blocks: int, threads: int):
-
     @T.prim_func
     def main(
-            A: T.Tensor([threads], "int32"),
-            bar: T.Tensor([1], 'uint32'),  # TODO(wt): auto alloc global bar
-            B: T.Tensor([num_blocks, threads], "int32")):
+        A: T.Tensor([threads], "int32"),
+        bar: T.Tensor([1], "uint32"),  # TODO(wt): auto alloc global bar
+        B: T.Tensor([num_blocks, threads], "int32"),
+    ):
         with T.Kernel(num_blocks, threads=threads) as bid:
             tid = T.get_thread_binding()
             T.init_barrier_gpu(bar, num_blocks)
@@ -36,18 +32,14 @@ def main(
     return main
 
 
-@tilelang.jit(
-    out_idx=-1, pass_configs={
-        "tl.disable_warp_specialized": True,
-        "tl.disable_tma_lower": True
-    })
+@tilelang.jit(out_idx=-1, pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True})
 def test_sync_grid_kernel(num_blocks: int, threads: int):
-
     @T.prim_func
     def main(
-            A: T.Tensor([threads], "int32"),
-            bar: T.Tensor([1], 'uint32'),  # TODO(wt): auto alloc global bar
-            B: T.Tensor([num_blocks, threads], "int32")):
+        A: T.Tensor([threads], "int32"),
+        bar: T.Tensor([1], "uint32"),  # TODO(wt): auto alloc global bar
+        B: T.Tensor([num_blocks, threads], "int32"),
+    ):
         with T.Kernel(num_blocks, threads=threads) as bid:
             tid = T.get_thread_binding()
 
@@ -66,43 +58,44 @@ def main(
 
 def test_barrier_gpu(num_blocks: int = 64, threads: int = 128, print_source: bool = False):
     kernel = get_test_barrier_gpu_kernel(num_blocks, threads)
-    input = torch.zeros(threads, dtype=torch.int32, device='cuda')
-    bar = torch.zeros(1, dtype=torch.uint32, device='cuda')
+    input = torch.zeros(threads, dtype=torch.int32, device="cuda")
+    bar = torch.zeros(1, dtype=torch.uint32, device="cuda")
     if print_source:
         print(kernel.get_kernel_source())
-    print('Compilation done, start running...')
+    print("Compilation done, start running...")
 
     output = kernel(input, bar)
 
     assert torch.all(output == num_blocks)
-    print('Check passed✅')
+    print("Check passed✅")
 
 
 def test_sync_grid_gpu(num_blocks: int = 64, threads: int = 128, print_source: bool = False):
     kernel = test_sync_grid_kernel(num_blocks, threads)
-    input = torch.zeros(threads, dtype=torch.int32, device='cuda')
-    bar = torch.zeros(1, dtype=torch.uint32, device='cuda')
+    input = torch.zeros(threads, dtype=torch.int32, device="cuda")
+    bar = torch.zeros(1, dtype=torch.uint32, device="cuda")
     if print_source:
         print(kernel.get_kernel_source())
-    print('Compilation done, start running...')
+    print("Compilation done, start running...")
 
     output = kernel(input, bar)
 
     assert torch.all(output == num_blocks)
-    print('Check passed✅')
+    print("Check passed✅")
 
 
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--blocks', type=int, default=64)
-    parser.add_argument('--threads', type=int, default=128)
-    parser.add_argument('--print-source', action='store_true')
+    parser.add_argument("--blocks", type=int, default=64)
+    parser.add_argument("--threads", type=int, default=128)
+    parser.add_argument("--print-source", action="store_true")
     return parser.parse_args()
 
 
 if __name__ == "__main__":
     args = parse_args()
-    assert args.blocks <= driver.get_num_sms(
-    ), f'Launched {args.blocks} blocks, which is larger than the number of SM ({driver.get_num_sms()}) on the current device and may cause deadlock!'
+    assert args.blocks <= driver.get_num_sms(), (
+        f"Launched {args.blocks} blocks, which is larger than the number of SM ({driver.get_num_sms()}) on the current device and may cause deadlock!"
+    )
     test_barrier_gpu(args.blocks, args.threads, args.print_source)
     test_sync_grid_gpu(args.blocks, args.threads, args.print_source)
diff --git a/tilelang/distributed/testing/sync/test_barrierall_sys.py b/tilelang/distributed/testing/sync/test_barrierall_sys.py
index 307daee46..698dfcc25 100644
--- a/tilelang/distributed/testing/sync/test_barrierall_sys.py
+++ b/tilelang/distributed/testing/sync/test_barrierall_sys.py
@@ -9,21 +9,16 @@
 from tilelang.distributed import init_dist
 
 tilelang.disable_cache()
-os.environ['NCCL_DEBUG'] = 'WARN'  # silence NCCL log
+os.environ["NCCL_DEBUG"] = "WARN"  # silence NCCL log
 
 
-@tilelang.jit(
-    out_idx=-1, pass_configs={
-        "tl.disable_warp_specialized": True,
-        "tl.disable_tma_lower": True
-    })
+@tilelang.jit(out_idx=-1, pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True})
 def get_test_barrierall_sys_kernel(num_ranks: int, blocks: int, threads: int):
-
     @T.prim_func
     def main(
-            A: T.Tensor([threads], "int32"),  # type: ignore
-            barrier: T.Tensor([num_ranks], "int32"),  # type: ignore
-            B: T.Tensor([blocks, threads], "int32"),  # type: ignore
+        A: T.Tensor([threads], "int32"),  # type: ignore
+        barrier: T.Tensor([num_ranks], "int32"),  # type: ignore
+        B: T.Tensor([blocks, threads], "int32"),  # type: ignore
     ):
         with T.Kernel(blocks, threads=threads) as bid:
             tid = T.get_thread_binding()
@@ -36,12 +31,7 @@ def main(
             T.barrier_blocks(barrier)
 
             if tid < 32:
-                T.put_warp(
-                    src=T.address_of(A),
-                    dst=T.address_of(B[bid, 0]),
-                    size=threads,
-                    dst_pe=rank[0] ^ 1,
-                    unroll_factor=4)
+                T.put_warp(src=T.address_of(A), dst=T.address_of(B[bid, 0]), size=threads, dst_pe=rank[0] ^ 1, unroll_factor=4)
 
     return main
 
@@ -51,12 +41,8 @@ def main(local_rank: int, num_ranks: int, args: argparse.Namespace):
 
     _, _, group = init_dist(local_rank, num_ranks)
     allocator = tilelang.get_allocator(
-        size=2**20,
-        device="cuda",
-        is_distributed=True,
-        local_rank=local_rank,
-        num_local_ranks=num_ranks,
-        group=group)
+        size=2**20, device="cuda", is_distributed=True, local_rank=local_rank, num_local_ranks=num_ranks, group=group
+    )
     kernel = get_test_barrierall_sys_kernel(num_ranks, blocks, threads)
     kernel.initialize(allocator=allocator)
 
@@ -78,19 +64,18 @@ def main(local_rank: int, num_ranks: int, args: argparse.Namespace):
 
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--num-processes', type=int, default=2, help='Number of processes to spawn (default: 2)')
-    parser.add_argument('--blocks', type=int, default=64, help='Number of blocks (default: 64)')
-    parser.add_argument('--threads', type=int, default=128, help='Number of threads (default: 128)')
-    parser.add_argument(
-        '--print-source', action='store_true', help='Print the source code of the kernel')
+    parser.add_argument("--num-processes", type=int, default=2, help="Number of processes to spawn (default: 2)")
+    parser.add_argument("--blocks", type=int, default=64, help="Number of blocks (default: 64)")
+    parser.add_argument("--threads", type=int, default=128, help="Number of threads (default: 128)")
+    parser.add_argument("--print-source", action="store_true", help="Print the source code of the kernel")
     return parser.parse_args()
 
 
 if __name__ == "__main__":
     args = parse_args()
     num_processes = args.num_processes
-    assert args.blocks <= driver.get_num_sms(
-    ), f'Launched {args.blocks} blocks, which is larger than the number of SM ({driver.get_num_sms()}) on the current device and may cause deadlock!'
+    assert args.blocks <= driver.get_num_sms(), (
+        f"Launched {args.blocks} blocks, which is larger than the number of SM ({driver.get_num_sms()}) on the current device and may cause deadlock!"
+    )
 
     torch.multiprocessing.spawn(main, args=(num_processes, args), nprocs=num_processes)
diff --git a/tilelang/distributed/utils.py b/tilelang/distributed/utils.py
index d994d5094..2fa80230b 100644
--- a/tilelang/distributed/utils.py
+++ b/tilelang/distributed/utils.py
@@ -5,13 +5,15 @@
 import datetime
 import os
 import inspect
-from typing import Callable, Sequence
+from typing import Callable
+from collections.abc import Sequence
 from contextlib import contextmanager
 
 import importlib.metadata
 
 cuda_python_version = importlib.metadata.version("cuda-python")
 from packaging import version
+
 if version.parse(cuda_python_version) >= version.parse("12.8.0"):
     from cuda.bindings import driver as cuda
     from cuda.bindings import runtime as cudart
@@ -38,28 +40,27 @@
 
 
 def init_dist(local_rank: int, num_local_ranks: int):
-    ip = os.getenv('MASTER_ADDR', '127.0.0.1')
-    port = int(os.getenv('MASTER_PORT', '8361'))
-    num_nodes = int(os.getenv('WORLD_SIZE', 1))
-    node_rank = int(os.getenv('RANK', 0))
+    ip = os.getenv("MASTER_ADDR", "127.0.0.1")
+    port = int(os.getenv("MASTER_PORT", "8361"))
+    num_nodes = int(os.getenv("WORLD_SIZE", 1))
+    node_rank = int(os.getenv("RANK", 0))
 
     sig = inspect.signature(dist.init_process_group)
     params = {
-        'backend': 'nccl',
-        'init_method': f'tcp://{ip}:{port}',
-        'world_size': num_nodes * num_local_ranks,
-        'rank': node_rank * num_local_ranks + local_rank,
+        "backend": "nccl",
+        "init_method": f"tcp://{ip}:{port}",
+        "world_size": num_nodes * num_local_ranks,
+        "rank": node_rank * num_local_ranks + local_rank,
     }
-    if 'device_id' in sig.parameters:
+    if "device_id" in sig.parameters:
         # noinspection PyTypeChecker
-        params['device_id'] = torch.device(f'cuda:{local_rank}')
+        params["device_id"] = torch.device(f"cuda:{local_rank}")
     dist.init_process_group(**params)
     torch.set_default_dtype(torch.bfloat16)
-    torch.set_default_device('cuda')
+    torch.set_default_device("cuda")
     torch.cuda.set_device(local_rank)
 
-    return dist.get_rank(), dist.get_world_size(), dist.new_group(
-        list(range(num_local_ranks * num_nodes)))
+    return dist.get_rank(), dist.get_world_size(), dist.new_group(list(range(num_local_ranks * num_nodes)))
 
 
 def init_distributed(return_tp_group=False, init_nvshmem=True, return_lc_group=False):
@@ -69,7 +70,7 @@ def init_distributed(return_tp_group=False, init_nvshmem=True, return_lc_group=F
 
     torch.distributed.init_process_group(
         backend="nccl",
-        device_id=torch.device(f'cuda:{LOCAL_RANK}'),
+        device_id=torch.device(f"cuda:{LOCAL_RANK}"),
         world_size=WORLD_SIZE,
         rank=RANK,
         timeout=datetime.timedelta(seconds=1800),
@@ -81,13 +82,13 @@ def init_distributed(return_tp_group=False, init_nvshmem=True, return_lc_group=F
     torch.cuda.synchronize()
     if init_nvshmem:
         import pynvshmem
+
         pynvshmem.init_nvshmem_by_uniqueid(TP_GROUP)
 
     if return_lc_group:
-        local_world_size = int(os.environ.get('LOCAL_WORLD_SIZE', 1))
+        local_world_size = int(os.environ.get("LOCAL_WORLD_SIZE", 1))
         base = (RANK // local_world_size) * local_world_size
-        LC_GROUP = torch.distributed.new_group(
-            list(range(base, base + local_world_size)), backend="nccl")
+        LC_GROUP = torch.distributed.new_group(list(range(base, base + local_world_size)), backend="nccl")
 
         return WORLD_SIZE, RANK, LOCAL_RANK, TP_GROUP, LC_GROUP
     elif return_tp_group:
@@ -108,8 +109,7 @@ def get_local_ipc_handle(data: torch.Tensor):
     return handle
 
 
-def create_dist_tensor(local_rank: int, num_local_ranks: int, data: torch.Tensor, rank: int,
-                       group: dist.ProcessGroup):
+def create_dist_tensor(local_rank: int, num_local_ranks: int, data: torch.Tensor, rank: int, group: dist.ProcessGroup):
     assert num_local_ranks == group.size()
     # Synchronize device IDs
     device_ids = [
@@ -125,8 +125,7 @@ def create_dist_tensor(local_rank: int, num_local_ranks: int, data: torch.Tensor
     local_ipc_handle = get_local_ipc_handle(data)
     dist.all_gather_object(ipc_handles, local_ipc_handle, group)
     buffer_ptrs_gpu = torch.empty(group.size(), dtype=torch.uint64, device="cuda")
-    _sync_ipc_handles(rank, device_ids,
-                      ctypes.c_void_p(buffer_ptrs_gpu.data_ptr()).value, ipc_handles, None)
+    _sync_ipc_handles(rank, device_ids, ctypes.c_void_p(buffer_ptrs_gpu.data_ptr()).value, ipc_handles, None)
     return buffer_ptrs_gpu
 
 
@@ -264,13 +263,13 @@ def supports_p2p_native_atomic():
     (err,) = cudart.cudaFree(0)
     CUDA_CHECK(err)
 
-    (err, support) = cudart.cudaDeviceGetP2PAttribute(
-        cudart.cudaDeviceP2PAttr.cudaDevP2PAttrNativeAtomicSupported, 0, 1)
+    (err, support) = cudart.cudaDeviceGetP2PAttribute(cudart.cudaDeviceP2PAttr.cudaDevP2PAttrNativeAtomicSupported, 0, 1)
     CUDA_CHECK(err)
     return support == 1
 
 
 def set_signal(signal_tensor: torch.Tensor, signal: int, stream: torch.cuda.Stream | None = None):
+    # host side
     stream = stream or torch.cuda.current_stream()
     if signal_tensor.dtype in (torch.int32, torch.uint32):
         (err,) = cuda.cuStreamWriteValue32(
@@ -284,10 +283,8 @@ def set_signal(signal_tensor: torch.Tensor, signal: int, stream: torch.cuda.Stre
         raise Exception(f"Unsupported signal dtype {signal_tensor.dtype}")
 
 
-def wait_eq(signal_tensor: torch.Tensor,
-            signal: int,
-            stream: torch.cuda.Stream | None = None,
-            require_i64=False):
+def wait_eq(signal_tensor: torch.Tensor, signal: int, stream: torch.cuda.Stream | None = None, require_i64=False):
+    # host side
     stream = stream or torch.cuda.current_stream()
     if signal_tensor.dtype == torch.int32:
         (err,) = cuda.cuStreamWaitValue32(
@@ -337,8 +334,7 @@ def has_fullmesh_nvlink_pynvml():
                 if remote_device == cur_device:
                     continue
                 remote_handle = handles[remote_device]
-                p2p_status = pynvml.nvmlDeviceGetP2PStatus(cur_handle, remote_handle,
-                                                           pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
+                p2p_status = pynvml.nvmlDeviceGetP2PStatus(cur_handle, remote_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
                 if p2p_status != pynvml.NVML_P2P_STATUS_OK:
                     return False
         return True
@@ -347,7 +343,6 @@ def has_fullmesh_nvlink_pynvml():
 
 
 class NvidiaSmiUtil:
-
     @staticmethod
     def get_nvlink_adjacency_matrix():
         output = subprocess.check_output(["nvidia-smi", "topo", "-m"], text=True)
@@ -391,8 +386,7 @@ def has_fullmesh_nvlink():
     except Exception:
         nvlink_matrix = NvidiaSmiUtil.get_nvlink_adjacency_matrix()
         has_nvlink = any([any(x == 1 for x in row) for row in nvlink_matrix])
-        _has_fullmesh_nvlink = all(
-            [i == j or v == 1 for i, row in enumerate(nvlink_matrix) for j, v in enumerate(row)])
+        _has_fullmesh_nvlink = all([i == j or v == 1 for i, row in enumerate(nvlink_matrix) for j, v in enumerate(row)])
         if has_nvlink and not _has_fullmesh_nvlink:
             warnings.warn(
                 "⚠️ found NVLink but not fullmesh NVLink, this may cause undefined behavior, please check your GPU topology",
diff --git a/tilelang/engine/__init__.py b/tilelang/engine/__init__.py
index 476b40a35..b7cd7eb23 100644
--- a/tilelang/engine/__init__.py
+++ b/tilelang/engine/__init__.py
@@ -1,3 +1,7 @@
 from .lower import lower, is_device_call  # noqa: F401
 from .param import KernelParam  # noqa: F401
-from .callback import register_cuda_postproc, register_hip_postproc  # noqa: F401
+from .callback import (
+    register_cuda_postproc,  # noqa: F401
+    register_hip_postproc,  # noqa: F401
+    register_c_postproc,  # noqa: F401
+)
diff --git a/tilelang/engine/callback.py b/tilelang/engine/callback.py
index ee1c80693..d65f1eb2b 100644
--- a/tilelang/engine/callback.py
+++ b/tilelang/engine/callback.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 from typing import Callable
-from tvm import register_func
+import tvm_ffi
 from tvm.target import Target
 
 
@@ -12,7 +12,7 @@ def register_cuda_postproc(func: Callable[[str, Target], str], override: bool =
              and returns the processed code (str).
         override: Whether to override existing registered function. Defaults to True.
     """
-    register_func("tilelang_callback_cuda_postproc", f=func, override=override)
+    tvm_ffi.register_global_func("tilelang_callback_cuda_postproc", f=func, override=override)
 
 
 def register_hip_postproc(func: Callable[[str, Target], str], override: bool = True):
@@ -23,7 +23,22 @@ def register_hip_postproc(func: Callable[[str, Target], str], override: bool = T
              and returns the processed code (str).
         override: Whether to override existing registered function. Defaults to True.
     """
-    register_func("tilelang_callback_hip_postproc", f=func, override=override)
+    tvm_ffi.register_global_func("tilelang_callback_hip_postproc", f=func, override=override)
+
+
+def register_c_postproc(func: Callable[[str, Target], str], override: bool = True):
+    """Register a post-processing function for C host code generation.
+
+    This callback intercepts C host code emitted by TileLang just before it
+    is wrapped into a CSourceModule. It should take the generated code string
+    and the `Target` as inputs, and return the (possibly) modified code.
+
+    Args:
+        func: A callable that takes generated code (str) and target (Target) as input,
+              and returns the processed code (str).
+        override: Whether to override existing registered function. Defaults to True.
+    """
+    tvm_ffi.register_global_func("tilelang_callback_c_host_postproc", f=func, override=override)
 
 
 def register_cuda_postproc_callback(func: Callable | bool = None, override: bool = True):
@@ -90,3 +105,36 @@ def _register(fn: Callable[[str, Target], str]):
         return _register
 
     raise TypeError("Invalid decorator usage")
+
+
+def register_c_postproc_callback(func: Callable | bool = None, override: bool = True):
+    """Decorator for registering C host post-processing callback function.
+
+    Can be used with or without parentheses:
+        @register_c_postproc_callback
+        def func(code, target): ...
+
+        @register_c_postproc_callback()
+        def func(code, target): ...
+
+        @register_c_postproc_callback(override=False)
+        def func(code, target): ...
+
+    Args:
+        func: The function to be decorated or a boolean override flag
+        override: Whether to override existing registered function. Defaults to True.
+    """
+    if callable(func):
+        register_c_postproc(func, override)
+        return func
+
+    if func is None or isinstance(func, bool):
+        _override = func if isinstance(func, bool) else override
+
+        def _register(fn: Callable[[str, Target], str]):
+            register_c_postproc(fn, _override)
+            return fn
+
+        return _register
+
+    raise TypeError("Invalid decorator usage")
diff --git a/tilelang/engine/lower.py b/tilelang/engine/lower.py
index 4bd77c8c8..44557ec3a 100644
--- a/tilelang/engine/lower.py
+++ b/tilelang/engine/lower.py
@@ -1,4 +1,5 @@
 """The compiler for TL programs."""
+
 from __future__ import annotations
 
 import os
@@ -7,12 +8,15 @@
 import tilelang.transform
 from tilelang import tvm as tvm
 from tvm import tir
+import tvm_ffi
 from tvm.ir import CallingConv
 from tvm.target import Target
 from tilelang.contrib import hipcc, nvcc
+from tilelang.transform import PassConfigKey
 from tilelang.engine.param import KernelParam, CompiledArtifact
 from tilelang.utils.target import determine_target
 from tilelang.engine.phase import (
+    PreLowerSemanticCheck,
     LowerAndLegalize,
     OptimizeForTarget,
 )
@@ -25,14 +29,13 @@ def is_cpu_device_backend(target: Target):
 
 def has_device_kernel_launch(attrs) -> bool:
     """Check if the attributes indicate a device kernel launch."""
-    return bool(attrs and "calling_conv" in attrs and
-                attrs["calling_conv"] == CallingConv.DEVICE_KERNEL_LAUNCH)
+    return bool(attrs and "calling_conv" in attrs and attrs["calling_conv"] == CallingConv.DEVICE_KERNEL_LAUNCH)
 
 
 def is_device_call_c_device(func: tir.PrimFunc):
     attrs = func.attrs
     calling_conv = attrs.get("calling_conv", CallingConv.DEFAULT)
-    is_cpacked = (calling_conv == CallingConv.C_PACKED_FUNC)
+    is_cpacked = calling_conv == CallingConv.C_PACKED_FUNC
 
     # Check if it's a C target
     if "target" in attrs and attrs["target"].kind.name == "c" and not is_cpacked:
@@ -53,8 +56,8 @@ def get_host_call(is_device_c: bool = False) -> Callable[[tir.PrimFunc], bool]:
     return lambda func: not get_device_call(is_device_c)(func)
 
 
-@tvm.register_func("tilelang_callback_cuda_compile", override=True)
-def tilelang_callback_cuda_compile(code, target):
+@tvm_ffi.register_global_func("tilelang_callback_cuda_compile", override=True)
+def tilelang_callback_cuda_compile(code, target, pass_config=None):
     project_root = osp.join(osp.dirname(__file__), "../..")
     if "TL_TEMPLATE_PATH" in os.environ:
         tl_template_path = os.environ["TL_TEMPLATE_PATH"]
@@ -66,48 +69,73 @@ def tilelang_callback_cuda_compile(code, target):
         cutlass_path = os.environ["TL_CUTLASS_PATH"]
     else:
         cutlass_path = osp.abspath(osp.join(project_root, "3rdparty/cutlass/include"))
-    if env.USE_DISTRIBUTED:
-        if os.environ.get("NVSHMEM_SRC", None) is not None:
-            nvshmem_include_path = os.environ["NVSHMEM_SRC"] + "/build/src/include"
-            nvshmem_lib_path = os.environ["NVSHMEM_SRC"] + "/build/src/lib"
-        else:
-            raise ValueError("NVSHMEM_SRC is not set")
     target_arch = nvcc.get_target_arch(nvcc.get_target_compute_version(target))
 
     arch = [f"-arch=sm_{target_arch}"]
-    format = "cubin"
+    compile_format = "cubin"
+
+    # Read pass-config keys (string-valued) like in jit.adapter.libgen.compile_lib
+    cfg = pass_config or {}
+    enable_fast_math = bool(cfg.get(PassConfigKey.TL_ENABLE_FAST_MATH, False))
+
+    ptxas_usage_level = cfg.get(PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL, None)
+    verbose_ptxas_output = bool(cfg.get(PassConfigKey.TL_ENABLE_PTXAS_VERBOSE_OUTPUT, False))
 
-    # printing out number of registers
-    debug_option = "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage"
     options = [
         "-std=c++17",
-        debug_option,
-        "--use_fast_math",
         "-I" + tl_template_path,
         "-I" + cutlass_path,
     ]
-    if env.USE_DISTRIBUTED:
-        if os.environ.get("NVSHMEM_SRC", None) is not None:
-            options += [
-                "-I" + nvshmem_include_path,
-                "-L" + nvshmem_lib_path,
-                "-lnvshmem_host -lnvshmem_device"
-                "-rdc=true",
-            ]
+    # Add NVSHMEM include path and library linking for distributed support
+    if env.USE_DISTRIBUTED and env.USE_NVSHMEM:
+        if env.NVSHMEM_INCLUDE_DIR and env.NVSHMEM_LIB_PATH:
+            options.append("-I" + env.NVSHMEM_INCLUDE_DIR)
+            options.append("-L" + env.NVSHMEM_LIB_PATH)
+            options.append("-lnvshmem_device")
+            options.append("-rdc=true")
         else:
-            raise ValueError("NVSHMEM_SRC is not set")
+            raise ValueError(
+                "TILELANG_USE_DISTRIBUTED is enabled but NVSHMEM paths not found. Install nvidia-nvshmem-cu12 via pip or set NVSHMEM_SRC."
+            )
+
+    # Merge extra device compiler flags from pass config, if provided
+    extra_flags = cfg.get(PassConfigKey.TL_DEVICE_COMPILE_FLAGS, None)
+    if extra_flags:
+        import shlex
+
+        if isinstance(extra_flags, str):
+            tokens = shlex.split(extra_flags)
+        else:
+            tokens = []
+            for flag in extra_flags:
+                if isinstance(flag, str):
+                    tokens.extend(shlex.split(flag))
+                else:
+                    tokens.append(str(flag))
+        options += tokens
+
+    verbose = False
+    if enable_fast_math:
+        options.append("--use_fast_math")
+    if ptxas_usage_level is not None:
+        options.append(f"--ptxas-options=--register-usage-level={ptxas_usage_level}")
+    if verbose_ptxas_output:
+        options.append("--ptxas-options=--verbose")
+        options.append("-w")  # Suppress warnings to make ptxas output more readable
+        verbose = True
+
     ptx = nvcc.compile_cuda(
         code,
-        format,
+        compile_format,
         arch,
         options=options,
-        verbose=False,
+        verbose=verbose,
     )
 
     return ptx
 
 
-@tvm.register_func("tilelang_callback_hip_compile", override=True)
+@tvm_ffi.register_global_func("tilelang_callback_hip_compile", override=True)
 def tilelang_callback_hip_compile(code, target):
     project_root = osp.join(osp.dirname(__file__), "../..")
     tl_template_path = osp.abspath(osp.join(project_root, "src"))
@@ -139,12 +167,16 @@ def extrac_params(func: tir.PrimFunc) -> list[KernelParam]:
         if var in func.buffer_map:
             tensor_types.append(KernelParam.from_buffer(func.buffer_map[var]))
         else:
+            if var.dtype == "handle":
+                raise ValueError(
+                    f"Handle parameter {var} must be mapped to a buffer.\n"
+                    f"Please use T.tensor({var.name}, shape=..., dtype=...) to map it to a buffer."
+                )
             tensor_types.append(KernelParam.from_var(var))
     return tensor_types
 
 
 def canon_target_host(target: str | Target, target_host: str | Target | None):
-
     if not target_host:
         target_host = "llvm" if tvm.runtime.enabled("llvm") else "c"
 
@@ -163,7 +195,7 @@ def host_codegen(host_mod: tvm.IRModule, target_host: Target) -> tvm.IRModule:
     if target_host.kind.name == "llvm":
         host_mod = tvm.ffi.get_global_func("target.build.llvm")(host_mod, target_host)
     elif target_host.kind.name == "c":
-        host_mod = tvm.ffi.get_global_func("target.build.c")(host_mod, target_host)
+        host_mod = tvm.ffi.get_global_func("target.build.tilelang_c")(host_mod, target_host)
     else:
         raise ValueError(f"Target host {target_host.kind.name} is not supported")
     return host_mod
@@ -175,7 +207,8 @@ def device_codegen(device_mod: tvm.IRModule, target: Target) -> tvm.IRModule:
     device_mod = tir.transform.Simplify()(device_mod)
 
     if target.kind.name == "cuda":
-        device_mod = tvm.ffi.get_global_func("target.build.tilelang_cuda")(device_mod, target)
+        global_func = "target.build.tilelang_" + ("cutedsl" if "cutedsl" in target.keys else "cuda")
+        device_mod = tvm.ffi.get_global_func(global_func)(device_mod, target)
     elif target.kind.name == "hip":
         device_mod = tvm.ffi.get_global_func("target.build.tilelang_hip")(device_mod, target)
     else:
@@ -189,17 +222,16 @@ def device_codegen_without_compile(device_mod: tvm.IRModule, target: Target) ->
     device_mod = tilelang.transform.LowerIntrin()(device_mod)
     device_mod = tir.transform.Simplify()(device_mod)
     if target.kind.name == "cuda":
-        device_mod = tvm.ffi.get_global_func("target.build.tilelang_cuda_without_compile")(
-            device_mod, target)
+        global_func = "target.build.tilelang_" + ("cutedsl" if "cutedsl" in target.keys else "cuda") + "_without_compile"
+        device_mod = tvm.ffi.get_global_func(global_func)(device_mod, target)
     elif target.kind.name == "hip":
-        device_mod = tvm.ffi.get_global_func("target.build.tilelang_hip_without_compile")(
-            device_mod, target)
+        device_mod = tvm.ffi.get_global_func("target.build.tilelang_hip_without_compile")(device_mod, target)
     elif target.kind.name == "c":
         device_mod = tvm.ffi.get_global_func("target.build.tilelang_cpp")(device_mod, target)
     elif target.kind.name == "llvm":
         device_mod = tvm.ffi.get_global_func("target.build.llvm")(device_mod, target)
     elif target.kind.name == "webgpu":
-        device_mod = tvm.ffi.get_global_func("target.build.tilelang_webgpu")(device_mod, target)
+        device_mod = tvm.ffi.get_global_func("target.build.webgpu")(device_mod, target)
     elif target.kind.name == "metal":
         device_mod = tvm.ffi.get_global_func("target.build.metal")(device_mod, target)
     else:
@@ -216,12 +248,12 @@ def lower(
     enable_host_codegen=False,
     enable_device_compile=False,
 ) -> CompiledArtifact:
-    '''
-        enable_host_codegen: whether to enable host codegen, default is False, as we have our
-        own host codegen implementation in jit.
-        enable_device_compile: whether to enable device codegen, default is False, as we have our
-        own device codegen implementation in jit.
-    '''
+    """
+    enable_host_codegen: whether to enable host codegen, default is False, as we have our
+    own host codegen implementation in jit.
+    enable_device_compile: whether to enable device codegen, default is False, as we have our
+    own device codegen implementation in jit.
+    """
 
     mod = func_or_mod
     params = None
@@ -241,6 +273,9 @@ def lower(
     _is_host_call = get_host_call(is_device_c=is_cpu_device_backend(target))
     _is_device_call = get_device_call(is_device_c=is_cpu_device_backend(target))
 
+    # Before lowering, do semantic check
+    PreLowerSemanticCheck(mod)
+
     # Phase 1: Lower and legalize the IR
     mod = LowerAndLegalize(mod, target)
 
@@ -250,14 +285,11 @@ def lower(
     host_mod = tir.transform.Filter(_is_host_call)(mod)
     device_mod = tir.transform.Filter(_is_device_call)(mod)
 
-    codegen_mod = device_codegen(
-        device_mod, target) if enable_device_compile else device_codegen_without_compile(
-            device_mod, target)
+    codegen_mod = device_codegen(device_mod, target) if enable_device_compile else device_codegen_without_compile(device_mod, target)
 
     if enable_host_codegen:
         host_mod = host_codegen(host_mod, target_host)
         host_mod.import_module(codegen_mod)
-        return CompiledArtifact(
-            host_mod, device_mod, params, codegen_mod.get_source(), rt_mod=host_mod)
+        return CompiledArtifact(host_mod, device_mod, params, codegen_mod.inspect_source(), rt_mod=host_mod)
 
-    return CompiledArtifact(host_mod, device_mod, params, codegen_mod.get_source())
+    return CompiledArtifact(host_mod, device_mod, params, codegen_mod.inspect_source())
diff --git a/tilelang/engine/param.py b/tilelang/engine/param.py
index de3c979ea..98ef6f0e1 100644
--- a/tilelang/engine/param.py
+++ b/tilelang/engine/param.py
@@ -1,11 +1,12 @@
 """The profiler and convert to torch utils"""
+
 from __future__ import annotations
 
 from dataclasses import dataclass
 import torch
 from tilelang import tvm as tvm
 from tvm.tir import Buffer, IntImm, Var, PrimExpr
-from tilelang.utils.tensor import map_torch_type
+import tilelang.language as T
 
 
 @dataclass
@@ -14,7 +15,12 @@ class KernelParam:
     Represents parameters for a kernel operation, storing dtype and shape information.
     Used to describe tensor or scalar parameters in TVM/PyTorch interop.
     """
-    dtype: torch.dtype  # PyTorch data type of the parameter
+
+    # Use tvm.DataType (buffer.dtype) directly instead of torch.dtype to support more data types
+    # tvm.DataType can represent a much wider range of types than PyTorch's dtype system,
+    # including specialized types like float8_e4m3, float8_e5m2, custom quantized types, etc.
+    # This avoids information loss when converting from TVM buffer types
+    dtype: tvm.DataType  # Data type from buffer.dtype (supports all TVM types)
     shape: list[int | Var]  # List of dimensions, can be integers or TVM variables
 
     @classmethod
@@ -26,12 +32,14 @@ def from_buffer(cls, buffer: Buffer):
             buffer: TVM Buffer object containing dtype and shape information
 
         Returns:
-            KernelParam instance with converted dtype and shape
+            KernelParam instance with dtype directly from buffer and shape
 
         Raises:
             ValueError: If dimension type is not supported (not IntImm or Var)
         """
-        dtype = map_torch_type(buffer.dtype)
+        # Use buffer.dtype directly (tvm.DataType) to preserve all type information
+        # buffer.dtype is already a tvm.DataType object, no conversion needed
+        dtype = buffer.dtype
         shape = []
         for s in buffer.shape:
             if isinstance(s, IntImm):
@@ -54,7 +62,9 @@ def from_var(cls, var: Var):
         Returns:
             KernelParam instance representing a scalar (empty shape)
         """
-        dtype = map_torch_type(var.dtype)
+        # Use var.dtype directly (tvm.DataType) to preserve all type information
+        # var.dtype is already a tvm.DataType object, no conversion needed
+        dtype = var.dtype
         return cls(dtype, [])
 
     def is_scalar(self) -> bool:
@@ -90,6 +100,18 @@ def is_float8(self) -> bool:
             dtype_str = dtype_str[6:]
         return dtype_str.startswith("float8")
 
+    def is_float4(self) -> bool:
+        """
+        Checks if the parameter represents a float4 type.
+
+        Returns:
+            bool: True if parameter is a float4 type, False otherwise
+        """
+        dtype_str = str(self.dtype)
+        if dtype_str.startswith("torch."):
+            dtype_str = dtype_str[6:]
+        return dtype_str.startswith("float4")
+
     def is_boolean(self) -> bool:
         """
         Checks if the parameter represents a boolean type.
@@ -102,6 +124,31 @@ def is_boolean(self) -> bool:
             dtype_str = dtype_str[6:]
         return dtype_str.startswith("bool")
 
+    def torch_dtype(self) -> torch.dtype:
+        """
+        Converts the TVM DataType to PyTorch dtype.
+
+        This method is used when creating PyTorch tensors from KernelParam,
+        as PyTorch's tensor creation functions require torch.dtype.
+
+        Returns:
+            torch.dtype: Corresponding PyTorch dtype
+
+        Example:
+            >>> param = KernelParam.from_buffer(buffer)
+            >>> tensor = torch.empty(shape, dtype=param.torch_dtype())
+        """
+        return T.dtype(self.dtype).as_torch()
+
+    def tilelang_dtype(self) -> T.dtype:
+        """
+        Converts the TVM DataType to TileLang dtype.
+
+        Returns:
+            T.dtype: Corresponding TileLang dtype
+        """
+        return T.dtype(self.dtype)
+
 
 @dataclass
 class CompiledArtifact:
@@ -109,6 +156,7 @@ class CompiledArtifact:
     Represents a compiled kernel artifact containing both host and device code.
     Stores all necessary components for kernel execution in the TVM runtime.
     """
+
     host_mod: tvm.IRModule  # Host-side TVM IR module for managing kernel execution
     device_mod: tvm.IRModule  # Device-side TVM IR module containing the actual kernel code
     params: list[KernelParam]  # List of parameters (tensors/scalars) used by the kernel
diff --git a/tilelang/engine/phase.py b/tilelang/engine/phase.py
index 6c34eae08..952a175a2 100644
--- a/tilelang/engine/phase.py
+++ b/tilelang/engine/phase.py
@@ -6,8 +6,7 @@
 from tilelang.contrib.nvcc import have_tma, is_hopper
 
 
-def allow_warp_specialized(pass_ctx: PassContext | None = None,
-                           target: Target | None = None) -> bool:
+def allow_warp_specialized(pass_ctx: PassContext | None = None, target: Target | None = None) -> bool:
     # avoid circular import
     from tilelang.jit.adapter.utils import is_cuda_target
 
@@ -19,8 +18,7 @@ def allow_warp_specialized(pass_ctx: PassContext | None = None,
     return not disable_warp_specialized
 
 
-def allow_tma_and_warp_specialized(pass_ctx: PassContext | None = None,
-                                   target: Target | None = None) -> bool:
+def allow_tma_and_warp_specialized(pass_ctx: PassContext | None = None, target: Target | None = None) -> bool:
     if pass_ctx is None:
         pass_ctx = tilelang.transform.get_pass_context()
     if not have_tma(target):
@@ -47,12 +45,10 @@ def allow_global_thread_synchronization(pass_ctx: PassContext | None = None) ->
     return enable_global_thread_sync
 
 
-def should_enable_aggressive_merge(pass_ctx: PassContext | None = None,
-                                   target: Target | None = None) -> bool:
+def should_enable_aggressive_merge(pass_ctx: PassContext | None = None, target: Target | None = None) -> bool:
     if pass_ctx is None:
         pass_ctx = tilelang.transform.get_pass_context()
-    enable_aggressive_merge = bool(
-        pass_ctx.config.get(tilelang.PassConfigKey.TL_ENABLE_AGGRESSIVE_SHARED_MEMORY_MERGE, False))
+    enable_aggressive_merge = bool(pass_ctx.config.get(tilelang.PassConfigKey.TL_ENABLE_AGGRESSIVE_SHARED_MEMORY_MERGE, False))
     if allow_warp_specialized(pass_ctx=pass_ctx, target=target):
         # This is a workaround to avoid the bug in the MergeSharedMemoryAllocations pass
         # when warp specialization is enabled, as different warp threads may access different
@@ -67,6 +63,70 @@ def should_force_let_inline(pass_ctx: PassContext | None = None) -> bool:
     return bool(pass_ctx and pass_ctx.config.get(tilelang.PassConfigKey.TL_FORCE_LET_INLINE, False))
 
 
+def should_enable_ast_print(pass_ctx: PassContext | None = None) -> bool:
+    if pass_ctx is None:
+        pass_ctx = tilelang.transform.get_pass_context()
+    return bool(pass_ctx and pass_ctx.config.get(tilelang.PassConfigKey.TL_AST_PRINT_ENABLE, False))
+
+
+def should_enable_layout_visual(pass_ctx: PassContext | None = None) -> bool:
+    if pass_ctx is None:
+        pass_ctx = tilelang.transform.get_pass_context()
+    enabled = pass_ctx.config.get(tilelang.PassConfigKey.TL_LAYOUT_VISUALIZATION_ENABLE, False)
+    return enabled
+
+
+def get_layout_visual_formats(pass_ctx: PassContext | None = None) -> list[str]:
+    if pass_ctx is None:
+        pass_ctx = tilelang.transform.get_pass_context()
+    formats_value = pass_ctx.config.get(tilelang.PassConfigKey.TL_LAYOUT_VISUALIZATION_FORMATS, "")
+    if not formats_value:
+        return ["txt"]
+
+    formats_str = formats_value.strip().lower()
+    valid_formats = ["txt", "png", "pdf", "svg", "all"]
+
+    if formats_str == "all":
+        return ["txt", "png", "pdf", "svg"]
+
+    if "," in formats_str:
+        formats_list = [f.strip() for f in formats_str.split(",")]
+    else:
+        formats_list = [formats_str]
+
+    invalid_formats = [f for f in formats_list if f not in valid_formats]
+    if invalid_formats:
+        raise ValueError(
+            f"Invalid formats for TL_LAYOUT_VISUALIZATION_FORMATS: {invalid_formats}. "
+            f"Valid formats are: {valid_formats}. "
+            f"You can choose one of the valid formats or a comma-separated list of formats.(e.g., 'txt,png,pdf')"
+        )
+    return formats_list
+
+
+def LayoutVisual(mod: IRModule) -> None:
+    """Apply layout visualization pass if enabled."""
+    if should_enable_layout_visual():
+        formats = get_layout_visual_formats()
+        tilelang.analysis.LayoutVisual(formats=formats)(mod)
+
+
+def PreLowerSemanticCheck(mod: IRModule) -> None:
+    """
+    Check whether the module is valid before lowering. If not, raise a user-friendly error
+    in Python side instead of letting the error dive into the complicated TVM/C++ stack.
+    Note: This is a validation-only pipeline of passes and does not modify or return the module.
+    """
+
+    # Print AST for debugging purpose
+    if should_enable_ast_print():
+        tilelang.analysis.ASTPrinter()(mod)
+    # Check if there are any invalid nested loops.
+    tilelang.analysis.NestedLoopChecker()(mod)
+    # Check if there are any invalid symbolic T.Parallel + fragment access.
+    tilelang.analysis.FragmentLoopChecker()(mod)
+
+
 def LowerAndLegalize(mod: IRModule, target: Target) -> IRModule:
     # Bind the target device information to the module
     """
@@ -96,6 +156,8 @@ def LowerAndLegalize(mod: IRModule, target: Target) -> IRModule:
         mod = tilelang.transform.LetInline()(mod)
     # Add wrapper for single buf store
     mod = tilelang.transform.AddWrapperForSingleBufStore()(mod)
+    # Normalize negative indices to canonical non-negative form
+    mod = tilelang.transform.LegalizeNegativeIndex()(mod)
     # Inject assumes to speedup tvm prover
     mod = tilelang.transform.InjectAssumes()(mod)
     # Simplify the IR expressions
@@ -104,6 +166,8 @@ def LowerAndLegalize(mod: IRModule, target: Target) -> IRModule:
     mod = tilelang.transform.LayoutReducer()(mod)
     # Infer memory layouts for fragments and shared memory
     mod = tilelang.transform.LayoutInference()(mod)
+    # Visualize the layout
+    LayoutVisual(mod)
     # Lower high-level tile operations to low-level operations
     mod = tilelang.transform.LowerTileOp()(mod)
     # Lower l2 persistent map
@@ -118,8 +182,8 @@ def LowerAndLegalize(mod: IRModule, target: Target) -> IRModule:
     # TODO(lei): return to tir pass when kSymbolicBound simplification
     # is merged into tvm.
     mod = tilelang.transform.Simplify()(mod)
-    # Try to vectorize loop with dynamic shape
-    mod = tilelang.transform.LoopVectorizeDynamic()(mod)
+    # Hoist any root-block annotations to PrimFunc attrs if pass is available
+    mod = tilelang.transform.HoistNonRestrictParams()(mod)
     return mod
 
 
@@ -149,7 +213,7 @@ def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
         mod = tilelang.transform.InjectFenceProxy()(mod)
     else:
         mod = tilelang.transform.IfStmtBinding()(mod)
-        mod = tir.transform.PlanAndUpdateBufferAllocationLocation()(mod)
+        mod = tilelang.transform.PlanAndUpdateBufferAllocationLocation()(mod)
         mod = tilelang.transform.PipelinePlanning()(mod)
         mod = tilelang.transform.InjectSoftwarePipeline()(mod)
         mod = tilelang.transform.MergeIfStmt()(mod)
@@ -159,6 +223,7 @@ def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
             mod = tilelang.transform.InjectFenceProxy()(mod)
 
     mod = tilelang.transform.LowerOpaqueBlock()(mod)
+    mod = tilelang.transform.Simplify()(mod)
     mod = tir.transform.NarrowDataType(32)(mod)
     mod = tilelang.transform.FlattenBuffer()(mod)
     # ConfigIndexBitwidth must be applied after FlattenBuffer
@@ -195,12 +260,11 @@ def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
         mod = tilelang.transform.ThreadSync("global")(mod)
     mod = tilelang.transform.AnnotateDeviceRegions()(mod)
     mod = tilelang.transform.SplitHostDevice()(mod)
+    mod = tilelang.transform.AnnotateReadOnlyParams()(mod)
     # MergeSharedMemoryAllocations must be applied after SplitHostDevice
     # because the merged allocation site is at the beginning of each device function
     enable_aggressive_merge = should_enable_aggressive_merge(pass_ctx=pass_ctx, target=target)
-    mod = tilelang.transform.MergeSharedMemoryAllocations(
-        enable_aggressive_merge=enable_aggressive_merge)(
-            mod)
+    mod = tilelang.transform.MergeSharedMemoryAllocations(enable_aggressive_merge=enable_aggressive_merge)(mod)
     mod = tilelang.transform.ThreadSync("shared")(mod)
     mod = tilelang.transform.ThreadSync("shared.dyn")(mod)
     # Inject PTX async copy must behind the thread sync pass
@@ -209,6 +273,7 @@ def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
     if allow_tma_and_warp_specialized(pass_ctx=pass_ctx, target=target):
         mod = tilelang.transform.AnnotateWarpGroupRegAlloc()(mod)
     mod = tilelang.transform.MakePackedAPI()(mod)
+    mod = tilelang.transform.Simplify()(mod)
     mod = tilelang.transform.LowerDeviceKernelLaunch()(mod)
 
     # Transform threadblock to persistent threadblock
diff --git a/tilelang/env.py b/tilelang/env.py
index 6db3ee33b..a15477f04 100644
--- a/tilelang/env.py
+++ b/tilelang/env.py
@@ -10,32 +10,34 @@
 logger = logging.getLogger(__name__)
 
 # SETUP ENVIRONMENT VARIABLES
-CUTLASS_NOT_FOUND_MESSAGE = ("CUTLASS is not installed or found in the expected path")
+CUTLASS_NOT_FOUND_MESSAGE = "CUTLASS is not installed or found in the expected path"
 ", which may lead to compilation bugs when utilize tilelang backend."
-COMPOSABLE_KERNEL_NOT_FOUND_MESSAGE = (
-    "Composable Kernel is not installed or found in the expected path")
+COMPOSABLE_KERNEL_NOT_FOUND_MESSAGE = "Composable Kernel is not installed or found in the expected path"
 ", which may lead to compilation bugs when utilize tilelang backend."
-TL_TEMPLATE_NOT_FOUND_MESSAGE = ("TileLang is not installed or found in the expected path")
+TL_TEMPLATE_NOT_FOUND_MESSAGE = "TileLang is not installed or found in the expected path"
 ", which may lead to compilation bugs when utilize tilelang backend."
-TVM_LIBRARY_NOT_FOUND_MESSAGE = ("TVM is not installed or found in the expected path")
+TVM_LIBRARY_NOT_FOUND_MESSAGE = "TVM is not installed or found in the expected path"
 
 TL_ROOT = os.path.dirname(os.path.abspath(__file__))
-TL_LIBS = [TL_ROOT, os.path.join(TL_ROOT, 'lib')]
+# Only expose the internal lib directory to sys.path to avoid shadowing
+# common top-level module names (e.g., utils, analysis) from user projects.
+TL_LIBS = [os.path.join(TL_ROOT, "lib")]
 TL_LIBS = [i for i in TL_LIBS if os.path.exists(i)]
 
 DEV = False
-THIRD_PARTY_ROOT = os.path.join(TL_ROOT, '3rdparty')
+THIRD_PARTY_ROOT = os.path.join(TL_ROOT, "3rdparty")
 if not os.path.exists(THIRD_PARTY_ROOT):
     DEV = True
     tl_dev_root = os.path.dirname(TL_ROOT)
 
-    dev_lib_root = os.path.join(tl_dev_root, 'build')
-    TL_LIBS = [dev_lib_root, os.path.join(dev_lib_root, 'tvm')]
-    THIRD_PARTY_ROOT = os.path.join(tl_dev_root, '3rdparty')
-    logger.warning(f'Loading tilelang libs from dev root: {dev_lib_root}')
+    dev_lib_root = os.path.join(tl_dev_root, "build")
+    # In dev builds, place artifacts under build/lib and point search path there
+    # to avoid adding the entire build root to sys.path.
+    TL_LIBS = [os.path.join(dev_lib_root, "lib"), os.path.join(dev_lib_root, "tvm")]
+    THIRD_PARTY_ROOT = os.path.join(tl_dev_root, "3rdparty")
+    logger.warning(f"Loading tilelang libs from dev root: {dev_lib_root}")
 
-assert TL_LIBS and all(
-    os.path.exists(i) for i in TL_LIBS), f'tilelang lib root do not exists: {TL_LIBS}'
+assert TL_LIBS and all(os.path.exists(i) for i in TL_LIBS), f"tilelang lib root do not exists: {TL_LIBS}"
 
 for lib in TL_LIBS:
     if lib not in sys.path:
@@ -48,7 +50,7 @@ def _find_cuda_home() -> str:
     Adapted from https://github.com/pytorch/pytorch/blob/main/torch/utils/cpp_extension.py
     """
     # Guess #1
-    cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')
+    cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
     if cuda_home is None:
         # Guess #2
         nvcc_path = shutil.which("nvcc")
@@ -66,15 +68,15 @@ def _find_cuda_home() -> str:
 
         else:
             # Guess #3
-            if sys.platform == 'win32':
-                cuda_homes = glob.glob('C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*')
-                cuda_home = '' if len(cuda_homes) == 0 else cuda_homes[0]
+            if sys.platform == "win32":
+                cuda_homes = glob.glob("C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*")
+                cuda_home = "" if len(cuda_homes) == 0 else cuda_homes[0]
             else:
                 # Linux/macOS
-                if os.path.exists('/usr/local/cuda'):
-                    cuda_home = '/usr/local/cuda'
-                elif os.path.exists('/opt/nvidia/hpc_sdk/Linux_x86_64'):
-                    cuda_home = '/opt/nvidia/hpc_sdk/Linux_x86_64'
+                if os.path.exists("/usr/local/cuda"):
+                    cuda_home = "/usr/local/cuda"
+                elif os.path.exists("/opt/nvidia/hpc_sdk/Linux_x86_64"):
+                    cuda_home = "/opt/nvidia/hpc_sdk/Linux_x86_64"
 
             # Validate found path
             if cuda_home is None or not os.path.exists(cuda_home):
@@ -85,13 +87,13 @@ def _find_cuda_home() -> str:
 
 def _find_rocm_home() -> str:
     """Find the ROCM install path."""
-    rocm_home = os.environ.get('ROCM_PATH') or os.environ.get('ROCM_HOME')
+    rocm_home = os.environ.get("ROCM_PATH") or os.environ.get("ROCM_HOME")
     if rocm_home is None:
         rocmcc_path = shutil.which("hipcc")
         if rocmcc_path is not None:
             rocm_home = os.path.dirname(os.path.dirname(rocmcc_path))
         else:
-            rocm_home = '/opt/rocm'
+            rocm_home = "/opt/rocm"
             if not os.path.exists(rocm_home):
                 rocm_home = None
     return rocm_home if rocm_home is not None else ""
@@ -100,6 +102,7 @@ def _find_rocm_home() -> str:
 # Cache control
 class CacheState:
     """Class to manage global kernel caching state."""
+
     _enabled = True
 
     @classmethod
@@ -196,12 +199,6 @@ def __set__(self, instance, value):
         # os.environ[self.key] = value
 
 
-# Cache control API (wrap CacheState)
-enable_cache = CacheState.enable
-disable_cache = CacheState.disable
-is_cache_enabled = CacheState.is_enabled
-
-
 # Utility function for environment variables with defaults
 # Assuming EnvVar and CacheState are defined elsewhere
 class Environment:
@@ -232,36 +229,103 @@ class Environment:
     TILELANG_TMP_DIR = EnvVar("TILELANG_TMP_DIR", os.path.join(TILELANG_CACHE_DIR.get(), "tmp"))
 
     # Kernel Build options
-    TILELANG_PRINT_ON_COMPILATION = EnvVar("TILELANG_PRINT_ON_COMPILATION",
-                                           "1")  # print kernel name on compile
-    TILELANG_CLEAR_CACHE = EnvVar("TILELANG_CLEAR_CACHE", "0")  # clear cache automatically if set
+    TILELANG_PRINT_ON_COMPILATION = EnvVar("TILELANG_PRINT_ON_COMPILATION", "1")  # print kernel name on compile
+    TILELANG_DISABLE_CACHE = EnvVar(
+        "TILELANG_DISABLE_CACHE", "0"
+    )  # disable kernel cache, usually for unit testing / debugging, high priority
+    TILELANG_CLEAR_CACHE = EnvVar("TILELANG_CLEAR_CACHE", "0")  # DEPRECATED! clear cache automatically if set
+
+    # Kernel selection options
+    # Default to GEMM v2; set to "1"/"true"/"yes"/"on" to force v1
+    TILELANG_USE_GEMM_V1 = EnvVar("TILELANG_USE_GEMM_V1", "0")
 
     # Auto-tuning settings
-    TILELANG_AUTO_TUNING_CPU_UTILITIES = EnvVar("TILELANG_AUTO_TUNING_CPU_UTILITIES",
-                                                "0.9")  # percent of CPUs used
-    TILELANG_AUTO_TUNING_CPU_COUNTS = EnvVar("TILELANG_AUTO_TUNING_CPU_COUNTS",
-                                             "-1")  # -1 means auto
-    TILELANG_AUTO_TUNING_MAX_CPU_COUNT = EnvVar("TILELANG_AUTO_TUNING_MAX_CPU_COUNT",
-                                                "-1")  # -1 means no limit
+    TILELANG_AUTO_TUNING_DISABLE_CACHE = EnvVar("TILELANG_AUTO_TUNING_DISABLE_CACHE", "0")
+    TILELANG_AUTO_TUNING_CPU_UTILITIES = EnvVar("TILELANG_AUTO_TUNING_CPU_UTILITIES", "0.9")  # percent of CPUs used
+    TILELANG_AUTO_TUNING_CPU_COUNTS = EnvVar("TILELANG_AUTO_TUNING_CPU_COUNTS", "-1")  # -1 means auto
+    TILELANG_AUTO_TUNING_MAX_CPU_COUNT = EnvVar("TILELANG_AUTO_TUNING_MAX_CPU_COUNT", "-1")  # -1 means no limit
+
+    # Compilation defaults (for jit, autotune, compile)
+    # These allow overriding default compilation parameters via environment variables
+    TILELANG_DEFAULT_TARGET = EnvVar("TILELANG_TARGET", "auto")
+    TILELANG_DEFAULT_EXECUTION_BACKEND = EnvVar("TILELANG_EXECUTION_BACKEND", "auto")
+    TILELANG_DEFAULT_VERBOSE = EnvVar("TILELANG_VERBOSE", "0")
 
     # TVM integration
     SKIP_LOADING_TILELANG_SO = EnvVar("SKIP_LOADING_TILELANG_SO", "0")
     TVM_IMPORT_PYTHON_PATH = EnvVar("TVM_IMPORT_PYTHON_PATH", None)
 
-    # Distributed settings
-    USE_DISTRIBUTED = EnvVar("TILELANG_USE_DISTRIBUTED", "0").get().lower() in ("1", "true", "on")
-    USE_NVSHMEM = EnvVar("TILELANG_USE_NVSHMEM", "0").get().lower() in ("1", "true", "on")
-    if USE_DISTRIBUTED:
-        if EnvVar("NVSHMEM_SRC", None).get() is not None:
-            NVSHMEM_SRC = EnvVar("NVSHMEM_SRC", None).get()
+    # NVSHMEM paths - auto-detect from pip-installed nvidia-nvshmem-cu12 or NVSHMEM_HOME
+    _nvshmem_include_dir: str | None = None
+    _nvshmem_lib_path: str | None = None
+
+    @property
+    def USE_NVSHMEM(self) -> bool:
+        """Return True if NVSHMEM is enabled (dynamically reads env var)."""
+        return os.environ.get("TILELANG_USE_NVSHMEM", "0").lower() in ("1", "true", "on")
+
+    @property
+    def USE_DISTRIBUTED(self) -> bool:
+        """Return True if distributed mode is enabled (dynamically reads env var)."""
+        return os.environ.get("TILELANG_USE_DISTRIBUTED", "0").lower() in ("1", "true", "on")
+
+    @property
+    def NVSHMEM_INCLUDE_DIR(self) -> str | None:
+        """Get NVSHMEM include directory, auto-detecting if needed."""
+        if self._nvshmem_include_dir is None and self.USE_DISTRIBUTED:
+            self._nvshmem_include_dir, self._nvshmem_lib_path = Environment._find_nvshmem_paths()
+        return self._nvshmem_include_dir
+
+    @property
+    def NVSHMEM_LIB_PATH(self) -> str | None:
+        """Get NVSHMEM library path, auto-detecting if needed."""
+        if self._nvshmem_lib_path is None and self.USE_DISTRIBUTED:
+            self._nvshmem_include_dir, self._nvshmem_lib_path = Environment._find_nvshmem_paths()
+        return self._nvshmem_lib_path
+
+    @staticmethod
+    def _find_nvshmem_paths():
+        """Find NVSHMEM include and library paths from source build, env vars, or pip package."""
+        include_dir = None
+        lib_path = None
+
+        # First priority: NVSHMEM_HOME or NVSHMEM_SRC environment variables
+        nvshmem_home = os.environ.get("NVSHMEM_HOME", "")
+        if nvshmem_home and os.path.exists(nvshmem_home):
+            include_dir = os.path.join(nvshmem_home, "include")
+            lib_path = os.path.join(nvshmem_home, "lib")
         else:
-            NVSHMEM_SRC = os.path.join(
-                os.path.dirname(os.path.abspath(__file__)), "..", "3rdparty", "nvshmem_src")
-        NVSHMEM_INCLUDE_DIR: str = NVSHMEM_SRC + "/build/src/include"
-        NVSHMEM_LIB_PATH: str = NVSHMEM_SRC + "/build/src/lib"
-    else:
-        NVSHMEM_INCLUDE_DIR = None
-        NVSHMEM_LIB_PATH = None
+            nvshmem_src = os.environ.get("NVSHMEM_SRC", "")
+            if nvshmem_src and os.path.exists(nvshmem_src):
+                include_dir = os.path.join(nvshmem_src, "build/src/include")
+                lib_path = os.path.join(nvshmem_src, "build/src/lib")
+
+        # Second priority: Check 3rdparty/nvshmem_src in the project
+        if include_dir is None:
+            # Check relative to THIRD_PARTY_ROOT
+            nvshmem_3rdparty = os.path.join(THIRD_PARTY_ROOT, "nvshmem_src")
+            if os.path.exists(nvshmem_3rdparty):
+                candidate_inc = os.path.join(nvshmem_3rdparty, "build/src/include")
+                candidate_lib = os.path.join(nvshmem_3rdparty, "build/src/lib")
+                if os.path.exists(candidate_inc) and os.path.exists(candidate_lib):
+                    include_dir = candidate_inc
+                    lib_path = candidate_lib
+
+        # Third priority: pip-installed nvidia-nvshmem-cu12 (but has header compatibility issues)
+        if include_dir is None:
+            try:
+                import nvidia.nvshmem
+
+                nvshmem_pip_home = nvidia.nvshmem.__path__[0]
+                pip_include = os.path.join(nvshmem_pip_home, "include")
+                pip_lib = os.path.join(nvshmem_pip_home, "lib")
+                if os.path.exists(pip_include) and os.path.exists(pip_lib):
+                    include_dir = pip_include
+                    lib_path = pip_lib
+            except ImportError:
+                pass
+
+        return include_dir, lib_path
 
     def _initialize_torch_cuda_arch_flags(self) -> None:
         """
@@ -278,7 +342,7 @@ def _initialize_torch_cuda_arch_flags(self) -> None:
 
     # Cache control API (wrap CacheState)
     def is_cache_enabled(self) -> bool:
-        return CacheState.is_enabled()
+        return not self.is_cache_globally_disabled() and CacheState.is_enabled()
 
     def enable_cache(self) -> None:
         CacheState.enable()
@@ -286,13 +350,44 @@ def enable_cache(self) -> None:
     def disable_cache(self) -> None:
         CacheState.disable()
 
+    def is_cache_globally_disabled(self) -> bool:
+        return self.TILELANG_DISABLE_CACHE.lower() in ("1", "true", "yes", "on")
+
+    def is_autotune_cache_disabled(self) -> bool:
+        return self.TILELANG_AUTO_TUNING_DISABLE_CACHE.lower() in ("1", "true", "yes", "on")
+
     def is_print_on_compilation_enabled(self) -> bool:
         return self.TILELANG_PRINT_ON_COMPILATION.lower() in ("1", "true", "yes", "on")
 
+    def use_gemm_v1(self) -> bool:
+        """Return True if GEMM v1 should be used based on env.
+
+        Controlled by `TILELANG_USE_GEMM_V1`. Truthy values are one of
+        {"1", "true", "yes", "on"} (case-insensitive).
+        """
+        return str(self.TILELANG_USE_GEMM_V1).lower() in ("1", "true", "yes", "on")
+
+    def get_default_target(self) -> str:
+        """Get default compilation target from environment."""
+        return self.TILELANG_DEFAULT_TARGET
+
+    def get_default_execution_backend(self) -> str:
+        """Get default execution backend from environment."""
+        return self.TILELANG_DEFAULT_EXECUTION_BACKEND
+
+    def get_default_verbose(self) -> bool:
+        """Get default verbose flag from environment."""
+        return self.TILELANG_DEFAULT_VERBOSE.lower() in ("1", "true", "yes", "on")
+
 
 # Instantiate as a global configuration object
 env = Environment()
 
+# Cache control API (wrap env, which is managed by CacheState and Environment Variables jointly)
+enable_cache = env.enable_cache  # CacheState.enable
+disable_cache = env.disable_cache  # CacheState.disable
+is_cache_enabled = env.is_cache_enabled  # CacheState.is_enabled
+
 # Export CUDA_HOME and ROCM_HOME, both are static variables
 # after initialization.
 CUDA_HOME = env.CUDA_HOME
@@ -312,19 +407,18 @@ def prepend_pythonpath(path):
 if env.TVM_IMPORT_PYTHON_PATH is not None:
     prepend_pythonpath(env.TVM_IMPORT_PYTHON_PATH)
 else:
-    tvm_path = os.path.join(THIRD_PARTY_ROOT, "tvm")
+    tvm_path = os.path.join(THIRD_PARTY_ROOT, "tvm", "python")
     assert os.path.exists(tvm_path), tvm_path
     if tvm_path not in sys.path:
-        tvm_python_binding = os.path.join(tvm_path, 'python')
-        prepend_pythonpath(tvm_python_binding)
-        env.TVM_IMPORT_PYTHON_PATH = tvm_python_binding
-
-    if os.environ.get("TVM_LIBRARY_PATH") is None:
-        os.environ['TVM_LIBRARY_PATH'] = env.TVM_LIBRARY_PATH = os.pathsep.join(TL_LIBS)
+        prepend_pythonpath(tvm_path)
+        env.TVM_IMPORT_PYTHON_PATH = tvm_path
+# By default, the built TVM-related libraries are stored in TL_LIBS.
+if os.environ.get("TVM_LIBRARY_PATH") is None:
+    os.environ["TVM_LIBRARY_PATH"] = env.TVM_LIBRARY_PATH = os.pathsep.join(TL_LIBS)
 
 # Initialize CUTLASS paths
 if os.environ.get("TL_CUTLASS_PATH", None) is None:
-    cutlass_inc_path = os.path.join(THIRD_PARTY_ROOT, 'cutlass', 'include')
+    cutlass_inc_path = os.path.join(THIRD_PARTY_ROOT, "cutlass", "include")
     if os.path.exists(cutlass_inc_path):
         os.environ["TL_CUTLASS_PATH"] = env.CUTLASS_INCLUDE_DIR = cutlass_inc_path
     else:
@@ -332,7 +426,7 @@ def prepend_pythonpath(path):
 
 # Initialize COMPOSABLE_KERNEL paths
 if os.environ.get("TL_COMPOSABLE_KERNEL_PATH", None) is None:
-    ck_inc_path = os.path.join(THIRD_PARTY_ROOT, 'composable_kernel', 'include')
+    ck_inc_path = os.path.join(THIRD_PARTY_ROOT, "composable_kernel", "include")
     if os.path.exists(ck_inc_path):
         os.environ["TL_COMPOSABLE_KERNEL_PATH"] = env.COMPOSABLE_KERNEL_INCLUDE_DIR = ck_inc_path
     else:
@@ -346,6 +440,9 @@ def prepend_pythonpath(path):
     else:
         logger.warning(TL_TEMPLATE_NOT_FOUND_MESSAGE)
 
+# NVSHMEM paths are now lazily initialized via properties in Environment class
+# when USE_DISTRIBUTED is enabled. No need for eager initialization here.
+
 # Export static variables after initialization.
 CUTLASS_INCLUDE_DIR = env.CUTLASS_INCLUDE_DIR
 COMPOSABLE_KERNEL_INCLUDE_DIR = env.COMPOSABLE_KERNEL_INCLUDE_DIR
diff --git a/tilelang/intrinsics/mfma_layout.py b/tilelang/intrinsics/mfma_layout.py
index 183ba646f..389596494 100644
--- a/tilelang/intrinsics/mfma_layout.py
+++ b/tilelang/intrinsics/mfma_layout.py
@@ -4,7 +4,7 @@
 
 
 def shared_16x4_to_local_64x1_layout_A(i, j):
-    thread_id = (j * 16 + i)
+    thread_id = j * 16 + i
     return thread_id, convert(0)
 
 
@@ -15,7 +15,7 @@ def thread_id_shared_access_64x1_to_16x4_layout_A(thread_id, local_id):
 
 
 def shared_4x16_to_local_64x1_layout_B(i, j):
-    thread_id = (i * 16 + j)
+    thread_id = i * 16 + j
     return thread_id, convert(0)
 
 
@@ -27,7 +27,7 @@ def thread_id_shared_access_64x1_to_4x16_layout_B(thread_id, local_id):
 
 def shared_16x16_to_local_64x4_layout_C(i, j):
     thread_id = j + (i // 4) * 16
-    local = (i % 4)
+    local = i % 4
     return thread_id, local
 
 
@@ -45,7 +45,7 @@ def thread_id_shared_access_64x4_to_16x16_layout_A(thread_id, local_id):
 
 def shared_16x16_to_local_64x4_layout_A(i, j):
     thread_id = i + 16 * (j // 4)
-    local = (j % 4)
+    local = j % 4
     return thread_id, local
 
 
@@ -57,7 +57,7 @@ def thread_id_shared_access_64x4_to_16x16_layout_B(thread_id, local_id):
 
 def shared_16x16_to_local_64x4_layout_B(i, j):
     thread_id = j + (i // 4) * 16
-    local = (i % 4)
+    local = i % 4
     return thread_id, local
 
 
@@ -87,7 +87,7 @@ def thread_id_shared_access_64x8_to_16x32_layout_A(thread_id, local_id):
 
 def shared_16x32_to_local_64x8_layout_A(i, j):
     thread_id = i + 16 * (j // 8)
-    local = (j % 8)
+    local = j % 8
     return thread_id, local
 
 
@@ -99,7 +99,7 @@ def thread_id_shared_access_64x8_to_16x32_layout_B(thread_id, local_id):
 
 def shared_16x32_to_local_64x8_layout_B(i, j):
     thread_id = j + (i // 8) * 16
-    local = (i % 8)
+    local = i % 8
     return thread_id, local
 
 
@@ -111,7 +111,7 @@ def thread_id_shared_access_64x16_to_16x64_layout_A(thread_id, local_id):
 
 def shared_16x64_to_local_64x16_layout_A(i, j):
     thread_id = i + 16 * (j // 16)
-    local = (j % 16)
+    local = j % 16
     return thread_id, local
 
 
@@ -123,7 +123,7 @@ def thread_id_shared_access_64x16_to_16x64_layout_B(thread_id, local_id):
 
 def shared_16x64_to_local_64x16_layout_B(i, j):
     thread_id = i + 16 * (j // 16)
-    local = (j % 16)
+    local = j % 16
     return thread_id, local
 
 
diff --git a/tilelang/intrinsics/mfma_macro_generator.py b/tilelang/intrinsics/mfma_macro_generator.py
index aa369980f..ad2192061 100644
--- a/tilelang/intrinsics/mfma_macro_generator.py
+++ b/tilelang/intrinsics/mfma_macro_generator.py
@@ -2,10 +2,33 @@
 from tilelang import tvm as tvm
 import tilelang.language as T
 from tvm import DataType
-from tvm.tir import PrimExpr
+from tvm import tir
+from tvm.ir import Range
+from tvm.tir import PrimExpr, IndexMap, Buffer, Var, BufferRegion, BufferLoad
 from tvm.runtime import convert
-from .utils import (
-    mfma_store_index_map,)
+from .utils import mfma_store_index_map
+from typing import Literal, Callable
+
+from tilelang.utils import is_fragment
+from tilelang.utils.language import get_buffer_region_from_load
+from .mfma_layout import (
+    shared_16x4_to_local_64x1_layout_A,
+    shared_4x16_to_local_64x1_layout_B,
+    shared_16x16_to_local_64x4_layout_A,
+    shared_16x16_to_local_64x4_layout_B,
+    shared_16x32_to_local_64x8_layout_A,
+    shared_16x32_to_local_64x8_layout_B,
+    shared_16x64_to_local_64x16_layout_A,
+    shared_16x64_to_local_64x16_layout_B,
+    thread_id_shared_access_64x1_to_16x4_layout_A,
+    thread_id_shared_access_64x1_to_4x16_layout_B,
+    thread_id_shared_access_64x4_to_16x16_layout_A,
+    thread_id_shared_access_64x4_to_16x16_layout_B,
+    thread_id_shared_access_64x8_to_16x32_layout_A,
+    thread_id_shared_access_64x8_to_16x32_layout_B,
+    thread_id_shared_access_64x16_to_16x64_layout_A,
+    thread_id_shared_access_64x16_to_16x64_layout_B,
+)
 
 lift = convert
 
@@ -38,9 +61,9 @@ class MatrixCoreIntrinEmitter:
 
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
@@ -53,6 +76,7 @@ def __init__(
         k_pack: int | None = None,
         is_m_first: bool | None = False,
         b_preshuffle: bool | None = False,
+        thread_var: Var | None = None,
     ):
         self.a_dtype = a_dtype
         self.b_dtype = b_dtype
@@ -77,12 +101,13 @@ def __init__(
         self.warp_rows = warp_row_tiles // self.micro_size_x
         self.warp_cols = warp_col_tiles // self.micro_size_y
         self.reduce_k = reduce_k
-        self.threads = (self.WARP_SIZE * (block_row_warps * block_col_warps) * reduce_k)
+        self.threads = self.WARP_SIZE * (block_row_warps * block_col_warps) * reduce_k
         self.num_elems_per_byte = num_elems_per_byte
+        self.thread_var = thread_var
 
-    def _initialize_k_dim(self, a_dtype="float16"):
+    def _initialize_k_dim(self, a_dtype=T.float16):
         if isinstance(a_dtype, str):
-            if a_dtype in ["float8_e4m3fnuz", "int8"]:
+            if a_dtype in ["float8_e4m3fnuz", T.int8]:
                 self.k_dim = 32
                 return
             a_dtype = DataType(a_dtype)
@@ -107,14 +132,10 @@ def _initialize_abbrev(self, a_dtype, b_dtype, accum_dtype):
     def _initialize_mfma_prefix(self, k_dim=16):
         in_dtype, out_dtype = self.a_dtype, self.accum_dtype
         M_DIM, N_DIM = self.M_DIM, self.N_DIM
-        out_dtype_abbrv = {
-            "float16": "f16",
-            "float32": "f32",
-            "int8": "i8",
-            "int32": "i32"
-        }[out_dtype]
+        out_dtype_abbrv = {T.float16: "f16", T.float32: "f32", T.int8: "i8", T.int32: "i32"}[out_dtype]
 
         in_dtype_abbrv = {
+            "bfloat16": "bf16",
             "float16": "f16",
             "float32": "f32",
             "int8": "i8",
@@ -126,6 +147,9 @@ def _initialize_mfma_prefix(self, k_dim=16):
             self.mfma_suffix = f"{out_dtype_abbrv}_{M_DIM}x{N_DIM}x{k_dim}_fp8_fp8"
         elif in_dtype_abbrv == "i8":
             self.mfma_suffix = f"{out_dtype_abbrv}_{M_DIM}x{N_DIM}x{k_dim}_i8"
+        elif in_dtype_abbrv == "bf16":
+            # HIP intrinsic uses ...x{K}bf16_1k without an underscore before bf16
+            self.mfma_suffix = f"{out_dtype_abbrv}_{M_DIM}x{N_DIM}x{k_dim}bf16_1k"
         else:
             self.mfma_suffix = f"{out_dtype_abbrv}_{M_DIM}x{N_DIM}x{k_dim}{in_dtype_abbrv}"
 
@@ -147,25 +171,6 @@ def _initialize_b_preshuffle(self, b_preshuffle: bool | None = False):
             self.b_preshuffle = b_preshuffle
 
     def get_ldmatrix_index_map(self, is_b=False):
-        from .mfma_layout import (
-            shared_16x4_to_local_64x1_layout_A,
-            shared_4x16_to_local_64x1_layout_B,
-            shared_16x16_to_local_64x4_layout_A,
-            shared_16x16_to_local_64x4_layout_B,
-            shared_16x32_to_local_64x8_layout_A,
-            shared_16x32_to_local_64x8_layout_B,
-            shared_16x64_to_local_64x16_layout_A,
-            shared_16x64_to_local_64x16_layout_B,
-            thread_id_shared_access_64x1_to_16x4_layout_A,
-            thread_id_shared_access_64x1_to_4x16_layout_B,
-            thread_id_shared_access_64x4_to_16x16_layout_A,
-            thread_id_shared_access_64x4_to_16x16_layout_B,
-            thread_id_shared_access_64x8_to_16x32_layout_A,
-            thread_id_shared_access_64x8_to_16x32_layout_B,
-            thread_id_shared_access_64x16_to_16x64_layout_A,
-            thread_id_shared_access_64x16_to_16x64_layout_B,
-        )
-
         k_dim = self.k_dim * self.k_pack
         transposed = self.a_transposed if not is_b else self.b_transposed
         if k_dim == 4:
@@ -173,41 +178,69 @@ def get_ldmatrix_index_map(self, is_b=False):
             reverse_index_map = thread_id_shared_access_64x1_to_16x4_layout_A
             if is_b:
                 index_map = shared_16x4_to_local_64x1_layout_A if transposed else shared_4x16_to_local_64x1_layout_B
-                reverse_index_map = thread_id_shared_access_64x1_to_16x4_layout_A if transposed else thread_id_shared_access_64x1_to_4x16_layout_B
+                reverse_index_map = (
+                    thread_id_shared_access_64x1_to_16x4_layout_A if transposed else thread_id_shared_access_64x1_to_4x16_layout_B
+                )
         elif k_dim == 16:
             index_map = shared_16x16_to_local_64x4_layout_B if transposed else shared_16x16_to_local_64x4_layout_A
-            reverse_index_map = thread_id_shared_access_64x4_to_16x16_layout_B if transposed else thread_id_shared_access_64x4_to_16x16_layout_A
+            reverse_index_map = (
+                thread_id_shared_access_64x4_to_16x16_layout_B if transposed else thread_id_shared_access_64x4_to_16x16_layout_A
+            )
 
             if is_b:
                 index_map = shared_16x16_to_local_64x4_layout_A if transposed else shared_16x16_to_local_64x4_layout_B
-                reverse_index_map = thread_id_shared_access_64x4_to_16x16_layout_A if transposed else thread_id_shared_access_64x4_to_16x16_layout_B
+                reverse_index_map = (
+                    thread_id_shared_access_64x4_to_16x16_layout_A if transposed else thread_id_shared_access_64x4_to_16x16_layout_B
+                )
         elif k_dim == 32:
             index_map = shared_16x32_to_local_64x8_layout_B if transposed else shared_16x32_to_local_64x8_layout_A
-            reverse_index_map = thread_id_shared_access_64x8_to_16x32_layout_B if transposed else thread_id_shared_access_64x8_to_16x32_layout_A
+            reverse_index_map = (
+                thread_id_shared_access_64x8_to_16x32_layout_B if transposed else thread_id_shared_access_64x8_to_16x32_layout_A
+            )
 
             if is_b:
                 index_map = shared_16x32_to_local_64x8_layout_A if transposed else shared_16x32_to_local_64x8_layout_B
-                reverse_index_map = thread_id_shared_access_64x8_to_16x32_layout_A if transposed else thread_id_shared_access_64x8_to_16x32_layout_B
+                reverse_index_map = (
+                    thread_id_shared_access_64x8_to_16x32_layout_A if transposed else thread_id_shared_access_64x8_to_16x32_layout_B
+                )
         elif k_dim == 64:
             index_map = shared_16x64_to_local_64x16_layout_B if transposed else shared_16x64_to_local_64x16_layout_A
-            reverse_index_map = thread_id_shared_access_64x16_to_16x64_layout_B if transposed else thread_id_shared_access_64x16_to_16x64_layout_A
+            reverse_index_map = (
+                thread_id_shared_access_64x16_to_16x64_layout_B if transposed else thread_id_shared_access_64x16_to_16x64_layout_A
+            )
 
             if is_b:
                 index_map = shared_16x64_to_local_64x16_layout_A if transposed else shared_16x64_to_local_64x16_layout_B
-                reverse_index_map = thread_id_shared_access_64x16_to_16x64_layout_A if transposed else thread_id_shared_access_64x16_to_16x64_layout_B
+                reverse_index_map = (
+                    thread_id_shared_access_64x16_to_16x64_layout_A if transposed else thread_id_shared_access_64x16_to_16x64_layout_B
+                )
         else:
             raise ValueError("k_dim must be 4 or 16 or 32 or 64 currently")
 
         return index_map, reverse_index_map
 
-    def extract_thread_binding(self,
-                               thread_id,
-                               is_m_first=None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
-        '''
-            is_m_first: True if the thread binding is in the form of (tx, warp_n, warp_m)
-            which represents [warp_size, block_row_warps (split n), block_col_warps (split m)]
-            Otherwise, it is in the form of [warp_size, block_col_warps (split m), block_row_warps (split n)]
-        '''
+    def get_store_index_map(self, inverse: bool = False) -> IndexMap:
+        warp_size, local_size_c = self.WARP_SIZE, self.local_size_out
+        index_map = IndexMap.from_func(mfma_store_index_map, index_dtype=T.int32)
+        if not inverse:
+            return index_map
+        inverse_index_map = index_map.inverse([warp_size, local_size_c])
+        return inverse_index_map
+
+    def get_thread_binding(self):
+        if self.thread_var is None:
+            current_frame = T.KernelLaunchFrame.Current()
+            assert current_frame is not None, "Must be called in a T.Kernel Frame"
+            return current_frame.get_thread_binding()
+        else:
+            return self.thread_var
+
+    def extract_thread_binding(self, thread_id, is_m_first=None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
+        """
+        is_m_first: True if the thread binding is in the form of (tx, warp_n, warp_m)
+        which represents [warp_size, block_row_warps (split n), block_col_warps (split m)]
+        Otherwise, it is in the form of [warp_size, block_col_warps (split m), block_row_warps (split n)]
+        """
         WARP_SIZE = self.WARP_SIZE
         block_row_warps = self.block_row_warps
         block_col_warps = self.block_col_warps
@@ -217,19 +250,21 @@ def extract_thread_binding(self,
             is_m_first = self.is_m_first
 
         if is_m_first:
-            lane_id, warp_n, warp_m = thread_id % WARP_SIZE, (
-                thread_id //
-                WARP_SIZE) % block_col_warps, (thread_id //
-                                               (WARP_SIZE * block_col_warps)) % block_row_warps,
+            lane_id, warp_n, warp_m = (
+                thread_id % WARP_SIZE,
+                (thread_id // WARP_SIZE) % block_col_warps,
+                (thread_id // (WARP_SIZE * block_col_warps)) % block_row_warps,
+            )
             return lane_id, warp_n, warp_m
         else:
-            lane_id, warp_m, warp_n = thread_id % WARP_SIZE, (
-                thread_id //
-                WARP_SIZE) % block_row_warps, (thread_id //
-                                               (WARP_SIZE * block_row_warps)) % block_col_warps,
+            lane_id, warp_m, warp_n = (
+                thread_id % WARP_SIZE,
+                (thread_id // WARP_SIZE) % block_row_warps,
+                (thread_id // (WARP_SIZE * block_row_warps)) % block_col_warps,
+            )
             return lane_id, warp_n, warp_m
 
-    def ldmatrix_a(self, A_local_buf, A_shared_buf, ki, rk=0):
+    def ldmatrix_a(self, A_local_buf, A_shared_buf: Buffer | BufferRegion, ki, rk=0):
         warp_row_tiles = self.warp_row_tiles
         warp_rows = self.warp_rows
         chunk = self.chunk
@@ -238,10 +273,15 @@ def ldmatrix_a(self, A_local_buf, A_shared_buf, ki, rk=0):
         local_size_a = self.local_size_a
         k_pack = self.k_pack
         is_transposed = self.a_transposed
-        current_frame = T.KernelLaunchFrame.Current()
-        thread_binding = current_frame.get_thread_binding()
+        thread_binding = self.get_thread_binding()
         _, reverse_index_map = self.get_ldmatrix_index_map(is_b=False)
 
+        # legalize shared buffer to region
+        A_region = self._legalize_to_buffer_region(A_shared_buf)
+        A_buf = A_region.buffer
+        A_base0 = A_region.region[-2].min
+        A_base1 = A_region.region[-1].min
+
         @T.macro
         def _warp_ldmatrix_a(
             A_local_buf,
@@ -255,22 +295,18 @@ def _warp_ldmatrix_a(
                 for i in T.serial(warp_rows):
                     for local_id in T.vectorized(k_pack * local_size_a):
                         row, col = T.meta_var(reverse_index_map(tx, local_id))
-                        l, r = (rk * chunk + ki * (k_pack * micro_size_k),
-                                warp_m * warp_row_tiles + i * micro_size_x)
-                        A_local_buf[i * k_pack * local_size_a + local_id] = A_shared_buf[l + row,
-                                                                                         r + col]
+                        l, r = (rk * chunk + ki * (k_pack * micro_size_k), warp_m * warp_row_tiles + i * micro_size_x)
+                        A_local_buf[i * k_pack * local_size_a + local_id] = A_buf[A_base0 + l + row, A_base1 + r + col]
             else:
                 for i in T.serial(warp_rows):
                     for local_id in T.vectorized(k_pack * local_size_a):
                         row, col = T.meta_var(reverse_index_map(tx, local_id))
-                        l, r = (warp_m * warp_row_tiles + i * micro_size_x,
-                                rk * chunk + ki * (k_pack * micro_size_k))
-                        A_local_buf[i * k_pack * local_size_a + local_id] = A_shared_buf[l + row,
-                                                                                         r + col]
+                        l, r = (warp_m * warp_row_tiles + i * micro_size_x, rk * chunk + ki * (k_pack * micro_size_k))
+                        A_local_buf[i * k_pack * local_size_a + local_id] = A_buf[A_base0 + l + row, A_base1 + r + col]
 
         return _warp_ldmatrix_a(A_local_buf, A_shared_buf, ki, thread_binding, rk)
 
-    def ldmatrix_b(self, B_local_buf, B_shared_buf, ki, rk=0):
+    def ldmatrix_b(self, B_local_buf, B_shared_buf: Buffer | BufferRegion, ki, rk=0):
         warp_col_tiles = self.warp_col_tiles
         warp_cols = self.warp_cols
         chunk = self.chunk
@@ -279,10 +315,15 @@ def ldmatrix_b(self, B_local_buf, B_shared_buf, ki, rk=0):
         local_size_b = self.local_size_b
         k_pack = self.k_pack
         is_transposed = self.b_transposed
-        current_frame = T.KernelLaunchFrame.Current()
-        thread_binding = current_frame.get_thread_binding()
+        thread_binding = self.get_thread_binding()
         _, reverse_index_map = self.get_ldmatrix_index_map(is_b=True)
 
+        # legalize shared buffer to region
+        B_region = self._legalize_to_buffer_region(B_shared_buf)
+        B_buf = B_region.buffer
+        B_base0 = B_region.region[-2].min
+        B_base1 = B_region.region[-1].min
+
         @T.macro
         def _warp_ldmatrix_b(
             B_local_buf,
@@ -300,8 +341,7 @@ def _warp_ldmatrix_b(
                             warp_n * warp_col_tiles + j * micro_size_y,
                             rk * chunk + ki * (k_pack * micro_size_k),
                         )
-                        B_local_buf[j * k_pack * local_size_b + local_id] = B_shared_buf[l + row,
-                                                                                         r + col]
+                        B_local_buf[j * k_pack * local_size_b + local_id] = B_buf[B_base0 + l + row, B_base1 + r + col]
 
             else:
                 for j in T.serial(warp_cols):
@@ -311,12 +351,11 @@ def _warp_ldmatrix_b(
                             rk * chunk + ki * (k_pack * micro_size_k),
                             warp_n * warp_col_tiles + j * micro_size_y,
                         )
-                        B_local_buf[j * k_pack * local_size_b + local_id] = B_shared_buf[l + row,
-                                                                                         r + col]
+                        B_local_buf[j * k_pack * local_size_b + local_id] = B_buf[B_base0 + l + row, B_base1 + r + col]
 
         return _warp_ldmatrix_b(B_local_buf, B_shared_buf, ki, thread_binding, rk)
 
-    def mfma(self, A_local_buf, B_local_buf, C_local_buf):
+    def mfma(self, A_local_buf: Buffer, B_local_buf: Buffer, C_local_buf: Buffer, k_inner: PrimExpr | None = 0):
         warp_rows = self.warp_rows
         warp_cols = self.warp_cols
         local_size_a = self.local_size_a
@@ -329,8 +368,13 @@ def mfma(self, A_local_buf, B_local_buf, C_local_buf):
         compute_b_dtype = b_dtype if local_size_b == 1 else f"{b_dtype}x{local_size_b}"
         compute_out_dtype = out_dtype if local_size_out == 1 else f"{out_dtype}x{local_size_out}"
 
+        a_is_fragment = is_fragment(A_local_buf)
+        b_is_fragment = is_fragment(B_local_buf)
+        a_local_stride: PrimExpr = k_inner * warp_rows * k_pack * local_size_a if a_is_fragment else 0
+        b_local_stride: PrimExpr = k_inner * warp_cols * k_pack * local_size_b if b_is_fragment else 0
+
         @T.macro
-        def _warp_mma(A_local_buf, B_local_buf, C_local_buf):
+        def _warp_mfma(A_local_buf, B_local_buf, C_local_buf):
             for kp, i, j in T.grid(k_pack, warp_rows, warp_cols):
                 T.tvm_mfma(
                     mfma_suffix,
@@ -340,15 +384,15 @@ def _warp_mma(A_local_buf, B_local_buf, C_local_buf):
                     compute_b_dtype,
                     compute_out_dtype,
                     B_local_buf.data,
-                    ((j * k_pack + kp) * local_size_b) // local_size_b,
+                    (b_local_stride + (j * k_pack + kp) * local_size_b) // local_size_b,
                     A_local_buf.data,
-                    ((i * k_pack + kp) * local_size_a) // local_size_a,
+                    (a_local_stride + (i * k_pack + kp) * local_size_a) // local_size_a,
                     C_local_buf.data,
                     (i * warp_cols * local_size_out + j * local_size_out) // local_size_out,
                     dtype=compute_out_dtype,
                 )
 
-        return _warp_mma(A_local_buf, B_local_buf, C_local_buf)
+        return _warp_mfma(A_local_buf, B_local_buf, C_local_buf)
 
     def stmatrix(self, C_local_buf, C_buf, pid_m=None, pid_n=None):
         block_row_warps = self.block_row_warps
@@ -356,8 +400,7 @@ def stmatrix(self, C_local_buf, C_buf, pid_m=None, pid_n=None):
         warp_rows = self.warp_rows
         warp_cols = self.warp_cols
         local_size_out = self.local_size_out
-        current_frame = T.KernelLaunchFrame.Current()
-        thread_binding = current_frame.get_thread_binding()
+        thread_binding = self.get_thread_binding()
         is_global = pid_m is not None and pid_n is not None
         BLOCK_M = block_row_warps * warp_rows
         BLOCK_N = block_col_warps * warp_cols
@@ -366,7 +409,7 @@ def stmatrix(self, C_local_buf, C_buf, pid_m=None, pid_n=None):
         assert C_buf_dims in {2, 4}, "C_buf should be 2D or 4D"
 
         # STS
-        # MMA Store must be in simulated instead of TVM Intrins
+        # MFMA Store must be in simulated instead of TVM Intrins
         # As TVM Intrins is like a hack that the threadIdx.x should be always
         # equal to the warp_size
         @T.macro
@@ -376,14 +419,13 @@ def _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding):
                 for local_id in T.vectorized(local_size_out):
                     row, col = T.meta_var(mfma_store_index_map(tx, local_id))
                     if C_buf_dims == 2:
-                        C_buf[(warp_m * warp_rows + i) * M_DIM + row,
-                              (warp_n * warp_cols + j) * N_DIM +
-                              col] = C_local_buf[i * (warp_cols * local_size_out) +
-                                                 j * local_size_out + local_id]
+                        C_buf[(warp_m * warp_rows + i) * M_DIM + row, (warp_n * warp_cols + j) * N_DIM + col] = C_local_buf[
+                            i * (warp_cols * local_size_out) + j * local_size_out + local_id
+                        ]
                     else:
-                        C_buf[warp_m * warp_rows + i, warp_n * warp_cols + j, row,
-                              col] = C_local_buf[i * warp_cols * local_size_out +
-                                                 j * local_size_out + local_id]
+                        C_buf[warp_m * warp_rows + i, warp_n * warp_cols + j, row, col] = C_local_buf[
+                            i * warp_cols * local_size_out + j * local_size_out + local_id
+                        ]
 
         @T.macro
         def _warp_stmatrix_global(C_local_buf, C_buf, thread_binding):
@@ -391,23 +433,246 @@ def _warp_stmatrix_global(C_local_buf, C_buf, thread_binding):
             for i, j in T.grid(warp_rows, warp_cols):
                 for local_id in T.vectorized(local_size_out):
                     row, col = T.meta_var(mfma_store_index_map(tx, local_id))
-                    C_buf[(pid_m * BLOCK_M + warp_m * warp_rows + i) * M_DIM + row,
-                          (pid_n * BLOCK_N + warp_n * warp_cols + j) * N_DIM +
-                          col] = C_local_buf[i * warp_cols * local_size_out + j * local_size_out +
-                                             local_id]
+                    C_buf[
+                        (pid_m * BLOCK_M + warp_m * warp_rows + i) * M_DIM + row, (pid_n * BLOCK_N + warp_n * warp_cols + j) * N_DIM + col
+                    ] = C_local_buf[i * warp_cols * local_size_out + j * local_size_out + local_id]
+
+        return (
+            _warp_stmatrix_global(C_local_buf, C_buf, thread_binding)
+            if is_global
+            else _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding)
+        )
 
-        return _warp_stmatrix_global(C_local_buf, C_buf,
-                                     thread_binding) if is_global else _warp_stmatrix_shared(
-                                         C_local_buf, C_buf, thread_binding)
+    def make_mfma_load_layout(self, local_buf: Buffer, matrix: Literal["A", "B"] = "A") -> T.Fragment:
+        """
+        Create a layout function for storing MFMA results into a fragment buffer.
+
+        Parameters
+        ----------
+        local_buf : tir.Buffer
+            The local buffer representing a fragment of a matrix.
+
+        Returns
+        -------
+        T.Fragment
+            A fragment object that describes how threads and indices
+            in `local_buf` are laid out.
+
+        Raises
+        ------
+        AssertionError
+            If `local_buf` is not detected to be a fragment buffer.
+        """
+        from tilelang.utils import is_fragment
+
+        assert matrix in ["A", "B"], "matrix should be either A or B"
+        matrix_is_a: bool = matrix == "A"
+        matrix_is_b: bool = matrix == "B"
+        transposed = self.a_transposed if matrix_is_a else self.b_transposed
+
+        # s represents spatial axis
+        # r represents reduction axis
+        # sr represents the two dims are spatial + reduction
+        # rs represents the two dims are reduction + spatial
+        # sr also can represent a non-transposed basic layout
+        # then rs also can represent a transposed basic layout
+        transform_func_sr_a: Callable = None
+        transform_func_sr_b: Callable = None
 
+        k_dim = self.k_dim * self.k_pack
 
-class MatrixCorePreshuffleIntrinEmitter(MatrixCoreIntrinEmitter):
+        if k_dim == 4:
+            transform_func_sr_a = shared_16x4_to_local_64x1_layout_A
+            transform_func_sr_b = shared_16x4_to_local_64x1_layout_A
+        elif k_dim == 16:
+            transform_func_sr_a = shared_16x16_to_local_64x4_layout_A
+            transform_func_sr_b = shared_16x16_to_local_64x4_layout_A
+        elif k_dim == 32:
+            transform_func_sr_a = shared_16x32_to_local_64x8_layout_A
+            transform_func_sr_b = shared_16x32_to_local_64x8_layout_A
+        elif k_dim == 64:
+            transform_func_sr_a = shared_16x64_to_local_64x16_layout_A
+            transform_func_sr_b = shared_16x64_to_local_64x16_layout_A
+        else:
+            raise ValueError("k_dim must be 4 or 16 or 32 or 64 currently")
 
+        is_sr_conditions = [False]
+        is_sr_conditions.append(matrix_is_a and not transposed)
+        is_sr_conditions.append(matrix_is_b and transposed)
+        is_sr_axis_order = any(is_sr_conditions)
+
+        transform_func: Callable = None
+        if matrix_is_a:
+            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
+        elif matrix_is_b:
+            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(j, i)
+        else:
+            raise ValueError(f"Unsupported matrix {matrix}")
+
+        assert is_fragment(local_buf), f"local_buf must be a fragment, but got {local_buf.scope()}"
+
+        if matrix_is_a:
+            micro_size_s, micro_size_r = self.micro_size_x, self.micro_size_k
+        else:
+            micro_size_r, micro_size_s = self.micro_size_k, self.micro_size_y
+
+        block_row_warps, block_col_warps = (
+            self.block_row_warps,
+            self.block_col_warps,
+        )
+
+        inverse_mfma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
+
+        def forward_thread(i: int, j: int) -> int:
+            """
+            Given the row index `i` and column index `j` in the fragment,
+            """
+            lane_id, _ = inverse_mfma_load_layout.map_indices([i, j])
+            return lane_id
+
+        def forward_index(i: int, j: int) -> int:
+            """
+            Given the row index `i` and column index `j` in the fragment,
+            """
+            _, local_id = inverse_mfma_load_layout.map_indices([i, j])
+            return local_id
+
+        base_fragment = T.Fragment(
+            [micro_size_s, micro_size_r * self.k_pack] if is_sr_axis_order else [micro_size_r * self.k_pack, micro_size_s],
+            forward_thread_fn=forward_thread,
+            forward_index_fn=forward_index,
+        )
+
+        warp_rows, warp_cols = self.warp_rows, self.warp_cols
+        chunk = self.chunk
+
+        warp_s = warp_rows if matrix_is_a else warp_cols
+        warp_r = chunk // (micro_size_r * self.k_pack)
+        block_s = block_row_warps if matrix_is_a else block_col_warps
+        replicate = block_col_warps if matrix_is_a else block_row_warps
+
+        if is_sr_axis_order:
+            warp_fragment = base_fragment.repeat([warp_s, warp_r], repeat_on_thread=False, lower_dim_first=False)
+            if matrix_is_a:
+                block_fragment = warp_fragment.repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
+            elif matrix_is_b:
+                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True)
+            else:
+                raise ValueError(f"Unsupported matrix type {matrix}")
+        else:
+            warp_fragment = base_fragment.repeat([warp_r, warp_s], repeat_on_thread=False, lower_dim_first=True)
+            if matrix_is_a:
+                block_fragment = warp_fragment.repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
+            elif matrix_is_b:
+                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True)
+            else:
+                raise ValueError(f"Unsupported matrix type {matrix}")
+
+        return block_fragment
+
+    def make_mfma_store_layout(self, local_buf: Buffer) -> T.Fragment:
+        """
+        Create a layout function for storing MFMA results into a fragment buffer.
+
+        Parameters
+        ----------
+        local_buf : tir.Buffer
+            The local buffer representing a fragment of a matrix.
+
+        Returns
+        -------
+        T.Fragment
+            A fragment object that describes how threads and indices
+            in `local_buf` are laid out.
+
+        Raises
+        ------
+        AssertionError
+            If `local_buf` is not detected to be a fragment buffer.
+        """
+        from tilelang.utils import is_fragment
+
+        shape = local_buf.shape
+        inverse_mfma_store_layout = self.get_store_index_map(inverse=True)
+        assert is_fragment(local_buf), "local_buf must be a fragment"
+        micro_size_x, micro_size_y = self.micro_size_x, self.micro_size_y
+        local_size_out = self.local_size_out
+        block_row_warps, block_col_warps = self.block_row_warps, self.block_col_warps
+        warp_rows, warp_cols = self.warp_rows, self.warp_cols
+        warp_size = self.WARP_SIZE
+        is_m_first = self.is_m_first
+
+        def forward_thread(i: int, j: int) -> int:
+            """
+            Given the row index `i` and column index `j` in the fragment,
+            map them to a thread index according to `inverse_mfma_store_layout`.
+            """
+            # the upper bounds of i and j are block_row_warps * warp_rows * micro_size_x and block_col_warps * warp_cols * micro_size_y
+            # the upper bounds of block_row_warps and block_col_warps are warp_rows and warp_cols
+            block_i, block_j = (i // micro_size_x) // warp_rows, (j // micro_size_y) // warp_cols
+            # upper bounds of mfma_i and mfma_j are micro_size_x and micro_size_y
+            mfma_i, mfma_j = i % micro_size_x, j % micro_size_y
+            lane_id, _ = inverse_mfma_store_layout.map_indices([mfma_i, mfma_j])
+            if is_m_first:
+                thread_id = block_i * (block_col_warps * warp_cols) + block_j * warp_size + lane_id
+            else:
+                thread_id = block_j * (block_row_warps * warp_size) + block_i * warp_size + lane_id
+            return thread_id
+
+        def forward_index(i: int, j: int) -> int:
+            """
+            Given the row index `i` and column index `j` in the fragment,
+            map them to a local index in a single thread according
+            to `inverse_mfma_store_layout`.
+            """
+            # the upper bounds of i and j are block_row_warps * warp_rows * micro_size_x and block_col_warps * warp_cols * micro_size_y
+            # the upper bounds of warp_i and warp_j are warp_rows and warp_cols
+            warp_i, warp_j = (i // micro_size_x) % warp_rows, (j // micro_size_y) % warp_cols
+            # upper bounds of mfma_i and mfma_j are micro_size_x and micro_size_y
+            mfma_i, mfma_j = i % micro_size_x, j % micro_size_y
+            _, local_id = inverse_mfma_store_layout.map_indices([mfma_i, mfma_j])
+            return warp_i * (warp_cols * local_size_out) + warp_j * local_size_out + local_id
+
+        return T.Fragment(
+            shape,
+            forward_thread_fn=forward_thread,
+            forward_index_fn=forward_index,
+        )
+
+    @staticmethod
+    def _legalize_to_buffer_region(obj: Buffer | BufferLoad | BufferRegion) -> BufferRegion:
+        """
+        Convert Buffer/BufferRegion/BufferLoad to a BufferRegion.
+
+        - Buffer -> full-region BufferRegion covering entire shape
+        - BufferRegion -> returned as-is
+        - BufferLoad -> best-effort convert via get_buffer_region_from_load;
+        if scalar, fall back to 1-sized ranges at given indices
+        """
+        if isinstance(obj, BufferRegion):
+            return obj
+        if isinstance(obj, Buffer):
+            mins = [tir.IntImm("int32", 0) for _ in obj.shape]
+            ranges = [Range.from_min_extent(m, e) for m, e in zip(mins, obj.shape)]
+            return BufferRegion(obj, ranges)
+        if isinstance(obj, BufferLoad):
+            region = get_buffer_region_from_load(obj)
+            if region is not None:
+                return region
+            # Fallback: scalar load -> 1-sized ranges at indices
+            mins = [idx for idx in obj.indices]
+            ones = [tir.IntImm("int32", 1) for _ in obj.indices]
+            ranges = [Range.from_min_extent(m, e) for m, e in zip(mins, ones)]
+            return BufferRegion(obj.buffer, ranges)
+        raise ValueError(f"Unsupported argument type for BufferRegion: {type(obj)}")
+
+
+class MatrixCorePreshuffleIntrinEmitter(MatrixCoreIntrinEmitter):
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
@@ -421,34 +686,27 @@ def __init__(
         is_m_first: bool | None = False,
         a_preshuffle: bool | None = False,
         b_preshuffle: bool | None = False,
+        thread_var: Var | None = None,
     ):
-
-        self.a_dtype = a_dtype
-        self.b_dtype = b_dtype
-        self.accum_dtype = accum_dtype
-        self.a_transposed = a_transposed
-        self.b_transposed = b_transposed
-        # Hint Information
-        self.block_row_warps = block_row_warps
-        self.block_col_warps = block_col_warps
-        self.warp_row_tiles = warp_row_tiles
-        self.warp_col_tiles = warp_col_tiles
-        self.chunk = chunk
-        self._initialize_k_dim(a_dtype)
-        self._initialize_abbrev(a_dtype, b_dtype, accum_dtype)
-        self._initialize_local_size(self.M_DIM, self.N_DIM, self.k_dim, self.WARP_SIZE)
-        self._initialize_mfma_prefix(self.k_dim)
-        self._initialize_micro_size(self.M_DIM, self.N_DIM, self.k_dim)
-        self._initialize_k_pack(k_pack)
-        self._initialize_is_m_first(is_m_first)
+        super().__init__(
+            a_dtype=a_dtype,
+            b_dtype=b_dtype,
+            accum_dtype=accum_dtype,
+            a_transposed=a_transposed,
+            b_transposed=b_transposed,
+            block_row_warps=block_row_warps,
+            block_col_warps=block_col_warps,
+            warp_row_tiles=warp_row_tiles,
+            warp_col_tiles=warp_col_tiles,
+            chunk=chunk,
+            reduce_k=reduce_k,
+            num_elems_per_byte=num_elems_per_byte,
+            k_pack=k_pack,
+            is_m_first=is_m_first,
+            thread_var=thread_var,
+        )
         self._initialize_preshuffle(a_preshuffle, b_preshuffle)
 
-        self.warp_rows = warp_row_tiles // self.micro_size_x
-        self.warp_cols = warp_col_tiles // self.micro_size_y
-        self.reduce_k = reduce_k
-        self.threads = (self.WARP_SIZE * (block_row_warps * block_col_warps) * reduce_k)
-        self.num_elems_per_byte = num_elems_per_byte
-
     def _initialize_preshuffle(self, a_preshuffle: bool, b_preshuffle: bool):
         if a_preshuffle is not None:
             self.a_preshuffle = a_preshuffle
@@ -515,20 +773,20 @@ def _warp_ldmatrix_a_shared(
                             rk * (chunk // micro_size_k) + ki,
                             warp_m * warp_rows + i,
                         )
-                        A_local_buf[i * k_pack * local_size_a + local_id] = A_shared_buf[l, r, row,
-                                                                                         col]
+                        A_local_buf[i * k_pack * local_size_a + local_id] = A_shared_buf[l, r, row, col]
             else:
                 print(self.a_preshuffle)
                 for i in T.serial(warp_rows):
                     for local_id in T.vectorized(k_pack * local_size_a):
                         row, col = T.meta_var(reverse_index_map(tx, local_id))
                         l, r = (warp_m * warp_rows + i, rk * (chunk // micro_size_k) + ki)
-                        A_local_buf[i * k_pack * local_size_a + local_id] = A_shared_buf[l, r, row,
-                                                                                         col]
+                        A_local_buf[i * k_pack * local_size_a + local_id] = A_shared_buf[l, r, row, col]
 
-        return _warp_ldmatrix_a_global(A_local_buf, A_buf, ki, thread_binding,
-                                       rk) if is_global else _warp_ldmatrix_a_shared(
-                                           A_local_buf, A_buf, ki, thread_binding, rk)
+        return (
+            _warp_ldmatrix_a_global(A_local_buf, A_buf, ki, thread_binding, rk)
+            if is_global
+            else _warp_ldmatrix_a_shared(A_local_buf, A_buf, ki, thread_binding, rk)
+        )
 
     def ldmatrix_b(self, B_local_buf, B_buf, ki, rk=0, pid_m=None, pid_n=None):
         warp_cols = self.warp_cols
@@ -590,8 +848,7 @@ def _warp_ldmatrix_b_shared(
                             warp_n * warp_cols + j,
                             rk * (chunk // micro_size_k) + ki,
                         )
-                        B_local_buf[j * k_pack * local_size_b + local_id] = B_shared_buf[l, r, row,
-                                                                                         col]
+                        B_local_buf[j * k_pack * local_size_b + local_id] = B_shared_buf[l, r, row, col]
             else:
                 for j in T.serial(warp_cols):
                     for local_id in T.vectorized(k_pack * local_size_b):
@@ -600,9 +857,10 @@ def _warp_ldmatrix_b_shared(
                             rk * (chunk // micro_size_k) + ki,
                             warp_n * warp_cols + j,
                         )
-                        B_local_buf[j * k_pack * local_size_b + local_id] = B_shared_buf[l, r, row,
-                                                                                         col]
+                        B_local_buf[j * k_pack * local_size_b + local_id] = B_shared_buf[l, r, row, col]
 
-        return _warp_ldmatrix_b_global(B_local_buf, B_buf, ki, thread_binding,
-                                       rk) if is_global else _warp_ldmatrix_b_shared(
-                                           B_local_buf, B_buf, ki, thread_binding, rk)
+        return (
+            _warp_ldmatrix_b_global(B_local_buf, B_buf, ki, thread_binding, rk)
+            if is_global
+            else _warp_ldmatrix_b_shared(B_local_buf, B_buf, ki, thread_binding, rk)
+        )
diff --git a/tilelang/intrinsics/mma_layout.py b/tilelang/intrinsics/mma_layout.py
index 1fec00584..2eb575f0c 100644
--- a/tilelang/intrinsics/mma_layout.py
+++ b/tilelang/intrinsics/mma_layout.py
@@ -45,6 +45,12 @@ def mma_store_32x8_to_shared_16x16_layout(thread_id, local_id):
     return row, col
 
 
+def mma_store_32x2_to_shared_8x8_layout_fp64(thread_id, local_id):
+    row = thread_id // 4
+    col = (thread_id % 4) * 2 + local_id
+    return row, col
+
+
 # sr represents spatial + reduction layout
 # the first axis is spatial while the second axis is reduction
 # mma.sync matrix A layout, if wanna trans, please apply map_indices
@@ -145,12 +151,43 @@ def mma_load_a_32x16_to_shared_16x32_layout(thread_id, local_id):
     return row, col
 
 
+def mma_load_a_32x8_to_shared_16x16_layout(thread_id, local_id):
+    """
+    groupID           = %laneid >> 2
+    threadID_in_group = %laneid % 4
+
+    row =      groupID            for ai where  0 <= i < 2 || 4 <= i < 6
+            groupID + 8         Otherwise
+
+    col =  (threadID_in_group * 2) + (i & 0x1)          for ai where i <  4
+    (threadID_in_group * 2) + (i & 0x1) + 8      for ai where i >= 4
+    """
+    row = (thread_id // 4) + 8 * (local_id % 4 // 2)
+    col = (thread_id % 4) * 2 + (local_id % 2) + 8 * (local_id // 4)
+    return row, col
+
+
 def mma_load_b_32x16_to_shared_16x32_layout(thread_id, local_id):
     row = 8 * (local_id // 8) + (thread_id // 4)
     col = 16 * (local_id % 8 // 4) + (thread_id % 4) * 4 + (local_id % 4)
     return row, col
 
 
+def mma_load_b_32x8_to_shared_16x16_layout(thread_id, local_id):
+    """
+    groupID           = %laneid >> 2
+    threadID_in_group = %laneid % 4
+
+    row =  (threadID_in_group * 2) + (i & 0x1)           for bi where i <  2
+        (threadID_in_group * 2) + (i & 0x1) + 8       for bi where i >= 2
+
+    col = groupID
+    """
+    col = (thread_id % 4) * 2 + ((local_id % 4) % 2) + ((local_id % 4) // 2) * 8
+    row = (thread_id // 4) + 8 * (local_id // 4)
+    return row, col
+
+
 def shared_16x16_to_mma_32x8_smoothlayout(i, j):
     return (i * 2 + j // 8, j % 8)
 
diff --git a/tilelang/intrinsics/mma_macro_generator.py b/tilelang/intrinsics/mma_macro_generator.py
index 537cc762c..4b41eef2a 100644
--- a/tilelang/intrinsics/mma_macro_generator.py
+++ b/tilelang/intrinsics/mma_macro_generator.py
@@ -3,13 +3,16 @@
 from typing import Literal, Callable
 from tilelang.common import TransformKind
 from tvm import DataType
-from tvm.tir import PrimExpr, IndexMap, Buffer, Var
+from tvm import tir
+from tvm.ir import Range
+from tvm.tir import PrimExpr, IndexMap, Buffer, Var, BufferRegion, BufferLoad
+from tilelang import tvm as tvm
 from tvm.runtime import convert
 from .utils import (
     mma_store_index_map,
     get_ldmatrix_offset,
 )
-from tilelang.utils import is_fragment
+from tilelang.utils import is_fragment, get_buffer_region_from_load
 from tilelang.intrinsics.mma_layout import (
     shared_16x8_to_mma_32x4_layout_sr_a,
     shared_16x8_to_mma_32x4_layout_sr_b,
@@ -19,8 +22,10 @@
     shared_16x32_to_mma_32x16_layout_sr_b,
     mma_load_a_32x4_to_shared_16x8_layout,
     mma_load_b_32x4_to_shared_16x8_layout,
+    mma_load_b_32x8_to_shared_16x16_layout,
     mma_load_a_32x16_to_shared_16x32_layout,
     mma_load_b_32x16_to_shared_16x32_layout,
+    mma_load_a_32x8_to_shared_16x16_layout,
 )
 
 lift = convert
@@ -40,10 +45,14 @@ class TensorCoreIntrinEmitter:
         "float16": "fp16",
         "bfloat16": "bf16",
         "float32": "fp32",
+        "float64": "fp64",
         "int8": "int8",
         "int32": "int32",
         "float8_e4m3": "e4m3",
+        "float8_e4m3fn": "e4m3",
+        "float8_e4m3fnuz": "e4m3",
         "float8_e5m2": "e5m2",
+        "float8_e5m2fnuz": "e5m2",
     }
 
     # Represent the thread binding in the form of (tx, warp_n, warp_m)
@@ -51,9 +60,9 @@ class TensorCoreIntrinEmitter:
 
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
@@ -78,6 +87,11 @@ def __init__(
         self.warp_col_tiles = warp_col_tiles
         self.chunk = chunk
         self._initialize_k_dim(a_dtype)
+        # For FP64, MMA shape is m8n8k4; adjust instance dims early
+        if DataType(a_dtype).bits == 64:
+            # Override default M/N dims for fp64 MMA
+            self.M_DIM = 8
+            # n_dim will be set to 8 in _initialize_micro_size via k_dim==4
         self._initialize_abbrev(a_dtype, b_dtype, accum_dtype)
         self._initialize_micro_size(self.M_DIM, self.k_dim)
         self._initialize_local_size(self.M_DIM, self.n_dim, self.k_dim, self.WARP_SIZE)
@@ -94,7 +108,7 @@ def __init__(
                 f"Invalid threads configuration for this tile shape, {self.warp_rows} x {self.warp_cols} with threads {self.threads}"
             )
 
-    def _initialize_k_dim(self, a_dtype="float16"):
+    def _initialize_k_dim(self, a_dtype=T.float16):
         if isinstance(a_dtype, str):
             a_dtype = DataType(a_dtype)
         self.k_dim = 256 // a_dtype.bits
@@ -105,12 +119,21 @@ def _initialize_local_size(self, m_dim=16, n_dim=16, k_dim=16, warp_size=32):
         self.local_size_out = (m_dim * n_dim) // warp_size
 
     def _initialize_abbrev(self, a_dtype, b_dtype, accum_dtype):
-        self.a_dtype_abbrv = self.dtype_abbrv[a_dtype]
-        self.b_dtype_abbrv = self.dtype_abbrv[b_dtype]
-        self.accum_dtype_abbrv = self.dtype_abbrv[accum_dtype]
+        self.a_dtype_abbrv = self._get_dtype_abbrv(a_dtype)
+        self.b_dtype_abbrv = self._get_dtype_abbrv(b_dtype)
+        self.accum_dtype_abbrv = self._get_dtype_abbrv(accum_dtype)
+
+    def _get_dtype_abbrv(self, dtype: str) -> str:
+        try:
+            return self.dtype_abbrv[dtype]
+        except KeyError as err:
+            raise ValueError(f"Unsupported dtype: {dtype}") from err
 
     def _initialize_mma_prefix(self, k_dim: int = 16):
-        if k_dim == 8:
+        if k_dim == 4:
+            # fp64
+            self.mma_prefix = "m8n8k4"
+        elif k_dim == 8:
             # typically used for tfloat32
             self.mma_prefix = "m16n8k8"
         elif k_dim == 16:
@@ -125,22 +148,31 @@ def _initialize_mma_prefix(self, k_dim: int = 16):
     def _initialize_micro_size(self, m_dim: int = 16, k_dim: int = 16):
         warp_row_tiles = self.warp_row_tiles
         warp_col_tiles = self.warp_col_tiles
-        assert warp_row_tiles >= 16, f"warp_row_tiles must be greater than 16, got {warp_row_tiles}"
-        assert warp_row_tiles % 16 == 0, f"warp_row_tiles must be divisible by 16, got {warp_row_tiles}"
-        assert warp_col_tiles >= 8, f"warp_col_tiles must be greater than 8, got {warp_col_tiles}"
-        assert warp_col_tiles % 8 == 0, f"warp_col_tiles must be divisible by 8, got {warp_col_tiles}"
-
-        self.warp_rows = warp_row_tiles // m_dim
-
-        if warp_col_tiles % 16 == 0:
-            self.n_dim = 16
-            self.micro_size_y = 16
-            self.warp_cols = warp_col_tiles // 16
-        else:
-            # must be divisible by 8
+        # For fp64 (k_dim==4), micro tile is 8x8, otherwise keep 16x{8|16}
+        if k_dim == 4:
+            # fp64 path: m_dim must be 8, n_dim 8
+            assert m_dim == 8, f"For fp64 MMA, m_dim must be 8, got {m_dim}"
             self.n_dim = 8
             self.micro_size_y = 8
+            self.warp_rows = warp_row_tiles // m_dim
             self.warp_cols = warp_col_tiles // 8
+        else:
+            assert warp_row_tiles >= 16, f"warp_row_tiles must be greater than 16, got {warp_row_tiles}"
+            assert warp_row_tiles % 16 == 0, f"warp_row_tiles must be divisible by 16, got {warp_row_tiles}"
+            assert warp_col_tiles >= 8, f"warp_col_tiles must be greater than 8, got {warp_col_tiles}"
+            assert warp_col_tiles % 8 == 0, f"warp_col_tiles must be divisible by 8, got {warp_col_tiles}"
+
+            self.warp_rows = warp_row_tiles // m_dim
+
+            if warp_col_tiles % 16 == 0:
+                self.n_dim = 16
+                self.micro_size_y = 16
+                self.warp_cols = warp_col_tiles // 16
+            else:
+                # must be divisible by 8
+                self.n_dim = 8
+                self.micro_size_y = 8
+                self.warp_cols = warp_col_tiles // 8
 
         self.micro_size_x = m_dim
         self.micro_size_k = k_dim
@@ -158,17 +190,19 @@ def get_thread_binding(self):
             return self.thread_var
 
     def get_store_index_map(self, inverse: bool = False) -> IndexMap:
+        from .utils import mma_store_index_map, mma_store_index_map_fp64
+
         warp_size, local_size_c = self.WARP_SIZE, self.local_size_out
-        index_map = IndexMap.from_func(mma_store_index_map, index_dtype="int32")
+        if DataType(self.accum_dtype).bits == 64:
+            index_map = IndexMap.from_func(mma_store_index_map_fp64, index_dtype=T.int32)
+        else:
+            index_map = IndexMap.from_func(mma_store_index_map, index_dtype=T.int32)
         if not inverse:
             return index_map
         inverse_index_map = index_map.inverse([warp_size, local_size_c])
         return inverse_index_map
 
-    def extract_thread_binding(
-            self,
-            thread_id: PrimExpr,
-            is_m_first: bool | None = None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
+    def extract_thread_binding(self, thread_id: PrimExpr, is_m_first: bool | None = None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
         """
         is_m_first: True if the thread binding is in the form of (tx, warp_n, warp_m)
         which represents [warp_size, block_row_warps (split n), block_col_warps (split m)]
@@ -197,11 +231,45 @@ def extract_thread_binding(
             )
             return lane_id, warp_n, warp_m
 
-    def ldmatrix_a(self,
-                   A_local_buf: Buffer,
-                   A_shared_buf: Buffer,
-                   ki: PrimExpr,
-                   rk: PrimExpr | None = 0):
+    def ldmatrix_a(self, A_local_buf: Buffer, A_shared_buf: Buffer | BufferRegion, ki: PrimExpr, rk: PrimExpr | None = 0):
+        # Fast path for fp64: no ldmatrix support, do direct per-lane loads
+        if DataType(self.a_dtype).bits == 64:
+            warp_row_tiles = self.warp_row_tiles
+            warp_rows = self.warp_rows
+            chunk = self.chunk
+            micro_size_x = self.micro_size_x  # 8
+            micro_size_k = self.micro_size_k  # 4
+            local_size_a = self.local_size_a  # 1
+            a_transposed = self.a_transposed
+
+            thread_binding = self.get_thread_binding()
+            # legalize shared buffer to region
+            A_region = self._legalize_to_buffer_region(A_shared_buf)
+            A_buf = A_region.buffer
+            A_base0 = A_region.region[-2].min
+            A_base1 = A_region.region[-1].min
+
+            @T.macro
+            def _warp_ld_a_fp64(
+                A_local_buf,
+                A_shared_buf,
+                ki,
+                thread_binding,
+                rk=0,
+            ):
+                tx, _, warp_m = self.extract_thread_binding(thread_binding)
+                for i in T.serial(warp_rows):
+                    wi = warp_m * warp_row_tiles + i * micro_size_x
+                    wk = rk * chunk + ki * micro_size_k
+                    mi = tx // micro_size_k
+                    mk = tx % micro_size_k
+                    if a_transposed:
+                        A_local_buf[i * local_size_a] = A_buf[A_base0 + wk + mk, A_base1 + wi + mi]
+                    else:
+                        A_local_buf[i * local_size_a] = A_buf[A_base0 + wi + mi, A_base1 + wk + mk]
+
+            return _warp_ld_a_fp64(A_local_buf, A_region, ki, thread_binding, rk)
+
         warp_row_tiles = self.warp_row_tiles
         warp_rows = self.warp_rows
         chunk = self.chunk
@@ -219,6 +287,8 @@ def mma_load_layout(i, j):
         if not ldmatrix_available:
             if DataType(a_dtype).bits == 8:
                 mma_load_layout = mma_load_a_32x16_to_shared_16x32_layout
+            elif DataType(a_dtype).bits == 16:
+                mma_load_layout = mma_load_a_32x8_to_shared_16x16_layout
             elif DataType(a_dtype).bits == 32:
                 mma_load_layout = mma_load_a_32x4_to_shared_16x8_layout
             else:
@@ -226,6 +296,13 @@ def mma_load_layout(i, j):
 
         thread_binding = self.get_thread_binding()
 
+        # legalize shared buffer to region
+        A_region = self._legalize_to_buffer_region(A_shared_buf)
+        A_buf = A_region.buffer
+        A_base0 = A_region.region[-2].min
+        A_base1 = A_region.region[-1].min
+        A_stride_last = A_buf.shape[-1]
+
         @T.macro
         def _warp_ldmatrix_a(
             A_local_buf,
@@ -234,14 +311,14 @@ def _warp_ldmatrix_a(
             thread_binding,
             rk=0,
         ):
-            stride = A_shared_buf.shape[-1]
+            stride = A_stride_last
             tx, _, warp_m = self.extract_thread_binding(thread_binding)
             trans = self.a_transposed
 
             for i in T.serial(warp_rows):
                 # Assign A_shared_buf_elem
                 wi, wk = warp_m * warp_row_tiles + i * micro_size_x, rk * chunk + ki * micro_size_k
-                A_shared_buf_elem = A_shared_buf[wk, wi] if a_transposed else A_shared_buf[wi, wk]
+                A_shared_buf_elem = A_buf[A_base0 + wk, A_base1 + wi] if a_transposed else A_buf[A_base0 + wi, A_base1 + wk]
 
                 if ldmatrix_available:
                     T.ptx_ldmatrix(
@@ -257,15 +334,52 @@ def _warp_ldmatrix_a(
                 else:
                     for j in T.serial(local_size_a):
                         mi, mk = mma_load_layout(tx, j)
-                        A_local_buf[i * local_size_a + j] = A_shared_buf[wk + mk, wi + mi]
+                        if a_transposed:
+                            A_local_buf[i * local_size_a + j] = A_buf[A_base0 + wk + mk, A_base1 + wi + mi]
+                        else:
+                            A_local_buf[i * local_size_a + j] = A_buf[A_base0 + wi + mi, A_base1 + wk + mk]
+
+        return _warp_ldmatrix_a(A_local_buf, A_region, ki, thread_binding, rk)
+
+    def ldmatrix_b(self, B_local_buf: Buffer, B_shared_buf: Buffer | BufferRegion, ki: PrimExpr, rk: PrimExpr | None = 0):
+        # Fast path for fp64: no ldmatrix support, do direct per-lane loads
+        if DataType(self.b_dtype).bits == 64:
+            warp_col_tiles = self.warp_col_tiles
+            warp_cols = self.warp_cols
+            chunk = self.chunk
+            micro_size_y = self.micro_size_y  # 8
+            micro_size_k = self.micro_size_k  # 4
+            local_size_b = self.local_size_b  # 1
+            b_transposed = self.b_transposed
+            thread_binding = self.get_thread_binding()
+
+            # legalize shared buffer to region
+            B_region = self._legalize_to_buffer_region(B_shared_buf)
+            B_buf = B_region.buffer
+            B_base0 = B_region.region[-2].min
+            B_base1 = B_region.region[-1].min
+
+            @T.macro
+            def _warp_ld_b_fp64(
+                B_local_buf,
+                B_shared_buf,
+                ki,
+                thread_binding,
+                rk=0,
+            ):
+                tx, warp_n, _ = self.extract_thread_binding(thread_binding)
+                for j in T.serial(warp_cols):
+                    wi = warp_n * warp_col_tiles + j * micro_size_y
+                    wk = rk * chunk + ki * micro_size_k
+                    mi = tx // micro_size_k
+                    mk = tx % micro_size_k
+                    if b_transposed:
+                        B_local_buf[j * local_size_b] = B_buf[B_base0 + wi + mi, B_base1 + wk + mk]
+                    else:
+                        B_local_buf[j * local_size_b] = B_buf[B_base0 + wk + mk, B_base1 + wi + mi]
 
-        return _warp_ldmatrix_a(A_local_buf, A_shared_buf, ki, thread_binding, rk)
+            return _warp_ld_b_fp64(B_local_buf, B_region, ki, thread_binding, rk)
 
-    def ldmatrix_b(self,
-                   B_local_buf: Buffer,
-                   B_shared_buf: Buffer,
-                   ki: PrimExpr,
-                   rk: PrimExpr | None = 0):
         warp_col_tiles = self.warp_col_tiles
         warp_cols = self.warp_cols
         chunk = self.chunk
@@ -275,7 +389,14 @@ def ldmatrix_b(self,
         b_dtype = self.b_dtype
         b_transposed = self.b_transposed
         thread_binding = self.get_thread_binding()
-        replicate_b = (self.n_dim == 16)
+
+        # legalize shared buffer to region
+        B_region = self._legalize_to_buffer_region(B_shared_buf)
+        B_buf = B_region.buffer
+        B_base0 = B_region.region[-2].min
+        B_base1 = B_region.region[-1].min
+        B_stride_last = B_buf.shape[-1]
+        replicate_b = self.n_dim == 16
         # ldmatrix cannot be used for int8 + trans case.
         ldmatrix_available = not (DataType(b_dtype).bits != 16 and not b_transposed)
 
@@ -285,6 +406,8 @@ def mma_load_layout(i, j):
         if not ldmatrix_available:
             if DataType(b_dtype).bits == 8:
                 mma_load_layout = mma_load_b_32x16_to_shared_16x32_layout
+            elif DataType(b_dtype).bits == 16:
+                mma_load_layout = mma_load_b_32x8_to_shared_16x16_layout
             elif DataType(b_dtype).bits == 32:
                 mma_load_layout = mma_load_b_32x4_to_shared_16x8_layout
             else:
@@ -298,7 +421,7 @@ def _warp_ldmatrix_b(
             thread_binding,
             rk=0,
         ):
-            stride = B_shared_buf.shape[-1]
+            stride = B_stride_last
             tx, warp_n, _ = self.extract_thread_binding(thread_binding)
             trans = not b_transposed
 
@@ -310,8 +433,7 @@ def _warp_ldmatrix_b(
                 )
 
                 if ldmatrix_available:
-                    B_shared_buf_elem = B_shared_buf[wi, wk] if b_transposed else B_shared_buf[wk,
-                                                                                               wi]
+                    B_shared_buf_elem = B_buf[B_base0 + wi, B_base1 + wk] if b_transposed else B_buf[B_base0 + wk, B_base1 + wi]
 
                     T.ptx_ldmatrix(
                         b_dtype,
@@ -329,15 +451,14 @@ def _warp_ldmatrix_b(
                     # must be transposed.
                     for j in T.serial(local_size_b):
                         mi, mk = mma_load_layout(tx, j)
-                        B_local_buf[i * local_size_b + j] = B_shared_buf[wk + mk, wi + mi]
+                        if b_transposed:
+                            B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wi + mi, B_base1 + wk + mk]
+                        else:
+                            B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wk + mk, B_base1 + wi + mi]
 
         return _warp_ldmatrix_b(B_local_buf, B_shared_buf, ki, thread_binding, rk)
 
-    def mma(self,
-            A_local_buf: Buffer,
-            B_local_buf: Buffer,
-            C_local_buf: Buffer,
-            k_inner: PrimExpr | None = 0):
+    def mma(self, A_local_buf: Buffer, B_local_buf: Buffer, C_local_buf: Buffer, k_inner: PrimExpr | None = 0):
         warp_rows = self.warp_rows
         warp_cols = self.warp_cols
         local_size_a = self.local_size_a
@@ -348,7 +469,7 @@ def mma(self,
         accum_dtype = self.accum_dtype
         accum_dtype_abbrv = self.accum_dtype_abbrv
         mma_prefix = self.mma_prefix
-        replicate_b = (self.n_dim == 16)
+        replicate_b = self.n_dim == 16
 
         a_is_fragment = is_fragment(A_local_buf)
         b_is_fragment = is_fragment(B_local_buf)
@@ -388,8 +509,7 @@ def _warp_mma(A_local_buf, B_local_buf, C_local_buf):
                         B_local_buf.data,
                         b_local_stride + j * local_size_b + lift(local_size_b) // 2,
                         C_local_buf.data,
-                        i * warp_cols * local_size_out + j * local_size_out +
-                        lift(local_size_out) // 2,
+                        i * warp_cols * local_size_out + j * local_size_out + lift(local_size_out) // 2,
                         T.bool(False),  # saturate
                     )
 
@@ -424,14 +544,13 @@ def _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding):
                         local_id = local_id_o * 2 + local_id_i
                         row, col = T.meta_var(mma_store_index_map(tx, local_id))
                         if C_buf_dims == 2:
-                            C_buf[(warp_m * warp_rows + i) * M_DIM + row,
-                                  (warp_n * warp_cols + j) * n_dim +
-                                  col] = C_local_buf[i * (warp_cols * local_size_out) +
-                                                     j * local_size_out + local_id]
+                            C_buf[(warp_m * warp_rows + i) * M_DIM + row, (warp_n * warp_cols + j) * n_dim + col] = C_local_buf[
+                                i * (warp_cols * local_size_out) + j * local_size_out + local_id
+                            ]
                         else:
-                            C_buf[warp_m * warp_rows + i, warp_n * warp_cols + j, row,
-                                  col] = C_local_buf[i * (warp_cols * local_size_out) +
-                                                     j * local_size_out + local_id]
+                            C_buf[warp_m * warp_rows + i, warp_n * warp_cols + j, row, col] = C_local_buf[
+                                i * (warp_cols * local_size_out) + j * local_size_out + local_id
+                            ]
 
         @T.macro
         def _warp_stmatrix_global(C_local_buf, C_buf, thread_binding):
@@ -444,15 +563,15 @@ def _warp_stmatrix_global(C_local_buf, C_buf, thread_binding):
                         C_buf[
                             (pid_m * BLOCK_M + warp_m * warp_rows + i) * M_DIM + row,
                             (pid_n * BLOCK_N + warp_n * warp_cols + j) * n_dim + col,
-                        ] = C_local_buf[i * warp_cols * local_size_out + j * local_size_out +
-                                        local_id]
+                        ] = C_local_buf[i * warp_cols * local_size_out + j * local_size_out + local_id]
 
-        return (_warp_stmatrix_global(C_local_buf, C_buf, thread_binding)
-                if is_global else _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding))
+        return (
+            _warp_stmatrix_global(C_local_buf, C_buf, thread_binding)
+            if is_global
+            else _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding)
+        )
 
-    def make_mma_load_layout(self,
-                             local_buf: Buffer,
-                             matrix: Literal["A", "B"] = "A") -> T.Fragment:
+    def make_mma_load_layout(self, local_buf: Buffer, matrix: Literal["A", "B"] = "A") -> T.Fragment:
         """
         Create a layout function for storing MMA results into a fragment buffer.
         This layout is used in conjunction with `inverse_mma_store_layout` to
@@ -475,6 +594,7 @@ def make_mma_load_layout(self,
             If `local_buf` is not detected to be a fragment buffer.
         """
         from tilelang.utils import is_fragment
+
         assert matrix in ["A", "B"], "matrix should be either A or B"
         matrix_is_a: bool = matrix == "A"
         matrix_is_b: bool = matrix == "B"
@@ -511,11 +631,9 @@ def make_mma_load_layout(self,
         # so the b matrix expected a transposed basic layout
         transform_func: Callable = None
         if matrix_is_a:
-            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(
-                j, i)
+            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
         elif matrix_is_b:
-            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(
-                j, i)
+            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(j, i)
         else:
             raise ValueError(f"Unsupported matrix {matrix}")
 
@@ -531,7 +649,7 @@ def make_mma_load_layout(self,
             self.block_col_warps,
         )
 
-        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
         def forward_thread(i: int, j: int) -> int:
             """
@@ -562,31 +680,19 @@ def forward_index(i: int, j: int) -> int:
         replicate = block_col_warps if matrix_is_a else block_row_warps
 
         if is_sr_axis_order:
-            warp_fragment = base_fragment.repeat([warp_s, warp_r],
-                                                 repeat_on_thread=False,
-                                                 lower_dim_first=False)
+            warp_fragment = base_fragment.repeat([warp_s, warp_r], repeat_on_thread=False, lower_dim_first=False)
             if matrix_is_a:
-                block_fragment = warp_fragment.repeat([block_s, 1],
-                                                      repeat_on_thread=True,
-                                                      lower_dim_first=True).replicate(replicate)
+                block_fragment = warp_fragment.repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
             elif matrix_is_b:
-                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1],
-                                                                           repeat_on_thread=True,
-                                                                           lower_dim_first=True)
+                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True)
             else:
                 raise ValueError(f"Unsupported matrix type {matrix}")
         else:
-            warp_fragment = base_fragment.repeat([warp_r, warp_s],
-                                                 repeat_on_thread=False,
-                                                 lower_dim_first=True)
+            warp_fragment = base_fragment.repeat([warp_r, warp_s], repeat_on_thread=False, lower_dim_first=True)
             if matrix_is_a:
-                block_fragment = warp_fragment.repeat([1, block_s],
-                                                      repeat_on_thread=True,
-                                                      lower_dim_first=True).replicate(replicate)
+                block_fragment = warp_fragment.repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
             elif matrix_is_b:
-                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s],
-                                                                           repeat_on_thread=True,
-                                                                           lower_dim_first=True)
+                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True)
             else:
                 raise ValueError(f"Unsupported matrix type {matrix}")
 
@@ -617,8 +723,9 @@ def make_mma_store_layout(self, local_buf: Buffer) -> T.Fragment:
         from tilelang.utils import is_fragment
 
         shape = local_buf.shape
+        assert is_fragment(local_buf), f"local_buf {local_buf} must be a fragment, but got {local_buf.scope()}"
         inverse_mma_store_layout = self.get_store_index_map(inverse=True)
-        assert is_fragment(local_buf), "local_buf must be a fragment"
+
         micro_size_x, micro_size_y = self.micro_size_x, self.micro_size_y
         local_size_out = self.local_size_out
         block_row_warps, block_col_warps = self.block_row_warps, self.block_col_warps
@@ -663,6 +770,33 @@ def forward_index(i: int, j: int) -> int:
             forward_index_fn=forward_index,
         )
 
+    @staticmethod
+    def _legalize_to_buffer_region(obj: Buffer | BufferLoad | BufferRegion) -> BufferRegion:
+        """
+        Convert Buffer/BufferRegion/BufferLoad to a BufferRegion.
+
+        - Buffer -> full-region BufferRegion covering entire shape
+        - BufferRegion -> returned as-is
+        - BufferLoad -> best-effort convert via get_buffer_region_from_load;
+        if scalar, fall back to 1-sized ranges at given indices
+        """
+        if isinstance(obj, BufferRegion):
+            return obj
+        if isinstance(obj, Buffer):
+            mins = [tir.IntImm("int32", 0) for _ in obj.shape]
+            ranges = [Range.from_min_extent(m, e) for m, e in zip(mins, obj.shape)]
+            return BufferRegion(obj, ranges)
+        if isinstance(obj, BufferLoad):
+            region = get_buffer_region_from_load(obj)
+            if region is not None:
+                return region
+            # Fallback: scalar load -> 1-sized ranges at indices
+            mins = [idx for idx in obj.indices]
+            ones = [tir.IntImm("int32", 1) for _ in obj.indices]
+            ranges = [Range.from_min_extent(m, e) for m, e in zip(mins, ones)]
+            return BufferRegion(obj.buffer, ranges)
+        raise ValueError(f"Unsupported argument type for BufferRegion: {type(obj)}")
+
 
 class TensorCoreIntrinEmitterWithLadderTransform(TensorCoreIntrinEmitter):
     """
@@ -672,9 +806,9 @@ class TensorCoreIntrinEmitterWithLadderTransform(TensorCoreIntrinEmitter):
 
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
@@ -705,7 +839,7 @@ def __init__(
         )
         self._initialize_transform_kind(transform_kind_a, transform_kind_b)
 
-    def _initialize_k_dim(self, a_dtype="float16"):
+    def _initialize_k_dim(self, a_dtype=T.float16):
         self.k_dim = 256 // DataType(a_dtype).bits
 
     def _initialize_local_size(self, m_dim=16, n_dim=16, k_dim=16, warp_size=32):
@@ -781,10 +915,12 @@ def _warp_ldmatrix_a(
                         ".b16",
                         A_local_buf.data,
                         i * local_size_a,
-                        T.address_of(A_shared_buf[
-                            warp_m * warp_row_tiles + i * micro_size_x,
-                            rk * chunk + ki * micro_size_k,
-                        ]),
+                        T.address_of(
+                            A_shared_buf[
+                                warp_m * warp_row_tiles + i * micro_size_x,
+                                rk * chunk + ki * micro_size_k,
+                            ]
+                        ),
                         get_ldmatrix_offset("A", tx, 0, stride, a_dtype, a_transposed),
                     )
             elif transform_kind_a == TransformKind.InterWarpTransform:
@@ -846,10 +982,8 @@ def _warp_ldmatrix_a(
                             warp_m * warp_rows + j,
                             rk * (chunk // micro_size_k) + ki,
                         )
-                        rii, rjj = (tx * local_size_a +
-                                    local_id) // micro_size_k, (tx * local_size_a + local_id) % (
-                                        micro_size_k)
-                        A_local_buf[j * local_size_a + local_id] = (A_shared_buf[ri, rj, rii, rjj])
+                        rii, rjj = (tx * local_size_a + local_id) // micro_size_k, (tx * local_size_a + local_id) % (micro_size_k)
+                        A_local_buf[j * local_size_a + local_id] = A_shared_buf[ri, rj, rii, rjj]
             else:
                 raise ValueError("Unsupported TransformKind for Input A")
 
@@ -958,12 +1092,11 @@ def _warp_ldmatrix_b(
                             warp_n * warp_cols + j,
                             rk * (chunk // micro_size_k) + ki,
                         )
-                        rii, rjj = (tx * local_size_dequantize +
-                                    local_id) // (micro_size_k // num_elems_per_byte), (
-                                        tx * local_size_dequantize + local_id) % (
-                                            micro_size_k // num_elems_per_byte)
-                        B_local_buf[j * local_size_dequantize + local_id] = (
-                            B_shared_buf[ri, rj, rii, rjj])
+                        rii, rjj = (
+                            (tx * local_size_dequantize + local_id) // (micro_size_k // num_elems_per_byte),
+                            (tx * local_size_dequantize + local_id) % (micro_size_k // num_elems_per_byte),
+                        )
+                        B_local_buf[j * local_size_dequantize + local_id] = B_shared_buf[ri, rj, rii, rjj]
             else:
                 raise ValueError("Unsupported TransformKind for Input B")
 
@@ -1022,7 +1155,6 @@ def _warp_mma(A_local_buf, B_local_buf, C_local_buf):
 
 
 class INT4TensorCoreIntrinEmitter(TensorCoreIntrinEmitter):
-
     def mma(self, A_local_buf, B_local_buf, C_local_buf):
         warp_rows = self.warp_rows
         warp_cols = self.warp_cols
@@ -1125,9 +1257,7 @@ def _warp_mma(A_local_buf, B_local_buf, C_local_buf):
 
 
 class INT4TensorCoreIntrinEmitterWithLadderTransform(TensorCoreIntrinEmitterWithLadderTransform):
-
     def mma(self, A_local_buf, B_local_buf, C_local_buf):
-
         warp_rows = self.warp_rows
         warp_cols = self.warp_cols
         local_size_a = self.local_size_a
@@ -1136,7 +1266,7 @@ def mma(self, A_local_buf, B_local_buf, C_local_buf):
         a_dtype_abbrv = "int4"
         b_dtype_abbrv = "int4"
         accum_dtype = self.accum_dtype
-        accum_dtype_abbrv = "int32"
+        accum_dtype_abbrv = T.int32
         mma_prefix = "m16n8k32"
 
         @T.macro
diff --git a/tilelang/intrinsics/mma_sm70_layout.py b/tilelang/intrinsics/mma_sm70_layout.py
new file mode 100644
index 000000000..802923441
--- /dev/null
+++ b/tilelang/intrinsics/mma_sm70_layout.py
@@ -0,0 +1,46 @@
+def shared_16x4_to_mma_a_32x4_layout(row, col, rep):
+    tid = (row % 4) + 16 * ((row // 4) % 2) + 4 * (row // 8) + 8 * rep
+    local_id = col
+    return tid, local_id
+
+
+def shared_4x16_to_mma_b_32x4_layout(row, col, rep):
+    thread_id = row + 8 * col // 4 + 4 * rep
+    local_id = col % 4
+    return thread_id, local_id
+
+
+def shared_16x4_to_mma_b_32x4_layout_trans(row, col, rep):
+    thread_id = row % 4 + 4 * rep + 8 * ((row % 8) // 4) + 16 * (row // 8)
+    local_id = col
+    return thread_id, local_id
+
+
+def mma_32x8_to_shared_16x16_layout_fp32(thread_id, local_id):
+    row = (thread_id % 2) + ((local_id // 2 % 2) * 2) + 4 * (thread_id // 16) + (thread_id % 16 // 4) % 2 * 8
+    col = (thread_id % 4 // 2) * 2 + (thread_id % 16 // 8) * 4 + (local_id % 2) + (local_id // 4) * 8
+    return row, col
+
+
+def mma_32x8_to_shared_16x16_layout_fp16(thread_id, local_id):
+    row = (thread_id % 4) + (thread_id // 16) * 4 + (thread_id % 8) // 4 * 8
+    col = local_id % 4 + ((thread_id % 16) // 8) * 4 + (local_id // 4) * 8
+    return row, col
+
+
+def mma_load_a_32x4_to_shared_16x4_layout(thread_id, local_id):
+    row = (thread_id % 4) + (4 * ((thread_id // 16 + thread_id % 16 // 4 * 2) % 4))
+    col = local_id
+    return row, col
+
+
+def mma_load_b_32x4_to_shared_16x4_layout_trans(thread_id, local_id):
+    row = (thread_id % 4) + 8 * (thread_id // 16) + 4 * ((thread_id // 8) % 2)
+    col = local_id
+    return row, col
+
+
+def mma_load_b_32x4_to_shared_4x16_layout(thread_id, local_id):
+    row = thread_id % 4
+    col = local_id + (4 * (thread_id // 8))
+    return row, col
diff --git a/tilelang/intrinsics/mma_sm70_macro_generator.py b/tilelang/intrinsics/mma_sm70_macro_generator.py
new file mode 100644
index 000000000..6acc40a4c
--- /dev/null
+++ b/tilelang/intrinsics/mma_sm70_macro_generator.py
@@ -0,0 +1,495 @@
+from __future__ import annotations
+import tilelang.language as T
+from typing import Literal, Callable
+from tvm import DataType
+from tvm.tir import PrimExpr, IndexMap, Buffer, Var, BufferRegion
+from tilelang import tvm as tvm
+from tvm.runtime import convert
+from tilelang.utils import is_fragment
+from tilelang.intrinsics.mma_sm70_layout import (
+    shared_16x4_to_mma_a_32x4_layout,
+    shared_4x16_to_mma_b_32x4_layout,
+    shared_16x4_to_mma_b_32x4_layout_trans,
+    mma_32x8_to_shared_16x16_layout_fp32,
+    mma_32x8_to_shared_16x16_layout_fp16,
+    mma_load_a_32x4_to_shared_16x4_layout,
+    mma_load_b_32x4_to_shared_16x4_layout_trans,
+    mma_load_b_32x4_to_shared_4x16_layout,
+)
+
+lift = convert
+
+
+class TensorCoreIntrinEmitter:
+    """
+    To eliminate Python syntax within TIR Macro.
+    """
+
+    M_DIM = 16
+    # use lowercase as n_dim can be dynamic
+    # the smallest instructions can be m16n8k16, so the n_dim can also be 8
+    n_dim = 16
+    WARP_SIZE = 32
+    HALF_WARP_SIZE = WARP_SIZE // 2
+    dtype_abbrv = {
+        "float16": "fp16",
+        "bfloat16": "bf16",
+        "float32": "fp32",
+        "int8": "int8",
+        "int32": "int32",
+        "float8_e4m3": "e4m3",
+        "float8_e5m2": "e5m2",
+    }
+
+    # Represent the thread binding in the form of (tx, warp_n, warp_m)
+    is_m_first = False
+
+    def __init__(
+        self,
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
+        a_transposed: bool = False,
+        b_transposed: bool = False,
+        block_row_warps: int = 2,
+        block_col_warps: int = 2,
+        warp_row_tiles: int = 8,
+        warp_col_tiles: int = 8,
+        chunk: int = 16,
+        reduce_k: int = 1,
+        num_elems_per_byte: int = 1,
+        is_m_first: bool | None = False,
+        thread_var: Var | None = None,
+    ):
+        self.a_dtype = a_dtype
+        self.b_dtype = b_dtype
+        self.accum_dtype = accum_dtype
+        self.a_transposed = a_transposed
+        self.b_transposed = b_transposed
+        # Hint Information
+        self.block_row_warps = block_row_warps
+        self.block_col_warps = block_col_warps
+        self.warp_row_tiles = warp_row_tiles
+        self.warp_col_tiles = warp_col_tiles
+        self.chunk = chunk
+        self._initialize_k_dim(a_dtype)
+        self._initialize_abbrev(a_dtype, b_dtype, accum_dtype)
+        self._initialize_micro_size(self.M_DIM, self.k_dim)
+        self._initialize_local_size(self.M_DIM, self.n_dim, self.k_dim)
+        self._initialize_mma_prefix(self.k_dim)
+        self._initialize_is_m_first(is_m_first)
+
+        self.reduce_k = reduce_k
+        self.threads = self.WARP_SIZE * (block_row_warps * block_col_warps) * reduce_k
+        self.num_elems_per_byte = num_elems_per_byte
+        self.thread_var = thread_var
+
+        if self.warp_rows == 0 or self.warp_cols == 0:
+            raise ValueError(
+                f"Invalid threads configuration for this tile shape, {self.warp_rows} x {self.warp_cols} with threads {self.threads}"
+            )
+
+    def _initialize_k_dim(self, a_dtype=T.float16):
+        self.k_dim = 4
+
+    def _initialize_local_size(self, m_dim=16, n_dim=16, k_dim=16):
+        self.local_size_a = (m_dim * k_dim) // self.HALF_WARP_SIZE
+        self.local_size_b = (n_dim * k_dim) // self.HALF_WARP_SIZE
+        self.local_size_out = (m_dim * n_dim) // self.WARP_SIZE
+
+    def _initialize_abbrev(self, a_dtype, b_dtype, accum_dtype):
+        self.a_dtype_abbrv = self._get_dtype_abbrv(a_dtype)
+        self.b_dtype_abbrv = self._get_dtype_abbrv(b_dtype)
+        self.accum_dtype_abbrv = self._get_dtype_abbrv(accum_dtype)
+
+    def _get_dtype_abbrv(self, dtype: str) -> str:
+        try:
+            return self.dtype_abbrv[dtype]
+        except KeyError as err:
+            raise ValueError(f"Unsupported dtype: {dtype}") from err
+
+    def _initialize_mma_prefix(self, k_dim: int = 16):
+        if k_dim == 4:
+            # typically used for float16
+            self.mma_prefix = "m16n16k4"
+        else:
+            raise ValueError(f"Unsupported k_dim: {k_dim}")
+
+    def _initialize_micro_size(self, m_dim: int = 16, k_dim: int = 16):
+        warp_row_tiles = self.warp_row_tiles
+        warp_col_tiles = self.warp_col_tiles
+        assert warp_row_tiles >= 16, f"warp_row_tiles must be greater than 16, got {warp_row_tiles}"
+        assert warp_row_tiles % 16 == 0, f"warp_row_tiles must be divisible by 16, got {warp_row_tiles}"
+        assert warp_col_tiles >= 16, f"warp_col_tiles must be greater than 16, got {warp_col_tiles}"
+        assert warp_col_tiles % 16 == 0, f"warp_col_tiles must be divisible by 16, got {warp_col_tiles}"
+
+        self.warp_rows = warp_row_tiles // m_dim
+
+        self.n_dim = 16
+        self.micro_size_y = 16
+        self.warp_cols = warp_col_tiles // 16
+
+        self.micro_size_x = m_dim
+        self.micro_size_k = k_dim
+
+    def _initialize_is_m_first(self, is_m_first: bool | None = False):
+        if is_m_first is not None:
+            self.is_m_first = is_m_first
+
+    def get_thread_binding(self):
+        if self.thread_var is None:
+            current_frame = T.KernelLaunchFrame.Current()
+            assert current_frame is not None, "Must be called in a T.Kernel Frame"
+            return current_frame.get_thread_binding()
+        else:
+            return self.thread_var
+
+    def get_store_index_map(self, inverse: bool = False) -> IndexMap:
+        warp_size, local_size_c = self.WARP_SIZE, self.local_size_out
+        index_map = IndexMap.from_func(
+            mma_32x8_to_shared_16x16_layout_fp32 if self.accum_dtype == T.float32 else mma_32x8_to_shared_16x16_layout_fp16,
+            index_dtype=T.int32,
+        )
+        if not inverse:
+            return index_map
+        inverse_index_map = index_map.inverse([warp_size, local_size_c])
+        return inverse_index_map
+
+    def extract_thread_binding(self, thread_id: PrimExpr, is_m_first: bool | None = None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
+        """
+        is_m_first: True if the thread binding is in the form of (tx, warp_n, warp_m)
+        which represents [warp_size, block_row_warps (split n), block_col_warps (split m)]
+        Otherwise, it is in the form of [warp_size, block_col_warps (split m), block_row_warps (split n)]
+        """
+        WARP_SIZE = self.WARP_SIZE
+        block_row_warps = self.block_row_warps
+        block_col_warps = self.block_col_warps
+
+        # if is_m_first is None, then use the default value
+        if is_m_first is None:
+            is_m_first = self.is_m_first
+
+        if is_m_first:
+            lane_id, warp_n, warp_m = (
+                thread_id % WARP_SIZE,
+                (thread_id // WARP_SIZE) % block_col_warps,
+                (thread_id // (WARP_SIZE * block_col_warps)) % block_row_warps,
+            )
+            return lane_id, warp_n, warp_m
+        else:
+            lane_id, warp_m, warp_n = (
+                thread_id % WARP_SIZE,
+                (thread_id // WARP_SIZE) % block_row_warps,
+                (thread_id // (WARP_SIZE * block_row_warps)) % block_col_warps,
+            )
+            return lane_id, warp_n, warp_m
+
+    def ldmatrix_a(self, A_local_buf: Buffer, A_shared_buf: Buffer | BufferRegion, ki: PrimExpr, rk: PrimExpr | None = 0):
+        warp_row_tiles = self.warp_row_tiles
+        warp_rows = self.warp_rows
+        chunk = self.chunk
+        micro_size_x = self.micro_size_x
+        micro_size_k = self.micro_size_k
+        local_size_a = self.local_size_a
+        a_transposed = self.a_transposed
+
+        thread_binding = self.get_thread_binding()
+
+        assert not a_transposed, "A must be not transposed"
+
+        mma_load_layout = mma_load_a_32x4_to_shared_16x4_layout
+
+        # legalize shared buffer to region
+        A_region = self._legalize_to_buffer_region(A_shared_buf)
+        A_buf = A_region.buffer
+        A_base0 = A_region.region[-2].min
+        A_base1 = A_region.region[-1].min
+
+        @T.macro
+        def _warp_ldmatrix_a(
+            A_local_buf,
+            A_shared_buf,
+            ki,
+            thread_binding,
+            rk=0,
+        ):
+            tx, _, warp_m = self.extract_thread_binding(thread_binding)
+
+            for i in T.serial(warp_rows):
+                # Assign A_shared_buf_elem
+                wi, wk = warp_m * warp_row_tiles + i * micro_size_x, rk * chunk + ki * micro_size_k
+                for j in T.vectorized(local_size_a):
+                    mi, mk = mma_load_layout(tx, j)
+                    A_local_buf[i * local_size_a + j] = A_buf[A_base0 + wi + mi, A_base1 + wk + mk]
+
+        return _warp_ldmatrix_a(A_local_buf, A_region, ki, thread_binding, rk)
+
+    def ldmatrix_b(self, B_local_buf: Buffer, B_shared_buf: Buffer | BufferRegion, ki: PrimExpr, rk: PrimExpr | None = 0):
+        warp_col_tiles = self.warp_col_tiles
+        warp_cols = self.warp_cols
+        chunk = self.chunk
+        micro_size_y = self.micro_size_y
+        micro_size_k = self.micro_size_k
+        local_size_b = self.local_size_b
+        b_transposed = self.b_transposed
+        thread_binding = self.get_thread_binding()
+
+        mma_load_layout = mma_load_b_32x4_to_shared_16x4_layout_trans if b_transposed else mma_load_b_32x4_to_shared_4x16_layout
+
+        # legalize shared buffer to region
+        B_region = self._legalize_to_buffer_region(B_shared_buf)
+        B_buf = B_region.buffer
+        B_base0 = B_region.region[-2].min
+        B_base1 = B_region.region[-1].min
+
+        @T.macro
+        def _warp_ldmatrix_b(
+            B_local_buf,
+            B_shared_buf,
+            ki,
+            thread_binding,
+            rk=0,
+        ):
+            tx, warp_n, _ = self.extract_thread_binding(thread_binding)
+
+            for i in T.serial(warp_cols):
+                # Assign B_shared_elem
+                wi, wk = (
+                    warp_n * warp_col_tiles + i * micro_size_y,
+                    rk * chunk + ki * micro_size_k,
+                )
+                # load 16x32 data from shared buffer to local buffer
+                # must be transposed.
+                for j in T.vectorized(local_size_b):
+                    if b_transposed:
+                        mi, mk = mma_load_layout(tx, j)
+                        B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wi + mi, B_base1 + wk + mk]
+                    else:
+                        mk, mi = mma_load_layout(tx, j)
+                        B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wk + mk, B_base1 + wi + mi]
+
+        return _warp_ldmatrix_b(B_local_buf, B_region, ki, thread_binding, rk)
+
+    def mma(self, A_local_buf: Buffer, B_local_buf: Buffer, C_local_buf: Buffer, k_inner: PrimExpr | None = 0):
+        warp_rows = self.warp_rows
+        warp_cols = self.warp_cols
+        local_size_a = self.local_size_a
+        local_size_b = self.local_size_b
+        local_size_out = self.local_size_out
+        a_dtype_abbrv = self.a_dtype_abbrv
+        b_dtype_abbrv = self.b_dtype_abbrv
+        accum_dtype_abbrv = self.accum_dtype_abbrv
+        mma_prefix = self.mma_prefix
+
+        a_is_fragment = is_fragment(A_local_buf)
+        b_is_fragment = is_fragment(B_local_buf)
+        a_local_stride: PrimExpr = k_inner * warp_rows * local_size_a if a_is_fragment else 0
+        b_local_stride: PrimExpr = k_inner * warp_cols * local_size_b if b_is_fragment else 0
+
+        a_major = "col" if self.a_transposed else "row"
+        b_major = "col" if self.b_transposed else "row"
+
+        @T.macro
+        def _warp_mma(A_local_buf, B_local_buf, C_local_buf):
+            for i, j in T.grid(warp_rows, warp_cols):
+                T.ptx_mma_sm70(
+                    mma_prefix,
+                    a_major,
+                    b_major,
+                    a_dtype_abbrv,
+                    b_dtype_abbrv,
+                    accum_dtype_abbrv,
+                    A_local_buf.data,
+                    a_local_stride + i * local_size_a,
+                    B_local_buf.data,
+                    b_local_stride + j * local_size_b,
+                    C_local_buf.data,
+                    i * warp_cols * local_size_out + j * local_size_out,
+                )
+
+        return _warp_mma(A_local_buf, B_local_buf, C_local_buf)
+
+    def make_mma_load_layout(self, local_buf: Buffer, matrix: Literal["A", "B"] = "A") -> T.Fragment:
+        """
+        Create a layout function for storing MMA results into a fragment buffer.
+        This layout is used in conjunction with `inverse_mma_store_layout` to
+        map fragment indices to threads and local indices.
+
+        Parameters
+        ----------
+        local_buf : tir.Buffer
+            The local buffer representing a fragment of a matrix.
+
+        Returns
+        -------
+        T.Fragment
+            A fragment object that describes how threads and indices
+            in `local_buf` are laid out.
+
+        Raises
+        ------
+        AssertionError
+            If `local_buf` is not detected to be a fragment buffer.
+        """
+        from tilelang.utils import is_fragment
+
+        assert matrix in ["A", "B"], "matrix should be either A or B"
+        matrix_is_a: bool = matrix == "A"
+        matrix_is_b: bool = matrix == "B"
+        dtype = self.a_dtype if matrix_is_a else self.b_dtype
+        dtype_bits = DataType(dtype).bits
+        transposed = self.a_transposed if matrix_is_a else self.b_transposed
+
+        # s represents spatial axis
+        # r represents reduction axis
+        # sr represents the two dims are spatial + reduction
+        # rs represents the two dims are reduction + spatial
+        # sr also can represent a non-transposed basic layout
+        # then rs also can represent a transposed basic layout
+        transform_func_sr_a: Callable = None
+        transform_func_sr_b: Callable = None
+        transform_func_rs_b: Callable = None
+        if dtype_bits == 16:
+            transform_func_sr_a = shared_16x4_to_mma_a_32x4_layout
+            transform_func_sr_b = shared_16x4_to_mma_b_32x4_layout_trans
+            transform_func_rs_b = shared_4x16_to_mma_b_32x4_layout
+        else:
+            raise ValueError(f"Unsupported dtype {dtype}")
+
+        is_sr_conditions = [False]
+        is_sr_conditions.append(matrix_is_a and not transposed)
+        is_sr_conditions.append(matrix_is_b and transposed)
+        is_sr_axis_order = any(is_sr_conditions)
+
+        # the layout of mma.sync is row.col.
+        # so the b matrix expected a transposed basic layout
+        transform_func: Callable = None
+        if matrix_is_a:
+            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
+        elif matrix_is_b:
+            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_rs_b(i, j)
+        else:
+            raise ValueError(f"Unsupported matrix {matrix}")
+
+        assert is_fragment(local_buf), f"local_buf must be a fragment, but got {local_buf.scope()}"
+
+        if matrix_is_a:
+            micro_size_s, micro_size_r = self.micro_size_x, self.micro_size_k
+        else:
+            micro_size_r, micro_size_s = self.micro_size_k, self.micro_size_y
+
+        block_row_warps, block_col_warps = (
+            self.block_row_warps,
+            self.block_col_warps,
+        )
+
+        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
+
+        def forward(i: int, j: int, rep: int) -> int:
+            """
+            Given the row index `i` and column index `j` in the fragment,
+            """
+            lane_id, local_id = inverse_mma_load_layout.map_indices([i, j, rep])
+            return lane_id, local_id
+
+        base_fragment = T.Fragment(
+            [micro_size_s, micro_size_r] if is_sr_axis_order else [micro_size_r, micro_size_s], forward_fn=forward, replicate=2
+        )
+
+        warp_rows, warp_cols = self.warp_rows, self.warp_cols
+        chunk = self.chunk
+
+        warp_s = warp_rows if matrix_is_a else warp_cols
+        warp_r = chunk // micro_size_r
+        block_s = block_row_warps if matrix_is_a else block_col_warps
+        replicate = block_col_warps if matrix_is_a else block_row_warps
+
+        if is_sr_axis_order:
+            warp_fragment = base_fragment.repeat([warp_s, warp_r], repeat_on_thread=False, lower_dim_first=False)
+            if matrix_is_a:
+                block_fragment = warp_fragment.repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
+            elif matrix_is_b:
+                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True)
+            else:
+                raise ValueError(f"Unsupported matrix type {matrix}")
+        else:
+            warp_fragment = base_fragment.repeat([warp_r, warp_s], repeat_on_thread=False, lower_dim_first=True)
+            if matrix_is_a:
+                block_fragment = warp_fragment.repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
+            elif matrix_is_b:
+                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True)
+            else:
+                raise ValueError(f"Unsupported matrix type {matrix}")
+
+        return block_fragment
+
+    def make_mma_store_layout(self, local_buf: Buffer) -> T.Fragment:
+        """
+        Create a layout function for storing MMA results into a fragment buffer.
+        This layout is used in conjunction with `inverse_mma_store_layout` to
+        map fragment indices to threads and local indices.
+
+        Parameters
+        ----------
+        local_buf : tir.Buffer
+            The local buffer representing a fragment of a matrix.
+
+        Returns
+        -------
+        T.Fragment
+            A fragment object that describes how threads and indices
+            in `local_buf` are laid out.
+
+        Raises
+        ------
+        AssertionError
+            If `local_buf` is not detected to be a fragment buffer.
+        """
+        from tilelang.utils import is_fragment
+
+        shape = local_buf.shape
+        inverse_mma_store_layout = self.get_store_index_map(inverse=True)
+        assert is_fragment(local_buf), "local_buf must be a fragment"
+        micro_size_x, micro_size_y = self.micro_size_x, self.micro_size_y
+        local_size_out = self.local_size_out
+        block_row_warps, block_col_warps = self.block_row_warps, self.block_col_warps
+        warp_rows, warp_cols = self.warp_rows, self.warp_cols
+        warp_size = self.WARP_SIZE
+        is_m_first = self.is_m_first
+
+        def forward_thread(i: int, j: int) -> int:
+            """
+            Given the row index `i` and column index `j` in the fragment,
+            map them to a thread index according to `inverse_mma_store_layout`.
+            """
+            # the upper bounds of i and j are block_row_warps * warp_rows * micro_size_x and block_col_warps * warp_cols * micro_size_y
+            # the upper bounds of block_row_warps and block_col_warps are warp_rows and warp_cols
+            block_i, block_j = (i // micro_size_x) // warp_rows, (j // micro_size_y) // warp_cols
+            # upper bounds of mma_i and mma_j are micro_size_x and micro_size_y
+            mma_i, mma_j = i % micro_size_x, j % micro_size_y
+            lane_id, _ = inverse_mma_store_layout.map_indices([mma_i, mma_j])
+            if is_m_first:
+                thread_id = block_i * (block_col_warps * warp_cols) + block_j * warp_size + lane_id
+            else:
+                thread_id = block_j * (block_row_warps * warp_size) + block_i * warp_size + lane_id
+            return thread_id
+
+        def forward_index(i: int, j: int) -> int:
+            """
+            Given the row index `i` and column index `j` in the fragment,
+            map them to a local index in a single thread according
+            to `inverse_mma_store_layout`.
+            """
+            # the upper bounds of i and j are block_row_warps * warp_rows * micro_size_x and block_col_warps * warp_cols * micro_size_y
+            # the upper bounds of warp_i and warp_j are warp_rows and warp_cols
+            warp_i, warp_j = (i // micro_size_x) % warp_rows, (j // micro_size_y) % warp_cols
+            # upper bounds of mma_i and mma_j are micro_size_x and micro_size_y
+            mma_i, mma_j = i % micro_size_x, j % micro_size_y
+            _, local_id = inverse_mma_store_layout.map_indices([mma_i, mma_j])
+            return warp_i * (warp_cols * local_size_out) + warp_j * local_size_out + local_id
+
+        return T.Fragment(
+            shape,
+            forward_thread_fn=forward_thread,
+            forward_index_fn=forward_index,
+        )
diff --git a/tilelang/intrinsics/mma_sp_layout.py b/tilelang/intrinsics/mma_sp_layout.py
new file mode 100644
index 000000000..58034e7fd
--- /dev/null
+++ b/tilelang/intrinsics/mma_sp_layout.py
@@ -0,0 +1,181 @@
+from tvm import DataType
+from typing import Literal
+
+from tilelang.intrinsics.mma_layout import (
+    mma_load_a_32x4_to_shared_16x8_layout,
+    mma_load_a_32x16_to_shared_16x32_layout,
+    mma_load_a_32x8_to_shared_16x16_layout,
+    shared_16x8_to_mma_32x4_layout_sr_a,
+    shared_16x16_to_mma_32x8_layout_sr_a,
+    shared_16x32_to_mma_32x16_layout_sr_a,
+)
+
+
+def shared_16x16_to_mma_sp_layout_sr_a(i, j):
+    return shared_16x8_to_mma_32x4_layout_sr_a(i, j)
+
+
+def shared_16x16_to_mma_sp_layout_sr_b(i, j):
+    thread_id = 4 * (i % 8) + (j % 4)
+    return thread_id, 4 * (i // 8) + (j // 4)
+
+
+def shared_16x32_to_mma_sp_layout_sr_a(i, j):
+    return shared_16x16_to_mma_32x8_layout_sr_a(i, j)
+
+
+def shared_16x32_to_mma_sp_layout_sr_b(i, j):
+    thread_id = 4 * (i % 8) + (j % 8) // 2
+    return thread_id, 8 * (i // 8) + (j // 8) * 2 + (j % 2)
+
+
+def shared_16x64_to_mma_sp_layout_sr_a(i, j):
+    return shared_16x32_to_mma_32x16_layout_sr_a(i, j)
+
+
+def shared_16x64_to_mma_sp_layout_sr_b(i, j):
+    thread_id = 4 * (i % 8) + (j % 16) // 4
+    return thread_id, 16 * (i // 8) + (j // 16) * 4 + j % 4
+
+
+def mma_sp_load_a_32x4_to_shared_16x16_layout(thread_id, local_id):
+    return mma_load_a_32x4_to_shared_16x8_layout(thread_id, local_id)
+
+
+def mma_sp_load_a_32x8_to_shared_16x32_layout(thread_id, local_id):
+    return mma_load_a_32x8_to_shared_16x16_layout(thread_id, local_id)
+
+
+def mma_sp_load_a_32x16_to_shared_16x64_layout(thread_id, local_id):
+    return mma_load_a_32x16_to_shared_16x32_layout(thread_id, local_id)
+
+
+def mma_sp_load_b_32x8_to_shared_16x16_layout(thread_id, local_id):
+    col = 4 * (local_id % 4) + (thread_id % 4)
+    row = 8 * (local_id // 4) + (thread_id // 4)
+    return row, col
+
+
+def mma_sp_load_b_32x16_to_shared_16x32_layout(thread_id, local_id):
+    col = (thread_id % 4) * 2 + (local_id % 2) + ((local_id % 8) // 2) * 8
+    row = (thread_id // 4) + 8 * (local_id // 8)
+    return row, col
+
+
+def mma_sp_load_b_32x32_to_shared_16x64_layout(thread_id, local_id):
+    col = (thread_id % 4) * 4 + (local_id % 4) + 16 * ((local_id % 16) // 4)
+    row = (thread_id // 4) + 8 * (local_id // 16)
+    return row, col
+
+
+def get_logical_id_32bit(thread_id: int) -> int:
+    return (thread_id // 4) * 2 + (thread_id % 4) % 2
+
+
+def metadata_8bit_load_32x4_to_shared_16x4_layout_32bit(thread_id: int, local_id: int) -> tuple[int, int]:
+    logical_id = get_logical_id_32bit(thread_id)
+    row = logical_id // 4 + local_id * 8
+    col = logical_id % 4
+    return row, col
+
+
+def metadata_16bit_load_32x2_to_shared_16x2_layout_32bit(thread_id: int, local_id: int) -> tuple[int, int]:
+    logical_id = get_logical_id_32bit(thread_id)
+    row = logical_id // 2 + local_id * 8
+    col = logical_id % 2
+    return row, col
+
+
+def metadata_8bit_load_32x4_to_shared_16x4_layout_16bit(thread_id: int, local_id: int) -> tuple[int, int]:
+    return metadata_8bit_load_32x4_to_shared_16x4_layout_32bit(thread_id, local_id)  # same mapping for 16bit and 32bit
+
+
+def metadata_16bit_load_32x2_to_shared_16x2_layout_16bit(thread_id: int, local_id: int) -> tuple[int, int]:
+    return metadata_16bit_load_32x2_to_shared_16x2_layout_32bit(thread_id, local_id)  # same mapping for 16bit and 32bit
+
+
+def get_logical_id_8bit(thread_id: int) -> int:
+    return thread_id
+
+
+def metadata_8bit_load_32x4_to_shared_16x4_layout_8bit(thread_id: int, local_id: int) -> tuple[int, int]:
+    logical_id = get_logical_id_8bit(thread_id)
+    row = logical_id // 2 + local_id * 8
+    col = (logical_id % 4) // 2 * 4 + local_id
+    return row, col
+
+
+def metadata_16bit_load_32x2_to_shared_16x4_layout_8bit(thread_id: int, local_id: int) -> tuple[int, int]:
+    logical_id = get_logical_id_8bit(thread_id)
+    row = logical_id // 2 + local_id * 8
+    col = (logical_id % 4) // 2 * 2 + local_id
+    return row, col
+
+
+def metadata_32bit_load_32x1_to_shared_16x2_layout_8bit(thread_id: int, local_id: int) -> tuple[int, int]:
+    # local_id is always 0
+    logical_id = get_logical_id_8bit(thread_id)
+    row = logical_id // 4 + (logical_id % 2) * 8
+    col = (logical_id % 4) // 2
+    return row, col
+
+
+def ldmatrix_trans_32x8_to_shared_16x16_layout(thread_id, local_id):
+    row = (local_id // 4) * 8 + thread_id % 8
+    col = (thread_id // 8) * 4 + local_id % 4
+    return row, col
+
+
+def ldmatrix_32x16_to_shared_32x16_layout(thread_id, local_id):
+    row = thread_id
+    col = local_id % 8 + 8 * (local_id // 8)
+    return row, col
+
+
+def ldmatrix_trans_32x16_to_shared_16x32_layout(thread_id, local_id):
+    row = 8 * (local_id // 8) + thread_id % 8
+    col = (thread_id // 8) * 8 + local_id % 8
+    return row, col
+
+
+def ldmatrix_trans_32x32_to_shared_shared_16x64_layout(thread_id, local_id):
+    row = (local_id // 16) * 8 + thread_id % 8
+    col = (thread_id // 8) * 16 + local_id % 16
+    return row, col
+
+
+def get_ldmatrix_offset_b(
+    matrix: Literal["B"],
+    row_idx,
+    col_idx,
+    stride,
+    dtype: Literal["float16", "int8"] = "float16",
+    transposed: bool = False,
+):
+    assert matrix == "B", "matrix should be B"
+    dtype_bits = DataType(dtype).bits
+    if dtype_bits == 32:
+        if transposed:
+            transform_func = ldmatrix_trans_32x8_to_shared_16x16_layout
+            new_row_idx, new_col_idx = transform_func(row_idx, col_idx)
+            return new_row_idx * stride + new_col_idx
+        else:
+            raise ValueError("ldmatrix only supports B transposed for 32-bit dtype")
+    elif dtype_bits == 16:
+        transform_func = ldmatrix_32x16_to_shared_32x16_layout
+        transform_func_trans = ldmatrix_trans_32x16_to_shared_16x32_layout
+        if transposed:
+            new_row_idx, new_col_idx = transform_func_trans(row_idx, col_idx)
+            return new_row_idx * stride + new_col_idx
+        else:
+            new_row_idx, new_col_idx = transform_func(row_idx, col_idx)
+            return new_row_idx * stride + new_col_idx
+    elif dtype_bits == 8:
+        if transposed:
+            transform_func = ldmatrix_trans_32x32_to_shared_shared_16x64_layout
+            new_row_idx, new_col_idx = transform_func(row_idx, col_idx)
+            return new_row_idx * stride + new_col_idx
+        else:
+            raise ValueError("ldmatrix only supports B transposed for 8-bit dtype")
+    else:
+        raise ValueError(f"Unsupported dtype {dtype}")
diff --git a/tilelang/intrinsics/mma_sp_macro_generator.py b/tilelang/intrinsics/mma_sp_macro_generator.py
new file mode 100644
index 000000000..3e375b46b
--- /dev/null
+++ b/tilelang/intrinsics/mma_sp_macro_generator.py
@@ -0,0 +1,831 @@
+from __future__ import annotations
+
+import tilelang.language as T
+from typing import Literal, Callable
+from tvm import DataType
+from tvm.tir import PrimExpr, IndexMap, Buffer, Var
+from tvm.runtime import convert
+from .utils import (
+    mma_store_index_map,
+    get_ldmatrix_offset,
+)
+from tilelang.utils import is_fragment
+
+from tilelang.intrinsics.mma_sp_layout import (
+    shared_16x16_to_mma_sp_layout_sr_a,
+    shared_16x16_to_mma_sp_layout_sr_b,
+    shared_16x32_to_mma_sp_layout_sr_a,
+    shared_16x32_to_mma_sp_layout_sr_b,
+    shared_16x64_to_mma_sp_layout_sr_a,
+    shared_16x64_to_mma_sp_layout_sr_b,
+    mma_sp_load_a_32x4_to_shared_16x16_layout,
+    mma_sp_load_a_32x8_to_shared_16x32_layout,
+    mma_sp_load_a_32x16_to_shared_16x64_layout,
+    mma_sp_load_b_32x8_to_shared_16x16_layout,
+    mma_sp_load_b_32x16_to_shared_16x32_layout,
+    mma_sp_load_b_32x32_to_shared_16x64_layout,
+    metadata_8bit_load_32x4_to_shared_16x4_layout_32bit,
+    metadata_16bit_load_32x2_to_shared_16x2_layout_32bit,
+    metadata_8bit_load_32x4_to_shared_16x4_layout_16bit,
+    metadata_16bit_load_32x2_to_shared_16x2_layout_16bit,
+    metadata_8bit_load_32x4_to_shared_16x4_layout_8bit,
+    metadata_16bit_load_32x2_to_shared_16x4_layout_8bit,
+    metadata_32bit_load_32x1_to_shared_16x2_layout_8bit,
+    get_ldmatrix_offset_b,
+)
+
+lift = convert
+
+
+class SparseTensorCoreIntrinEmitter:
+    """
+    To eliminate Python syntax within TIR Macro.
+    """
+
+    M_DIM = 16
+    SPARSE_FACTOR = 2  # 1:2 for tfloat12, 2:4 for 16-bit and 8-bit datatypes
+    SPARSE_SELECTOR = 0  # always use lower threads to provide metadata
+    # use lowercase as n_dim can be dynamic
+    # the smallest instructions can be m16n8k16, so the n_dim can also be 8
+    n_dim = 16
+    WARP_SIZE = 32
+    dtype_abbrv = {
+        "float16": "fp16",
+        "bfloat16": "bf16",
+        "float32": "fp32",
+        "int8": "int8",
+        "int32": "int32",
+        "float8_e4m3": "e4m3",
+        "float8_e5m2": "e5m2",
+    }
+
+    E_FACTOR_MAP = {  # e_kdim = mma_kdim // e_factor
+        "float": {
+            "int16": 8,
+            "uint16": 8,
+        },
+        "float32": {
+            "int16": 8,
+            "uint16": 8,
+        },
+        "float16": {
+            "int8": 8,
+            "uint8": 8,
+            "int16": 16,
+            "uint16": 16,
+            "int32": 32,
+            "uint32": 32,
+        },
+        "bfloat16": {
+            "int8": 8,
+            "uint8": 8,
+            "int16": 16,
+            "uint16": 16,
+            "int32": 32,
+            "uint32": 32,
+        },
+        "int8": {
+            "int8": 8,
+            "uint8": 8,
+            "int16": 16,
+            "uint16": 16,
+            "int32": 32,
+            "uint32": 32,
+        },
+        "uint8": {
+            "int8": 8,
+            "uint8": 8,
+            "int16": 16,
+            "uint16": 16,
+            "int32": 32,
+            "uint32": 32,
+        },
+        "float8_e4m3": {
+            "int8": 8,
+            "uint8": 8,
+            "int16": 16,
+            "uint16": 16,
+            "int32": 32,
+            "uint32": 32,
+        },
+        "float8_e5m2": {
+            "int8": 8,
+            "uint8": 8,
+            "int16": 16,
+            "uint16": 16,
+            "int32": 32,
+            "uint32": 32,
+        },
+    }
+
+    E_REPLICATE_FACTOR = {  # metadata replicate every 4 consecutive threads
+        "float32": 2,
+        "float16": 2,  # 2 of 4 consecutive threads provides
+        "bfloat16": 2,
+        "int8": 1,  # 4 of 4 consecutive threads provides
+        "uint8": 1,
+        "float8_e4m3": 1,
+        "float8_e5m2": 1,
+    }
+
+    # Represent the thread binding in the form of (tx, warp_n, warp_m)
+    is_m_first = False
+
+    def __init__(
+        self,
+        a_dtype: str = T.float16,
+        e_dtype: str = T.uint8,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
+        a_transposed: bool = False,
+        b_transposed: bool = False,
+        e_transposed: bool = False,
+        block_row_warps: int = 2,
+        block_col_warps: int = 2,
+        warp_row_tiles: int = 8,
+        warp_col_tiles: int = 8,
+        warp_k: int = 16,
+        reduce_k: int = 1,
+        num_elems_per_byte: int = 1,
+        is_m_first: bool = False,
+        thread_var: Var | None = None,
+    ):
+        self.a_dtype = a_dtype
+        self.e_dtype = e_dtype
+        self.b_dtype = b_dtype
+        self.accum_dtype = accum_dtype
+        self.a_transposed = a_transposed
+        self.b_transposed = b_transposed
+        self.e_transposed = e_transposed
+        # Hint Information
+        self.block_row_warps = block_row_warps
+        self.block_col_warps = block_col_warps
+        self.warp_row_tiles = warp_row_tiles
+        self.warp_col_tiles = warp_col_tiles
+        self.warp_k = warp_k
+        self.e_factor = self.E_FACTOR_MAP[self.a_dtype][self.e_dtype]
+        self._initialize_k_dim(a_dtype)
+        self._initialize_abbrev(a_dtype, b_dtype, accum_dtype)
+        self._initialize_micro_size(self.M_DIM, self.k_dim)
+        self._initialize_local_size(self.M_DIM, self.n_dim, self.k_dim, self.WARP_SIZE)
+        self._initialize_mma_sp_prefix(self.k_dim)
+        self._initialize_is_m_first(is_m_first)
+
+        self.reduce_k = reduce_k
+        self.threads = self.WARP_SIZE * (block_row_warps * block_col_warps) * reduce_k
+        self.num_elems_per_byte = num_elems_per_byte
+        self.thread_var = thread_var
+
+        if self.warp_rows == 0 or self.warp_cols == 0:
+            raise ValueError(
+                f"Invalid threads configuration for this tile shape, {self.warp_rows} x {self.warp_cols} with threads {self.threads}"
+            )
+
+    def _initialize_k_dim(self, a_dtype=T.float16):
+        if isinstance(a_dtype, str):
+            a_dtype = DataType(a_dtype)
+        # NOTE: k_dim here represents the logical shape of the MMA operation.
+        # When referring to the physical data movement, it should be divided by sparse_factor.
+        self.k_dim = 256 // a_dtype.bits * self.SPARSE_FACTOR
+
+    def _initialize_local_size(self, m_dim=16, n_dim=16, k_dim=16, warp_size=32):
+        self.local_size_a = (m_dim * k_dim) // warp_size // self.SPARSE_FACTOR
+        self.local_size_e = (m_dim * k_dim) // self.e_factor // warp_size * self.E_REPLICATE_FACTOR[self.a_dtype]
+        self.local_size_b = (n_dim * k_dim) // warp_size
+        self.local_size_out = (m_dim * n_dim) // warp_size
+
+    def _initialize_abbrev(self, a_dtype, b_dtype, accum_dtype):
+        self.a_dtype_abbrv = self.dtype_abbrv[a_dtype]
+        self.b_dtype_abbrv = self.dtype_abbrv[b_dtype]
+        self.accum_dtype_abbrv = self.dtype_abbrv[accum_dtype]
+
+    def _initialize_mma_sp_prefix(self, k_dim: int = 16):
+        if k_dim == 16:
+            # typically used for tfloat32
+            self.mma_prefix = "m16n8k16"
+        elif k_dim == 32:
+            # typically used for float16/bfloat16
+            self.mma_prefix = "m16n8k32"
+        elif k_dim == 64:
+            # typically used for int8/fp8
+            self.mma_prefix = "m16n8k64"
+        else:
+            raise ValueError("Unsupported k_dim")
+
+    def _initialize_micro_size(self, m_dim: int = 16, k_dim: int = 16):
+        warp_row_tiles = self.warp_row_tiles
+        warp_col_tiles = self.warp_col_tiles
+        assert warp_row_tiles >= 16, f"warp_row_tiles must be greater than 16, got {warp_row_tiles}"
+        assert warp_row_tiles % 16 == 0, f"warp_row_tiles must be divisible by 16, got {warp_row_tiles}"
+        assert warp_col_tiles >= 8, f"warp_col_tiles must be greater than 8, got {warp_col_tiles}"
+        assert warp_col_tiles % 8 == 0, f"warp_col_tiles must be divisible by 8, got {warp_col_tiles}"
+
+        self.warp_rows = warp_row_tiles // m_dim
+
+        if warp_col_tiles % 16 == 0:
+            self.n_dim = 16
+            self.micro_size_y = 16
+            self.warp_cols = warp_col_tiles // 16
+        else:
+            # must be divisible by 8
+            self.n_dim = 8
+            self.micro_size_y = 8
+            self.warp_cols = warp_col_tiles // 8
+
+        self.micro_size_x = m_dim
+        # NOTE: k_dim here represents the logical shape of the MMA operation.
+        self.micro_size_k = k_dim
+
+    def _initialize_is_m_first(self, is_m_first: bool | None = False):
+        if is_m_first is not None:
+            self.is_m_first = is_m_first
+
+    def get_thread_binding(self):
+        if self.thread_var is None:
+            current_frame = T.KernelLaunchFrame.Current()
+            assert current_frame is not None, "Must be called in a T.Kernel Frame"
+            return current_frame.get_thread_binding()
+        else:
+            return self.thread_var
+
+    def get_store_index_map(self, inverse: bool = False) -> IndexMap:
+        warp_size, local_size_c = self.WARP_SIZE, self.local_size_out
+        index_map = IndexMap.from_func(mma_store_index_map, index_dtype=T.int32)
+        if not inverse:
+            return index_map
+        inverse_index_map = index_map.inverse([warp_size, local_size_c])
+        return inverse_index_map
+
+    def extract_thread_binding(self, thread_id: PrimExpr, is_m_first: bool | None = None) -> tuple[PrimExpr, PrimExpr, PrimExpr]:
+        """
+        is_m_first: True if the thread binding is in the form of (tx, warp_n, warp_m)
+        which represents [warp_size, block_row_warps (split n), block_col_warps (split m)]
+        Otherwise, it is in the form of [warp_size, block_col_warps (split m), block_row_warps (split n)]
+        """
+        WARP_SIZE = self.WARP_SIZE
+        block_row_warps = self.block_row_warps
+        block_col_warps = self.block_col_warps
+
+        # if is_m_first is None, then use the default value
+        if is_m_first is None:
+            is_m_first = self.is_m_first
+
+        if is_m_first:
+            lane_id, warp_n, warp_m = (
+                thread_id % WARP_SIZE,
+                (thread_id // WARP_SIZE) % block_col_warps,
+                (thread_id // (WARP_SIZE * block_col_warps)) % block_row_warps,
+            )
+            return lane_id, warp_n, warp_m
+        else:
+            lane_id, warp_m, warp_n = (
+                thread_id % WARP_SIZE,
+                (thread_id // WARP_SIZE) % block_row_warps,
+                (thread_id // (WARP_SIZE * block_row_warps)) % block_col_warps,
+            )
+            return lane_id, warp_n, warp_m
+
+    def ldmatrix_a(self, A_local_buf: Buffer, A_shared_buf: Buffer, ki: PrimExpr, rk: PrimExpr = 0):
+        warp_row_tiles = self.warp_row_tiles
+        warp_rows = self.warp_rows
+        warp_k = self.warp_k
+        micro_size_x = self.micro_size_x
+        micro_size_k = self.micro_size_k
+        local_size_a = self.local_size_a
+        a_dtype = self.a_dtype
+        a_transposed = self.a_transposed
+        # ldmatrix cannot be used for int8 + trans case.
+        ldmatrix_available = not (DataType(a_dtype).bits != 16 and a_transposed)
+
+        def mma_load_layout(i, j):
+            return i, j
+
+        if not ldmatrix_available:
+            if DataType(a_dtype).bits == 8:
+                mma_load_layout = mma_sp_load_a_32x16_to_shared_16x64_layout
+            elif DataType(a_dtype).bits == 16:
+                mma_load_layout = mma_sp_load_a_32x8_to_shared_16x32_layout
+            elif DataType(a_dtype).bits == 32:
+                mma_load_layout = mma_sp_load_a_32x4_to_shared_16x16_layout
+            else:
+                raise ValueError(f"Unsupported dtype: {a_dtype}")
+
+        thread_binding = self.get_thread_binding()
+
+        @T.macro
+        def _warp_ldmatrix_a(
+            A_local_buf,
+            A_shared_buf,
+            ki,
+            thread_binding,
+            rk=0,
+        ):
+            stride = A_shared_buf.shape[-1]
+            tx, _, warp_m = self.extract_thread_binding(thread_binding)
+            trans = self.a_transposed
+
+            for i in T.serial(warp_rows):
+                # Assign A_shared_buf_elem
+                wi, wk = warp_m * warp_row_tiles + i * micro_size_x, (rk * warp_k + ki * micro_size_k) // self.SPARSE_FACTOR
+                A_shared_buf_elem = A_shared_buf[wk, wi] if a_transposed else A_shared_buf[wi, wk]
+
+                if ldmatrix_available:
+                    T.ptx_ldmatrix(
+                        a_dtype,
+                        T.bool(trans),
+                        4,
+                        ".b16",
+                        A_local_buf.data,
+                        i * local_size_a,
+                        T.address_of(A_shared_buf_elem),
+                        get_ldmatrix_offset("A", tx, 0, stride, a_dtype, a_transposed),
+                    )
+                else:
+                    for j in T.serial(local_size_a):
+                        mi, mk = mma_load_layout(tx, j)
+                        A_local_buf[i * local_size_a + j] = (
+                            A_shared_buf[wk + mk, wi + mi] if a_transposed else A_shared_buf[wi + mi, wk + mk]
+                        )
+
+        return _warp_ldmatrix_a(A_local_buf, A_shared_buf, ki, thread_binding, rk)
+
+    def ldmatrix_e(self, E_local_buf: Buffer, E_shared_buf: Buffer, ki: PrimExpr, rk: PrimExpr = 0):
+        warp_row_tiles = self.warp_row_tiles
+        warp_rows = self.warp_rows
+        warp_k = self.warp_k
+        micro_size_x = self.micro_size_x
+        micro_size_k = self.micro_size_k
+        local_size_e = self.local_size_e
+        a_dtype = self.a_dtype
+        e_dtype = self.e_dtype
+        trans = self.e_transposed
+        # ldmatrix cannot be used for int8 + trans case.
+        # include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h
+        ldmatrix_available = False  # TODO: use ldmatrix when possible
+
+        def mma_load_layout(i, j):
+            return i, j
+
+        if not ldmatrix_available:
+            if DataType(e_dtype).bits == 8:
+                if DataType(a_dtype).bits == 8:
+                    mma_load_layout = metadata_8bit_load_32x4_to_shared_16x4_layout_8bit
+                elif DataType(a_dtype).bits == 16:
+                    mma_load_layout = metadata_8bit_load_32x4_to_shared_16x4_layout_16bit
+                elif DataType(a_dtype).bits == 32:
+                    mma_load_layout = metadata_8bit_load_32x4_to_shared_16x4_layout_32bit
+                else:
+                    raise ValueError(f"Unsupported a_dtype for e_dtype 8bit: {a_dtype}")
+            elif DataType(e_dtype).bits == 16:
+                if DataType(a_dtype).bits == 8:
+                    mma_load_layout = metadata_16bit_load_32x2_to_shared_16x4_layout_8bit
+                elif DataType(a_dtype).bits == 16:
+                    mma_load_layout = metadata_16bit_load_32x2_to_shared_16x2_layout_16bit
+                elif DataType(a_dtype).bits == 32:
+                    mma_load_layout = metadata_16bit_load_32x2_to_shared_16x2_layout_32bit
+                else:
+                    raise ValueError(f"Unsupported a_dtype for e_dtype 16bit: {a_dtype}")
+            elif DataType(e_dtype).bits == 32:
+                if DataType(a_dtype).bits == 8:
+                    mma_load_layout = metadata_32bit_load_32x1_to_shared_16x2_layout_8bit
+                else:
+                    raise ValueError(f"Unsupported a_dtype for e_dtype 32bit: {a_dtype}")
+            else:
+                raise ValueError(f"Unsupported dtype: {e_dtype}")
+
+        thread_binding = self.get_thread_binding()
+
+        @T.macro
+        def _warp_ldmatrix_e(
+            E_local_buf,
+            E_shared_buf,
+            ki,
+            thread_binding,
+            rk=0,
+        ):
+            tx, _, warp_m = self.extract_thread_binding(thread_binding)
+            for i in T.serial(warp_rows):
+                # Assign E_shared_buf_elem
+                wi, wk = warp_m * warp_row_tiles + i * micro_size_x, (rk * warp_k + ki * micro_size_k) // self.e_factor
+                for j in T.serial(local_size_e):
+                    mi, mk = mma_load_layout(tx, j)
+                    E_local_buf[i * local_size_e + j] = E_shared_buf[wk + mk, wi + mi] if trans else E_shared_buf[wi + mi, wk + mk]
+
+        return _warp_ldmatrix_e(E_local_buf, E_shared_buf, ki, thread_binding, rk)
+
+    def ldmatrix_b(self, B_local_buf: Buffer, B_shared_buf: Buffer, ki: PrimExpr, rk: PrimExpr = 0):
+        warp_col_tiles = self.warp_col_tiles
+        warp_cols = self.warp_cols
+        warp_k = self.warp_k
+        micro_size_y = self.micro_size_y
+        micro_size_k = self.micro_size_k
+        local_size_b = self.local_size_b
+        b_dtype = self.b_dtype
+        b_transposed = self.b_transposed
+        thread_binding = self.get_thread_binding()
+        replicate_b = self.n_dim == 16
+        # ldmatrix cannot be used for int8 + trans case.
+        ldmatrix_available = not (DataType(b_dtype).bits != 16 and not b_transposed)
+
+        def mma_load_layout(i, j):
+            return i, j
+
+        if not ldmatrix_available:
+            if DataType(b_dtype).bits == 8:
+                mma_load_layout = mma_sp_load_b_32x32_to_shared_16x64_layout
+            elif DataType(b_dtype).bits == 16:
+                mma_load_layout = mma_sp_load_b_32x16_to_shared_16x32_layout
+            elif DataType(b_dtype).bits == 32:
+                mma_load_layout = mma_sp_load_b_32x8_to_shared_16x16_layout
+            else:
+                raise ValueError(f"Unsupported dtype: {b_dtype}")
+
+        @T.macro
+        def _warp_ldmatrix_b(
+            B_local_buf,
+            B_shared_buf,
+            ki,
+            thread_binding,
+            rk=0,
+        ):
+            stride = B_shared_buf.shape[-1]
+            tx, warp_n, _ = self.extract_thread_binding(thread_binding)
+            trans = not b_transposed
+
+            for i in T.serial(warp_cols):
+                # Assign B_shared_elem
+                wi, wk = (
+                    warp_n * warp_col_tiles + i * micro_size_y,
+                    rk * warp_k + ki * micro_size_k,
+                )
+
+                if ldmatrix_available:
+                    B_shared_buf_elem = B_shared_buf[wi, wk] if b_transposed else B_shared_buf[wk, wi]
+
+                    if replicate_b:
+                        T.ptx_ldmatrix(
+                            b_dtype,
+                            T.bool(trans),
+                            4,
+                            ".b16",
+                            B_local_buf.data,
+                            i * local_size_b,
+                            T.address_of(B_shared_buf_elem),
+                            get_ldmatrix_offset_b("B", tx, 0, stride, b_dtype, b_transposed),
+                        )
+
+                        T.ptx_ldmatrix(
+                            b_dtype,
+                            T.bool(trans),
+                            4,
+                            ".b16",
+                            B_local_buf.data,
+                            i * local_size_b + lift(local_size_b) // 2,
+                            T.address_of(B_shared_buf_elem),
+                            get_ldmatrix_offset_b("B", tx, lift(local_size_b) // 2, stride, b_dtype, b_transposed),
+                        )
+                    else:
+                        T.ptx_ldmatrix(
+                            b_dtype,
+                            T.bool(trans),
+                            4,
+                            ".b16",
+                            B_local_buf.data,
+                            i * local_size_b,
+                            T.address_of(B_shared_buf_elem),
+                            get_ldmatrix_offset_b("B", tx, 0, stride, b_dtype, b_transposed),
+                        )
+
+                else:
+                    # load 16x32 data from shared buffer to local buffer
+                    # must be transposed.
+                    for j in T.serial(local_size_b):
+                        mi, mk = mma_load_layout(tx, j)
+                        B_local_buf[i * local_size_b + j] = (
+                            B_shared_buf[wi + mi, wk + mk] if b_transposed else B_shared_buf[wk + mk, wi + mi]
+                        )
+
+        return _warp_ldmatrix_b(B_local_buf, B_shared_buf, ki, thread_binding, rk)
+
+    def mma_sp(self, A_local_buf: Buffer, E_local_buf: Buffer, B_local_buf: Buffer, C_local_buf: Buffer, k_inner: PrimExpr = 0):
+        warp_rows = self.warp_rows
+        warp_cols = self.warp_cols
+        local_size_a = self.local_size_a
+        local_size_e = self.local_size_e
+        local_size_b = self.local_size_b
+        local_size_out = self.local_size_out
+        a_dtype_abbrv = self.a_dtype_abbrv
+        b_dtype_abbrv = self.b_dtype_abbrv
+        accum_dtype = self.accum_dtype
+        accum_dtype_abbrv = self.accum_dtype_abbrv
+        mma_prefix = self.mma_prefix
+        replicate_b = self.n_dim == 16
+
+        a_is_fragment = is_fragment(A_local_buf)
+        e_is_fragment = is_fragment(E_local_buf)
+        b_is_fragment = is_fragment(B_local_buf)
+        assert not e_is_fragment, f"currently E_local_buf must be a local allocation, found {E_local_buf.scope()}"
+        a_local_stride: PrimExpr = k_inner * warp_rows * local_size_a if a_is_fragment else 0
+        e_local_stride: PrimExpr = k_inner * warp_rows * local_size_e if e_is_fragment else 0
+        b_local_stride: PrimExpr = k_inner * warp_cols * local_size_b if b_is_fragment else 0
+
+        @T.macro
+        def _warp_mma_sp(A_local_buf, E_local_buf, B_local_buf, C_local_buf):
+            for i, j in T.grid(warp_rows, warp_cols):
+                T.ptx_mma_sp(
+                    accum_dtype,
+                    mma_prefix,
+                    "row",
+                    "col",
+                    a_dtype_abbrv,
+                    b_dtype_abbrv,
+                    accum_dtype_abbrv,
+                    A_local_buf.data,
+                    a_local_stride + i * local_size_a,
+                    B_local_buf.data,
+                    b_local_stride + j * local_size_b,
+                    C_local_buf.data,
+                    i * warp_cols * local_size_out + j * local_size_out,
+                    E_local_buf.data,  # metadata
+                    e_local_stride + i * local_size_e,  # metadata offset
+                    self.SPARSE_SELECTOR,  # sparse_selector
+                    T.bool(False),  # saturate
+                )
+                if replicate_b:
+                    T.ptx_mma_sp(
+                        accum_dtype,
+                        mma_prefix,
+                        "row",
+                        "col",
+                        a_dtype_abbrv,
+                        b_dtype_abbrv,
+                        accum_dtype_abbrv,
+                        A_local_buf.data,
+                        a_local_stride + i * local_size_a,
+                        B_local_buf.data,
+                        b_local_stride + j * local_size_b + lift(local_size_b) // 2,
+                        C_local_buf.data,
+                        i * warp_cols * local_size_out + j * local_size_out + lift(local_size_out) // 2,
+                        E_local_buf.data,  # metadata
+                        e_local_stride + i * local_size_e,  # metadata offset
+                        self.SPARSE_SELECTOR,  # sparse_selector
+                        T.bool(False),  # saturate
+                    )
+
+        return _warp_mma_sp(A_local_buf, E_local_buf, B_local_buf, C_local_buf)
+
+    def stmatrix(self, C_local_buf, C_buf, pid_m=None, pid_n=None):
+        block_row_warps = self.block_row_warps
+        block_col_warps = self.block_col_warps
+        warp_rows = self.warp_rows
+        warp_cols = self.warp_cols
+        local_size_out = self.local_size_out
+
+        is_global = pid_m is not None and pid_n is not None
+        BLOCK_M = block_row_warps * warp_rows
+        BLOCK_N = block_col_warps * warp_cols
+        M_DIM, n_dim = self.M_DIM, self.n_dim
+        C_buf_dims = len(C_buf.shape)
+        assert C_buf_dims in {2, 4}, "C_buf should be 2D or 4D"
+
+        thread_binding = self.get_thread_binding()
+
+        # STS
+        # MMA Store must be in simulated instead of TVM Intrins
+        # As TVM Intrins is like a hack that the threadIdx.x should be always
+        # equal to the warp_size
+        @T.macro
+        def _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding):
+            tx, warp_n, warp_m = self.extract_thread_binding(thread_binding)
+            for i, j in T.grid(warp_rows, warp_cols):
+                for local_id_o in T.serial(local_size_out // 2):
+                    for local_id_i in T.vectorized(2):
+                        local_id = local_id_o * 2 + local_id_i
+                        row, col = T.meta_var(mma_store_index_map(tx, local_id))
+                        if C_buf_dims == 2:
+                            C_buf[(warp_m * warp_rows + i) * M_DIM + row, (warp_n * warp_cols + j) * n_dim + col] = C_local_buf[
+                                i * (warp_cols * local_size_out) + j * local_size_out + local_id
+                            ]
+                        else:
+                            C_buf[warp_m * warp_rows + i, warp_n * warp_cols + j, row, col] = C_local_buf[
+                                i * (warp_cols * local_size_out) + j * local_size_out + local_id
+                            ]
+
+        @T.macro
+        def _warp_stmatrix_global(C_local_buf, C_buf, thread_binding):
+            tx, warp_n, warp_m = self.extract_thread_binding(thread_binding)
+            for i, j in T.grid(warp_rows, warp_cols):
+                for local_id_o in T.serial(local_size_out // 2):
+                    for local_id_i in T.vectorized(2):
+                        local_id = local_id_o * 2 + local_id_i
+                        row, col = T.meta_var(mma_store_index_map(tx, local_id))
+                        C_buf[
+                            (pid_m * BLOCK_M + warp_m * warp_rows + i) * M_DIM + row,
+                            (pid_n * BLOCK_N + warp_n * warp_cols + j) * n_dim + col,
+                        ] = C_local_buf[i * warp_cols * local_size_out + j * local_size_out + local_id]
+
+        return (
+            _warp_stmatrix_global(C_local_buf, C_buf, thread_binding)
+            if is_global
+            else _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding)
+        )
+
+    def make_mma_load_layout(self, local_buf: Buffer, matrix: Literal["A", "B"] = "A") -> T.Fragment:
+        """
+        Create a layout function for storing MMA results into a fragment buffer.
+        This layout is used in conjunction with `inverse_mma_store_layout` to
+        map fragment indices to threads and local indices.
+
+        Parameters
+        ----------
+        local_buf : tir.Buffer
+            The local buffer representing a fragment of a matrix.
+
+        Returns
+        -------
+        T.Fragment
+            A fragment object that describes how threads and indices
+            in `local_buf` are laid out.
+
+        Raises
+        ------
+        AssertionError
+            If `local_buf` is not detected to be a fragment buffer.
+        """
+        from tilelang.utils import is_fragment
+
+        assert matrix in ["A", "B"], "matrix should be either A or B"
+        matrix_is_a: bool = matrix == "A"
+        matrix_is_b: bool = matrix == "B"
+        dtype = self.a_dtype if matrix_is_a else self.b_dtype
+        dtype_bits = DataType(dtype).bits
+        transposed = self.a_transposed if matrix_is_a else self.b_transposed
+
+        # s represents spatial axis
+        # r represents reduction axis
+        # sr represents the two dims are spatial + reduction
+        # rs represents the two dims are reduction + spatial
+        # sr also can represent a non-transposed basic layout
+        # then rs also can represent a transposed basic layout
+        transform_func_sr_a: Callable = None
+        transform_func_sr_b: Callable = None
+        if dtype_bits == 32:
+            transform_func_sr_a = shared_16x16_to_mma_sp_layout_sr_a
+            transform_func_sr_b = shared_16x16_to_mma_sp_layout_sr_b
+        elif dtype_bits == 16:
+            transform_func_sr_a = shared_16x32_to_mma_sp_layout_sr_a
+            transform_func_sr_b = shared_16x32_to_mma_sp_layout_sr_b
+        elif dtype_bits == 8:
+            transform_func_sr_a = shared_16x64_to_mma_sp_layout_sr_a
+            transform_func_sr_b = shared_16x64_to_mma_sp_layout_sr_b
+        else:
+            raise ValueError(f"Unsupported dtype {dtype}")
+
+        is_sr_conditions = [False]
+        is_sr_conditions.append(matrix_is_a and not transposed)
+        is_sr_conditions.append(matrix_is_b and transposed)
+        is_sr_axis_order = any(is_sr_conditions)
+
+        # the layout of mma.sync is row.col.
+        # so the b matrix expected a transposed basic layout
+        transform_func: Callable = None
+        if matrix_is_a:
+            transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
+        elif matrix_is_b:
+            transform_func = transform_func_sr_b if is_sr_axis_order else lambda i, j: transform_func_sr_b(j, i)
+        else:
+            raise ValueError(f"Unsupported matrix {matrix}")
+
+        assert is_fragment(local_buf), f"local_buf must be a fragment, but got {local_buf.scope()}"
+
+        if matrix_is_a:
+            micro_size_s, micro_size_r = self.micro_size_x, self.micro_size_k
+        else:
+            micro_size_r, micro_size_s = self.micro_size_k, self.micro_size_y
+
+        block_row_warps, block_col_warps = (
+            self.block_row_warps,
+            self.block_col_warps,
+        )
+
+        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
+
+        def forward_thread(i: int, j: int) -> int:
+            """
+            Given the row index `i` and column index `j` in the fragment,
+            """
+            lane_id, _ = inverse_mma_load_layout.map_indices([i, j])
+            return lane_id
+
+        def forward_index(i: int, j: int) -> int:
+            """
+            Given the row index `i` and column index `j` in the fragment,
+            """
+            _, local_id = inverse_mma_load_layout.map_indices([i, j])
+            return local_id
+
+        base_fragment = T.Fragment(
+            [micro_size_s, micro_size_r // 2 if matrix_is_a else micro_size_r]
+            if is_sr_axis_order
+            else [micro_size_r // 2 if matrix_is_a else micro_size_r, micro_size_s],
+            forward_thread_fn=forward_thread,
+            forward_index_fn=forward_index,
+        )
+
+        warp_rows, warp_cols = self.warp_rows, self.warp_cols
+        chunk = self.warp_k
+
+        warp_s = warp_rows if matrix_is_a else warp_cols
+        warp_r = chunk // micro_size_r
+        block_s = block_row_warps if matrix_is_a else block_col_warps
+        replicate = block_col_warps if matrix_is_a else block_row_warps
+
+        if is_sr_axis_order:
+            warp_fragment = base_fragment.repeat([warp_s, warp_r], repeat_on_thread=False, lower_dim_first=False)
+            if matrix_is_a:
+                block_fragment = warp_fragment.repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
+            elif matrix_is_b:
+                block_fragment = warp_fragment.replicate(replicate).repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True)
+            else:
+                raise ValueError(f"Unsupported matrix type {matrix}")
+        else:
+            warp_fragment = base_fragment.repeat([warp_r, warp_s], repeat_on_thread=False, lower_dim_first=True)
+            if matrix_is_a:
+                block_fragment = warp_fragment.repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True).replicate(replicate)
+            elif matrix_is_b:
+                block_fragment = warp_fragment.replicate(replicate).repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True)
+            else:
+                raise ValueError(f"Unsupported matrix type {matrix}")
+
+        return block_fragment
+
+    def make_mma_store_layout(self, local_buf: Buffer) -> T.Fragment:
+        """
+        Create a layout function for storing MMA results into a fragment buffer.
+        This layout is used in conjunction with `inverse_mma_store_layout` to
+        map fragment indices to threads and local indices.
+
+        Parameters
+        ----------
+        local_buf : tir.Buffer
+            The local buffer representing a fragment of a matrix.
+
+        Returns
+        -------
+        T.Fragment
+            A fragment object that describes how threads and indices
+            in `local_buf` are laid out.
+
+        Raises
+        ------
+        AssertionError
+            If `local_buf` is not detected to be a fragment buffer.
+        """
+        from tilelang.utils import is_fragment
+
+        shape = local_buf.shape
+        inverse_mma_store_layout = self.get_store_index_map(inverse=True)
+        assert is_fragment(local_buf), "local_buf must be a fragment"
+        micro_size_x, micro_size_y = self.micro_size_x, self.micro_size_y
+        local_size_out = self.local_size_out
+        block_row_warps, block_col_warps = self.block_row_warps, self.block_col_warps
+        warp_rows, warp_cols = self.warp_rows, self.warp_cols
+        warp_size = self.WARP_SIZE
+        is_m_first = self.is_m_first
+
+        def forward_thread(i: int, j: int) -> int:
+            """
+            Given the row index `i` and column index `j` in the fragment,
+            map them to a thread index according to `inverse_mma_store_layout`.
+            """
+            # the upper bounds of i and j are block_row_warps * warp_rows * micro_size_x and block_col_warps * warp_cols * micro_size_y
+            # the upper bounds of block_row_warps and block_col_warps are warp_rows and warp_cols
+            block_i, block_j = (i // micro_size_x) // warp_rows, (j // micro_size_y) // warp_cols
+            # upper bounds of mma_i and mma_j are micro_size_x and micro_size_y
+            mma_i, mma_j = i % micro_size_x, j % micro_size_y
+            lane_id, _ = inverse_mma_store_layout.map_indices([mma_i, mma_j])
+            if is_m_first:
+                thread_id = block_i * (block_col_warps * warp_cols) + block_j * warp_size + lane_id
+            else:
+                thread_id = block_j * (block_row_warps * warp_size) + block_i * warp_size + lane_id
+            return thread_id
+
+        def forward_index(i: int, j: int) -> int:
+            """
+            Given the row index `i` and column index `j` in the fragment,
+            map them to a local index in a single thread according
+            to `inverse_mma_store_layout`.
+            """
+            # the upper bounds of i and j are block_row_warps * warp_rows * micro_size_x and block_col_warps * warp_cols * micro_size_y
+            # the upper bounds of warp_i and warp_j are warp_rows and warp_cols
+            warp_i, warp_j = (i // micro_size_x) % warp_rows, (j // micro_size_y) % warp_cols
+            # upper bounds of mma_i and mma_j are micro_size_x and micro_size_y
+            mma_i, mma_j = i % micro_size_x, j % micro_size_y
+            _, local_id = inverse_mma_store_layout.map_indices([mma_i, mma_j])
+            return warp_i * (warp_cols * local_size_out) + warp_j * local_size_out + local_id
+
+        return T.Fragment(
+            shape,
+            forward_thread_fn=forward_thread,
+            forward_index_fn=forward_index,
+        )
diff --git a/tilelang/intrinsics/tcgen05_macro_generator.py b/tilelang/intrinsics/tcgen05_macro_generator.py
new file mode 100644
index 000000000..923bb0e10
--- /dev/null
+++ b/tilelang/intrinsics/tcgen05_macro_generator.py
@@ -0,0 +1,446 @@
+from __future__ import annotations
+from enum import IntEnum
+import tilelang.language as T
+from .mma_macro_generator import TensorCoreIntrinEmitter as MMAIntrinEmitter
+from tvm import DataType
+from tvm.tir import PrimExpr, Buffer, Var, BufferLoad, BufferRegion
+from tilelang import tvm as tvm
+from tilelang import _ffi_api
+from tilelang.utils import is_tensor_memory
+from tilelang.layout import (
+    Layout,
+    make_full_bank_swizzled_layout,
+    make_half_bank_swizzled_layout,
+    make_quarter_bank_swizzled_layout,
+    make_linear_layout,
+)
+from tvm.runtime import convert
+
+lift = convert
+
+
+class SwizzleMode(IntEnum):
+    # SWIZZLE_NONE = 0, SWIZZLE_32B = 3, SWIZZLE_64B = 2, SWIZZLE_128B = 1
+    NONE = 0
+    SWIZZLE_128B = 2
+    SWIZZLE_64B = 4
+    SWIZZLE_32B = 6
+
+    def is_none(self) -> bool:
+        return self == SwizzleMode.NONE
+
+    def is_swizzle_32b(self) -> bool:
+        return self == SwizzleMode.SWIZZLE_32B
+
+    def is_swizzle_64b(self) -> bool:
+        return self == SwizzleMode.SWIZZLE_64B
+
+    def is_swizzle_128b(self) -> bool:
+        return self == SwizzleMode.SWIZZLE_128B
+
+    def swizzle_byte_size(self) -> int:
+        if self.is_swizzle_32b():
+            return 32
+        elif self.is_swizzle_64b():
+            return 64
+        elif self.is_swizzle_128b():
+            return 128
+        else:
+            return 1
+
+    def swizzle_atom_size(self) -> int:
+        if self.is_swizzle_32b():
+            return 32 // 16
+        elif self.is_swizzle_64b():
+            return 64 // 16
+        elif self.is_swizzle_128b():
+            return 128 // 16
+        else:
+            return 1
+
+
+# derive from MMAIntrinEmitter as some layouts are the same
+class TensorCoreIntrinEmitter(MMAIntrinEmitter):
+    """
+    To eliminate Python syntax within TIR Macro.
+    """
+
+    # should be rewritten to support dynamic k_dim
+    tcgen05_prefix: str
+
+    a_shared_layout: Layout = None
+    b_shared_layout: Layout = None
+
+    def __init__(
+        self,
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
+        a_transposed: bool = False,
+        b_transposed: bool = False,
+        block_row_warps: int = 2,
+        block_col_warps: int = 2,
+        warp_row_tiles: int = 8,
+        warp_col_tiles: int = 8,
+        chunk: int = 16,
+        reduce_k: int = 1,
+        num_elems_per_byte: int = 1,
+        is_m_first: bool = False,
+        thread_var: Var | None = None,
+    ):
+        super().__init__(
+            a_dtype,
+            b_dtype,
+            accum_dtype,
+            a_transposed,
+            b_transposed,
+            block_row_warps,
+            block_col_warps,
+            warp_row_tiles,
+            warp_col_tiles,
+            chunk,
+            reduce_k,
+            num_elems_per_byte,
+            is_m_first,
+            thread_var,
+        )
+
+    def _assign_a_shared_layout(self, layout: Layout):
+        self.a_shared_layout = layout
+        return self
+
+    def _assign_b_shared_layout(self, layout: Layout):
+        self.b_shared_layout = layout
+        return self
+
+    def _initialize_micro_size(self, m_dim: int = 16, k_dim: int = 16):
+        warp_row_tiles = self.warp_row_tiles
+        warp_col_tiles = self.warp_col_tiles
+        # For tcgen05, warp_row_tiles is 8 as we can use .ws to support m32
+        assert warp_row_tiles >= 8, f"warp_row_tiles must be greater than 8, got {warp_row_tiles}"
+        assert warp_row_tiles % 8 == 0, f"warp_row_tiles must be divisible by 8, got {warp_row_tiles}"
+        assert warp_col_tiles >= 8, f"warp_col_tiles must be greater than 8, got {warp_col_tiles}"
+        assert warp_col_tiles % 8 == 0, f"warp_col_tiles must be divisible by 8, got {warp_col_tiles}"
+
+        # four warps per block
+        self.warp_rows = warp_row_tiles // 8
+        if warp_col_tiles % 16 == 0:
+            self.n_dim = 16
+            self.micro_size_y = 16
+            self.warp_cols = warp_col_tiles // 16
+        else:
+            # must be divisible by 8
+            self.n_dim = 8
+            self.micro_size_y = 8
+            self.warp_cols = warp_col_tiles // 8
+
+        self.micro_size_x = m_dim
+        self.micro_size_k = k_dim
+
+    def _determinate_swizzle_mode(self, buffer: Buffer, layout: Layout) -> SwizzleMode:
+        # same behavior to src/layout/gemm_layouts.cc::makeGemmABLayoutHopper
+        if layout is None or layout.is_equal(make_linear_layout(buffer)):
+            return SwizzleMode.NONE
+        elif layout.is_equal(make_quarter_bank_swizzled_layout(buffer)):
+            return SwizzleMode.SWIZZLE_32B
+        elif layout.is_equal(make_half_bank_swizzled_layout(buffer)):
+            return SwizzleMode.SWIZZLE_64B
+        elif layout.is_equal(make_full_bank_swizzled_layout(buffer)):
+            return SwizzleMode.SWIZZLE_128B
+        else:
+            raise ValueError(f"Unsupported swizzle mode: {layout}")
+
+    def tcgen05mma(self, A_buf: Buffer, B_buf: Buffer, C_local_buf: Buffer, mbar, clear_accum: PrimExpr = False):
+        if is_tensor_memory(A_buf):
+            return self.tcgen05mma_rs(A_buf, B_buf, C_local_buf, clear_accum)
+
+        accum_dtype = self.accum_dtype
+        m_dim = self.block_row_warps * self.warp_row_tiles
+        micro_size_k = self.micro_size_k
+        k_dim, n_dim = self.chunk, self.block_col_warps * self.warp_col_tiles
+        scale_in_a = 1
+        scale_in_b = 1
+
+        assert k_dim >= micro_size_k, f"k_dim must be greater than or equal to {micro_size_k}, got k_dim: {k_dim}"
+
+        a_is_k_major = not self.a_transposed
+        b_is_k_major = self.b_transposed
+        a_swizzle_mode = self._determinate_swizzle_mode(A_buf, self.a_shared_layout)
+        b_swizzle_mode = self._determinate_swizzle_mode(B_buf, self.b_shared_layout)
+
+        elems_in_bits = DataType(self.a_dtype).bits
+        elems_in_bytes = elems_in_bits // 8
+        a_swizzle_atom_elems = a_swizzle_mode.swizzle_byte_size() // elems_in_bytes
+        b_swizzle_atom_elems = n_dim if b_swizzle_mode.is_none() else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
+        accum_dtype_in_bits = DataType(accum_dtype).bits
+
+        meta = self.get_tcgen5_mma_meta(m_dim, n_dim, k_dim)
+        if len(meta) != 5:
+            raise ValueError(
+                f"Unsupported TCGEN5MMA configuration for desc generation: M={m_dim}, N={n_dim}, "
+                f"K={k_dim}, A dtype={self.a_dtype}, accum dtype={self.accum_dtype}"
+            )
+        atom_m, atom_n, atom_k, enable_ws, enable_2cta = (int(x) for x in meta)
+
+        # by default, we utilize non-swizzle layout offset
+        a_leading_byte_offset = (8 * 8 * elems_in_bytes) if a_is_k_major else (8 * m_dim * elems_in_bytes)
+        a_stride_byte_offset = (8 * k_dim * elems_in_bytes) if a_is_k_major else (8 * 8 * elems_in_bytes)
+
+        if not a_swizzle_mode.is_none():
+            # swizzle mode doesn't require LBO/SBO to be 1
+            # https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-leading-dimension-byte-offset
+            if a_is_k_major:
+                a_leading_byte_offset = 16
+                a_stride_byte_offset = 8 * a_swizzle_mode.swizzle_byte_size()
+            else:
+                # MN Major
+                # LBO represents the distance between two atoms along the M dimension
+                # SBO represents the distance between two atoms along the K dimension
+                a_m_axis_atoms = m_dim // a_swizzle_atom_elems
+                if a_m_axis_atoms <= 1:
+                    a_leading_byte_offset = 0
+                else:
+                    a_leading_byte_offset = k_dim * a_swizzle_mode.swizzle_byte_size()
+
+                if a_m_axis_atoms <= 1:
+                    a_stride_byte_offset = 8 * elems_in_bytes * m_dim
+                else:
+                    a_stride_byte_offset = 8 * elems_in_bytes * a_swizzle_atom_elems
+
+        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim * elems_in_bytes)
+        b_stride_byte_offset = (8 * k_dim * elems_in_bytes) if b_is_k_major else (0 if n_dim == 8 else (8 * 8 * elems_in_bytes))
+        if not b_swizzle_mode.is_none():
+            # swizzle mode doesn't require LBO/SBO to be 1
+            # https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-leading-dimension-byte-offset
+            if b_is_k_major:
+                b_leading_byte_offset = 16
+                b_stride_byte_offset = 8 * b_swizzle_mode.swizzle_byte_size()
+            else:
+                # MN Major, K * N
+                # LBO represents the distance between two atoms along the N dimension
+                # SBO represents the distance between two atoms along the K dimension
+                b_n_axis_atoms = n_dim // b_swizzle_atom_elems
+                if b_n_axis_atoms <= 1:
+                    b_leading_byte_offset = 0
+                else:
+                    b_leading_byte_offset = 8 * 8 * elems_in_bytes * k_dim
+                if b_n_axis_atoms <= 1:
+                    b_stride_byte_offset = 8 * elems_in_bytes * n_dim
+                else:
+                    b_stride_byte_offset = 8 * elems_in_bytes * b_swizzle_atom_elems
+
+        # for example, if [n, k] where k is 128, we should split it into 2 atoms
+        # where max specially handles the case when n_dim is 8.
+        ak_atom_size = max(a_swizzle_atom_elems // micro_size_k, 1)
+        bk_atom_size = max(b_swizzle_atom_elems // micro_size_k, 1)
+
+        instr_desc = self.get_tcgen5_instr_desc(
+            atom_m,
+            atom_n,
+            atom_k,
+            a_is_k_major,
+            b_is_k_major,
+            scale_in_a,
+            scale_in_b,
+        )
+        # Allocate an instruction descriptor wrapper and initialize it
+        a_dtype_abbrv = self.a_dtype_abbrv
+        mask_zero = T.Cast(T.int32, 0)
+        mask0 = mask1 = mask2 = mask3 = mask_zero
+
+        # TCGEN05 only has one warp group
+        num_inst_m = self.block_row_warps * self.warp_row_tiles // atom_m
+        num_inst_n = self.block_col_warps * self.warp_col_tiles // atom_n
+
+        # Helper to allow BufferRegion/BufferLoad as inputs
+        def access_ptr_from(buffer_or_load_or_region, access_type: str = "r"):
+            if isinstance(buffer_or_load_or_region, Buffer):
+                return buffer_or_load_or_region.access_ptr(access_type)
+            elif isinstance(buffer_or_load_or_region, BufferLoad):
+                buffer_load = buffer_or_load_or_region
+                offset, stride = 0, 1
+                buffer = buffer_load.buffer
+                for i, shape in enumerate(reversed(buffer.shape)):
+                    indice = buffer_load.indices[len(buffer_load.indices) - i - 1]
+                    if isinstance(indice, (tvm.tir.IntImm, tvm.tir.PrimExpr)):
+                        offset += indice * stride
+                    elif isinstance(indice, tvm.tir.Ramp):
+                        offset += indice.base * stride
+                    else:
+                        raise ValueError(f"Unsupported index type: {type(indice)}")
+                    stride *= shape
+                return buffer.access_ptr(access_type, offset=offset)
+            elif isinstance(buffer_or_load_or_region, BufferRegion):
+                buffer_region = buffer_or_load_or_region
+                buffer = buffer_region.buffer
+                offset, stride = 0, 1
+                for i, shape in enumerate(reversed(buffer.shape)):
+                    offset += buffer_region.region[len(buffer_region.region) - i - 1].min * stride
+                    stride *= shape
+                return buffer.access_ptr(access_type, offset=offset)
+            else:
+                raise ValueError(f"Unsupported buffer type: {type(buffer_or_load_or_region)}")
+
+        @T.macro
+        def _warp_mma(A_buf, B_buf, C_local_buf, mbar):
+            # Allocate SMEM descriptors for A and B
+            desc_a = T.alloc_tcgen05_smem_desc()
+            desc_b = T.alloc_tcgen05_smem_desc()
+            A_ptr = access_ptr_from(A_buf, "r")
+            B_ptr = access_ptr_from(B_buf, "r")
+
+            T.initialize_tcgen05_descriptor(
+                desc_a,
+                A_ptr,
+                int(a_leading_byte_offset >> 4),
+                int(a_stride_byte_offset >> 4),
+                0,
+                False,
+                int(a_swizzle_mode),
+            )
+            T.initialize_tcgen05_descriptor(
+                desc_b,
+                B_ptr,
+                int(b_leading_byte_offset >> 4),
+                int(b_stride_byte_offset >> 4),
+                0,
+                False,
+                int(b_swizzle_mode),
+            )
+
+            tmem_col_step = atom_n // (128 // atom_m)
+            for j in T.unroll(num_inst_n):
+                for i in T.unroll(num_inst_m):
+                    for ki in T.unroll(0, (k_dim // micro_size_k)):
+                        scale_out = T.Select(ki != 0, 1, T.Select(clear_accum, 0, 1))
+                        A_elem_offset = (
+                            (ki % ak_atom_size) * micro_size_k
+                            + i * atom_m * a_swizzle_atom_elems
+                            + (ki // ak_atom_size) * m_dim * a_swizzle_atom_elems
+                            if a_is_k_major
+                            else i * atom_m * k_dim + ki * a_swizzle_atom_elems * micro_size_k
+                        )
+
+                        B_elem_offset = (
+                            (ki // bk_atom_size) * n_dim * b_swizzle_atom_elems
+                            + (ki % bk_atom_size) * micro_size_k
+                            + j * atom_n * b_swizzle_atom_elems
+                            if b_is_k_major
+                            else (
+                                ki * b_swizzle_atom_elems * micro_size_k + j * atom_n * (k_dim if n_dim // b_swizzle_atom_elems > 1 else 1)
+                            )
+                        )
+
+                        A_byte_offset = A_elem_offset * elems_in_bytes
+                        B_byte_offset = B_elem_offset * elems_in_bytes
+                        C_offset = (i * n_dim + j * tmem_col_step) * accum_dtype_in_bits // 32  # 32 bits per tmem bank
+
+                        T.ptx_tcgen05_mma_ss(
+                            a_dtype_abbrv,
+                            desc_a.data,
+                            A_byte_offset,
+                            desc_b.data,
+                            B_byte_offset,
+                            C_local_buf.data,
+                            C_offset,
+                            instr_desc,
+                            scale_out,
+                            mask0,
+                            mask1,
+                            mask2,
+                            mask3,
+                            enable_ws,
+                        )
+            T.tcgen05_mma_arrive(mbar)
+
+        return _warp_mma(A_buf, B_buf, C_local_buf, mbar)
+
+    def make_mma_load_layout(self, local_buf: Buffer, matrix: str = "A") -> T.Fragment:
+        raise NotImplementedError
+
+    def make_mma_store_layout(self, tmem_buf: Buffer) -> Layout:
+        """
+        Create the TCGEN5 tensor-memory layout used to store MMA accumulators.
+
+        Parameters
+        ----------
+        tmem_buf : tir.Buffer
+            The local buffer representing tensormemory of a mma's output
+
+        Returns
+        -------
+        Layout
+            Layout object describing how logical (i, j) coordinates map to the
+            swizzled tensor-memory offsets required by TCGEN5MMA.
+
+        Raises
+        ------
+        AssertionError
+            If `tmem_buf` is not detected to be a tensor-memory buffer.
+        """
+        assert is_tensor_memory(tmem_buf), "tmem_buf must reside in tensor memory (shared.tmem)"
+        if len(tmem_buf.shape) != 2:
+            raise ValueError(f"TCGEN5MMA expects a 2-D tensor-memory buffer, got shape {tmem_buf.shape}")
+
+        m = int(tmem_buf.shape[0])
+        n = int(tmem_buf.shape[1])
+        k = int(self.chunk)
+
+        meta = self.get_tcgen5_mma_meta(m, n, k)
+        if len(meta) != 5:
+            raise ValueError(
+                f"Unsupported TCGEN5MMA configuration: M={m}, N={n}, K={k}, A dtype={self.a_dtype}, accum dtype={self.accum_dtype}"
+            )
+        atom_m, atom_n, _, _, _ = (int(x) for x in meta)
+
+        if m % atom_m != 0 or n % atom_n != 0:
+            raise ValueError(f"Invalid TCGEN5MMA store layout for shape ({m}, {n}) with atoms ({atom_m}, {atom_n})")
+
+        def forward(i: PrimExpr, j: PrimExpr):
+            atom_idx = (i // atom_m) + (j // atom_n) * (m // atom_m)
+            ai = i % atom_m
+            aj = j % atom_n
+
+            if atom_m == 128:
+                # Layout D
+                return [
+                    ai,
+                    aj + atom_idx * atom_n,
+                ]
+            if atom_m == 64:
+                # Layout E (.ws variant)
+                half_atom_n = atom_n // 2
+                return [
+                    (ai // 32) * 32 + ai % 32 + (aj // half_atom_n) * 64,
+                    (aj % half_atom_n) + atom_idx * half_atom_n,
+                ]
+            if atom_m == 32:
+                # Layout G
+                quarter_atom_n = atom_n // 4
+                return [
+                    ai % 32 + (aj // quarter_atom_n) * 32,
+                    (aj % quarter_atom_n) + atom_idx * quarter_atom_n,
+                ]
+
+            raise ValueError(f"Unsupported TCGEN5 atom_m={atom_m}")
+
+        return Layout([m, n], forward)
+
+    def get_tcgen5_mma_meta(self, m: int, n: int, k: int):
+        return _ffi_api.get_tcgen5_mma_meta(int(m), int(n), int(k), DataType(self.a_dtype), DataType(self.accum_dtype))
+
+    def get_tcgen5_instr_desc(
+        self, atom_m: int, atom_n: int, atom_k: int, a_is_k_major: bool, b_is_k_major: bool, scale_in_a: int, scale_in_b: int
+    ) -> PrimExpr:
+        desc = _ffi_api.get_tcgen5_instr_desc(
+            atom_m,
+            atom_n,
+            atom_k,
+            DataType(self.a_dtype),
+            DataType(self.accum_dtype),
+            a_is_k_major,
+            b_is_k_major,
+            scale_in_a,
+            scale_in_b,
+        )
+        return lift(desc)
diff --git a/tilelang/intrinsics/utils.py b/tilelang/intrinsics/utils.py
index ef03ea7cc..fb24a4add 100644
--- a/tilelang/intrinsics/utils.py
+++ b/tilelang/intrinsics/utils.py
@@ -8,8 +8,9 @@
     ldmatrix_32x16_to_shared_16x32_layout_a,
     ldmatrix_32x16_to_shared_16x32_layout_b,
     mma_store_32x8_to_shared_16x16_layout,
+    mma_store_32x2_to_shared_8x8_layout_fp64,
 )
-from .mfma_layout import (thread_id_shared_access_64x4_to_16x16_layout_C_n_m)
+from .mfma_layout import thread_id_shared_access_64x4_to_16x16_layout_C_n_m
 
 from .mma_layout import get_swizzle_layout  # noqa: F401
 from .mma_layout import make_mma_swizzle_layout  # noqa: F401
@@ -82,6 +83,10 @@ def mma_store_index_map(thread_id, local_id):
     return mma_store_32x8_to_shared_16x16_layout(thread_id, local_id)
 
 
+def mma_store_index_map_fp64(thread_id, local_id):
+    return mma_store_32x2_to_shared_8x8_layout_fp64(thread_id, local_id)
+
+
 def mfma_store_index_map(thread_id, local_id):
     return thread_id_shared_access_64x4_to_16x16_layout_C_n_m(thread_id, local_id)
 
@@ -91,6 +96,7 @@ def get_mma_micro_size(dtype: Literal["float16", "int8"]):
     # Basic Tensor Core Matrix Multiply operation Unit
     """
     Return the MMA (Tensor Core) micro-tile dimensions for a given data type.
+
     This function returns the micro tile sizes (x, y, k) used by MMA/Tensor Core operations.
     - x: tile width in the output/result dimension
     - y: tile height in the output/result dimension
diff --git a/tilelang/intrinsics/wgmma_macro_generator.py b/tilelang/intrinsics/wgmma_macro_generator.py
index d9d591f72..864420c77 100644
--- a/tilelang/intrinsics/wgmma_macro_generator.py
+++ b/tilelang/intrinsics/wgmma_macro_generator.py
@@ -4,8 +4,9 @@
 from typing import Callable
 from .mma_macro_generator import TensorCoreIntrinEmitter as MMAIntrinEmitter
 from tvm import DataType
-from tvm.tir import PrimExpr, Buffer, Var, IndexMap
-from tilelang.utils import is_fragment
+from tvm.tir import PrimExpr, Buffer, Var, IndexMap, BufferRegion
+from tilelang.utils import is_fragment, retrive_ptr_from_buffer_region, is_full_region
+from math import gcd
 from tilelang.layout import (
     Layout,
     make_full_bank_swizzled_layout,
@@ -14,9 +15,11 @@
     make_linear_layout,
 )
 from tvm.runtime import convert
-from tilelang.intrinsics.mma_layout import (shared_16x8_to_mma_32x4_layout_sr_a,
-                                            shared_16x16_to_mma_32x8_layout_sr_a,
-                                            shared_16x32_to_mma_32x16_layout_sr_a)
+from tilelang.intrinsics.mma_layout import (
+    shared_16x8_to_mma_32x4_layout_sr_a,
+    shared_16x16_to_mma_32x8_layout_sr_a,
+    shared_16x32_to_mma_32x16_layout_sr_a,
+)
 
 lift = convert
 
@@ -70,14 +73,19 @@ class TensorCoreIntrinEmitter(MMAIntrinEmitter):
     # should be rewritten to support dynamic k_dim
     wgmma_prefix: str
 
+    # wgmma instruction M dimension
+    wgmma_inst_m: int
+    # wgmma instruction N dimension
+    wgmma_inst_n: int
+
     a_shared_layout: Layout = None
     b_shared_layout: Layout = None
 
     def __init__(
         self,
-        a_dtype: str = "float16",
-        b_dtype: str = "float16",
-        accum_dtype: str = "float16",
+        a_dtype: str = T.float16,
+        b_dtype: str = T.float16,
+        accum_dtype: str = T.float16,
         a_transposed: bool = False,
         b_transposed: bool = False,
         block_row_warps: int = 2,
@@ -90,9 +98,22 @@ def __init__(
         is_m_first: bool | None = False,
         thread_var: Var | None = None,
     ):
-        super().__init__(a_dtype, b_dtype, accum_dtype, a_transposed, b_transposed, block_row_warps,
-                         block_col_warps, warp_row_tiles, warp_col_tiles, chunk, reduce_k,
-                         num_elems_per_byte, is_m_first, thread_var)
+        super().__init__(
+            a_dtype,
+            b_dtype,
+            accum_dtype,
+            a_transposed,
+            b_transposed,
+            block_row_warps,
+            block_col_warps,
+            warp_row_tiles,
+            warp_col_tiles,
+            chunk,
+            reduce_k,
+            num_elems_per_byte,
+            is_m_first,
+            thread_var,
+        )
         self._initialize_wgmma_prefix(self.n_dim)
 
     def _assign_a_shared_layout(self, layout: Layout):
@@ -104,9 +125,18 @@ def _assign_b_shared_layout(self, layout: Layout):
         return self
 
     def _initialize_wgmma_prefix(self, n_dim: int = 16):
-        inst_m, inst_n = 64, self.block_col_warps * self.warp_col_tiles
+        inst_m, inst_n = 64, gcd(self.warp_col_tiles, 256)
+        assert inst_n % 8 == 0, (
+            f"inst_n must be a multiple of 8, got {inst_n} (block_col_warps={self.block_col_warps}, warp_col_tiles={self.warp_col_tiles})"
+        )
+        # Validate inst_n: Hopper WGMMA supports n in [8, 256] and multiple of 8
+        assert 8 <= inst_n <= 256, (
+            f"inst_n must be within [8, 256], got {inst_n} (block_col_warps={self.block_col_warps}, warp_col_tiles={self.warp_col_tiles})"
+        )
         # 256 bits per instruction
         inst_k = 256 // DataType(self.a_dtype).bits
+        self.wgmma_inst_m = inst_m
+        self.wgmma_inst_n = inst_n
         self.wgmma_prefix = f"m{inst_m}n{inst_n}k{inst_k}"
 
     def _initialize_micro_size(self, m_dim: int = 16, k_dim: int = 16):
@@ -145,14 +175,11 @@ def _determinate_swizzle_mode(self, buffer: Buffer, layout: Layout) -> SwizzleMo
         else:
             raise ValueError(f"Unsupported swizzle mode: {layout}")
 
-    def wgmma(self,
-              A_buf: Buffer,
-              B_buf: Buffer,
-              C_local_buf: Buffer,
-              clear_accum: PrimExpr = False):
-
-        if is_fragment(A_buf):
-            return self.wgmma_rs(A_buf, B_buf, C_local_buf, clear_accum)
+    def wgmma(
+        self, A_region: BufferRegion, B_region: BufferRegion, C_region: BufferRegion, clear_accum: PrimExpr = False, wg_wait: int = 0
+    ):
+        if is_fragment(A_region):
+            return self.wgmma_rs(A_region, B_region, C_region, clear_accum, wg_wait)
 
         local_size_out = self.local_size_out
         a_dtype_abbrv = self.a_dtype_abbrv
@@ -164,7 +191,6 @@ def wgmma(self,
         micro_size_k = self.micro_size_k
         k_dim, n_dim = self.chunk, self.block_col_warps * self.warp_col_tiles
         wgmma_prefix = self.wgmma_prefix
-        scale_out = not clear_accum
         scale_in_a = 1
         scale_in_b = 1
 
@@ -173,21 +199,20 @@ def wgmma(self,
         a_is_k_major = not self.a_transposed
         b_is_k_major = self.b_transposed
 
-        a_swizzle_mode = self._determinate_swizzle_mode(A_buf, self.a_shared_layout)
-        b_swizzle_mode = self._determinate_swizzle_mode(B_buf, self.b_shared_layout)
+        a_swizzle_mode = self._determinate_swizzle_mode(A_region, self.a_shared_layout)
+        b_swizzle_mode = self._determinate_swizzle_mode(B_region, self.b_shared_layout)
 
         elems_in_bits = DataType(self.a_dtype).bits
         elems_in_bytes = elems_in_bits // 8
 
         a_swizzle_atom_elems = a_swizzle_mode.swizzle_byte_size() // elems_in_bytes
-        b_swizzle_atom_elems = n_dim if b_swizzle_mode.is_none(
-        ) else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
+        b_swizzle_atom_elems = n_dim if b_swizzle_mode.is_none() else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
+        accum_bits = DataType(accum_dtype).bits
+        accum_regs = ((m_dim // 64) * warp_cols * local_size_out * accum_bits + 31) // 32
 
         # by default, we utilize non-swizzle layout offset
-        a_leading_byte_offset = (8 * 8 * elems_in_bytes) if a_is_k_major else (8 * m_dim *
-                                                                               elems_in_bytes)
-        a_stride_byte_offset = (8 * k_dim * elems_in_bytes) if a_is_k_major else (8 * 8 *
-                                                                                  elems_in_bytes)
+        a_leading_byte_offset = (8 * 8 * elems_in_bytes) if a_is_k_major else (8 * m_dim * elems_in_bytes)
+        a_stride_byte_offset = (8 * k_dim * elems_in_bytes) if a_is_k_major else (8 * 8 * elems_in_bytes)
 
         if not a_swizzle_mode.is_none():
             # swizzle mode doesn't require LBO/SBO to be 1
@@ -203,19 +228,15 @@ def wgmma(self,
                 if a_m_axis_atoms <= 1:
                     a_leading_byte_offset = 0
                 else:
-                    a_leading_byte_offset = 8 * a_swizzle_mode.swizzle_atom_size() * (
-                        a_swizzle_mode.swizzle_byte_size() // elems_in_bytes)
+                    a_leading_byte_offset = 8 * a_swizzle_mode.swizzle_atom_size() * (a_swizzle_mode.swizzle_byte_size() // elems_in_bytes)
 
                 if a_m_axis_atoms <= 1:
                     a_stride_byte_offset = 8 * elems_in_bytes * m_dim
                 else:
                     a_stride_byte_offset = 8 * elems_in_bytes * a_swizzle_atom_elems
 
-        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim *
-                                                                               elems_in_bytes)
-        b_stride_byte_offset = (8 * k_dim *
-                                elems_in_bytes) if b_is_k_major else (0 if n_dim == 8 else
-                                                                      (8 * 8 * elems_in_bytes))
+        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim * elems_in_bytes)
+        b_stride_byte_offset = (8 * k_dim * elems_in_bytes) if b_is_k_major else (0 if n_dim == 8 else (8 * 8 * elems_in_bytes))
         if not b_swizzle_mode.is_none():
             # swizzle mode doesn't require LBO/SBO to be 1
             # https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-leading-dimension-byte-offset
@@ -240,41 +261,82 @@ def wgmma(self,
         # where max specially handles the case when n_dim is 8.
         ak_atom_size = max(a_swizzle_atom_elems // micro_size_k, 1)
         bk_atom_size = max(b_swizzle_atom_elems // micro_size_k, 1)
+        wgmma_inst_m, wgmma_inst_n = self.wgmma_inst_m, self.wgmma_inst_n
+        num_inst_m = 4 * self.warp_row_tiles // wgmma_inst_m
+        num_inst_n = self.warp_col_tiles // wgmma_inst_n
+
+        thread_binding = self.get_thread_binding()
+
+        A_ptr = retrive_ptr_from_buffer_region(A_region)
+        B_ptr = retrive_ptr_from_buffer_region(B_region)
+        assert is_full_region(C_region), "Fragment output C must be a full region"
+
+        C_buf = C_region.buffer
 
         @T.macro
-        def _warp_mma(A_buf, B_buf, C_local_buf):
-            # TODO(lei): inject warpgroup_fence_operand for C_local_buf
-            desc_a = T.alloc_descriptor()
-            desc_b = T.alloc_descriptor()
-            T.initialize_descriptor(desc_a, A_buf.access_ptr("r"), a_swizzle_mode,
-                                    int(a_leading_byte_offset >> 4), int(a_stride_byte_offset >> 4))
-            T.initialize_descriptor(desc_b, B_buf.access_ptr("r"), b_swizzle_mode,
-                                    int(b_leading_byte_offset >> 4), int(b_stride_byte_offset >> 4))
+        def _warp_mma(A_ptr, B_ptr, C_buf):
+            tx, warp_n, warp_m = self.extract_thread_binding(thread_binding)
+
+            desc_a = T.alloc_wgmma_desc()
+            desc_b = T.alloc_wgmma_desc()
+            T.initialize_wgmma_descriptor(desc_a, A_ptr, a_swizzle_mode, int(a_leading_byte_offset >> 4), int(a_stride_byte_offset >> 4))
+            T.initialize_wgmma_descriptor(desc_b, B_ptr, b_swizzle_mode, int(b_leading_byte_offset >> 4), int(b_stride_byte_offset >> 4))
+            T.warpgroup_fence_operand(C_buf, num_regs=accum_regs)
             T.warpgroup_arrive()
-            for ki in T.serial(0, (k_dim // micro_size_k)):
-                for i in T.serial(m_dim // 64):
-                    A_offset = (ki % ak_atom_size) * micro_size_k + i * 64 * a_swizzle_atom_elems + (
-                        ki // ak_atom_size
-                    ) * m_dim * a_swizzle_atom_elems if a_is_k_major else i * 64 * k_dim + ki * a_swizzle_atom_elems * micro_size_k
-                    B_offset = (ki // bk_atom_size) * n_dim * b_swizzle_atom_elems + (
-                        ki % bk_atom_size
-                    ) * micro_size_k if b_is_k_major else ki * b_swizzle_atom_elems * micro_size_k
-                    C_offset = i * warp_cols * local_size_out  # 4 warps as an unit
-                    T.ptx_wgmma_ss(accum_dtype, wgmma_prefix, a_is_k_major, b_is_k_major,
-                                   a_dtype_abbrv, b_dtype_abbrv, accum_dtype_abbrv, desc_a.data,
-                                   (A_offset * elems_in_bytes) >> 4, desc_b.data,
-                                   (B_offset * elems_in_bytes) >> 4, C_local_buf.data, C_offset,
-                                   scale_out, scale_in_a, scale_in_b)
+
+            for j in T.unroll(num_inst_n):
+                for i in T.unroll(num_inst_m):
+                    for ki in T.unroll(k_dim // micro_size_k):
+                        scale_out = T.Select(ki != 0, 1, T.Select(clear_accum, 0, 1))
+                        warp_i = (warp_m // 4) * num_inst_m + i
+                        warp_j = warp_n * num_inst_n + j
+                        A_offset = (
+                            (ki % ak_atom_size) * micro_size_k
+                            + warp_i * 64 * a_swizzle_atom_elems
+                            + (ki // ak_atom_size) * m_dim * a_swizzle_atom_elems
+                            if a_is_k_major
+                            else warp_i * 64 * k_dim + ki * a_swizzle_atom_elems * micro_size_k
+                        )
+                        B_offset = (
+                            (ki // bk_atom_size) * n_dim * b_swizzle_atom_elems
+                            + (ki % bk_atom_size) * micro_size_k
+                            + warp_j * wgmma_inst_n * b_swizzle_atom_elems
+                            if b_is_k_major
+                            else (
+                                ki * b_swizzle_atom_elems * micro_size_k
+                                + warp_j * wgmma_inst_n * (k_dim if n_dim // b_swizzle_atom_elems > 1 else 1)
+                            )
+                        )
+                        C_offset = i * warp_cols * local_size_out + j * warp_cols * local_size_out // num_inst_n  # 4 warps as an unit
+                        T.ptx_wgmma_ss(
+                            accum_dtype,
+                            wgmma_prefix,
+                            a_is_k_major,
+                            b_is_k_major,
+                            a_dtype_abbrv,
+                            b_dtype_abbrv,
+                            accum_dtype_abbrv,
+                            desc_a.data,
+                            (A_offset * elems_in_bytes) >> 4,
+                            desc_b.data,
+                            (B_offset * elems_in_bytes) >> 4,
+                            C_buf.data,
+                            C_offset,
+                            scale_out,
+                            scale_in_a,
+                            scale_in_b,
+                        )
+
             T.warpgroup_commit_batch()
-            T.warpgroup_wait(0)
+            if wg_wait >= 0:
+                T.warpgroup_wait(wg_wait)
+            T.warpgroup_fence_operand(C_buf, num_regs=accum_regs)
 
-        return _warp_mma(A_buf, B_buf, C_local_buf)
+        return _warp_mma(A_ptr, B_ptr, C_buf)
 
-    def wgmma_rs(self,
-                 A_buf: Buffer,
-                 B_buf: Buffer,
-                 C_local_buf: Buffer,
-                 clear_accum: PrimExpr = False):
+    def wgmma_rs(
+        self, A_region: BufferRegion, B_region: BufferRegion, C_region: BufferRegion, clear_accum: PrimExpr = False, wg_wait: int = 0
+    ):
         local_size_a = self.local_size_a
         local_size_out = self.local_size_out
         a_dtype_abbrv = self.a_dtype_abbrv
@@ -286,75 +348,109 @@ def wgmma_rs(self,
         micro_size_k = self.micro_size_k
         k_dim, n_dim = self.chunk, self.block_col_warps * self.warp_col_tiles
         wgmma_prefix = self.wgmma_prefix
-        scale_out = not clear_accum
         scale_in_a = 1
         scale_in_b = 1
 
         assert k_dim >= micro_size_k, f"k_dim must be greater than or equal to {micro_size_k}, got k_dim: {k_dim}"
 
         elems_in_bytes = DataType(self.a_dtype).bits // 8
-
+        a_bits = DataType(self.a_dtype).bits
+        accum_bits = DataType(accum_dtype).bits
+        a_regs = ((warp_rows * local_size_a * (k_dim // micro_size_k)) * a_bits + 31) // 32
+        accum_regs = ((m_dim // 64) * warp_cols * local_size_out * accum_bits + 31) // 32
         b_is_k_major = self.b_transposed
 
-        b_swizzle_mode = self._determinate_swizzle_mode(B_buf, self.b_shared_layout)
+        b_swizzle_mode = self._determinate_swizzle_mode(B_region, self.b_shared_layout)
+        b_swizzle_atom_elems = n_dim if b_swizzle_mode.is_none() else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
 
-        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim *
-                                                                               elems_in_bytes)
-        b_stride_byte_offset = (8 * k_dim * elems_in_bytes) if b_is_k_major else (8 * 8 *
-                                                                                  elems_in_bytes)
+        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim * elems_in_bytes)
+        b_stride_byte_offset = (8 * k_dim * elems_in_bytes) if b_is_k_major else (0 if n_dim == 8 else (8 * 8 * elems_in_bytes))
         if not b_swizzle_mode.is_none():
             # swizzle mode doesn't require LBO/SBO to be 1
             # https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-leading-dimension-byte-offset
             if b_is_k_major:
                 b_leading_byte_offset = 16
+                b_stride_byte_offset = 8 * b_swizzle_mode.swizzle_byte_size()
             else:
                 # MN Major
                 # LBO represents the distance between two atoms along the N dimension
                 # SBO represents the distance between two atoms along the K dimension
-                b_n_axis_atoms = n_dim // (b_swizzle_mode.swizzle_byte_size() // elems_in_bytes)
+                b_n_axis_atoms = n_dim // b_swizzle_atom_elems
                 if b_n_axis_atoms <= 1:
                     b_leading_byte_offset = 0
                 else:
-                    b_leading_byte_offset = 8 * b_swizzle_mode.swizzle_atom_size() * (
-                        b_swizzle_mode.swizzle_byte_size() // elems_in_bytes)
-
+                    b_leading_byte_offset = 8 * 8 * elems_in_bytes * k_dim
                 if b_n_axis_atoms <= 1:
                     b_stride_byte_offset = 8 * elems_in_bytes * n_dim
                 else:
-                    b_stride_byte_offset = 8 * elems_in_bytes * (
-                        b_swizzle_mode.swizzle_byte_size() // elems_in_bytes)
+                    b_stride_byte_offset = 8 * elems_in_bytes * b_swizzle_atom_elems
+
+        bk_atom_size = max(b_swizzle_atom_elems // micro_size_k, 1)
+        wgmma_inst_m, wgmma_inst_n = self.wgmma_inst_m, self.wgmma_inst_n
+        num_inst_m = 4 * self.warp_row_tiles // wgmma_inst_m
+        num_inst_n = self.warp_col_tiles // wgmma_inst_n
+
+        thread_binding = self.get_thread_binding()
+
+        assert is_full_region(A_region), "Fragment input A must be a full region"
+        assert is_full_region(C_region), "Fragment output C must be a full region"
+        A_buf = A_region.buffer
+        B_ptr = retrive_ptr_from_buffer_region(B_region)
+        C_buf = C_region.buffer
 
         @T.macro
-        def _warp_mma(A_buf, B_buf, C_local_buf):
-            desc_b = T.alloc_descriptor()
-            T.initialize_descriptor(desc_b, B_buf.access_ptr("w"), b_swizzle_mode,
-                                    int(b_leading_byte_offset >> 4), int(b_stride_byte_offset >> 4))
-            for ki in T.serial(0, (k_dim // micro_size_k)):
-                for i in T.serial(m_dim // 64):
-                    k_dim_offset = ki * micro_size_k
-                    A_offset = ki * warp_rows * local_size_a + i * local_size_a
-                    B_offset = k_dim_offset if b_is_k_major else k_dim_offset * B_buf.shape[-1]
-                    C_offset = i * warp_cols * local_size_out  # 4 warps as an unit
-                    T.ptx_wgmma_rs(
-                        accum_dtype,
-                        wgmma_prefix,
-                        self.a_transposed,
-                        not self.b_transposed,
-                        a_dtype_abbrv,
-                        b_dtype_abbrv,
-                        accum_dtype_abbrv,
-                        A_buf.data,
-                        A_offset,
-                        desc_b.data,
-                        (B_offset * elems_in_bytes) >> 4,
-                        C_local_buf.data,
-                        C_offset,
-                        scale_out,
-                        scale_in_a,
-                        scale_in_b,
-                    )
-
-        return _warp_mma(A_buf, B_buf, C_local_buf)
+        def _warp_mma(A_buf, B_ptr, C_buf):
+            tx, warp_n, warp_m = self.extract_thread_binding(thread_binding)
+
+            desc_b = T.alloc_wgmma_desc()
+            T.initialize_wgmma_descriptor(desc_b, B_ptr, b_swizzle_mode, int(b_leading_byte_offset >> 4), int(b_stride_byte_offset >> 4))
+            T.warpgroup_fence_operand(A_buf, num_regs=a_regs)
+            T.warpgroup_fence_operand(C_buf, num_regs=accum_regs)
+            T.warpgroup_arrive()
+
+            for j in T.unroll(0, num_inst_n):
+                for i in T.unroll(num_inst_m):
+                    for ki in T.unroll(0, (k_dim // micro_size_k)):
+                        warp_j = warp_n * num_inst_n + j
+                        scale_out = T.Select(ki != 0, 1, T.Select(clear_accum, 0, 1))
+
+                        A_offset = ki * warp_rows * local_size_a + i * local_size_a
+                        B_offset = (
+                            (ki // bk_atom_size) * n_dim * b_swizzle_atom_elems
+                            + warp_j * wgmma_inst_n * b_swizzle_atom_elems
+                            + (ki % bk_atom_size) * micro_size_k
+                            if b_is_k_major
+                            else (
+                                ki * b_swizzle_atom_elems * micro_size_k
+                                + warp_j * wgmma_inst_n * (k_dim if n_dim // b_swizzle_atom_elems > 1 else 1)
+                            )
+                        )
+                        C_offset = i * warp_cols * local_size_out + j * warp_cols * local_size_out // num_inst_n  # 4 warps as an unit
+                        T.ptx_wgmma_rs(
+                            accum_dtype,
+                            wgmma_prefix,
+                            self.b_transposed,
+                            a_dtype_abbrv,
+                            b_dtype_abbrv,
+                            accum_dtype_abbrv,
+                            A_buf.data,
+                            A_offset,
+                            desc_b.data,
+                            (B_offset * elems_in_bytes) >> 4,
+                            C_buf.data,
+                            C_offset,
+                            scale_out,
+                            scale_in_a,
+                            scale_in_b,
+                        )
+
+            T.warpgroup_commit_batch()
+            if wg_wait >= 0:
+                T.warpgroup_wait(wg_wait)
+            T.warpgroup_fence_operand(C_buf, num_regs=accum_regs)
+            T.warpgroup_fence_operand(A_buf, num_regs=a_regs)
+
+        return _warp_mma(A_buf, B_ptr, C_buf)
 
     def make_mma_load_layout(self, local_buf: Buffer, matrix: str = "A") -> T.Fragment:
         """
@@ -379,6 +475,7 @@ def make_mma_load_layout(self, local_buf: Buffer, matrix: str = "A") -> T.Fragme
             If `local_buf` is not detected to be a fragment buffer.
         """
         from tilelang.utils import is_fragment
+
         assert matrix in ["A"], "matrix should be A for WGMMA"
         dtype = self.a_dtype
         dtype_bits = DataType(dtype).bits
@@ -407,8 +504,7 @@ def make_mma_load_layout(self, local_buf: Buffer, matrix: str = "A") -> T.Fragme
         # the layout of mma.sync is row.col.
         # so the b matrix expected a transposed basic layout
         transform_func: Callable = None
-        transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(
-            j, i)
+        transform_func = transform_func_sr_a if is_sr_axis_order else lambda i, j: transform_func_sr_a(j, i)
 
         assert is_fragment(local_buf), f"local_buf must be a fragment, but got {local_buf.scope()}"
 
@@ -419,7 +515,7 @@ def make_mma_load_layout(self, local_buf: Buffer, matrix: str = "A") -> T.Fragme
             self.block_col_warps,
         )
 
-        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype="int32")
+        inverse_mma_load_layout = IndexMap.from_func(transform_func, index_dtype=T.int32)
 
         def forward_thread(i: int, j: int) -> int:
             """
@@ -450,20 +546,12 @@ def forward_index(i: int, j: int) -> int:
         replicate = block_col_warps
 
         if is_sr_axis_order:
-            warp_fragment = base_fragment.repeat([block_s, 1],
-                                                 repeat_on_thread=True,
-                                                 lower_dim_first=False).replicate(replicate)
-            block_fragment = warp_fragment.repeat([warp_s, warp_r],
-                                                  repeat_on_thread=False,
-                                                  lower_dim_first=False)
+            warp_fragment = base_fragment.repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=False).replicate(replicate)
+            block_fragment = warp_fragment.repeat([warp_s, warp_r], repeat_on_thread=False, lower_dim_first=False)
         else:
             # rs condition, transposed_a matrix
-            warp_fragment = base_fragment.repeat([1, block_s],
-                                                 repeat_on_thread=True,
-                                                 lower_dim_first=False).replicate(replicate)
-            block_fragment = warp_fragment.repeat([warp_r, warp_s],
-                                                  repeat_on_thread=False,
-                                                  lower_dim_first=True)
+            warp_fragment = base_fragment.repeat([1, block_s], repeat_on_thread=True, lower_dim_first=False).replicate(replicate)
+            block_fragment = warp_fragment.repeat([warp_r, warp_s], repeat_on_thread=False, lower_dim_first=True)
 
         return block_fragment
 
diff --git a/tilelang/ir.py b/tilelang/ir.py
index d48aeeed8..b4a7de5eb 100644
--- a/tilelang/ir.py
+++ b/tilelang/ir.py
@@ -1,79 +1,76 @@
 from tilelang import tvm as tvm
 from tvm.ir.base import Node
 from tvm.runtime import Scriptable
-import tvm.ffi
+import tvm_ffi
 from tvm.target import Target
 from tilelang import _ffi_api
 
 
-@tvm.ffi.register_object("tl.Fill")
-class Fill(Node, Scriptable):
-    ...
+@tvm_ffi.register_object("tl.Fill")
+class Fill(Node, Scriptable): ...
 
 
-@tvm.ffi.register_object("tl.AtomicAdd")
-class AtomicAdd(Node, Scriptable):
-    ...
+@tvm_ffi.register_object("tl.AtomicAdd")
+class AtomicAdd(Node, Scriptable): ...
 
 
-@tvm.ffi.register_object("tl.Copy")
-class Copy(Node, Scriptable):
-    ...
+@tvm_ffi.register_object("tl.Copy")
+class Copy(Node, Scriptable): ...
 
 
-@tvm.ffi.register_object("tl.Conv2DIm2Col")
-class Conv2DIm2ColOp(Node, Scriptable):
-    ...
+@tvm_ffi.register_object("tl.Conv2DIm2Col")
+class Conv2DIm2ColOp(Node, Scriptable): ...
 
 
-@tvm.ffi.register_object("tl.GemmWarpPolicy")
+@tvm_ffi.register_object("tl.GemmWarpPolicy")
 class GemmWarpPolicy(Node, Scriptable):
     policy_type: int
     m_warp: int
     n_warp: int
 
-    def compute_warp_partition(self, M: int, N: int, block_size: int, target: Target,
-                               is_wgmma: bool):
-        _ffi_api.GemmWarpPolicyComputeWarpPartition(self, int(M), int(N), int(block_size), target,
-                                                    is_wgmma)
+    def compute_warp_partition(self, M: int, N: int, block_size: int, target: Target, is_wgmma: bool):
+        _ffi_api.GemmWarpPolicyComputeWarpPartition(self, int(M), int(N), int(block_size), target, is_wgmma)
         return self.m_warp, self.n_warp
 
 
-@tvm.ffi.register_object("tl.Gemm")
-class Gemm(Node, Scriptable):
-    ...
+@tvm_ffi.register_object("tl.GemmSPWarpPolicy")
+class GemmSPWarpPolicy(Node, Scriptable):
+    policy_type: int
+    m_warp: int
+    n_warp: int
+
+    def compute_warp_partition(self, M: int, N: int, block_size: int, target: Target, is_wgmma: bool, bits: int):
+        _ffi_api.GemmSPWarpPolicyComputeWarpPartition(self, int(M), int(N), int(block_size), target, is_wgmma, bits)
+        return self.m_warp, self.n_warp
+
+
+@tvm_ffi.register_object("tl.Gemm")
+class Gemm(Node, Scriptable): ...
 
 
-@tvm.ffi.register_object("tl.GemmSP")
-class GemmSP(Node, Scriptable):
-    ...
+@tvm_ffi.register_object("tl.GemmSP")
+class GemmSP(Node, Scriptable): ...
 
 
-@tvm.ffi.register_object("tl.FinalizeReducerOp")
-class FinalizeReducerOp(Node, Scriptable):
-    ...
+@tvm_ffi.register_object("tl.FinalizeReducerOp")
+class FinalizeReducerOp(Node, Scriptable): ...
 
 
-@tvm.ffi.register_object("tl.ParallelOp")
-class ParallelOp(Node, Scriptable):
-    ...
+@tvm_ffi.register_object("tl.ParallelOp")
+class ParallelOp(Node, Scriptable): ...
 
 
-@tvm.ffi.register_object("tl.ReduceOp")
-class ReduceOp(Node, Scriptable):
-    ...
+@tvm_ffi.register_object("tl.ReduceOp")
+class ReduceOp(Node, Scriptable): ...
 
 
-@tvm.ffi.register_object("tl.CumSumOp")
-class CumSumOp(Node, Scriptable):
-    ...
+@tvm_ffi.register_object("tl.CumSumOp")
+class CumSumOp(Node, Scriptable): ...
 
 
-@tvm.ffi.register_object("tl.RegionOp")
-class RegionOp(Node, Scriptable):
-    ...
+@tvm_ffi.register_object("tl.RegionOp")
+class RegionOp(Node, Scriptable): ...
 
 
-@tvm.ffi.register_object("tl.ReduceType")
-class ReduceType(Node, Scriptable):
-    ...
+@tvm_ffi.register_object("tl.ReduceType")
+class ReduceType(Node, Scriptable): ...
diff --git a/tilelang/jit/__init__.py b/tilelang/jit/__init__.py
index 2080a00c6..f30bae29c 100644
--- a/tilelang/jit/__init__.py
+++ b/tilelang/jit/__init__.py
@@ -3,60 +3,93 @@
 It includes functionality to JIT-compile TileLang programs into a runnable
 kernel adapter using TVM.
 """
+
 from __future__ import annotations
 
+from dataclasses import dataclass
+import inspect
 from typing import (
     Any,
     Callable,
+    Generic,
+    TypeVar,
     overload,
     Literal,
 )
+from collections.abc import Iterable
+
+# Python 3.9 compatibility for ParamSpec
+try:
+    from typing import ParamSpec
+except ImportError:  # Python < 3.10
+    from typing_extensions import ParamSpec
 from tilelang import tvm as tvm
-from tilelang.jit.adapter.utils import is_metal_target
-from tvm.tir import PrimFunc
+from tilelang.language.v2 import PrimFunc, prim_func, LazyJITFunc
 from tvm.target import Target
 
 from tilelang.jit.kernel import JITKernel
+from tilelang.jit.adapter.utils import is_metal_target
 from tilelang.utils.target import determine_target
 from tilelang.cache import cached
 from os import path, makedirs
 from logging import getLogger
-import functools
-from tilelang.jit.param import Kernel, _P, _RProg
+from tilelang.jit.param import Kernel
+import concurrent.futures
+
+from tqdm.auto import tqdm
 
 logger = getLogger(__name__)
 
+_P = ParamSpec("_P")
+_KP = ParamSpec("_KP")
+_T = TypeVar("_T")
+_Ret = TypeVar("_Ret")
+
 
 def compile(
-    func: PrimFunc = None,
+    func: PrimFunc[_KP, _T] = None,
     out_idx: list[int] | int | None = None,
-    execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
-    target: str | Target = "auto",
+    execution_backend: Literal["auto", "dlpack", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] | None = None,
+    target: str | Target | None = None,
     target_host: str | Target | None = None,
-    verbose: bool = False,
+    verbose: bool | None = None,
     pass_configs: dict[str, Any] | None = None,
     compile_flags: list[str] | str | None = None,
-) -> JITKernel:
+) -> JITKernel[_KP, _T]:
     """
     Compile the given TileLang PrimFunc with TVM and build a JITKernel.
+
     Parameters
     ----------
     func : tvm.tir.PrimFunc, optional
         The TileLang TIR function to compile and wrap.
     out_idx : Union[List[int], int], optional
         Index(es) of the output tensors to return (default: None).
-    execution_backend : Literal["dlpack", "ctypes", "cython", "nvrtc"], optional
-        Execution backend to use for kernel execution (default: "cython").
+    execution_backend : Literal["auto", "dlpack", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"], optional
+        Execution backend to use for kernel execution. If None, reads from
+        TILELANG_EXECUTION_BACKEND environment variable (defaults to "auto").
     target : Union[str, Target], optional
-        Compilation target, either as a string or a TVM Target object (default: "auto").
+        Compilation target, either as a string or a TVM Target object. If None, reads from
+        TILELANG_TARGET environment variable (defaults to "auto").
     target_host : Union[str, Target], optional
         Target host for cross-compilation (default: None).
     verbose : bool, optional
-        Whether to enable verbose output (default: False).
+        Whether to enable verbose output. If None, reads from
+        TILELANG_VERBOSE environment variable (defaults to False).
     pass_configs : dict, optional
         Additional keyword arguments to pass to the Compiler PassContext.
         Refer to `tilelang.transform.PassConfigKey` for supported options.
+
+    Environment Variables
+    ---------------------
+    TILELANG_TARGET : str
+        Default compilation target (e.g., "cuda", "llvm"). Defaults to "auto".
+    TILELANG_EXECUTION_BACKEND : str
+        Default execution backend. Defaults to "auto".
+    TILELANG_VERBOSE : str
+        Set to "1", "true", "yes", or "on" to enable verbose compilation by default.
     """
+
     assert isinstance(func, PrimFunc), f"target function must be a PrimFunc but got {type(func)}"
     if isinstance(compile_flags, str):
         compile_flags = [compile_flags]
@@ -65,7 +98,12 @@ def compile(
     target = Target(determine_target(target))
 
     if is_metal_target(target):
-        assert execution_backend == 'torch', 'Currently metal target only support `tl.jit(execution_backend="torch")`'
+        assert execution_backend == "torch", 'Currently metal target only support `tl.jit(execution_backend="torch")`'
+
+    if hasattr(func, "out_idx_override"):
+        if func.out_idx_override is not None and out_idx is not None:
+            raise ValueError("Out index conflict: out_idx is specified and prim_func have returned `T.empty` tensors")
+        out_idx = func.out_idx_override or out_idx
 
     return cached(
         func=func,
@@ -79,168 +117,348 @@ def compile(
     )
 
 
-class _JitImplementation:
+def par_compile(
+    funcs: Iterable[PrimFunc[_KP, _T]],
+    out_idx: list[int] | int | None = None,
+    execution_backend: Literal["auto", "dlpack", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] | None = None,
+    target: str | Target | None = None,
+    target_host: str | Target | None = None,
+    verbose: bool | None = None,
+    pass_configs: dict[str, Any] | None = None,
+    compile_flags: list[str] | str | None = None,
+    num_workers: int | None = None,
+    ignore_error: bool = False,
+) -> list[JITKernel[_KP, _T]]:
+    """
+    Parallel compile multiple TileLang PrimFunc with TVM and build JITKernels.
+
+    Parameters
+    ----------
+    funcs : Iterable[tvm.tir.PrimFunc]
+        The TileLang TIR functions to compile and wrap.
+    out_idx : Union[List[int], int], optional
+        Index(es) of the output tensors to return (default: None).
+    execution_backend : Literal["auto", "dlpack", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"], optional
+        Execution backend to use for kernel execution. If None, reads from
+        TILELANG_EXECUTION_BACKEND environment variable (defaults to "auto").
+    target : Union[str, Target], optional
+        Compilation target, either as a string or a TVM Target object. If None, reads from
+        TILELANG_TARGET environment variable (defaults to "auto").
+    target_host : Union[str, Target], optional
+        Target host for cross-compilation (default: None).
+    verbose : bool, optional
+        Whether to enable verbose output. If None, reads from
+        TILELANG_VERBOSE environment variable (defaults to False).
+    pass_configs : dict, optional
+        Additional keyword arguments to pass to the Compiler PassContext.
+        Refer to `tilelang.transform.PassConfigKey` for supported options.
+
+    Environment Variables
+    ---------------------
+    TILELANG_TARGET : str
+        Default compilation target (e.g., "cuda", "llvm"). Defaults to "auto".
+    TILELANG_EXECUTION_BACKEND : str
+        Default execution backend. Defaults to "auto".
+    TILELANG_VERBOSE : str
+        Set to "1", "true", "yes", or "on" to enable verbose compilation by default.
+    """
+
+    with concurrent.futures.ThreadPoolExecutor(num_workers, "tl-par-comp") as executor:
+        futures = []
+        future_map = {}
+        for i, func in enumerate(funcs):
+            future = executor.submit(
+                compile,
+                func=func,
+                out_idx=out_idx,
+                execution_backend=execution_backend,
+                target=target,
+                target_host=target_host,
+                verbose=verbose,
+                pass_configs=pass_configs,
+                compile_flags=compile_flags,
+            )
+            future_map[future] = i
+            futures.append(future)
+        results = [... for _ in futures]
+        for future in tqdm(
+            concurrent.futures.as_completed(futures),
+            total=len(futures),
+            desc="Parallel Compiling",
+        ):
+            idx = future_map[future]
+            if ignore_error:
+                try:
+                    results[idx] = future.result()
+                except Exception as e:
+                    logger.warning(f"Error compiling function at index {idx}: {e}")
+                    results[idx] = None
+            else:
+                results[idx] = future.result()
+        return results
+    return results
+
+
+@dataclass
+class JITImpl(Generic[_P, _KP, _T, _Ret]):
+    """
+    Detailed Just-In-Time wrapper for TileLang programs.
+
+    This dataclass encapsulates the configuration and runtime helpers used by the
+    top-level `jit` and `jit2` decorators. It represents a configured JIT
+    "factory" that can (a) elaborate TileLang/PrimFunc creators into concrete
+    TIR (PrimFunc), (b) compile those TIR functions into runnable kernels via
+    the TVM bridge, (c) cache compiled kernels keyed by call-site arguments
+    (and optional tuning parameters), and (d) provide parallel compilation
+    helpers for batch autotuning workflows.
+
+    Attributes
+    ----------
+    out_idx : list[int] | int | None
+        Which output tensor(s) of the compiled kernel should be returned to the
+        caller. Accepts a single index, a list of indices, or None to return all.
+    execution_backend : Literal["auto", "dlpack", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"]
+        Backend used for exchanging arguments and executing the generated kernel.
+    target : str | tvm.target.Target
+        TVM compilation target (e.g. "cuda", "llvm", or "auto").
+    target_host : str | tvm.target.Target | None
+        Host target used for cross-compilation, or None to infer/default.
+    verbose : bool
+        Enable verbose messages during compilation/build.
+    pass_configs : dict[str, Any] | None
+        Extra TVM pass configuration options forwarded to the compiler's
+        PassContext.
+    debug_root_path : str | None
+        If provided, compiled kernel source and the elaborated Python program
+        are written to this directory to ease debugging and inspection.
+    compile_flags : list[str] | str | None
+        Additional flags passed to the compiler. A single string will be converted
+        to a single-element list.
+    func_source : str
+        Original Python source string from which the PrimFunc or creator was
+        derived. Used for diagnostics and debug dumps.
+    signature : inspect.Signature
+        Function signature of the original Python function (useful for tooling).
+    v2 : bool
+        Indicates whether the object wraps a "v2" PrimFunc creator (True) or a
+        plain callable / PrimFunc (False). v2-mode enables argument conversion
+        hooks and a distinct cache keying strategy.
+    func : Callable | PrimFunc | PrimFuncCreater
+        The underlying object: either a user function that returns a PrimFunc
+        (creator), a PrimFuncCreater, or an already-constructed PrimFunc.
+        For presentation/readability the function is stored last in the dataclass.
+
+    Behavioral summary
+    ------------------
+    - get_tir(*args, **kwargs)
+        Converts provided call-site arguments into a concrete PrimFunc. If the
+        wrapped object is a PrimFuncCreater or a user callable, it is invoked
+        with the given arguments. If the wrapped object is already a PrimFunc,
+        it is returned as-is.
+
+    - compile(...)
+        A convenience wrapper that elaborates and immediately compiles a single
+        PrimFunc into a JITKernel using the module-level `compile` function.
+        When `debug_root_path` is set, the compiled C kernel and the source
+        Python program are saved for inspection.
+
+    - par_compile(configs, ...)
+        Accepts an iterable of configs (either dicts mapping keyword args or
+        tuples mapping to positional args). Each config is elaborated to a
+        PrimFunc and the resulting set is compiled in parallel via the
+        module-level `par_compile` helper. Returns a list of JITKernel objects
+        in the same order as the provided configs.
+    """
 
     out_idx: list[int] | int | None
-    target: str | Target
-    target_host: str | Target
-    execution_backend: Literal["dlpack", "ctypes", "cython"]
-    verbose: bool
+    execution_backend: Literal["auto", "dlpack", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] | None
+    target: str | Target | None
+    target_host: str | Target | None
+    verbose: bool | None
     pass_configs: dict[str, Any] | None
     debug_root_path: str | None
     compile_flags: list[str] | str | None
+    func_source: str
+    signature: inspect.Signature
+    lazy_jit: bool
+    # place func at the last element for better __repr__
+    func: Callable[_P, _T] | PrimFunc[_KP, _T] | LazyJITFunc[_KP, _T]
 
-    def __init__(self,
-                 out_idx: Any = None,
-                 target: str | Target = "auto",
-                 target_host: str | Target = None,
-                 execution_backend: Literal["dlpack", "ctypes", "cython"] = "cython",
-                 verbose: bool = False,
-                 pass_configs: dict[str, Any] | None = None,
-                 debug_root_path: str | None = None,
-                 compile_flags: list[str] | str | None = None):
-        """
-        Initializes the JIT compiler decorator.
-
-        Parameters
-        ----------
-        out_idx : Any, optional
-            Index(es) of the output tensors to return from the compiled kernel
-            (default: None, meaning all outputs are returned or determined by the kernel itself).
-        target : Union[str, Target], optional
-            Compilation target for TVM. Can be a string (e.g., "cuda", "llvm")
-            or a TVM Target object. If "auto", the target is determined automatically
-            (default: "auto").
-        target_host : Union[str, Target], optional
-            Target host for cross-compilation, similar to `target` (default: None).
-        execution_backend : Literal["dlpack", "ctypes", "cython"], optional
-            The backend used for kernel execution and argument passing.
-            "dlpack" is generally preferred for zero-copy tensor passing with compatible frameworks.
-            "ctypes" uses standard C types. "cython" uses Cython for potentially faster execution.
-            (default: "cython").
-        verbose : bool, optional
-            If True, enables verbose logging during compilation (default: False).
-        pass_configs : Optional[Dict[str, Any]], optional
-            A dictionary of configurations for TVM's pass context. These can fine-tune
-            the compilation process. Examples include "tir.disable_vectorize"
-            (default: None).
-        debug_root_path : Optional[str], optional
-            If provided, the compiled kernel's source code will be saved to a file
-            in this directory. This is useful for debugging the generated code.
-            If None, no debug information is saved (default: None).
-            If a relative path is given, it's made absolute relative to the project root
-            or current working directory.
-        compile_flags : Optional[Union[List[str], str]], optional
-            Additional compilation flags to pass to the compiler.
-            If None, no additional compilation flags are passed (default: None).
-        """
-        self.out_idx = out_idx
-        self.execution_backend = execution_backend
-        self.target = target
-        self.target_host = target_host
-        self.verbose = verbose
-        self.pass_configs = pass_configs
-        self.compile_flags = compile_flags
-
-        # Corrected debug_root_path handling
-        self.debug_root_path = debug_root_path
+    def __post_init__(self):
         if self.debug_root_path is not None and not path.isabs(self.debug_root_path):
             try:
                 base_path = path.dirname(path.dirname(path.dirname(__file__)))
                 self.debug_root_path = path.join(base_path, self.debug_root_path)
             except NameError:
                 self.debug_root_path = path.abspath(self.debug_root_path)
-
         self._kernel_cache: dict[tuple, Kernel] = {}
+        self._tuner_cache: dict[tuple, Kernel] = {}
 
-    # This tells the type checker what the *wrapper* function will return.
-    # this is for linting, please do not remove it.
-    @overload
-    def __call__(self, func: Callable[_P, _RProg]) -> Callable[_P, tuple[_RProg, Kernel]]:
-        ...
-
-    @overload
-    def __call__(self, func: Callable[_P, _RProg]) -> Callable[_P, Kernel]:
-        ...
-
-    # Actual implementation of __call__
-    def __call__(
-        self,
-        func: Callable[_P, _RProg]  # func is Union[Callable[_P, _RProg], PrimFunc] in original
-    ) -> Callable[_P, Any]:
-
-        @functools.wraps(func)
-        def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> Any:
-            # Separate out the tuning parameters from the user's kwargs
-            tune_params = kwargs.pop('__tune_params', {})
-            # Whether to return the compile arguments (out_idx, target, target_host, etc.) for autotuner cache
-            return_compile_arguments = kwargs.pop('__return_compile_arguments', False)
-            if return_compile_arguments:
-                compile_args = {
-                    'out_idx': self.out_idx,
-                    'execution_backend': self.execution_backend,
-                    'target': self.target,
-                    'target_host': self.target_host,
-                    'verbose': self.verbose,
-                    'pass_configs': self.pass_configs,
-                    'compile_flags': self.compile_flags,
-                }
-                return compile_args
-
-            key_args_tuple = args
-            key_kwargs_tuple = tuple(sorted(kwargs.items()))
-            tuned_key_kwargs_tuple = tuple(sorted(tune_params.items()))
-            key = (key_args_tuple, key_kwargs_tuple, tuned_key_kwargs_tuple)
-
-            if key not in self._kernel_cache:
-                # Ensure 'func' (the original user function) is used correctly
-                program_result_source = func
-                if isinstance(program_result_source, PrimFunc):
-                    program_result = program_result_source
-                elif callable(program_result_source):
-                    program_result = program_result_source(*args, **kwargs, **tune_params)
-                else:
-                    raise ValueError(f"Invalid function type: {type(program_result_source)}")
-
-                kernel_result = compile(
-                    program_result,
-                    out_idx=self.out_idx,
-                    execution_backend=self.execution_backend,
-                    target=self.target,
-                    target_host=self.target_host,
-                    verbose=self.verbose,
-                    pass_configs=self.pass_configs,
-                    compile_flags=self.compile_flags,
-                )
-
-                if self.debug_root_path:
-                    func_name = getattr(func, '__name__', 'jit_kernel')  # Use func for name
-                    kernel_file = f'tilelang_jit_kernel_{func_name}.c'
-                    program_file = f'tilelang_jit_program_{func_name}.py'
-                    makedirs(self.debug_root_path, exist_ok=True)
-                    with open(path.join(self.debug_root_path, kernel_file), 'w') as f:
-                        print(kernel_result.get_kernel_source(), file=f)
-                    with open(path.join(self.debug_root_path, program_file), 'w') as f:
-                        print(program_result.script(), file=f)
-
-                self._kernel_cache[key] = kernel_result
-
-            return self._kernel_cache[key]
-
-        return wrapper
+    def get_tir(self, *args: _P.args, **kwargs: _P.kwargs) -> PrimFunc[_KP, _T]:
+        """
+        Retrieve a TIR (Tensor Intermediate Representation) PrimFunc from the stored callable or object.
+        """
+        if isinstance(self.func, LazyJITFunc):
+            tir = self.func.get_tir(*args, **kwargs)
+        elif isinstance(self.func, PrimFunc):
+            tir = self.func
+        elif callable(self.func):
+            tir = self.func(*args, **kwargs)
+        else:
+            raise ValueError(f"Invalid function type: {type(self.func)}")
+        assert isinstance(tir, PrimFunc), f"target function must be a PrimFunc but got {type(tir)}"
+        return tir
+
+    def par_compile(
+        self, configs: Iterable[dict[str, Any] | tuple[str, Any]], num_workers: int = None, ignore_error: bool = False
+    ) -> list[JITKernel[_KP, _T]]:
+        """
+        Parallel compile multiple TileLang PrimFunc with TVM and build JITKernels.
+        Parameters
+        ----------
+        configs : Iterable[Union[dict[str, Any], tuple[Any, ...]]]
+            The configurations to elaborate and compile. Each config can be either
+            a dictionary mapping keyword arguments to values, or a tuple of positional
+            arguments.
+        num_workers : int, optional
+            Number of parallel workers to use for compilation. Defaults to None,
+            which lets the system decide.
+        ignore_error : bool, optional
+            If True, compilation errors for individual configs will be logged
+            as warnings and the corresponding result will be None. If False,
+            any compilation error will raise an exception. Defaults to False.
+        Returns
+        -------
+        List[JITKernel]
+            A list of compiled JITKernel objects corresponding to the provided configs.
+        """
+        configs = list(configs)
+        funcs = []
+        for cfg in tqdm(configs, desc="Elaborating"):
+            if isinstance(cfg, tuple):
+                funcs.append(self.get_tir(*cfg))
+            elif isinstance(cfg, dict):
+                funcs.append(self.get_tir(**cfg))
+            else:
+                raise ValueError(f"Invalid config type: {type(cfg)}, expected tuple or dict.")
+        return par_compile(
+            funcs,
+            out_idx=self.out_idx,
+            execution_backend=self.execution_backend,
+            target=self.target,
+            target_host=self.target_host,
+            verbose=self.verbose,
+            pass_configs=self.pass_configs,
+            compile_flags=self.compile_flags,
+            num_workers=num_workers,
+            ignore_error=ignore_error,
+        )
+
+    def compile(self, *args: _P.args, **kwargs: _P.kwargs) -> _Ret:
+        func = self.get_tir(*args, **kwargs)
+        kernel_result = compile(
+            func,
+            out_idx=self.out_idx,
+            execution_backend=self.execution_backend,
+            target=self.target,
+            target_host=self.target_host,
+            verbose=self.verbose,
+            pass_configs=self.pass_configs,
+            compile_flags=self.compile_flags,
+        )
+
+        if self.debug_root_path:
+            if isinstance(self.func, PrimFunc):
+                func_name = self.func.attrs["global_symbol"]
+            else:
+                func_name = getattr(self.func, "__name__", "jit_kernel")
+            kernel_file = f"tilelang_jit_kernel_{func_name}.c"
+            program_file = f"tilelang_jit_program_{func_name}.py"
+            makedirs(self.debug_root_path, exist_ok=True)
+            with open(path.join(self.debug_root_path, kernel_file), "w") as f:
+                print(kernel_result.get_kernel_source(), file=f)
+            with open(path.join(self.debug_root_path, program_file), "w") as f:
+                print(func.script(), file=f)
+
+        return kernel_result
+
+    def parse_cache_key(self, *args: _P.args, **kwargs: _P.kwargs):
+        tune_params = kwargs.pop("__tune_params", {})
+        key_args_tuple = args
+        key_kwargs_tuple = tuple(sorted(kwargs.items()))
+        tuned_key_kwargs_tuple = tuple(sorted(tune_params.items()))
+        key = (key_args_tuple, key_kwargs_tuple, tuned_key_kwargs_tuple)
+        return key
+
+    def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> _Ret:
+        # Separate out the tuning parameters from the user's kwargs
+        # Whether to return the compile arguments (out_idx, target, target_host, etc.) for autotuner cache
+        return_compile_arguments = kwargs.pop("__return_compile_arguments", False)
+        if return_compile_arguments:
+            logger.warning("`__return_compile_arguments` is deprecated and will be removed in future versions.")
+            compile_args = {
+                "out_idx": self.out_idx,
+                "execution_backend": self.execution_backend,
+                "target": self.target,
+                "target_host": self.target_host,
+                "verbose": self.verbose,
+                "pass_configs": self.pass_configs,
+                "compile_flags": self.compile_flags,
+            }
+            return compile_args
+
+        if self.lazy_jit:
+            kwargs.update(kwargs.pop("__tune_params", {}))
+            key, kernel_args = self.func.parse_args(*args, **kwargs)
+            kernel = self._kernel_cache.get(key, None)
+            if kernel is None:
+                kernel = self.compile(*args, **kwargs)
+                self._kernel_cache[key] = kernel
+            return kernel(*kernel_args.values())
+
+        else:
+            key = self.parse_cache_key(*args, **kwargs)
+            tune_params = kwargs.pop("__tune_params", {})
+            kernel = self._kernel_cache.get(key, None)
+            if kernel is None:
+                kernel = self.compile(*args, **kwargs, **tune_params)
+                self._kernel_cache[key] = kernel
+            return kernel
+
+
+ExecutionBackend = Literal["auto", "dlpack", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"]
+
+
+@overload
+def jit(func: Callable[_P, PrimFunc[_KP, _T]]) -> JITImpl[_P, _KP, _T, JITKernel[_KP, _T]]: ...
+
+
+@overload
+def jit(
+    *,  # Indicates subsequent arguments are keyword-only
+    out_idx: Any = None,
+    target: str | Target | None = None,
+    target_host: str | Target | None = None,
+    execution_backend: ExecutionBackend | None = None,
+    verbose: bool | None = None,
+    pass_configs: dict[str, Any] | None = None,
+    debug_root_path: str | None = None,
+    compile_flags: list[str] | str | None = None,
+) -> Callable[[Callable[_P, PrimFunc[_KP, _T]]], JITImpl[_P, _KP, _T, JITKernel[_KP, _T]]]: ...
 
 
 def jit(  # This is the new public interface
-        func: Callable[_P, _RProg] | PrimFunc | None = None,
-        *,  # Indicates subsequent arguments are keyword-only
-        out_idx: Any = None,
-        target: str | Target = "auto",
-        target_host: str | Target = None,
-        execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
-        verbose: bool = False,
-        pass_configs: dict[str, Any] | None = None,
-        debug_root_path: str | None = None,
-        compile_flags: list[str] | str | None = None):
+    func: Callable[_P, _T] | PrimFunc | None = None,
+    *,  # Indicates subsequent arguments are keyword-only
+    out_idx: Any = None,
+    target: str | Target | None = None,
+    target_host: str | Target | None = None,
+    execution_backend: ExecutionBackend | None = None,
+    verbose: bool | None = None,
+    pass_configs: dict[str, Any] | None = None,
+    debug_root_path: str | None = None,
+    compile_flags: list[str] | str | None = None,
+):
     """
     Just-In-Time (JIT) compiler decorator for TileLang functions.
 
@@ -254,53 +472,114 @@ def jit(  # This is the new public interface
         If using `@tilelang.jit` directly on a function, this argument is implicitly
         the function to be decorated (and `out_idx` will be `None`).
     target : Union[str, Target], optional
-        Compilation target for TVM (e.g., "cuda", "llvm"). Defaults to "auto".
+        Compilation target for TVM (e.g., "cuda", "llvm"). If None, reads from
+        TILELANG_TARGET environment variable (defaults to "auto").
     target_host : Union[str, Target], optional
         Target host for cross-compilation. Defaults to None.
-    execution_backend : Literal["dlpack", "ctypes", "cython", "nvrtc"], optional
-        Backend for kernel execution and argument passing. Defaults to "cython".
+    execution_backend : Literal["auto", "dlpack", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"], optional
+        Backend for kernel execution and argument passing. If None, reads from
+        TILELANG_EXECUTION_BACKEND environment variable (defaults to "auto").
     verbose : bool, optional
-        Enables verbose logging during compilation. Defaults to False.
+        Enables verbose logging during compilation. If None, reads from
+        TILELANG_VERBOSE environment variable (defaults to False).
     pass_configs : Optional[Dict[str, Any]], optional
         Configurations for TVM's pass context. Defaults to None.
     debug_root_path : Optional[str], optional
         Directory to save compiled kernel source for debugging. Defaults to None.
 
+    Environment Variables
+    ---------------------
+    TILELANG_TARGET : str
+        Default compilation target (e.g., "cuda", "llvm"). Defaults to "auto".
+    TILELANG_EXECUTION_BACKEND : str
+        Default execution backend. Defaults to "auto".
+    TILELANG_VERBOSE : str
+        Set to "1", "true", "yes", or "on" to enable verbose compilation by default.
+
     Returns
     -------
     Callable
         Either a JIT-compiled wrapper around the input function, or a configured decorator
         instance that can then be applied to a function.
     """
-    if isinstance(compile_flags, str):
-        compile_flags = [compile_flags]
 
-    if callable(func):
-        # Case 1: Used as @jit (func_or_out_idx is the function, others are defaults)
-        # Create a default _JitImplementation instance and apply it to the function.
-        default_decorator = _JitImplementation(
-            out_idx=out_idx,  # Explicitly None for the default case
-            target=target,
-            target_host=target_host,
+    def decorator(func: Callable[_P, _T]) -> JITImpl[_P, _T]:
+        if isinstance(func, PrimFunc):
+            orig_func = func.orig_func
+        else:
+            orig_func = func
+        return JITImpl(
+            func=func,
+            out_idx=out_idx,
             execution_backend=execution_backend,
-            verbose=verbose,
-            pass_configs=pass_configs,
-            debug_root_path=debug_root_path,
-            compile_flags=compile_flags)
-        return default_decorator(func)
-    elif isinstance(func, PrimFunc):
-        raise ValueError("Use tilelang.jit to decorate prim_func is not supported yet.")
-    else:
-        # Case 2: Used as @jit(...) to configure, or func_or_out_idx is meant as out_idx.
-        # Create a _JitImplementation instance with the provided/defaulted arguments.
-        # This instance is a decorator that will be applied to the function later.
-        configured_decorator = _JitImplementation(
-            out_idx=out_idx,  # Pass along; could be an actual out_idx or None
             target=target,
             target_host=target_host,
-            execution_backend=execution_backend,
             verbose=verbose,
             pass_configs=pass_configs,
             debug_root_path=debug_root_path,
-            compile_flags=compile_flags)
-        return configured_decorator
+            compile_flags=compile_flags,
+            func_source=inspect.getsource(orig_func),
+            signature=inspect.signature(orig_func),
+            lazy_jit=False,
+        )
+
+    if func is not None:
+        return decorator(func)
+    else:
+        return decorator
+
+
+@overload
+def lazy_jit(func: Callable[_KP, _T]) -> JITImpl[_KP, _KP, _T, _T]: ...
+
+
+@overload
+def lazy_jit(
+    *,
+    out_idx: Any = None,
+    target: str | Target | None = None,
+    target_host: str | Target | None = None,
+    execution_backend: ExecutionBackend | None = None,
+    verbose: bool | None = None,
+    pass_configs: dict[str, Any] | None = None,
+    debug_root_path: str | None = None,
+    compile_flags: list[str] | str | None = None,
+) -> Callable[[Callable[_KP, _T]], JITImpl[_KP, _KP, _T, _T]]: ...
+
+
+def lazy_jit(
+    func: Callable[_P, _T] | PrimFunc | None = None,
+    *,  # Indicates subsequent arguments are keyword-only
+    target: str | Target | None = None,
+    target_host: str | Target | None = None,
+    execution_backend: ExecutionBackend | None = None,
+    verbose: bool | None = None,
+    pass_configs: dict[str, Any] | None = None,
+    debug_root_path: str | None = None,
+    compile_flags: list[str] | str | None = None,
+):
+    """
+    Lazy JIT compiler decorator - returns the kernel object on first call, then executes it.
+
+    Supports environment variable defaults for target, execution_backend, and verbose.
+    See `jit` documentation for parameter details and environment variables.
+    """
+
+    compile_args = dict(
+        out_idx=None,
+        execution_backend=execution_backend,
+        target=target,
+        target_host=target_host,
+        verbose=verbose,
+        pass_configs=pass_configs,
+        debug_root_path=debug_root_path,
+        compile_flags=compile_flags,
+    )
+
+    def decorator(func: Callable[_P, _T]):
+        pf: LazyJITFunc[_P, _T] = prim_func(func, lazy_jit=True)
+        return JITImpl(
+            func=pf, **compile_args, func_source=inspect.getsource(pf.orig_func), signature=inspect.signature(pf.orig_func), lazy_jit=True
+        )
+
+    return decorator(func) if func is not None else decorator
diff --git a/tilelang/jit/adapter/__init__.py b/tilelang/jit/adapter/__init__.py
index 0e8fb98c8..0d9945285 100644
--- a/tilelang/jit/adapter/__init__.py
+++ b/tilelang/jit/adapter/__init__.py
@@ -1,6 +1,6 @@
 from .base import BaseKernelAdapter  # noqa: F401
-from .dlpack import TorchDLPackKernelAdapter  # noqa: F401
-from .ctypes import CtypesKernelAdapter  # noqa: F401
+from .tvm_ffi import TVMFFIKernelAdapter  # noqa: F401
 from .cython import CythonKernelAdapter  # noqa: F401
 from .nvrtc import NVRTCKernelAdapter  # noqa: F401
 from .torch import MetalKernelAdapter  # noqa: F401
+from .cutedsl import CuTeDSLKernelAdapter  # noqa: F401
diff --git a/tilelang/jit/adapter/base.py b/tilelang/jit/adapter/base.py
index 9d998bc96..3669f9e35 100644
--- a/tilelang/jit/adapter/base.py
+++ b/tilelang/jit/adapter/base.py
@@ -1,13 +1,14 @@
 """The profiler and convert to torch utils"""
+
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
 from typing import Any, Callable
 from tilelang.engine.param import KernelParam
+import torch
 
 
 class BaseKernelAdapter(ABC):
-
     func: Callable | None = None
 
     def __init__(self, mod, params: list[KernelParam], result_idx: list[int]) -> None:
@@ -23,18 +24,14 @@ def _legalize_result_idx(self, result_idx: list[int] | None) -> list[int]:
             result_idx = []
         elif isinstance(result_idx, int):
             if result_idx > len(params) or result_idx < -len(params):
-                raise ValueError(
-                    f"result_idx should be an integer between {-len(params) - 1} and {len(params) - 1}"
-                )
+                raise ValueError(f"result_idx should be an integer between {-len(params) - 1} and {len(params) - 1}")
             if result_idx < 0:
                 result_idx = len(params) + result_idx
             result_idx = [result_idx]
         elif isinstance(result_idx, list):
             for i, idx in enumerate(result_idx):
                 if idx >= len(params) or idx < -len(params):
-                    raise ValueError(
-                        f"result_idx should be an integer between {-len(params) - 1} and {len(params) - 1}"
-                    )
+                    raise ValueError(f"result_idx should be an integer between {-len(params) - 1} and {len(params) - 1}")
                 if idx < 0:
                     result_idx[i] = len(params) + idx
         else:
@@ -46,11 +43,54 @@ def _legalize_result_idx(self, result_idx: list[int] | None) -> list[int]:
     def _convert_torch_func(self) -> callable:
         pass
 
+    # --- Common helpers to align with PyTorch stream/device semantics ---
+    @staticmethod
+    def get_current_stream_functor() -> Callable[[], int]:
+        """Return a callable that reads Torch's current CUDA stream pointer.
+
+        The returned lambda yields the raw CUDA stream handle of the current
+        PyTorch stream on the active device. It's a thunk (evaluated at call
+        time) so that any upstream stream guards are respected. If CUDA is
+        unavailable, it returns a lambda that yields 0.
+        """
+        if torch.cuda.is_available():
+            try:
+                torch.cuda._lazy_init()
+                current_device = torch._C._cuda_getDevice
+                get_stream = torch._C._cuda_getCurrentRawStream
+                return lambda: get_stream(current_device())
+            except Exception:
+                # Fallback to Python API if internal handles are unavailable
+                return lambda: int(torch.cuda.current_stream().cuda_stream)
+        # CPU or CUDA unavailable: no stream semantics
+        return lambda: 0
+
+    @staticmethod
+    def get_current_device_functor() -> Callable[[], torch.device]:
+        """Return a callable that yields Torch's current device.
+
+        Similar to the stream functor, we capture a callable that, when called,
+        fetches the current device according to PyTorch. On CPU or when CUDA is
+        unavailable, returns ``torch.device('cpu')``.
+        """
+        if torch.cuda.is_available():
+            try:
+                torch.cuda._lazy_init()
+                current_device = torch._C._cuda_getDevice
+                return lambda: torch.device("cuda", current_device())
+            except Exception:
+                return lambda: torch.device("cuda", torch.cuda.current_device())
+        # CPU fallback
+        return lambda: torch.device("cpu")
+
     def __call__(self, *args: Any, **kwds: Any) -> Any:
         return self.func(*args, **kwds)
 
-    def get_kernel_source(self) -> str:
-        return self.mod.imported_modules[0].get_source()
+    def get_kernel_source(self, kernel_only: bool = True) -> str:
+        if kernel_only:
+            return self.mod.imports[0].inspect_source()
+        else:
+            return self.mod.inspect_source() + "\n\n" + self.mod.imports[0].inspect_source()
 
     def _post_init(self):
         self.func = self._convert_torch_func()
diff --git a/tilelang/jit/adapter/ctypes/__init__.py b/tilelang/jit/adapter/ctypes/__init__.py
deleted file mode 100644
index 5e6fdc84d..000000000
--- a/tilelang/jit/adapter/ctypes/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .adapter import CtypesKernelAdapter  # noqa: F401
diff --git a/tilelang/jit/adapter/ctypes/adapter.py b/tilelang/jit/adapter/ctypes/adapter.py
index 648c66c1c..9e3ea938a 100644
--- a/tilelang/jit/adapter/ctypes/adapter.py
+++ b/tilelang/jit/adapter/ctypes/adapter.py
@@ -1,4 +1,5 @@
 """The profiler and convert to torch utils"""
+
 from __future__ import annotations
 
 import torch
@@ -41,17 +42,19 @@ class CtypesKernelAdapter(BaseKernelAdapter):
     param_dtypes: list[torch.dtype] | None = None  # Cache for parameter dtypes
     param_shapes: list[list] | None = None  # Cache for parameter shapes
 
-    def __init__(self,
-                 params: list[TensorType],
-                 result_idx: list[int],
-                 target: str,
-                 func_or_mod: tir.PrimFunc | tvm.IRModule,
-                 host_mod: tvm.IRModule | None = None,
-                 device_mod: tvm.IRModule | None = None,
-                 kernel_global_source: str | None = None,
-                 verbose: bool = False,
-                 pass_configs: dict[str, Any] | None = None,
-                 compile_flags: list[str] | None = None):
+    def __init__(
+        self,
+        params: list[TensorType],
+        result_idx: list[int],
+        target: str,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_mod: tvm.IRModule | None = None,
+        device_mod: tvm.IRModule | None = None,
+        kernel_global_source: str | None = None,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
         """Initialize the adapter with the given TIR function or module.
 
         Args:
@@ -107,16 +110,18 @@ def __init__(self,
         self._post_init()
 
     @classmethod
-    def from_database(cls,
-                      params: list[TensorType],
-                      result_idx: list[int],
-                      target: str,
-                      func_or_mod: tir.PrimFunc | tvm.IRModule,
-                      kernel_global_source: str,
-                      kernel_lib_path: str,
-                      verbose: bool = False,
-                      pass_configs: dict[str, Any] | None = None,
-                      compile_flags: list[str] | None = None):
+    def from_database(
+        cls,
+        params: list[TensorType],
+        result_idx: list[int],
+        target: str,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        kernel_global_source: str,
+        kernel_lib_path: str,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
         adapter = cls.__new__(cls)
         adapter.params = params
         adapter.result_idx = adapter._legalize_result_idx(result_idx)
@@ -171,15 +176,13 @@ def _process_dynamic_symbolic(self) -> dict[tir.Var, tuple[int, int, int]]:
             if param in buffer_map:
                 buffer = buffer_map[param]
                 for j, shape in enumerate(buffer.shape):
-                    if (isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and
-                        (shape not in params)):
+                    if isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and (shape not in params):
                         dynamic_symbolic_map[shape] = (0, i, j)
         for i, param in enumerate(params):
             if param in buffer_map:
                 buffer = buffer_map[param]
                 for j, stride in enumerate(buffer.strides):
-                    if (isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and
-                        (stride not in params)):
+                    if isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and (stride not in params):
                         dynamic_symbolic_map[stride] = (1, i, j)
         return dynamic_symbolic_map
 
@@ -188,9 +191,7 @@ def _forward_from_prebuild_lib(self, *args, stream: int | None = None):
 
         Converts PyTorch tensor pointers to C void pointers for ctypes interface.
         """
-        ctypes_args = [
-            ctypes.c_void_p(arr.data_ptr()) if not isinstance(arr, int) else arr for arr in args
-        ]
+        ctypes_args = [ctypes.c_void_p(arr.data_ptr()) if not isinstance(arr, int) else arr for arr in args]
         ctypes_args.append(ctypes.c_void_p(stream))
         self.lib.call(*ctypes_args)
 
@@ -284,7 +285,7 @@ def lib_code(self):
     @property
     def is_dynamic(self):
         """Indicates whether the kernel handles dynamic shapes."""
-        return (self.dynamic_symbolic_map is not None and len(self.dynamic_symbolic_map) > 0)
+        return self.dynamic_symbolic_map is not None and len(self.dynamic_symbolic_map) > 0
 
     def get_kernel_source(self, kernel_only: bool = False):
         """Returns the source code of the compiled kernel."""
diff --git a/tilelang/jit/adapter/cutedsl/__init__.py b/tilelang/jit/adapter/cutedsl/__init__.py
new file mode 100644
index 000000000..e25899a1d
--- /dev/null
+++ b/tilelang/jit/adapter/cutedsl/__init__.py
@@ -0,0 +1,16 @@
+"""CuTeDSL Backend for TileLang.
+
+This module provides runtime compilation support using NVIDIA's CuTeDSL API.
+"""
+
+__all__ = [
+    "CuTeDSLKernelAdapter",
+    "TLCuTeDSLSourceWrapper",
+    "CuTeDSLLibraryGenerator",
+    "check_cutedsl_available",
+]
+
+from .checks import check_cutedsl_available  # noqa: F401
+from .adapter import CuTeDSLKernelAdapter  # noqa: F401
+from .wrapper import TLCuTeDSLSourceWrapper  # noqa: F401
+from .libgen import CuTeDSLLibraryGenerator  # noqa: F401
diff --git a/tilelang/jit/adapter/cutedsl/adapter.py b/tilelang/jit/adapter/cutedsl/adapter.py
new file mode 100644
index 000000000..1f5d2f2fc
--- /dev/null
+++ b/tilelang/jit/adapter/cutedsl/adapter.py
@@ -0,0 +1,411 @@
+from __future__ import annotations
+import logging
+import weakref
+from typing import Any, Callable
+
+import torch
+from tvm import tir
+from tvm.target import Target
+
+from tilelang import tvm as tvm
+from tilelang.engine.param import KernelParam
+from tilelang.jit.adapter.wrapper import TLPyWrapper
+from tilelang.jit.adapter.cutedsl.checks import check_cutedsl_available
+from tilelang.jit.adapter.cutedsl.libgen import CuTeDSLLibraryGenerator
+from tilelang.utils.language import retrieve_func_from_module
+from tilelang.utils.target import determine_target
+from tilelang.jit.adapter.base import BaseKernelAdapter
+
+logger = logging.getLogger(__name__)
+
+
+class CuTeDSLKernelAdapter(BaseKernelAdapter):
+    pymodule = None
+
+    def __init__(
+        self,
+        params: list[KernelParam],
+        result_idx: list[int],
+        target: str | Target,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_mod: tvm.IRModule | None = None,
+        device_mod: tvm.IRModule | None = None,
+        host_kernel_source: str | None = None,
+        device_kernel_source: str | None = None,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
+        check_cutedsl_available()
+
+        self.params = params
+        self.result_idx = self._legalize_result_idx(result_idx)
+        self.host_kernel_source = host_kernel_source
+        self.device_kernel_source = device_kernel_source
+
+        if isinstance(func_or_mod, tir.PrimFunc):
+            gsym = func_or_mod.attrs.get("global_symbol")
+            if gsym is None:
+                raise ValueError("PrimFunc is missing required attr 'global_symbol'")
+            self.ir_module = tvm.IRModule({gsym: func_or_mod})
+        else:
+            self.ir_module = func_or_mod
+
+        # Cache parameter information during initialization
+        self.param_dtypes = [param.torch_dtype() for param in params]
+        self.param_shapes = []
+        for param in params:
+            native_shape = []
+            for dim in param.shape:
+                if isinstance(dim, tir.IntImm):
+                    native_shape.append(int(dim))
+                elif isinstance(dim, tir.Var):
+                    # Keep tir.Var for dynamic dimensions
+                    native_shape.append(dim)
+                else:
+                    native_shape.append(dim)
+            self.param_shapes.append(native_shape)
+
+        self.dynamic_symbolic_map, self.dynamic_symbolic_order = self._process_dynamic_symbolic()
+
+        self.target = Target.canon_target(determine_target(target))
+        self.verbose = verbose
+        self.wrapper = TLPyWrapper(self.target)
+        self.wrapper.assign_optimized_module(self.ir_module)
+        self.wrapper.assign_pass_configs(pass_configs)
+        self.wrapper.assign_host_module(host_mod)
+        self.wrapper.assign_device_module(device_mod)
+        wrapper_result = self.wrapper.wrap(device_kernel_source)
+        self.host_func = wrapper_result["host_func"]
+        self.function_names = wrapper_result["function_names"]
+        self.launcher_cpp_code = wrapper_result.get("launcher_cpp_code", None)
+        self.launcher_lib_name = wrapper_result.get("launcher_lib_name", None)
+
+        self.lib_generator = CuTeDSLLibraryGenerator(self.target, self.verbose)
+        self.lib_generator.update_lib_code(self.device_kernel_source)
+        self.lib_generator.update_host_func(self.host_func)
+        self.lib_generator.update_launcher_cpp_code(self.launcher_cpp_code)
+        self.lib_generator.update_launcher_lib_name(self.launcher_lib_name)
+        self.lib_generator.assign_compile_flags(compile_flags)
+        self.lib_generator.compile_lib()
+        self.lib_generator.load_lib()
+        self.libpath = self.lib_generator.libpath
+        with open(self.libpath) as f:
+            self.device_kernel_source = f.read()
+        self.kernel_global_source = self.device_kernel_source
+        self.pymodule = self.lib_generator.pymodule
+
+        self._post_init()
+
+    @classmethod
+    def from_database(
+        cls,
+        params: list[KernelParam],
+        result_idx: list[int],
+        target: str,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_kernel_source: str,
+        device_kernel_source: str,
+        kernel_lib_path: str,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
+        adapter = cls.__new__(cls)
+        adapter.params = params
+        adapter.result_idx = adapter._legalize_result_idx(result_idx)
+        adapter.host_kernel_source = host_kernel_source
+        adapter.device_kernel_source = device_kernel_source
+
+        if isinstance(func_or_mod, tir.PrimFunc):
+            gsym = func_or_mod.attrs.get("global_symbol")
+            if gsym is None:
+                raise ValueError("PrimFunc is missing required attr 'global_symbol'")
+            adapter.ir_module = tvm.IRModule({gsym: func_or_mod})
+        else:
+            adapter.ir_module = func_or_mod
+
+        # Cache parameter information during initialization
+        adapter.param_dtypes = [param.torch_dtype() for param in params]
+        adapter.param_shapes = []
+        for param in params:
+            native_shape = []
+            for dim in param.shape:
+                if isinstance(dim, tir.IntImm):
+                    native_shape.append(int(dim))
+                elif isinstance(dim, tir.Var):
+                    # Keep tir.Var for dynamic dimensions
+                    native_shape.append(dim)
+                else:
+                    native_shape.append(dim)
+            adapter.param_shapes.append(native_shape)
+
+        adapter.dynamic_symbolic_map, adapter.dynamic_symbolic_order = adapter._process_dynamic_symbolic()
+
+        adapter.target = Target.canon_target(determine_target(target))
+        adapter.verbose = verbose
+        adapter.lib_generator = CuTeDSLLibraryGenerator(adapter.target, adapter.verbose)
+        adapter.lib_generator.assign_compile_flags(compile_flags)
+        adapter.lib_generator.load_lib(lib_path=kernel_lib_path)
+        adapter.libpath = kernel_lib_path
+        adapter.kernel_global_source = device_kernel_source
+        adapter.pymodule = adapter.lib_generator.pymodule
+
+        adapter._post_init()
+        return adapter
+
+    def _process_dynamic_symbolic(self) -> tuple[dict[tir.Var, tuple[int, int, int]], list[tir.Var]]:
+        """Extract information about dynamic symbols from the TIR function.
+
+        We follow the same ordering semantics as `TLCUDASourceWrapper.get_dynamic_symbolic_set()`:
+        1) dynamic symbols in buffer shapes (in prim_func param order)
+        2) then dynamic symbols in buffer strides
+
+        The mapping encodes:
+        - id=0: shape var -> (0, buffer_param_index, dim_index)
+        - id=1: stride var -> (1, buffer_param_index, stride_index)
+
+        Returns:
+            (dynamic_symbolic_map, dynamic_symbolic_order)
+        """
+        func = self.prim_func
+        params = func.params
+        buffer_map = func.buffer_map
+        dynamic_symbolic_map: dict[tir.Var, tuple[int, int, int]] = {}
+        dynamic_symbolic_order: list[tir.Var] = []
+
+        def unique_push_back(v: tir.Var, entry: tuple[int, int, int]):
+            if v in dynamic_symbolic_map:
+                return
+            dynamic_symbolic_map[v] = entry
+            dynamic_symbolic_order.append(v)
+
+        # 1) Shapes
+        for i, param in enumerate(params):
+            if param not in buffer_map:
+                continue
+            buffer = buffer_map[param]
+            for j, shape in enumerate(buffer.shape):
+                if isinstance(shape, tir.Var):
+                    unique_push_back(shape, (0, i, j))
+
+        # 2) Strides
+        for i, param in enumerate(params):
+            if param not in buffer_map:
+                continue
+            buffer = buffer_map[param]
+            if buffer.strides is None:
+                continue
+            for j, stride in enumerate(buffer.strides):
+                if isinstance(stride, tir.Var):
+                    unique_push_back(stride, (1, i, j))
+
+        return dynamic_symbolic_map, dynamic_symbolic_order
+
+    def get_kernel_source(self, kernel_only: bool = True) -> str | None:
+        """Get the CUDA kernel source code.
+
+        Returns
+        -------
+        str | None
+            The kernel source code, or None if not available
+        """
+        return self.device_kernel_source
+
+    def _forward_from_prebuild_lib(self, *args, stream: int | None = None, device_id: int = 0):
+        """Low-level function to call the compiled CUDA kernel.
+
+        Args:
+            *args: Kernel arguments (tensors and scalars)
+            stream: CUDA stream handle
+            device_id: CUDA device ID for multi-GPU support
+        """
+        result = self.pymodule.call(*args, stream=stream, device_id=device_id)
+
+        # After first call, save cubin to cache if needed
+        self._save_cubin_to_cache_if_needed()
+
+        return result
+
+    def _save_cubin_to_cache_if_needed(self):
+        """Save cubin to cache directory after first execution.
+
+        This is called after the first kernel execution to ensure the generated
+        cubin file is copied to the cache directory for future reuse.
+        """
+        if getattr(self, "_cubin_saved_to_cache", False):
+            return
+        self._cubin_saved_to_cache = True
+
+        # Check if we have a cache path (set by kernel_cache)
+        cache_path = getattr(self, "_cache_path", None)
+        if cache_path is None:
+            return
+
+        import os
+        import shutil
+
+        # Source cubin path (in temp directory)
+        src_py_path = self.libpath
+        src_py_stem = os.path.splitext(os.path.basename(src_py_path))[0]
+        src_dir = os.path.dirname(src_py_path)
+        src_cubin_path = os.path.join(src_dir, f"{src_py_stem}.cubin")
+
+        if not os.path.exists(src_cubin_path):
+            return
+
+        # Destination cubin path (in cache directory)
+        dst_cubin_path = os.path.join(cache_path, "kernel.cubin")
+
+        if os.path.exists(dst_cubin_path):
+            return
+
+        # Copy cubin to cache
+        try:
+            shutil.copy2(src_cubin_path, dst_cubin_path)
+            logger.debug(f"Saved CuTeDSL cubin to cache: {dst_cubin_path}")
+        except Exception as e:
+            logger.warning(f"Failed to save cubin to cache: {e}", exc_info=True)
+
+    def _wrap_forward_from_prebuild_lib(self, *ins: Any, stream: int | None = None):
+        """High-level wrapper for kernel execution.
+
+        Handles:
+        1. Input validation
+        2. Output tensor allocation
+        3. Dynamic shape resolution
+        4. CUDA stream management
+
+        Args:
+            ins: Input arguments (may include scalars and tensors)
+            stream: Optional CUDA stream for asynchronous execution
+
+        Returns:
+            Single tensor or list of tensors containing the kernel results
+        """
+        if len(ins) + len(self.result_idx) != len(self.params):
+            raise ValueError(
+                f"Expected {len(self.params)} inputs, got {len(ins) + len(self.result_idx)} with {len(ins)} inputs and {len(self.result_idx)} outputs"
+            )
+
+        # Materialize args in PrimFunc param order (inputs + allocated outputs)
+        ins_idx = 0
+        param_values: list[Any] = [None] * len(self.params)
+        for i in range(len(self.params)):
+            if i in self.result_idx:
+                continue
+            param_values[i] = ins[ins_idx]
+            ins_idx += 1
+
+        first_tensor = next((v for v in param_values if isinstance(v, torch.Tensor)), None)
+        if first_tensor is None:
+            raise ValueError("Expected at least one torch.Tensor argument to infer CUDA device")
+
+        args: list[Any] = []
+
+        # tensor pointers
+        for i in range(len(self.params)):
+            if i in self.result_idx:
+                dtype = self.param_dtypes[i]
+                shape = []
+                # Now working with native Python list, no FFI calls needed
+                for s in self.param_shapes[i]:
+                    if isinstance(s, tir.Var):
+                        ref_id, ref_param_idx, ref_dim_idx = self.dynamic_symbolic_map[s]
+                        ref_val = param_values[ref_param_idx]
+                        if not isinstance(ref_val, torch.Tensor):
+                            raise TypeError(f"Dynamic shape/stride var {s} refers to a non-tensor param at index {ref_param_idx}")
+                        if ref_id == 0:
+                            shape.append(ref_val.shape[ref_dim_idx])
+                        elif ref_id == 1:
+                            # Stride vars are not expected in output shapes, but handle defensively.
+                            shape.append(ref_val.stride()[ref_dim_idx])
+                        else:
+                            raise ValueError(f"Unknown dynamic symbol ref id: {ref_id}")
+                    else:  # Already converted to Python int during initialization
+                        shape.append(s)
+                tensor = torch.empty(*shape, dtype=dtype, device=first_tensor.device)
+                param_values[i] = tensor
+            else:
+                tensor = param_values[i]
+            args.append(tensor)
+
+        # dynamic symbolics
+        for sym in self.dynamic_symbolic_order:
+            ref_id, buffer_idx, dim_idx = self.dynamic_symbolic_map[sym]
+            ref_val = param_values[buffer_idx]
+            if not isinstance(ref_val, torch.Tensor):
+                raise TypeError(f"Dynamic symbolic var {sym} refers to a non-tensor param at index {buffer_idx}")
+            if ref_id == 0:
+                args.append(ref_val.shape[dim_idx])
+            elif ref_id == 1:
+                args.append(ref_val.stride()[dim_idx])
+            else:
+                raise ValueError(f"Unknown dynamic symbol ref id: {ref_id}")
+
+        # if stream is not None, we need to pass the stream to the library
+        if stream is None:
+            if str(self.target).startswith("cuda") and torch.cuda.is_available():
+                stream = torch.cuda.current_stream().cuda_stream
+            else:
+                stream = 0
+
+        # Get device_id from first tensor for multi-GPU support
+        if not first_tensor.is_cuda:
+            raise ValueError(f"CuTeDSL kernels require CUDA tensors, got tensor on device: {first_tensor.device}")
+        device_id = first_tensor.device.index or 0
+
+        self._forward_from_prebuild_lib(*args, stream=stream, device_id=device_id)
+
+        if len(self.result_idx) == 1:
+            return args[self.result_idx[0]]
+        else:
+            return [args[i] for i in self.result_idx]
+
+    def _convert_torch_func(self) -> Callable[..., torch.Tensor | list[torch.Tensor]]:
+        """Convert to a PyTorch-compatible function.
+
+        Returns
+        -------
+        Callable[..., torch.Tensor | list[torch.Tensor]]
+            A callable function that takes tensors and returns tensor(s)
+        """
+        return self._wrap_forward_from_prebuild_lib
+
+    def _post_init(self):
+        """Override base class _post_init to register cleanup via weakref.finalize."""
+        super()._post_init()
+
+        # Register cleanup for this instance using weakref.finalize
+        # This will automatically call cleanup when the object is garbage collected
+        if self.pymodule is not None and hasattr(self.pymodule, "cleanup_module"):
+            weakref.finalize(self, self._cleanup_module, self.pymodule)
+
+    @staticmethod
+    def _cleanup_module(pymodule):
+        """Cleanup a single adapter instance's CUDA module and contexts.
+
+        This is called automatically when the adapter instance is garbage collected.
+        It can also be called explicitly via the cleanup() instance method.
+        """
+        try:
+            if hasattr(pymodule, "cleanup_module"):
+                pymodule.cleanup_module()
+        except Exception:
+            # Suppress errors during cleanup (might be called during shutdown)
+            pass
+
+    def cleanup(self):
+        """Explicitly cleanup this adapter's CUDA resources.
+
+        This method can be called explicitly to immediately release CUDA resources
+        without waiting for garbage collection. Useful in Jupyter notebooks or tests.
+
+        Note: This is safe to call multiple times as the C++ implementation is idempotent.
+        """
+        self._cleanup_module(self.pymodule)
+
+    @property
+    def prim_func(self) -> tir.PrimFunc:
+        """Returns the primary TIR function from the IR module."""
+        return retrieve_func_from_module(self.ir_module)
diff --git a/tilelang/jit/adapter/cutedsl/checks.py b/tilelang/jit/adapter/cutedsl/checks.py
new file mode 100644
index 000000000..477cb43b9
--- /dev/null
+++ b/tilelang/jit/adapter/cutedsl/checks.py
@@ -0,0 +1,88 @@
+from __future__ import annotations
+
+import re
+from importlib import metadata as _importlib_metadata
+from importlib.util import find_spec as _find_spec
+import os
+
+_CUTEDSL_PUBLIC_DIST = "nvidia-cutlass-dsl"
+_CUTEDSL_MIN_VERSION = (4, 3, 1)
+_CUTEDSL_BANNED_VERSIONS = {(4, 3, 4)}  # Known broken versions
+_VERSION_TRIPLE_RE = re.compile(r"(\d+)\.(\d+)\.(\d+)")
+
+
+def _parse_version_triple(version_str: str) -> tuple[int, int, int] | None:
+    """Parse a best-effort (major, minor, patch) triple from a version string.
+
+    We intentionally avoid importing heavy/optional version parsers. For our
+    minimum requirement (>= 4.3.1), a numeric triple comparison is sufficient.
+    """
+    m = _VERSION_TRIPLE_RE.search(version_str)
+    if not m:
+        return None
+    return int(m.group(1)), int(m.group(2)), int(m.group(3))
+
+
+def _min_version_str() -> str:
+    return ".".join(map(str, _CUTEDSL_MIN_VERSION))
+
+
+def _requirement_spec() -> str:
+    spec = f"{_CUTEDSL_PUBLIC_DIST}>={_min_version_str()}"
+    for banned in _CUTEDSL_BANNED_VERSIONS:
+        spec += f",!={'.'.join(map(str, banned))}"
+    return spec
+
+
+def check_cutedsl_available() -> None:
+    """Fail fast if the CuTeDSL backend cannot be used in this Python environment.
+
+    Policy:
+    - If the public distribution `nvidia-cutlass-dsl` is installed, require version >= a minimum supported version.
+    - Regardless of distribution metadata, require that `cutlass.cute` is importable.
+
+    This intentionally does not mention or special-case any internal distributions.
+    """
+    # 1) Version gate (only when the public dist metadata is present)
+    try:
+        dist_version = _importlib_metadata.version(_CUTEDSL_PUBLIC_DIST)
+    except _importlib_metadata.PackageNotFoundError:
+        dist_version = None
+    except Exception:
+        # Metadata is best-effort; don't block internal/nonstandard installs here.
+        dist_version = None
+
+    if dist_version is not None:
+        parsed = _parse_version_triple(dist_version)
+        if parsed is None or parsed < _CUTEDSL_MIN_VERSION:
+            req = _requirement_spec()
+            raise ImportError(
+                f"CuTeDSL backend requires `{req}`, but found version `{dist_version}`. Please run: `pip install -U '{req}'`."
+            )
+        if parsed in _CUTEDSL_BANNED_VERSIONS:
+            req = _requirement_spec()
+            raise ImportError(
+                f"CuTeDSL version `{dist_version}` is known to have compatibility issues and is not supported. Please run: `pip install -U '{req}'`."
+            )
+
+    # 2) Capability probe: keep it cheap.
+    # Importing cutlass/cute can be expensive and defeats our lazy-import design,
+    # especially on cache hits. We only require that the module is importable.
+    cutlass_spec = _find_spec("cutlass")
+    if cutlass_spec is None:
+        req = _requirement_spec()
+        raise ImportError(f"CuTeDSL backend requires the CUTLASS Python DSL with CuTe support (install via `pip install -U '{req}'`).")
+
+    # Avoid find_spec("cutlass.cute") which can be surprisingly expensive.
+    # Instead, check for a 'cute' submodule/package under cutlass's search locations.
+    locs = getattr(cutlass_spec, "submodule_search_locations", None)
+    has_cute = False
+    if locs:
+        for base in locs:
+            if os.path.isdir(os.path.join(base, "cute")) or os.path.isfile(os.path.join(base, "cute.py")):
+                has_cute = True
+                break
+
+    if not has_cute:
+        req = _requirement_spec()
+        raise ImportError(f"CuTeDSL backend requires the CUTLASS Python DSL with CuTe support (install via `pip install -U '{req}'`).")
diff --git a/tilelang/jit/adapter/cutedsl/kernel_cache.py b/tilelang/jit/adapter/cutedsl/kernel_cache.py
new file mode 100644
index 000000000..b498f9797
--- /dev/null
+++ b/tilelang/jit/adapter/cutedsl/kernel_cache.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+import os
+from typing_extensions import override
+
+from tilelang.cache.kernel_cache import KernelCache
+from tilelang.jit import JITKernel
+
+
+class CuTeDSLKernelCache(KernelCache):
+    # CuTeDSL C++ launcher specific
+    kernel_lib_path = "kernel.py"
+    device_kernel_path = "kernel.py"
+    host_kernel_path = "kernel.py"
+    launcher_lib_path = "launcher_lib.so"
+    launcher_cpp_path = "launcher.cpp"
+
+    @override
+    def _save_kernel_source_code_to_disk(self, kernel: JITKernel, cache_path: str, verbose: bool = False):
+        return
+
+    @override
+    def _save_so_cubin_to_disk(self, kernel: JITKernel, cache_path: str, verbose: bool = False):
+        # Save C++ launcher library if it exists
+        lib_gen = getattr(kernel.adapter, "lib_generator", None)
+        if lib_gen and hasattr(lib_gen, "launcher_libpath") and lib_gen.launcher_libpath:
+            launcher_lib_path = os.path.join(cache_path, self.launcher_lib_path)
+            src_launcher_path = lib_gen.launcher_libpath
+            if verbose:
+                self.logger.debug(f"Saving C++ launcher library to cache: {src_launcher_path}")
+            KernelCache._safe_write_file(launcher_lib_path, "wb", lambda file: file.write(KernelCache._load_binary(src_launcher_path)))
+
+        # Optionally save launcher C++ source for debugging
+        if hasattr(kernel.adapter, "launcher_cpp_code") and kernel.adapter.launcher_cpp_code:
+            launcher_cpp_path = os.path.join(cache_path, self.launcher_cpp_path)
+            if verbose:
+                self.logger.debug(f"Saving C++ launcher source to: {launcher_cpp_path}")
+            KernelCache._safe_write_file(launcher_cpp_path, "w", lambda file: file.write(kernel.adapter.launcher_cpp_code))
+
+    @override
+    def _get_required_files(self, cache_path: str) -> list[str]:
+        return super()._get_required_files(cache_path) + [os.path.join(cache_path, self.launcher_lib_path)]
+
+    @override
+    def _set_adapter_cache_path(self, kernel: JITKernel, cache_path: str):
+        if hasattr(kernel, "adapter"):
+            kernel.adapter._cache_path = cache_path
diff --git a/tilelang/jit/adapter/cutedsl/libgen.py b/tilelang/jit/adapter/cutedsl/libgen.py
new file mode 100644
index 000000000..7e2dc2091
--- /dev/null
+++ b/tilelang/jit/adapter/cutedsl/libgen.py
@@ -0,0 +1,118 @@
+"""CuTeDSL Library Generator for TileLang.
+
+This module provides library generation functionality for the CuTeDSL backend.
+"""
+
+from __future__ import annotations
+import importlib.util
+import os
+import tempfile
+import subprocess
+
+from tvm.target import Target
+
+from tilelang.jit.adapter.libgen import LibraryGenerator
+from tilelang.jit.adapter.utils import is_cutedsl_target
+
+
+class CuTeDSLLibraryGenerator(LibraryGenerator):
+    host_func: str | None = None
+    tma_cpp_init_code: str | None = None
+    tma_lib_name: str | None = None
+    launcher_cpp_code: str | None = None
+    launcher_lib_name: str | None = None
+    pymodule = None
+
+    def __init__(self, target: Target, verbose: bool = False):
+        super().__init__(target, verbose)
+
+    @staticmethod
+    def import_from_file(module_name, file_path):
+        spec = importlib.util.spec_from_file_location(module_name, file_path)
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        return module
+
+    def update_host_func(self, host_func: str):
+        self.host_func = host_func
+
+    def update_launcher_cpp_code(self, launcher_cpp_code: str):
+        self.launcher_cpp_code = launcher_cpp_code
+
+    def update_launcher_lib_name(self, launcher_lib_name: str):
+        self.launcher_lib_name = launcher_lib_name
+
+    def load_lib(self, lib_path: str | None = None):
+        if lib_path is None:
+            if self.libpath is None:
+                raise RuntimeError("CuTeDSLLibraryGenerator.libpath is not set; call compile_lib() first or pass lib_path explicitly.")
+            lib_path = self.libpath
+
+        self.pymodule = self.import_from_file("kernel", lib_path)
+
+    def compile_lib(self, timeout: float = None):
+        if self.host_func is None:
+            raise RuntimeError("CuTeDSLLibraryGenerator.host_func is not set; call update_host_func() before compile_lib().")
+        target = self.target
+        if is_cutedsl_target(target):
+            # Use a dedicated temp directory per kernel so CuTeDSL artifacts (e.g. kept .cubin)
+            # never pollute user CWD, and are easy to locate alongside the generated module.
+            work_dir = tempfile.mkdtemp(prefix="tilelang_cutedsl_")
+            src_path = os.path.join(work_dir, "kernel.py")
+            with open(src_path, "w") as f:
+                # Note: lib_code (containing @cute.kernel definitions) is embedded
+                # inside host_func's _generate_cubin_if_needed function, so we only
+                # write host_func here. This ensures cute imports are lazy-loaded.
+                f.write(self.host_func)
+
+            # Compile C++ launcher library if needed
+            if self.launcher_cpp_code is not None:
+                with tempfile.NamedTemporaryFile(
+                    mode="w",
+                    suffix=".cpp",
+                    delete=False,
+                ) as launcher_src:
+                    launcher_src.write(self.launcher_cpp_code)
+                    launcher_src_path = launcher_src.name
+
+                # Generate launcher lib under the same directory as the source file
+                launcher_lib_path = os.path.join(os.path.dirname(src_path), self.launcher_lib_name)
+
+                # Get TVM FFI compiler flags using tvm_ffi.libinfo API
+                try:
+                    import tvm_ffi.libinfo
+
+                    include_paths = tvm_ffi.libinfo.include_paths()
+                    tvm_cxxflags = [f"-I{path}" for path in include_paths]
+                    lib_path = tvm_ffi.libinfo.find_libtvm_ffi()
+                    lib_dir = os.path.dirname(lib_path)
+                    tvm_ldflags = [f"-L{lib_dir}", "-ltvm_ffi"]
+                except (ImportError, RuntimeError):
+                    # tvm_ffi unavailable or libinfo functions failed
+                    tvm_cxxflags = []
+                    tvm_ldflags = []
+
+                # Compile with nvcc (need CUDA driver API)
+                compile_cmd = [
+                    "nvcc",
+                    "-shared",
+                    "-Xcompiler=-fPIC",
+                    "-lcuda",
+                    *tvm_cxxflags,
+                    *tvm_ldflags,
+                    "-o",
+                    launcher_lib_path,
+                    launcher_src_path,
+                ]
+
+                result = subprocess.run(compile_cmd, check=False, capture_output=True, text=True, timeout=timeout)
+                if result.returncode != 0:
+                    raise RuntimeError(f"Failed to compile C++ launcher: {result.stderr}")
+
+                self.launcher_libpath = launcher_lib_path
+                self.launcher_libname = self.launcher_lib_name
+
+            self.srcpath = src_path
+            self.libpath = src_path
+        else:
+            raise ValueError(f"Unsupported target: {target}")
diff --git a/tilelang/jit/adapter/cutedsl/wrapper.py b/tilelang/jit/adapter/cutedsl/wrapper.py
new file mode 100644
index 000000000..1cd5d8e0b
--- /dev/null
+++ b/tilelang/jit/adapter/cutedsl/wrapper.py
@@ -0,0 +1,1467 @@
+"""CuTeDSL Source Wrapper for TileLang.
+
+This module provides C++ kernel launcher generation for the CuTeDSL backend.
+
+Key features:
+- Automatic C++ launcher generation with CUDA Driver API
+- TMA descriptors on HOST memory, passed via __grid_constant__ (no device copy needed)
+- cuLaunchKernel automatically copies 128-byte CUtensorMap to kernel param space
+- Support for single and multiple kernel launches
+- Complete cache system integration
+"""
+
+from __future__ import annotations
+from typing import Any, ClassVar
+
+from tvm import IRModule
+from tvm.target import Target
+from tvm.tir.stmt_functor import post_order_visit
+
+from tilelang import tvm as tvm
+from tilelang.jit.adapter.wrapper import TLCUDASourceWrapper
+from tilelang.jit.adapter.utils import (
+    extract_python_func_declaration,
+    pythonic_expr,
+    parse_tma_descriptor_args,
+)
+
+# =============================================================================
+# C++ LAUNCHER TEMPLATES (using named parameters for clarity)
+# =============================================================================
+
+# TMA single descriptor initialization template (writes to caller-provided host array)
+# No device copy needed - cuLaunchKernel handles __grid_constant__ params automatically
+CPP_TMA_DESC_INIT_TEMPLATE = """\
+  // Descriptor {desc_idx}: {desc_name} (tensor: {tensor_name})
+  {{
+    uint64_t globalDim[{rank}] = {{{global_dim_values}}};
+    uint64_t globalStrides[{stride_rank}] = {{{global_stride_values}}};
+    uint32_t boxDim[{rank}] = {{{box_dim_values}}};
+    uint32_t elemStrides[{rank}] = {{{elem_stride_values}}};
+
+    result = cuTensorMapEncodeTiled(
+        &tma_descs[{desc_idx}],
+        static_cast<CUtensorMapDataType>({dtype}),
+        {rank},
+        reinterpret_cast<void*>({tensor_name}_ptr),
+        globalDim,
+        globalStrides,
+        boxDim,
+        elemStrides,
+        static_cast<CUtensorMapInterleave>({interleave}),
+        static_cast<CUtensorMapSwizzle>({swizzle}),
+        static_cast<CUtensorMapL2promotion>({l2_promotion}),
+        static_cast<CUtensorMapFloatOOBfill>({oob_fill})
+    );
+
+    if (result != CUDA_SUCCESS) {{
+      std::cerr << "Failed to encode TMA descriptor {desc_idx}: " << result << "\\n";
+      return result;
+    }}
+  }}
+"""
+
+# TMA single im2col descriptor initialization template (writes to caller-provided host array)
+# Align field ordering with NVRTC wrapper (cuTensorMapEncodeIm2col signature).
+CPP_TMA_IM2COL_DESC_INIT_TEMPLATE = """\
+  // Descriptor {desc_idx}: {desc_name} (tensor: {tensor_name}) [im2col]
+  {{
+    uint64_t globalDim[{rank}] = {{{global_dim_values}}};
+    uint64_t globalStrides[{stride_rank}] = {{{global_stride_values}}};
+    uint32_t elemStrides[{rank}] = {{{elem_stride_values}}};
+    int32_t lowerCorner[{rank_minus_two}] = {{{lower_corner_values}}};
+    int32_t upperCorner[{rank_minus_two}] = {{{upper_corner_values}}};
+
+    result = cuTensorMapEncodeIm2col(
+        &tma_descs[{desc_idx}],
+        static_cast<CUtensorMapDataType>({dtype}),
+        {rank},
+        reinterpret_cast<void*>({tensor_name}_ptr),
+        globalDim,
+        globalStrides,
+        lowerCorner,
+        upperCorner,
+        static_cast<uint32_t>({channels_per_pixel}),
+        static_cast<uint32_t>({pixels_per_column}),
+        elemStrides,
+        static_cast<CUtensorMapInterleave>({interleave}),
+        static_cast<CUtensorMapSwizzle>({swizzle}),
+        static_cast<CUtensorMapL2promotion>({l2_promotion}),
+        static_cast<CUtensorMapFloatOOBfill>({oob_fill})
+    );
+
+    if (result != CUDA_SUCCESS) {{
+      std::cerr << "Failed to encode TMA im2col descriptor {desc_idx}: " << result << "\\n";
+      return result;
+    }}
+  }}
+"""
+
+# TMA initialization function template (writes to caller-provided host array)
+# __grid_constant__ allows kernel to receive TMA descriptor by value via param space
+CPP_TMA_INIT_FUNC_TEMPLATE = """\
+CUresult tma_init(CUtensorMap* tma_descs, {func_args}) {{
+  // Initialize {num_descs} TMA descriptor(s) in caller-provided host array
+  // cuLaunchKernel will copy 128-byte CUtensorMap to kernel param space automatically
+  CUresult result;
+
+{desc_init_code}
+
+  return CUDA_SUCCESS;
+}}
+"""
+
+# Kernel initialization template
+CPP_KERNEL_INIT_TEMPLATE = """\
+  // Find and configure kernel {kernel_idx}: {kernel_name}
+  result = find_kernel_by_pattern(module, "{kernel_name}", &kernels[{kernel_idx}]);
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to find kernel {kernel_name} on device " << device_id << ": " << result << "\\n";
+    return result;
+  }}
+
+  if ({smem_size} > 0) {{
+    result = cuFuncSetAttribute(kernels[{kernel_idx}],
+                                CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+                                {smem_size});
+    if (result != CUDA_SUCCESS) {{
+      std::cerr << "Failed to set smem for {kernel_name} on device " << device_id << ": " << result << "\\n";
+      return result;
+    }}
+  }}
+"""
+
+# TMA launch initialization template (host memory mode - uses __grid_constant__)
+# Kernel receives TMA descriptor by value: .param .align 128 .b8 xxx_param[128]
+CPP_TMA_LAUNCH_INIT_TEMPLATE = """\
+  // Declare stack-local TMA descriptor array (eliminates concurrency race)
+  CUtensorMap tma_descs[{num_tma_descs}];
+
+  // Initialize TMA descriptors (HOST memory - passed via __grid_constant__)
+  // NOTE: We intentionally do NOT reuse/cached descriptors across launches.
+  // Pointer-only reuse is a correctness trap (shape/stride may change with same ptr),
+  // and correctness beats micro-optimizations.
+  result = tma_init(tma_descs, {tma_tensor_args});
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to initialize TMA descriptors: " << result << "\\n";
+    return result;
+  }}
+"""
+
+# Kernel launch template
+CPP_KERNEL_LAUNCH_TEMPLATE = """\
+  // Launch kernel {kernel_idx}: {kernel_name}
+  {{
+    // Get the kernel for current device
+    auto kernels_it = g_device_kernels.find(device_id);
+    if (kernels_it == g_device_kernels.end()) {{
+      std::cerr << "Kernels not initialized for device " << device_id << "\\n";
+      return CUDA_ERROR_NOT_INITIALIZED;
+    }}
+    const std::vector<CUfunction>& kernels = kernels_it->second;
+
+    void* args[] = {{{kernel_args}}};
+    result = cuLaunchKernel(
+        kernels[{kernel_idx}],
+        {grid_x}, {grid_y}, {grid_z},
+        {block_x}, {block_y}, {block_z},
+        {smem_size},
+        stream,
+        args,
+        nullptr
+    );
+    if (result != CUDA_SUCCESS) {{
+      std::cerr << "Failed to launch kernel {kernel_name} on device " << device_id << ": " << result << "\\n";
+      return result;
+    }}
+  }}
+"""
+
+# Complete C++ launcher template
+CPP_LAUNCHER_TEMPLATE = """\
+#include <cuda.h>
+#include <cstdint>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cstring>
+#include <string>
+#include <mutex>
+#include <unordered_map>
+
+// TVM Headers
+#include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/function.h>
+
+// Per-device module and kernel storage for multi-GPU support
+// Each device needs its own CUmodule because modules are tied to CUDA contexts
+static std::unordered_map<int, CUmodule> g_device_modules;
+static std::unordered_map<int, std::vector<CUfunction>> g_device_kernels;
+static std::unordered_map<int, CUcontext> g_device_contexts;  // Track retained contexts for cleanup
+static std::mutex g_devices_mutex;
+
+// Find kernel by pattern (substring match, prefer base name over _N variants)
+CUresult find_kernel_by_pattern(CUmodule module, const char* pattern, CUfunction* out_func) {{
+  CUresult result;
+  unsigned int num_funcs = 0;
+
+  result = cuModuleGetFunctionCount(&num_funcs, module);
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to get function count: " << result << "\\n";
+    return result;
+  }}
+
+  std::vector<CUfunction> func_list(num_funcs);
+  result = cuModuleEnumerateFunctions(func_list.data(), num_funcs, module);
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to enumerate functions: " << result << "\\n";
+    return result;
+  }}
+
+  // Collect substring matches, separating base name from _N variants
+  std::vector<std::pair<std::string, CUfunction>> base_matches;     // pattern not followed by _digit
+  std::vector<std::pair<std::string, CUfunction>> variant_matches;  // pattern followed by _digit
+
+  size_t pattern_len = std::strlen(pattern);
+
+  for (unsigned int i = 0; i < num_funcs; i++) {{
+    const char* func_name = nullptr;
+    result = cuFuncGetName(&func_name, func_list[i]);
+    if (result != CUDA_SUCCESS || func_name == nullptr) {{
+      std::cerr << "Failed to get function name: " << result << "\\n";
+      return result;
+    }}
+
+    std::string name_str(func_name);
+    size_t pos = name_str.find(pattern);
+
+    if (pos != std::string::npos) {{
+      // Found substring match
+      size_t after_pattern = pos + pattern_len;
+
+      // Check what follows the pattern
+      if (after_pattern < name_str.length() &&
+          name_str[after_pattern] == '_' &&
+          after_pattern + 1 < name_str.length() &&
+          std::isdigit(name_str[after_pattern + 1])) {{
+        // Pattern followed by _digit (e.g., "main_kernel_1")
+        variant_matches.push_back({{name_str, func_list[i]}});
+      }} else {{
+        // Pattern not followed by _digit (e.g., "main_kernel" itself)
+        base_matches.push_back({{name_str, func_list[i]}});
+      }}
+    }}
+  }}
+
+  // Decision logic: prefer base matches over variant matches
+  if (!base_matches.empty()) {{
+    if (base_matches.size() == 1) {{
+      *out_func = base_matches[0].second;
+      return CUDA_SUCCESS;
+    }}
+
+    // Multiple base matches - ambiguous
+    std::cerr << "Error: Pattern '" << pattern << "' matched " << base_matches.size()
+              << " base kernels (ambiguous). Matches found:\\n";
+    for (const auto& match : base_matches) {{
+      std::cerr << "  - " << match.first << "\\n";
+    }}
+    std::cerr << "Please use a more specific pattern.\\n";
+    return CUDA_ERROR_NOT_FOUND;
+  }}
+
+  // No base matches, try variant matches
+  if (!variant_matches.empty()) {{
+    if (variant_matches.size() == 1) {{
+      *out_func = variant_matches[0].second;
+      return CUDA_SUCCESS;
+    }}
+
+    // Multiple variant matches - ambiguous
+    std::cerr << "Error: Pattern '" << pattern << "' matched " << variant_matches.size()
+              << " variant kernels (ambiguous). Matches found:\\n";
+    for (const auto& match : variant_matches) {{
+      std::cerr << "  - " << match.first << "\\n";
+    }}
+    std::cerr << "Please use a more specific pattern (e.g., '" << pattern << "_1').\\n";
+    return CUDA_ERROR_NOT_FOUND;
+  }}
+
+  // No matches at all
+  std::cerr << "Failed to find kernel matching pattern '" << pattern << "'\\n";
+  return CUDA_ERROR_NOT_FOUND;
+}}
+
+
+// Initialize CUDA module for a specific device (called once per device)
+// Thread-safe and supports multi-GPU by tracking modules per device
+// device_id: PyTorch CUDA device ID (e.g., 0, 1, 2...)
+static CUresult tilelang_init_cuda_module(const std::string& cubin_path, int device_id) {{
+  std::lock_guard<std::mutex> lock(g_devices_mutex);
+
+  // Fast path: module already initialized for this device
+  if (g_device_modules.find(device_id) != g_device_modules.end()) {{
+    return CUDA_SUCCESS;
+  }}
+
+  CUresult result;
+  result = cuInit(0);
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to initialize CUDA: " << result << "\\n";
+    return result;
+  }}
+
+  // Get device handle for the requested device_id
+  CUdevice device;
+  result = cuDeviceGet(&device, device_id);
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to get CUDA device " << device_id << ": " << result << "\\n";
+    return result;
+  }}
+
+  // Retain and set the primary context for this device
+  // PyTorch (Runtime API) creates and activates the primary context
+  // We need to explicitly acquire it via Driver API and set it as current
+  CUcontext ctx;
+  result = cuDevicePrimaryCtxRetain(&ctx, device);
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to retain primary context for device " << device_id << ": " << result << "\\n";
+    return result;
+  }}
+
+  result = cuCtxSetCurrent(ctx);
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to set current context for device " << device_id << ": " << result << "\\n";
+    return result;
+  }}
+
+  // Store the retained context for cleanup
+  g_device_contexts[device_id] = ctx;
+
+  // Read cubin file
+  std::ifstream cubin_file(cubin_path.c_str(), std::ios::binary);
+  if (!cubin_file) {{
+    std::cerr << "Failed to open cubin file: " << cubin_path << "\\n";
+    return CUDA_ERROR_FILE_NOT_FOUND;
+  }}
+
+  std::vector<char> cubin_data((std::istreambuf_iterator<char>(cubin_file)),
+                                std::istreambuf_iterator<char>());
+  cubin_file.close();
+
+  if (cubin_data.empty()) {{
+    std::cerr << "Empty cubin file: " << cubin_path << "\\n";
+    return CUDA_ERROR_INVALID_IMAGE;
+  }}
+
+  // Load module for this specific device
+  CUmodule module;
+  result = cuModuleLoadData(&module, cubin_data.data());
+  if (result != CUDA_SUCCESS) {{
+    std::cerr << "Failed to load CUDA module on device " << device_id << ": " << result << "\\n";
+    return result;
+  }}
+
+  // Store module for this device
+  g_device_modules[device_id] = module;
+
+  return CUDA_SUCCESS;
+}}
+
+// Initialize kernel functions for a specific device (called once per device)
+// Thread-safe and supports multi-GPU by tracking kernels per device
+static CUresult tilelang_init_kernels(int device_id) {{
+  std::lock_guard<std::mutex> lock(g_devices_mutex);
+
+  // Fast path: kernels already initialized for this device
+  if (g_device_kernels.find(device_id) != g_device_kernels.end()) {{
+    return CUDA_SUCCESS;
+  }}
+
+  // Get the module for this device
+  auto module_it = g_device_modules.find(device_id);
+  if (module_it == g_device_modules.end()) {{
+    std::cerr << "Module not initialized for device " << device_id << "\\n";
+    return CUDA_ERROR_NOT_INITIALIZED;
+  }}
+  CUmodule module = module_it->second;
+
+  // Initialize kernel storage for this device
+  std::vector<CUfunction> kernels({num_kernels});
+  CUresult result;
+
+{kernel_inits}
+
+  // Store kernels for this device
+  g_device_kernels[device_id] = kernels;
+
+  return CUDA_SUCCESS;
+}}
+
+// TMA descriptor initialization (host-side)
+{tma_init_func}
+
+// Main kernel launcher
+extern "C" CUresult launch_kernel({launch_func_sig}, uint64_t _stream, int device_id, tvm::ffi::Bytes cubin_path) {{
+  CUresult result;
+
+  std::string cubin_path_str(reinterpret_cast<const char*>(cubin_path.data()), cubin_path.size());
+  result = tilelang_init_cuda_module(cubin_path_str, device_id);
+  if (result != CUDA_SUCCESS) return result;
+
+  result = tilelang_init_kernels(device_id);
+  if (result != CUDA_SUCCESS) return result;
+
+{get_ptr_code}
+  CUstream stream = (CUstream)_stream;
+
+{tma_init_in_launch}
+
+{kernel_launches}
+
+  return CUDA_SUCCESS;
+}}
+
+// Cleanup function
+extern "C" CUresult cleanup_module() {{
+  std::lock_guard<std::mutex> lock(g_devices_mutex);
+
+  CUresult last_error = CUDA_SUCCESS;
+
+  // Step 1: Unload modules for all devices
+  for (auto& pair : g_device_modules) {{
+    if (pair.second != nullptr) {{
+      CUresult result = cuModuleUnload(pair.second);
+      if (result != CUDA_SUCCESS) {{
+        std::cerr << "Failed to unload module for device " << pair.first
+                  << ": " << result << "\\n";
+        last_error = result;
+        // Continue cleanup even if unload fails
+      }}
+    }}
+  }}
+
+  // Step 2: Release primary contexts (must execute even if module unload failed)
+  // This ensures the reference count is decremented for every cuDevicePrimaryCtxRetain
+  for (auto& pair : g_device_contexts) {{
+    int device_id = pair.first;
+    CUcontext ctx = pair.second;
+
+    if (ctx != nullptr) {{
+      CUdevice device;
+      CUresult result = cuDeviceGet(&device, device_id);
+      if (result == CUDA_SUCCESS) {{
+        result = cuDevicePrimaryCtxRelease(device);
+        if (result != CUDA_SUCCESS) {{
+          std::cerr << "Failed to release primary context for device "
+                    << device_id << ": " << result << "\\n";
+          last_error = result;
+        }}
+      }} else {{
+        std::cerr << "Failed to get device " << device_id
+                  << " for context release: " << result << "\\n";
+        last_error = result;
+      }}
+    }}
+  }}
+
+  // Step 3: Clear all maps
+  g_device_modules.clear();
+  g_device_kernels.clear();
+  g_device_contexts.clear();
+
+  return last_error;
+}}
+
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(launch_kernel, launch_kernel);
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(cleanup_module, cleanup_module);
+"""
+
+# =============================================================================
+# PYTHON CUBIN GENERATION TEMPLATES
+# =============================================================================
+
+# TMA descriptor atom initialization template
+CUBIN_TMA_ATOM_INIT_TEMPLATE = """\
+    {desc_name} = tl.Gemm_SM90.get_tma_atom(__fake_tensor__, (32, 32))"""
+
+# Kernel launch call template
+CUBIN_KERNEL_LAUNCH_TEMPLATE = """\
+    {function_name}({call_args}).launch(
+      grid=[{grid_x}, {grid_y}, {grid_z}],
+      block=[{block_x}, {block_y}, {block_z}],
+      smem={smem_size},
+      stream=stream,
+    )"""
+
+# Fake tensor creation template
+CUBIN_FAKE_TENSOR_TEMPLATE = """\
+  __fake_{arg_name}__ = make_fake_compact_tensor(_DTYPE_MAP[str({arg_name}.dtype)], {arg_name}.shape, stride_order={arg_name}.dim_order()[::-1], assumed_align=16)"""
+
+# Complete cubin generation code template
+# {lib_code} contains the @cute.kernel definitions and is embedded here
+CUBIN_GEN_CODE_TEMPLATE = """\
+{lib_code}
+
+  @cute.jit
+  def kernel_wrapper({wrapper_args}):
+{tma_init_code}{kernel_launches}
+
+  # Compile kernels to generate cubin
+{fake_tensor_code}{fake_tma_tensor_code}  __fake_stream__ = make_fake_stream()
+  # Always generate cubin under a unique staging directory to avoid concurrent
+  # processes clobbering each other's intermediate artifacts.
+  _staging_dir = Path(tempfile.mkdtemp(
+      prefix=Path(__file__).stem + ".cubin.staging.",
+      dir=_module_dir,
+  ))
+  try:
+    _kernel_wrapper = cute.compile(
+        kernel_wrapper,
+        {compile_args},
+        options=f"--enable-tvm-ffi --keep-cubin --dump-dir={{_staging_dir.as_posix()}}",
+    )
+
+    # CuTeDSL generates a long, mangled cubin filename that includes argument/type info,
+    # e.g. "cutlass_kernel_wrapper_FakeTensor...sm_90a.cubin". We expect exactly one cubin.
+    _cubin_files = sorted(_staging_dir.glob("*.cubin"), key=lambda p: p.stat().st_mtime)
+    if len(_cubin_files) != 1:
+      raise RuntimeError(
+          f"Expected exactly one .cubin under {{_staging_dir}}, got {{len(_cubin_files)}}: {{_cubin_files}}"
+      )
+    os.replace(_cubin_files[0], _cubin_path)
+  finally:
+    shutil.rmtree(_staging_dir, ignore_errors=True)"""
+
+# =============================================================================
+# PYTHON HOST FUNCTION TEMPLATE
+# =============================================================================
+
+PYTHON_HOST_FUNC_TEMPLATE = """\
+import os
+from pathlib import Path
+
+# Minimal imports for runtime (no cutlass/cute - only needed for cubin generation)
+import tvm.runtime as runtime
+
+_cpp_launcher = None
+_cpp_launcher_lib = None
+_cubin_generated = False
+
+# Pre-compute paths - cubin is stored alongside the launcher .so
+# Use module basename to avoid conflicts when multiple kernels run concurrently
+# e.g., "/tmp/tmp8liu__ho.py" -> "/tmp/tmp8liu__ho.cubin"
+#       "kernel.py" (in cache) -> "kernel.cubin"
+_module_dir = Path(os.path.dirname(__file__))
+_cubin_path = _module_dir / (Path(__file__).stem + ".cubin")
+_cubin_path_bytes = _cubin_path.as_posix().encode('utf-8')
+_cubin_needs_generation = not _cubin_path.exists()
+
+def _generate_cubin_if_needed({cubin_gen_params}):
+  \"\"\"Generate cubin file on first call.
+
+  All CuTeDSL imports are inside this function to avoid slow
+  module-level initialization when loading from cache.
+  \"\"\"
+  global _cubin_generated, _cubin_path
+
+  # Lazy import CuTeDSL only when cubin generation is needed
+  from cuda.bindings.driver import CUstream
+  import cutlass
+  import cutlass.cute as cute
+  from cutlass.cute.runtime import make_fake_stream, make_fake_compact_tensor
+  import tilelang.contrib.cutedsl as tl
+  # We rely on CuTeDSL's keep-cubin artifact rather than custom extraction.
+  import tempfile
+  import shutil
+
+  _DTYPE_MAP = {{
+      "torch.float32": cutlass.Float32,
+      "torch.float16": cutlass.Float16,
+      "torch.bfloat16": cutlass.BFloat16,
+      "torch.float8_e4m3fnuz": cutlass.Float8E4M3FN,
+      "torch.float8_e4m3fn": cutlass.Float8E4M3FN,
+      "torch.float8_e5m2": cutlass.Float8E5M2,
+      "torch.float64": cutlass.Float64,
+      "torch.int64": cutlass.Int64,
+      "torch.int32": cutlass.Int32,
+      "torch.uint32": cutlass.Uint32,
+      "torch.bool": cutlass.Boolean,
+      "torch.int8": cutlass.Int8,
+      "torch.uint8": cutlass.Uint8,
+      "torch.int16": cutlass.Int16,
+      "torch.uint16": cutlass.Uint16,
+      "torch.uchar": cutlass.Uint8,
+  }}
+
+{cubin_gen_code}
+
+  _cubin_generated = True
+
+def _load_cpp_launcher():
+  \"\"\"Load C++ kernel launcher.\"\"\"
+  global _cpp_launcher, _cpp_launcher_lib
+  if _cpp_launcher is not None:
+    return _cpp_launcher
+
+  lib_path = os.path.join(os.path.dirname(__file__), "{launcher_lib_name}")
+  if not os.path.exists(lib_path):
+    raise FileNotFoundError(f"Launcher not found: {{lib_path}}")
+
+  _cpp_launcher_lib = runtime.load_module(lib_path)
+  _cpp_launcher = _cpp_launcher_lib["launch_kernel"]
+  return _cpp_launcher
+
+def call({call_func_params}, stream, device_id=0):
+  \"\"\"Kernel dispatch function.
+
+  Args:
+      stream: CUDA stream handle
+      device_id: CUDA device ID (should be passed from caller, defaults to 0 for backward compatibility)
+  \"\"\"
+  global _cubin_path_bytes, _cubin_needs_generation
+
+  if _cubin_needs_generation:
+    _generate_cubin_if_needed({cubin_gen_call_args})
+    _cubin_needs_generation = False
+
+{arg_prep_code}
+
+  launcher = _load_cpp_launcher()
+  result = launcher({launcher_call_args}, stream, device_id, _cubin_path_bytes)
+
+  if result != 0:
+    raise RuntimeError(f"Kernel launch failed with CUDA error {{result}}")
+"""
+
+# =============================================================================
+# WRAPPER CLASS
+# =============================================================================
+
+
+class TLCuTeDSLSourceWrapper(TLCUDASourceWrapper):
+    """Wrapper class for TileLang CuTe DSL backend with C++ launcher.
+
+    Generates optimized C++ launcher code that:
+    - Loads cubin via CUDA Driver API
+    - Passes TMA descriptors by value (host-side, no device copy)
+    - Launches kernels with minimal Python overhead
+    - Supports both single and multiple kernel scenarios
+    """
+
+    _TYPE_MAP: ClassVar[dict[str, str]] = {
+        "float32": "cutlass.Float32",
+        "float16": "cutlass.Float16",
+        "bfloat16": "cutlass.BFloat16",
+        "float8_e4m3": "cutlass.Float8E4M3",
+        "float8_e5m2": "cutlass.Float8E5M2",
+        "float64": "cutlass.Float64",
+        "int64": "cutlass.Int64",
+        "int32": "cutlass.Int32",
+        "uint32": "cutlass.Uint32",
+        "bool": "cutlass.Boolean",
+        "int8": "cutlass.Int8",
+        "uint8": "cutlass.Uint8",
+        "int16": "cutlass.Int16",
+        "uint16": "cutlass.Uint16",
+        "uchar": "cutlass.Uint8",
+    }
+
+    # C++ launcher code must not depend on cutlass Python types.
+    # Use plain C/C++ types for expression rendering inside generated .cpp.
+    _CXX_TYPE_MAP: ClassVar[dict[str, str]] = {
+        "float32": "float",
+        "float64": "double",
+        "int64": "int64_t",
+        "int32": "int32_t",
+        "uint32": "uint32_t",
+        "bool": "bool",
+        "int8": "int8_t",
+        "uint8": "uint8_t",
+        "int16": "int16_t",
+        "uint16": "uint16_t",
+    }
+
+    _CTYPES_MAP: ClassVar[dict[str, str]] = {
+        "buffer": "ctypes.c_uint64",
+        "cutlass.Float32": "ctypes.c_float",
+        "cutlass.Float16": "ctypes.c_uint16",
+        "cutlass.Float64": "ctypes.c_double",
+        "cutlass.Int64": "ctypes.c_int64",
+        "cutlass.Int32": "ctypes.c_int32",
+        "cutlass.Uint32": "ctypes.c_uint32",
+        "cutlass.Int8": "ctypes.c_int8",
+        "cutlass.Uint8": "ctypes.c_uint8",
+        "cutlass.Int16": "ctypes.c_int16",
+        "cutlass.Uint16": "ctypes.c_uint16",
+        "int": "ctypes.c_int32",
+    }
+
+    _generated_host_func: str | None = None
+    _launcher_lib_name: str | None = None
+
+    def __init__(
+        self,
+        scheduled_ir_module: IRModule,
+        source: str,
+        target: Target,
+        device_mod: IRModule | None = None,
+        host_mod: IRModule | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
+        super().__init__(scheduled_ir_module, source, target, device_mod, host_mod, pass_configs)
+
+    # =========================================================================
+    # Properties
+    # =========================================================================
+
+    @property
+    def host_func(self):
+        """Override parent's host_func to return generated Python code."""
+        if self._generated_host_func is not None:
+            return self._generated_host_func
+        return super().host_func
+
+    @host_func.setter
+    def host_func(self, value):
+        """Allow setting generated host function code."""
+        self._generated_host_func = value
+
+    # =========================================================================
+    # Utility Methods
+    # =========================================================================
+
+    def _pythonic_expr(self, expr: tvm.tir.PrimExpr) -> str:
+        """Convert TVM expression to Python string."""
+        return pythonic_expr(expr, self._TYPE_MAP, floor_div_op="//")
+
+    def _cxx_expr(self, expr: tvm.tir.PrimExpr) -> str:
+        """Convert TVM expression to C++ string for generated launcher code."""
+        return pythonic_expr(expr, self._CXX_TYPE_MAP)
+
+    @staticmethod
+    def _cxx_cast(ctype: str, expr_str: str) -> str:
+        return f"static_cast<{ctype}>({expr_str})"
+
+    def _collect_function_args(self) -> tuple[list[dict], list[str]]:
+        """Collect all function arguments from primary function.
+
+        Returns:
+            Tuple of (function_args, buffer_args)
+        """
+        function_args = []
+        buffer_args = []
+
+        for param in self.prim_func.params:
+            if param in self.prim_func.buffer_map:
+                buffer = self.prim_func.buffer_map[param]
+                function_args.append({"name": buffer.data.name, "type": "buffer"})
+                buffer_args.append(buffer.data.name)
+            elif isinstance(param, tvm.tir.Var):
+                function_args.append({"name": param.name, "type": self._TYPE_MAP[param.dtype]})
+            else:
+                raise ValueError(f"Parameter {param} not in buffer map")
+
+        existing_names = {arg["name"] for arg in function_args}
+        for dyn_sym in self.get_dynamic_symbolic_set(self.prim_func):
+            dyn_sym_name, dyn_sym_dtype = dyn_sym if isinstance(dyn_sym, tuple) else (dyn_sym, "int32")
+            if dyn_sym_name in existing_names:
+                continue
+            existing_names.add(dyn_sym_name)
+            function_args.append({"name": dyn_sym_name, "type": self._TYPE_MAP.get(dyn_sym_dtype, "int")})
+
+        return function_args, buffer_args
+
+    @staticmethod
+    def _extract_func_call_args(
+        declaration: str,
+        function_args: list[dict],
+        function_params: list,
+        desc_name_map: dict[str, str] | None = None,
+        desc_name_var_map: dict[str, tvm.tir.Var] | None = None,
+    ) -> list[tuple[str, str]]:
+        """Extract function call arguments from Python function declaration."""
+
+        def maybe_desc(name: str | tuple[str, str], param_names: list[str], i: int):
+            name_str = name if isinstance(name, str) else name[0]
+            param = param_names[i]
+            if not (param == name_str + "_desc" or param.startswith(name_str + "_desc_")):
+                return False
+            if desc_name_map is not None:
+                desc_name_map[param] = name_str
+            return True
+
+        def extract_param_names_ast(decl: str) -> list[str] | None:
+            """Extract parameter names using AST parsing."""
+            import ast
+            import warnings
+
+            try:
+                # Build a syntactically valid function by adding a body
+                func_stub = decl.rstrip()
+                if not func_stub.endswith(":"):
+                    func_stub += ":"
+                func_stub += "\n    pass"
+
+                # Parse and locate the FunctionDef
+                tree = ast.parse(func_stub)
+                func_def = None
+                for node in ast.walk(tree):
+                    if isinstance(node, ast.FunctionDef):
+                        func_def = node
+                        break
+
+                if func_def is None:
+                    return None
+
+                # Extract parameter names, skipping 'self'
+                param_names = []
+                for arg in func_def.args.args:
+                    if arg.arg != "self":
+                        param_names.append(arg.arg)
+
+                return param_names
+            except Exception as e:
+                warnings.warn(f"AST parsing failed for function declaration, falling back to split-based parsing: {e}", stacklevel=2)
+                return None
+
+        def extract_param_names_split(decl: str) -> list[str]:
+            """Fallback: extract parameter names using naive split-based parsing."""
+            paren_start = decl.find("(")
+            paren_end = decl.rfind(")")
+            if paren_start == -1 or paren_end == -1:
+                return []
+
+            params_str = decl[paren_start + 1 : paren_end].strip()
+            if not params_str:
+                return []
+
+            param_parts = params_str.split(",")
+            param_names = []
+            for param in param_parts:
+                param = param.strip()
+                if not param or param == "self":
+                    continue
+                if ":" in param:
+                    param_name = param.split(":")[0].strip()
+                else:
+                    param_name = param.strip()
+                param_names.append(param_name)
+
+            return param_names
+
+        # Try AST-based extraction first, fallback to split-based
+        param_names = extract_param_names_ast(declaration)
+        if param_names is None:
+            param_names = extract_param_names_split(declaration)
+
+        call_args = []
+        for i, param_name in enumerate(param_names):
+            for arg in function_args:
+                if arg["name"] == param_name:
+                    call_args.append((param_name, arg["type"]))
+                elif maybe_desc(arg["name"], param_names, i):
+                    call_args.append((param_name, "None"))
+                    if desc_name_var_map is not None and function_params is not None:
+                        assert len(call_args) <= len(function_params)
+                        desc_name_var_map[param_name] = function_params[len(call_args) - 1]
+        return call_args
+
+    @staticmethod
+    def _filter_non_descriptor_args(
+        call_args: list[tuple[str, str]], desc_names: list[str], tma_tensors: list[str]
+    ) -> list[tuple[str, str]]:
+        """Filter out descriptor arguments."""
+        filtered = []
+        for arg_name, arg_type in call_args:
+            if "desc" in arg_name and arg_name in desc_names:
+                continue
+            if arg_name in tma_tensors:
+                continue
+            filtered.append((arg_name, arg_type))
+        return filtered
+
+    # =========================================================================
+    # TMA Descriptor Code Generation
+    # =========================================================================
+
+    def _generate_tma_desc_init(self, desc_name: str, desc_idx: int, tensor_name: str, info: dict) -> str:
+        """Generate single TMA descriptor initialization code."""
+        if info.get("is_img2col", False):
+            rank = info["tensor_rank"]
+            return CPP_TMA_IM2COL_DESC_INIT_TEMPLATE.format(
+                desc_idx=desc_idx,
+                desc_name=desc_name,
+                tensor_name=tensor_name,
+                rank=rank,
+                stride_rank=rank - 1,
+                rank_minus_two=rank - 2,
+                global_dim_values=", ".join(self._cxx_cast("uint64_t", self._cxx_expr(x)) for x in info["global_dim"]),
+                global_stride_values=", ".join(self._cxx_cast("uint64_t", self._cxx_expr(x)) for x in info["global_stride"][1:]),
+                elem_stride_values=", ".join(self._cxx_cast("uint32_t", self._cxx_expr(x)) for x in info["element_strides"]),
+                lower_corner_values=", ".join(self._cxx_cast("int32_t", self._cxx_expr(x)) for x in info["lower_corner"]),
+                upper_corner_values=", ".join(self._cxx_cast("int32_t", self._cxx_expr(x)) for x in info["upper_corner"]),
+                # Match NVRTC wrapper naming: channelsPerPixel then pixelsPerColumn
+                channels_per_pixel=info["smem_box_channel"],
+                pixels_per_column=info["smem_box_pixel"],
+                dtype=info["dtype"],
+                interleave=info["interleave"],
+                swizzle=info["swizzle"],
+                l2_promotion=info["l2Promotion"],
+                oob_fill=info["oobFill"],
+            )
+
+        return CPP_TMA_DESC_INIT_TEMPLATE.format(
+            desc_idx=desc_idx,
+            desc_name=desc_name,
+            tensor_name=tensor_name,
+            rank=info["tensor_rank"],
+            global_dim_values=", ".join(self._cxx_cast("uint64_t", self._cxx_expr(x)) for x in info["global_dim"]),
+            stride_rank=info["tensor_rank"] - 1,
+            global_stride_values=", ".join(self._cxx_cast("uint64_t", self._cxx_expr(x)) for x in info["global_stride"][1:]),
+            box_dim_values=", ".join(self._cxx_cast("uint32_t", self._cxx_expr(x)) for x in info["box_dim"]),
+            elem_stride_values=", ".join(self._cxx_cast("uint32_t", self._cxx_expr(x)) for x in info["element_strides"]),
+            dtype=info["dtype"],
+            interleave=info["interleave"],
+            swizzle=info["swizzle"],
+            l2_promotion=info["l2Promotion"],
+            oob_fill=info["oobFill"],
+        )
+
+    def _generate_tma_init_func(
+        self,
+        desc_names: list[str],
+        tensor_args: list[str],
+        tensor_arg_map: dict[str, tuple[str, int]],
+        scalar_args: list[dict[str, str]],
+    ) -> str:
+        """Generate TMA init function code (creates descriptors in caller-provided host array).
+
+        TMA descriptors are stored in stack-local tma_descs[] array in launch_kernel.
+        cuLaunchKernel automatically handles __grid_constant__ params.
+        """
+        if not desc_names:
+            return ""
+
+        func_args_parts = [f"uint64_t {arg}_ptr" for arg in tensor_args]
+        for arg in scalar_args:
+            if arg["type"] in ["int", "cutlass.Int32"]:
+                func_args_parts.append(f"int32_t {arg['name']}")
+            elif arg["type"] in ["float", "cutlass.Float32"]:
+                func_args_parts.append(f"float {arg['name']}")
+            else:
+                # Default to int32_t for scalars used in shape/stride math
+                func_args_parts.append(f"int32_t {arg['name']}")
+        func_args = ", ".join(func_args_parts)
+        num_descs = len(desc_names)
+
+        desc_inits = []
+        for idx, desc_name in enumerate(desc_names):
+            info = self.tma_desc_info[desc_name]
+            tensor_name, _ = tensor_arg_map[desc_name]
+            desc_inits.append(self._generate_tma_desc_init(desc_name, idx, tensor_name, info))
+
+        return CPP_TMA_INIT_FUNC_TEMPLATE.format(
+            func_args=func_args,
+            num_descs=num_descs,
+            desc_init_code="\n".join(desc_inits),
+        )
+
+    def _generate_tma_launch_init(
+        self, desc_names: list[str], tma_tensors: list[str], scalar_args: list[dict[str, str]], num_tma_descs: int
+    ) -> str:
+        """Generate TMA initialization code for launch function (host memory mode).
+
+        TMA descriptors stay on host. cuLaunchKernel copies them to param space
+        when kernel uses __grid_constant__ CUtensorMap parameter.
+        """
+        if not desc_names:
+            return ""
+
+        # Generate tma_init call args (no device_ptr needed)
+        call_args_parts = [f"{arg}_ptr" for arg in tma_tensors] + [arg["name"] for arg in scalar_args]
+        tma_tensor_args = ", ".join(call_args_parts)
+
+        return CPP_TMA_LAUNCH_INIT_TEMPLATE.format(
+            num_tma_descs=num_tma_descs,
+            tma_tensor_args=tma_tensor_args,
+        )
+
+    # =========================================================================
+    # Kernel Code Generation
+    # =========================================================================
+
+    def _generate_kernel_init(self, kernel_idx: int, kernel_name: str, smem_size: int) -> str:
+        """Generate kernel initialization code."""
+        return CPP_KERNEL_INIT_TEMPLATE.format(
+            kernel_idx=kernel_idx,
+            kernel_name=kernel_name,
+            smem_size=smem_size,
+        )
+
+    def _generate_kernel_launch(self, kernel_meta: dict, kernel_idx: int, all_desc_names: list[str]) -> str:
+        """Generate single kernel launch code.
+
+        For __grid_constant__ CUtensorMap params:
+        - Pass CUtensorMap* directly (not CUtensorMap**)
+        - cuLaunchKernel copies 128 bytes to kernel param space
+        """
+        call_args = kernel_meta["call_args"]
+        desc_names = kernel_meta["desc_names"]
+        function_info = kernel_meta["function_info"]
+
+        # Build kernel args
+        kernel_args = []
+        for arg_name, arg_type in call_args:
+            if "desc" in arg_name and arg_name in desc_names:
+                # For __grid_constant__ CUtensorMap: pass host pointer directly
+                # cuLaunchKernel will copy 128-byte CUtensorMap to param space
+                desc_idx = all_desc_names.index(arg_name)
+                kernel_args.append(f"&tma_descs[{desc_idx}]")
+            elif arg_type == "buffer":
+                kernel_args.append(f"&{arg_name}_ptr")
+            else:
+                kernel_args.append(f"&{arg_name}")
+
+        grid = function_info["grid_info"]
+        block = function_info["block_info"]
+        smem_size = function_info["dynamic_smem_buf"] or 0
+
+        return CPP_KERNEL_LAUNCH_TEMPLATE.format(
+            kernel_idx=kernel_idx,
+            kernel_name=kernel_meta["function_name"],
+            kernel_args=", ".join(kernel_args),
+            grid_x=self._cxx_expr(grid[0]),
+            grid_y=self._cxx_expr(grid[1]),
+            grid_z=self._cxx_expr(grid[2]),
+            block_x=self._cxx_expr(block[0]),
+            block_y=self._cxx_expr(block[1]),
+            block_z=self._cxx_expr(block[2]),
+            smem_size=smem_size,
+        )
+
+    # =========================================================================
+    # C++ Launcher Generation
+    # =========================================================================
+
+    def _generate_cpp_launcher(
+        self,
+        kernel_metadata_list: list[dict],
+        function_args: list[dict],
+        all_tma_tensors: list[str],
+        all_desc_names: list[str],
+        tensor_arg_map: dict[str, tuple[str, int]],
+    ) -> str:
+        """Generate complete C++ launcher code using templates.
+
+        TMA descriptors are stored on HOST memory in stack-local tma_descs[] array.
+        cuLaunchKernel automatically copies 128-byte CUtensorMap to kernel param space
+        when kernel uses __grid_constant__ parameter.
+        """
+        num_kernels = len(kernel_metadata_list)
+        num_tma_descs = max(len(all_desc_names), 1)  # At least 1 to avoid zero-size array
+
+        # Generate kernel inits
+        kernel_inits = "\n".join(
+            self._generate_kernel_init(idx, km["function_name"], km["function_info"]["dynamic_smem_buf"] or 0)
+            for idx, km in enumerate(kernel_metadata_list)
+        )
+
+        # Generate TMA init function
+        scalar_args = [arg for arg in function_args if arg["type"] != "buffer"]
+        tma_init_func = self._generate_tma_init_func(all_desc_names, all_tma_tensors, tensor_arg_map, scalar_args)
+
+        # Generate launch function signature and get_ptr code
+        func_sig_parts = []
+        get_ptr_code = ""
+        for arg in function_args:
+            if arg["type"] == "buffer":
+                func_sig_parts.append(f"tvm::ffi::TensorView {arg['name']}")
+                get_ptr_code += f"  uint64_t {arg['name']}_ptr = reinterpret_cast<uint64_t>({arg['name']}.data_ptr());\n"
+            elif arg["type"] in ["int", "cutlass.Int32"]:
+                func_sig_parts.append(f"int32_t {arg['name']}")
+            elif arg["type"] in ["float", "cutlass.Float32"]:
+                func_sig_parts.append(f"float {arg['name']}")
+            else:
+                func_sig_parts.append(f"int32_t {arg['name']}")
+
+        # Generate TMA init in launch
+        tma_init_in_launch = self._generate_tma_launch_init(all_desc_names, all_tma_tensors, scalar_args, num_tma_descs)
+
+        # Generate kernel launches
+        kernel_launches = "\n".join(self._generate_kernel_launch(km, idx, all_desc_names) for idx, km in enumerate(kernel_metadata_list))
+
+        return CPP_LAUNCHER_TEMPLATE.format(
+            num_kernels=num_kernels,
+            num_tma_descs=num_tma_descs,
+            kernel_inits=kernel_inits,
+            tma_init_func=tma_init_func,
+            launch_func_sig=", ".join(func_sig_parts),
+            get_ptr_code=get_ptr_code,
+            tma_init_in_launch=tma_init_in_launch,
+            kernel_launches=kernel_launches,
+        )
+
+    # =========================================================================
+    # Python Wrapper Generation
+    # =========================================================================
+
+    def _generate_cubin_gen_code(
+        self,
+        kernel_metadata_list: list[dict],
+        buffer_args: list[str],
+        all_desc_names: list[str],
+        lib_code: str = "",
+    ) -> str:
+        """Generate cubin generation code for Python wrapper using templates.
+
+        Args:
+            lib_code: The CuTeDSL kernel definitions (@cute.kernel decorated functions).
+                      This will be embedded inside _generate_cubin_if_needed to enable
+                      lazy loading of cutlass/cute modules.
+        """
+        # Build unified wrapper parameters
+        wrapper_params_union = []
+        for kernel_meta in kernel_metadata_list:
+            for arg_name, _ in kernel_meta["call_args"]:
+                if arg_name not in wrapper_params_union:
+                    wrapper_params_union.append(arg_name)
+
+        # Build inner args for cute.compile
+        inner_args = []
+        fake_inner_args = []
+        for arg_name in wrapper_params_union:
+            if arg_name in buffer_args:
+                inner_args.append(f"{arg_name}_")
+                fake_inner_args.append(f"__fake_{arg_name}__")
+            elif arg_name in all_desc_names:
+                continue
+            else:
+                inner_args.append(arg_name)
+                fake_inner_args.append(arg_name)
+        if all_desc_names:
+            inner_args.append("__fake_tensor__")
+            fake_inner_args.append("__fake_tensor__")
+        fake_inner_args.append("__fake_stream__")
+
+        # Generate TMA init code
+        tma_init_code = ""
+        if all_desc_names:
+            tma_init_lines = ["    # Create dummy TMA atoms for compilation"]
+            tma_init_lines.extend(CUBIN_TMA_ATOM_INIT_TEMPLATE.format(desc_name=desc_name) for desc_name in all_desc_names)
+            tma_init_code = "\n".join(tma_init_lines) + "\n"
+
+        # Generate kernel launch calls
+        kernel_launches = "\n".join(
+            CUBIN_KERNEL_LAUNCH_TEMPLATE.format(
+                function_name=km["function_name"],
+                call_args=", ".join(arg[0] if arg[0] not in buffer_args else f"{arg[0]}_" for arg in km["call_args"]),
+                grid_x=self._pythonic_expr(km["function_info"]["grid_info"][0]),
+                grid_y=self._pythonic_expr(km["function_info"]["grid_info"][1]),
+                grid_z=self._pythonic_expr(km["function_info"]["grid_info"][2]),
+                block_x=self._pythonic_expr(km["function_info"]["block_info"][0]),
+                block_y=self._pythonic_expr(km["function_info"]["block_info"][1]),
+                block_z=self._pythonic_expr(km["function_info"]["block_info"][2]),
+                smem_size=km["function_info"]["dynamic_smem_buf"] or 0,
+            )
+            for km in kernel_metadata_list
+        )
+
+        # Generate fake tensor creation code
+        # IMPORTANT: Generate fake tensors based on the *union* of parameters actually
+        # passed to cute.compile (wrapper_params_union).
+        #
+        # In multi-kernel cases, a tensor may appear both as a TMA descriptor
+        # (e.g. Output_partial_desc) for one kernel and as a plain tensor argument
+        # (e.g. Output_partial_) for another kernel. Skipping fake tensor creation
+        # just because a matching "{arg}_desc" exists is a correctness bug and
+        # results in undefined names like "__fake_Output_partial__".
+        fake_tensor_code = "\n".join(
+            CUBIN_FAKE_TENSOR_TEMPLATE.format(arg_name=arg_name) for arg_name in wrapper_params_union if arg_name in buffer_args
+        )
+        if fake_tensor_code:
+            fake_tensor_code += "\n"
+
+        # Generate fake TMA tensor code
+        fake_tma_tensor_code = ""
+        if all_desc_names:
+            fake_tma_tensor_code = (
+                "  __fake_tensor__ = make_fake_compact_tensor(cutlass.Int32, (32, 32), stride_order=(1, 0), assumed_align=16)\n"
+            )
+
+        # Indent lib_code to be inside the function
+        indented_lib_code = "\n".join("  " + line if line.strip() else line for line in lib_code.split("\n")) if lib_code else ""
+
+        return CUBIN_GEN_CODE_TEMPLATE.format(
+            lib_code=indented_lib_code,
+            wrapper_args=", ".join(inner_args + ["stream: CUstream"]),
+            tma_init_code=tma_init_code,
+            kernel_launches=kernel_launches,
+            fake_tensor_code=fake_tensor_code,
+            fake_tma_tensor_code=fake_tma_tensor_code,
+            compile_args=", ".join(fake_inner_args),
+            primary_name=kernel_metadata_list[0]["function_name"],
+        )
+
+    def _generate_python_wrapper(
+        self,
+        function_args: list[dict],
+        cubin_gen_code: str,
+        cubin_gen_params: str,
+    ) -> str:
+        """Generate Python wrapper code."""
+        # Build function parameters
+        call_func_params = ", ".join(arg["name"] for arg in function_args)
+        launcher_call_args = ", ".join(arg["name"] for arg in function_args)
+
+        return PYTHON_HOST_FUNC_TEMPLATE.format(
+            cubin_gen_params=cubin_gen_params,
+            cubin_gen_code=cubin_gen_code,
+            launcher_lib_name=self._launcher_lib_name,
+            call_func_params=call_func_params,
+            cubin_gen_call_args=cubin_gen_params,
+            arg_prep_code="",
+            launcher_call_args=launcher_call_args,
+        )
+
+    # =========================================================================
+    # TMA Descriptor Processing
+    # =========================================================================
+
+    def _process_tma_descriptors(self, desc_names: list[str]) -> tuple[list[str], dict[str, tuple[str, int]]]:
+        """Process TMA descriptors and return tensor args and mapping.
+
+        Returns:
+            Tuple of (tensor_args, tensor_arg_map)
+        """
+        if not hasattr(self, "tma_desc_info") or not desc_names:
+            return [], {}
+
+        tensor_args = []
+        tensor_arg_map = {}
+
+        for desc_name in desc_names:
+            info = self.tma_desc_info[desc_name]
+            # Extract the base buffer variable name (must be a Var, not arbitrary expression)
+            global_addr = info["globalAddress"]
+            if not isinstance(global_addr, tvm.tir.Var):
+                raise ValueError(f"TMA globalAddress must be a buffer Var, got {type(global_addr)}: {global_addr}")
+            tensor_name = global_addr.name
+
+            if tensor_name not in tensor_args:
+                tensor_args.append(tensor_name)
+                tensor_arg_map[desc_name] = (tensor_name, len(tensor_args) - 1)
+            else:
+                tensor_arg_map[desc_name] = (tensor_name, tensor_args.index(tensor_name))
+
+        return tensor_args, tensor_arg_map
+
+    def generate_tma_descriptor_args(
+        self,
+        desc_name_map: dict[str, str],
+        desc_name_var_map: dict[str, tvm.tir.Var],
+        tma_desc_code_map: dict[str, str],
+    ) -> list[str]:
+        """Generate TMA descriptor information for C++ code generation.
+
+        Returns:
+            List of descriptor variable names in the order they were processed.
+        """
+        if self.tma_descriptor_args is None:
+            return []
+
+        if not hasattr(self, "tma_desc_info"):
+            self.tma_desc_info = {}
+
+        parsed_params = parse_tma_descriptor_args(self.tma_descriptor_args, desc_name_map, desc_name_var_map, self._pythonic_expr)
+
+        desc_names_ordered = []
+
+        for params in parsed_params:
+            handle_name = params.handle_name
+
+            if handle_name in tma_desc_code_map:
+                continue
+
+            desc_var = desc_name_var_map[handle_name]
+            args = self.tma_descriptor_args[desc_var]
+            _, dtype, tensor_rank, globalAddress, *remaining_args = args[1:]
+            tensor_rank = int(tensor_rank)
+
+            global_dim = remaining_args[:tensor_rank]
+            global_stride = remaining_args[tensor_rank : 2 * tensor_rank]
+
+            if not params.is_img2col:
+                box_dim = remaining_args[2 * tensor_rank : 3 * tensor_rank]
+                element_strides = remaining_args[3 * tensor_rank : 4 * tensor_rank]
+
+                self.tma_desc_info[handle_name] = {
+                    "desc_var": desc_var,
+                    "is_img2col": False,
+                    "dtype": params.dtype,
+                    "tensor_rank": params.tensor_rank,
+                    "globalAddress": params.global_address,
+                    "global_dim": global_dim,
+                    "global_stride": global_stride,
+                    "box_dim": box_dim,
+                    "element_strides": element_strides,
+                    "interleave": params.interleave,
+                    "swizzle": params.swizzle,
+                    "l2Promotion": params.l2_promotion,
+                    "oobFill": params.oob_fill,
+                }
+            else:
+                element_strides = remaining_args[2 * tensor_rank : 3 * tensor_rank]
+
+                self.tma_desc_info[handle_name] = {
+                    "desc_var": desc_var,
+                    "is_img2col": True,
+                    "dtype": params.dtype,
+                    "tensor_rank": params.tensor_rank,
+                    "globalAddress": params.global_address,
+                    "global_dim": global_dim,
+                    "global_stride": global_stride,
+                    "element_strides": element_strides,
+                    "lower_corner": params.lower_corner,
+                    "upper_corner": params.upper_corner,
+                    "smem_box_channel": params.smem_box_channel,
+                    "smem_box_pixel": params.smem_box_pixel,
+                    "interleave": params.interleave,
+                    "swizzle": params.swizzle,
+                    "l2Promotion": params.l2_promotion,
+                    "oobFill": params.oob_fill,
+                }
+
+            tma_desc_code_map[handle_name] = ""
+            desc_names_ordered.append(handle_name)
+
+        return desc_names_ordered
+
+    # =========================================================================
+    # Main Entry Points
+    # =========================================================================
+
+    def create_dispatch_func(self, code, function_informations):
+        """Create dispatch function - always use C++ launcher."""
+        return self.create_dispatch_func_cpp_launcher(code, function_informations)
+
+    def create_dispatch_func_cpp_launcher(self, code, function_informations):
+        """Create dispatch function using C++ launcher."""
+        function_args, buffer_args = self._collect_function_args()
+
+        # Process each kernel and collect metadata
+        kernel_metadata = []
+        all_desc_names_union = []
+        all_tma_tensors_union = []
+
+        for function_name, function_info in function_informations.items():
+            declaration = extract_python_func_declaration(code, function_name)
+            desc_name_map: dict[str, str] = {}
+            desc_name_var_map: dict[str, tvm.tir.Var] = {}
+            call_args = self._extract_func_call_args(
+                declaration,
+                function_args,
+                function_info["function_params"],
+                desc_name_map,
+                desc_name_var_map,
+            )
+
+            tma_desc_code_map = {}
+            desc_names = self.generate_tma_descriptor_args(desc_name_map, desc_name_var_map, tma_desc_code_map)
+
+            tma_tensor_args, _ = self._process_tma_descriptors(desc_names)
+
+            kernel_metadata.append(
+                {
+                    "function_name": function_name,
+                    "function_info": function_info,
+                    "call_args": call_args,
+                    "desc_names": desc_names,
+                    "tma_tensor_args": tma_tensor_args,
+                    "desc_name_map": desc_name_map,
+                }
+            )
+
+            for desc in desc_names:
+                if desc not in all_desc_names_union:
+                    all_desc_names_union.append(desc)
+            for t in tma_tensor_args:
+                if t not in all_tma_tensors_union:
+                    all_tma_tensors_union.append(t)
+
+        # Process all TMA descriptors
+        all_tma_tensors, tensor_arg_map = self._process_tma_descriptors(all_desc_names_union)
+
+        # Generate C++ launcher
+        launcher_cpp_code = self._generate_cpp_launcher(
+            kernel_metadata, function_args, all_tma_tensors, all_desc_names_union, tensor_arg_map
+        )
+
+        self.launcher_cpp_code = launcher_cpp_code
+        # Use a deterministic name so that:
+        # 1) the generated kernel.py can always locate the launcher in the same directory
+        # 2) KernelCache can store it under a stable filename
+        self._launcher_lib_name = "launcher_lib.so"
+        self.launcher_lib_name = self._launcher_lib_name
+
+        # Generate cubin generation code (includes lib_code with @cute.kernel definitions)
+        cubin_gen_code = self._generate_cubin_gen_code(
+            kernel_metadata, buffer_args, all_desc_names_union, lib_code=getattr(self, "lib_code", "")
+        )
+
+        # Generate Python wrapper
+        buffer_names = [arg["name"] for arg in function_args if arg["type"] == "buffer"]
+        # Cubin generation may reference scalar args (e.g., dynamic symbols like m/n/k)
+        # inside `kernel_wrapper` and `cute.compile(...)`. They must be visible in
+        # `_generate_cubin_if_needed(...)` scope, so include them in its signature.
+        scalar_names = [arg["name"] for arg in function_args if arg["type"] != "buffer"]
+        cubin_gen_params = ", ".join(buffer_names + scalar_names)
+
+        python_wrapper = self._generate_python_wrapper(function_args, cubin_gen_code, cubin_gen_params)
+
+        return python_wrapper
+
+    def get_launcher_cpp_code(self) -> str:
+        """Get the generated C++ launcher code."""
+        return getattr(self, "launcher_cpp_code", "")
+
+    def update_lib_code(self, code: str):
+        """Update the library code with the given code string."""
+        self.lib_code = code
+
+        function_informations = {}
+        for function_name in self.function_names:
+            if (function_name not in self.block_info) or (function_name not in self.grid_info):
+                continue
+
+            assert function_name in self.device_mod, f"Function {function_name} not found in device module"
+            device_func = self.device_mod[function_name]
+            kernel_params_cnt = len(device_func.params)
+            function_params: list[str] = None
+
+            def visitor(node, fn=function_name, param_cnt=kernel_params_cnt):
+                nonlocal function_params
+                if isinstance(node, tvm.tir.Call):
+                    if not (hasattr(node, "op") and node.op == tvm.ir.Op.get("tir.tvm_call_packed")):
+                        return
+                    args = node.args
+                    if not args or args[0] != fn:
+                        return
+                    if len(args) < 1 + param_cnt:
+                        raise AssertionError("tvm_call_packed should have at least 1 argument and match device function parameters")
+                    function_params = args[1 : 1 + param_cnt]
+
+            post_order_visit(self.host_func.body, visitor)
+            assert function_params is not None, "function_params should not be None"
+
+            function_informations[function_name] = {
+                "function_name": function_name,
+                "block_info": self.block_info[function_name],
+                "grid_info": self.grid_info[function_name],
+                "dynamic_smem_buf": self.dynamic_smem_buf[function_name],
+                "function_params": function_params,
+            }
+
+        self.host_func = self.create_dispatch_func(code, function_informations)
+        return self.lib_code
diff --git a/tilelang/jit/adapter/cython/adapter.py b/tilelang/jit/adapter/cython/adapter.py
index 7857872cf..4fa66a7f3 100644
--- a/tilelang/jit/adapter/cython/adapter.py
+++ b/tilelang/jit/adapter/cython/adapter.py
@@ -1,4 +1,5 @@
 """The profiler and convert to torch utils"""
+
 from __future__ import annotations
 
 import ctypes
@@ -49,9 +50,9 @@ class CythonKernelAdapter(BaseKernelAdapter):
     ir_module: tvm.IRModule | None = None
     # The global source code of the kernel -> global means the source code of the kernel
     # that is not wrapped by the wrapper code
-    kernel_global_source: str | None = None
+    host_kernel_source: str | None = None
+    device_kernel_source: str | None = None
     lib: ctypes.CDLL | None = None  # Compiled library handle
-    wrapped_source: str | None = None  # Generated C++ wrapper code
     # Maps symbolic variables to their corresponding buffer and shape indices
     dynamic_symbolic_map: dict[tir.Var, tuple[int, int]] | None = None
     # Maps pointer arguments to their corresponding (buffer_index, shape_dimension)
@@ -71,17 +72,19 @@ class CythonKernelAdapter(BaseKernelAdapter):
     # Pass configs for the compiler
     pass_configs: dict[str, Any] | None = None
 
-    def __init__(self,
-                 params: list[KernelParam],
-                 result_idx: list[int],
-                 target: str | Target,
-                 func_or_mod: tir.PrimFunc | tvm.IRModule,
-                 host_mod: tvm.IRModule | None = None,
-                 device_mod: tvm.IRModule | None = None,
-                 kernel_global_source: str | None = None,
-                 verbose: bool = False,
-                 pass_configs: dict[str, Any] | None = None,
-                 compile_flags: list[str] | None = None):
+    def __init__(
+        self,
+        params: list[KernelParam],
+        result_idx: list[int],
+        target: str | Target,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_mod: tvm.IRModule | None = None,
+        device_mod: tvm.IRModule | None = None,
+        device_kernel_source: str | None = None,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
         """Initialize the adapter with the given TIR function or module.
 
         Args:
@@ -93,7 +96,7 @@ def __init__(self,
         """
         self.params = params
         self.result_idx = self._legalize_result_idx(result_idx)
-        self.kernel_global_source = kernel_global_source
+        self.device_kernel_source = device_kernel_source
 
         if isinstance(func_or_mod, tir.PrimFunc):
             self.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod})
@@ -122,16 +125,16 @@ def __init__(self,
         self.wrapper.assign_pass_configs(pass_configs)
         self.wrapper.assign_host_module(host_mod)
         self.wrapper.assign_device_module(device_mod)
-        self.wrapped_source = self.wrapper.wrap(self.get_kernel_source(kernel_only=True))
+        self.host_kernel_source = self.wrapper.wrap(self.get_kernel_source(kernel_only=True))
 
-        self.lib_generator.update_lib_code(self.wrapped_source)
+        self.lib_generator.update_lib_code(self.host_kernel_source)
         self.lib_generator.compile_lib()
         self.lib = self.lib_generator.load_lib()
 
         self.lib.get_last_error.restype = ctypes.c_char_p
         result = self.lib.init()
         if result != 0:
-            error_msg = self.lib.get_last_error().decode('utf-8')
+            error_msg = self.lib.get_last_error().decode("utf-8")
             error_msg += f"\n{self.lib_code}"
             raise RuntimeError(f"Initialization failed: {error_msg}")
 
@@ -146,21 +149,24 @@ def __init__(self,
         self._post_init()
 
     @classmethod
-    def from_database(cls,
-                      params: list[TensorType],
-                      result_idx: list[int],
-                      target: str,
-                      func_or_mod: tir.PrimFunc | tvm.IRModule,
-                      kernel_global_source: str,
-                      kernel_lib_path: str,
-                      verbose: bool = False,
-                      pass_configs: dict[str, Any] | None = None,
-                      compile_flags: list[str] | None = None):
+    def from_database(
+        cls,
+        params: list[TensorType],
+        result_idx: list[int],
+        target: str,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_kernel_source: str,
+        device_kernel_source: str,
+        kernel_lib_path: str,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
         adapter = cls.__new__(cls)
         adapter.params = params
         adapter.result_idx = adapter._legalize_result_idx(result_idx)
-        adapter.kernel_global_source = kernel_global_source
-        adapter.wrapped_source = kernel_global_source
+        adapter.host_kernel_source = host_kernel_source
+        adapter.device_kernel_source = device_kernel_source
         adapter.pass_configs = pass_configs
 
         if isinstance(func_or_mod, tir.PrimFunc):
@@ -190,11 +196,10 @@ def from_database(cls,
         adapter.lib.get_last_error.restype = ctypes.c_char_p
         result = adapter.lib.init()
         if result != 0:
-            error_msg = adapter.lib.get_last_error().decode('utf-8')
+            error_msg = adapter.lib.get_last_error().decode("utf-8")
             raise RuntimeError(f"Initialization failed: {error_msg}")
 
-        adapter.cython_wrapper = CythonKernelWrapper(adapter.result_idx, adapter.params,
-                                                     adapter.lib)
+        adapter.cython_wrapper = CythonKernelWrapper(adapter.result_idx, adapter.params, adapter.lib)
         adapter.cython_wrapper.set_dynamic_symbolic_map(adapter.dynamic_symbolic_map)
         adapter.cython_wrapper.set_buffer_dtype_map(adapter.buffer_dtype_map)
         adapter.cython_wrapper.set_static_shape_map(adapter.static_shape_map)
@@ -221,15 +226,13 @@ def _process_dynamic_symbolic(self) -> dict[tir.Var, tuple[int, int, int]]:
             if param in buffer_map:
                 buffer = buffer_map[param]
                 for j, shape in enumerate(buffer.shape):
-                    if (isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and
-                        (shape not in params)):
+                    if isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and (shape not in params):
                         dynamic_symbolic_map[shape] = (0, i, j)
         for i, param in enumerate(params):
             if param in buffer_map:
                 buffer = buffer_map[param]
                 for j, stride in enumerate(buffer.strides):
-                    if (isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and
-                        (stride not in params)):
+                    if isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and (stride not in params):
                         dynamic_symbolic_map[stride] = (1, i, j)
         return dynamic_symbolic_map
 
@@ -259,14 +262,13 @@ def _process_ptr_map(self) -> dict[int, str]:
         params = func.params
         ptr_map = {}
         for i, param in enumerate(params):
-            if param.dtype == 'handle':
+            if param.dtype == "handle":
                 ptr_map[i] = param.name
         return ptr_map
 
-    def _process_static_buffer_infos(self) -> \
-            tuple[dict[tir.Var, tuple[int, list[tuple[int, int]]]],
-                  dict[tir.Var, tuple[int, list[tuple[int, int]]]],
-                  list[tuple[tir.Var]]]:
+    def _process_static_buffer_infos(
+        self,
+    ) -> tuple[dict[tir.Var, tuple[int, list[tuple[int, int]]]], dict[tir.Var, tuple[int, list[tuple[int, int]]]], list[tuple[tir.Var]]]:
         """Extract information about static shapes from the TIR function.
 
         Maps buffer variables to their corresponding static shapes.
@@ -332,9 +334,7 @@ def _forward_from_prebuild_lib(self, *args, stream: int | None = None):
 
         Converts PyTorch tensor pointers to C void pointers for ctypes interface.
         """
-        ctypes_args = [
-            ctypes.c_void_p(arr.data_ptr()) if not isinstance(arr, int) else arr for arr in args
-        ]
+        ctypes_args = [ctypes.c_void_p(arr.data_ptr()) if not isinstance(arr, int) else arr for arr in args]
         ctypes_args.append(ctypes.c_void_p(stream))
         self.lib.call(*ctypes_args)
 
@@ -349,9 +349,7 @@ def lambda_forward(*args, stream: int = -1, skip_tensor_validation: bool = False
                 skip_tensor_validation: Whether to skip tensor attributes validation which
                 includes shape, dtype, device, etc.
             """
-            return self.cython_wrapper.forward([*args],
-                                               stream=stream,
-                                               skip_tensor_validation=skip_tensor_validation)
+            return self.cython_wrapper.forward([*args], stream=stream, skip_tensor_validation=skip_tensor_validation)
 
         return lambda_forward
 
@@ -383,7 +381,8 @@ def is_dynamic(self):
     def get_kernel_source(self, kernel_only: bool = False):
         """Returns the source code of the compiled kernel."""
         if kernel_only:
-            return self.kernel_global_source
+            return self.device_kernel_source
         else:
-            assert self.wrapped_source is not None, "Wrapped source is not available"
-            return self.wrapped_source
+            # Wrapper only has host kernel source
+            assert self.host_kernel_source is not None, "Wrapped source is not available"
+            return self.host_kernel_source
diff --git a/tilelang/jit/adapter/cython/cython_wrapper.pyx b/tilelang/jit/adapter/cython/cython_wrapper.pyx
index bc5ed2156..0139be513 100644
--- a/tilelang/jit/adapter/cython/cython_wrapper.pyx
+++ b/tilelang/jit/adapter/cython/cython_wrapper.pyx
@@ -8,7 +8,13 @@ from libc.stdlib cimport malloc, free
 from tvm import tir
 from tilelang.utils.tensor import map_torch_type
 from tilelang.env import env
-if env.USE_NVSHMEM:
+
+def _use_nvshmem():
+    """Check if NVSHMEM is enabled in the environment."""
+    val = str(env.USE_NVSHMEM).lower()
+    return val in ("1", "true", "yes", "on")
+
+if _use_nvshmem():
     import pynvshmem
 
 cdef class CythonKernelWrapper:
@@ -35,7 +41,8 @@ cdef class CythonKernelWrapper:
         self.params = params
         self.lib = lib
         # Convert TVM types to native Python types during initialization
-        self.param_dtypes = [param.dtype for param in params]
+        # Convert tvm.DataType to torch.dtype for tensor creation
+        self.param_dtypes = [param.torch_dtype() for param in params]
         # Convert TVM shape arrays to native Python lists
         self.param_shapes = []
         self.get_current_device = torch.cuda.current_device
@@ -85,8 +92,8 @@ cdef class CythonKernelWrapper:
                 tensor_device = tensor.device
                 device_type_match = device.type == tensor_device.type
                 device_index_match = (
-                    tensor_device.index is None or 
-                    device.index is None or 
+                    tensor_device.index is None or
+                    device.index is None or
                     tensor_device.index == device.index
                 )
                 if not (device_type_match and device_index_match):
@@ -118,7 +125,7 @@ cdef class CythonKernelWrapper:
                     f"expected {len(shape_list)} dimensions, "
                     f"got {tensor.dim()}"
                 )
-                
+
             # Check each dimension
             for shape_idx, expected_shape in shape_list:
                 actual_shape = tensor.shape[shape_idx]
@@ -178,7 +185,7 @@ cdef class CythonKernelWrapper:
             )
 
         # Use current CUDA stream if none specified
-        if stream == -1: 
+        if stream == -1:
             if torch.cuda.is_available():
                 try:
                     stream = torch._C._cuda_getCurrentRawStream(torch.cuda.current_device())
@@ -215,7 +222,7 @@ cdef class CythonKernelWrapper:
                         f"Cannot create output tensor (name={param_name}) - 0-dimensional tensors are not supported. "
                         f"Expected shape: {shape}"
                     )
-                if env.USE_NVSHMEM:
+                if _use_nvshmem():
                     tensor = pynvshmem.nvshmem_create_tensor(shape, dtype)
                 else:
                     tensor = torch.empty(*shape, dtype=dtype, device=device)
@@ -244,7 +251,7 @@ cdef class CythonKernelWrapper:
             torch.int64: ctypes.c_int64,
             torch.bool: ctypes.c_bool,
         }
-        
+
         call_args = []
         for i, tensor in enumerate(tensor_list):
             if isinstance(tensor, torch.Tensor):
@@ -257,6 +264,8 @@ cdef class CythonKernelWrapper:
                     if dtype not in dtype_to_ctype:
                         raise ValueError(f"Unsupported tensor dtype: {dtype}")
                     call_args.append(dtype_to_ctype[dtype](tensor))
+            elif tensor is None:
+                call_args.append(ctypes.c_void_p(0))
             else:
                 raise ValueError(f"Unsupported tensor type: {type(tensor)}")
 
@@ -271,9 +280,9 @@ cdef class CythonKernelWrapper:
         # Add dynamic dimension values to kernel arguments
         for _, (ref_id, buffer_idx, shape_idx) in self.dynamic_symbolic_map.items():
             if ref_id == 0:
-                call_args.append(tensor_list[buffer_idx].shape[shape_idx])
+                call_args.append(ctypes.c_int64(tensor_list[buffer_idx].shape[shape_idx]))
             else:
-                call_args.append(tensor_list[buffer_idx].stride(shape_idx))
+                call_args.append(ctypes.c_int64(tensor_list[buffer_idx].stride(shape_idx)))
 
         # Add CUDA stream to kernel arguments
         call_args.append(ctypes.c_void_p(stream))
@@ -289,4 +298,3 @@ cdef class CythonKernelWrapper:
             return tensor_list[self.result_idx[0]]
         else:
             return [tensor_list[i] for i in self.result_idx]
-    
diff --git a/tilelang/jit/adapter/cython/kernel_cache.py b/tilelang/jit/adapter/cython/kernel_cache.py
new file mode 100644
index 000000000..0b3d3ddf7
--- /dev/null
+++ b/tilelang/jit/adapter/cython/kernel_cache.py
@@ -0,0 +1,4 @@
+from tilelang.cache.kernel_cache import KernelCache
+
+
+class CythonKernelCache(KernelCache): ...
diff --git a/tilelang/jit/adapter/dlpack.py b/tilelang/jit/adapter/dlpack.py
index 9fa767f04..bfb28e69c 100644
--- a/tilelang/jit/adapter/dlpack.py
+++ b/tilelang/jit/adapter/dlpack.py
@@ -1,4 +1,5 @@
 """The profiler and convert to torch utils"""
+
 from __future__ import annotations
 
 import torch
@@ -7,7 +8,6 @@
 
 
 class TorchDLPackKernelAdapter(BaseKernelAdapter):
-
     def _convert_torch_func(self) -> callable:
         torch_func = to_pytorch_func(self.mod)
 
diff --git a/tilelang/jit/adapter/kernel_cache.py b/tilelang/jit/adapter/kernel_cache.py
new file mode 100644
index 000000000..0a6a525da
--- /dev/null
+++ b/tilelang/jit/adapter/kernel_cache.py
@@ -0,0 +1,21 @@
+import os
+
+from tilelang.cache.kernel_cache import KernelCache
+from tilelang.jit import JITKernel
+
+
+class TVMFFIKernelCache(KernelCache):
+    kernel_lib_path = "executable.so"
+
+    def _save_wrapper_kernel_code_to_disk(self, kernel: JITKernel, cache_path: str, verbose: bool = False):
+        host_kernel_path = os.path.join(cache_path, self.host_kernel_path)
+        if verbose:
+            self.logger.debug(f"Saving wrapped kernel source code to file: {host_kernel_path}")
+        KernelCache._safe_write_file(host_kernel_path, "w", lambda file: file.write(kernel.adapter.get_host_source()))
+
+    def _save_so_cubin_to_disk(self, kernel: JITKernel, cache_path: str, verbose: bool = False):
+        kernel_lib_path = os.path.join(cache_path, self.kernel_lib_path)
+        executable = kernel.adapter.executable
+        if verbose:
+            self.logger.debug(f"Saving kernel executable to file: {executable}")
+        KernelCache._safe_write_executable(executable, kernel_lib_path)
diff --git a/tilelang/jit/adapter/libgen.py b/tilelang/jit/adapter/libgen.py
index fe839de1e..e4ab01b3b 100644
--- a/tilelang/jit/adapter/libgen.py
+++ b/tilelang/jit/adapter/libgen.py
@@ -1,9 +1,7 @@
 from __future__ import annotations
 import ctypes
-import importlib
 import logging
 import os
-import os.path as osp
 import subprocess
 import tempfile
 from typing import Any
@@ -15,20 +13,11 @@
 from tilelang.contrib.nvcc import get_nvcc_compiler, get_target_arch, get_target_compute_version
 from tilelang.contrib.rocm import find_rocm_path, get_rocm_arch
 from tilelang import env
-from tilelang.utils.deprecated import deprecated_warning
 
 from .utils import is_cpu_target, is_cuda_target, is_hip_target
 
 logger = logging.getLogger(__name__)
 
-try:
-    from tilelang.jit.adapter.nvrtc import is_nvrtc_available
-    if is_nvrtc_available:
-        import cuda.bindings.driver as cuda
-        from tilelang.contrib.nvrtc import compile_cuda
-except ImportError:
-    is_nvrtc_available = False
-
 
 class LibraryGenerator:
     srcpath: str | None = None
@@ -67,25 +56,15 @@ def compile_lib(self, timeout: float = None):
         verbose = self.verbose
         if is_cuda_target(target):
             from tilelang.env import CUTLASS_INCLUDE_DIR
+
             src = tempfile.NamedTemporaryFile(mode="w", suffix=".cu", delete=False)  # noqa: SIM115
             target_arch = get_target_arch(get_target_compute_version(target))
             libpath = src.name.replace(".cu", ".so")
 
-            if self.pass_configs.get(PassConfigKey.TL_DISABLE_FAST_MATH):
-                deprecated_warning(
-                    "TL_DISABLE_FAST_MATH",
-                    "TL_ENABLE_FAST_MATH",
-                    "0.1.7",
-                )
-                enable_fast_math = not self.pass_configs.get(PassConfigKey.TL_DISABLE_FAST_MATH,
-                                                             True)
-            else:
-                enable_fast_math = self.pass_configs.get(PassConfigKey.TL_ENABLE_FAST_MATH, False)
-
-            ptxas_usage_level = self.pass_configs.get(PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL,
-                                                      None)
-            verbose_ptxas_output = self.pass_configs.get(
-                PassConfigKey.TL_ENABLE_PTXAS_VERBOSE_OUTPUT, False)
+            enable_fast_math = self.pass_configs.get(PassConfigKey.TL_ENABLE_FAST_MATH, False)
+
+            ptxas_usage_level = self.pass_configs.get(PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL, None)
+            verbose_ptxas_output = self.pass_configs.get(PassConfigKey.TL_ENABLE_PTXAS_VERBOSE_OUTPUT, False)
 
             command = [
                 get_nvcc_compiler(),
@@ -114,6 +93,7 @@ def compile_lib(self, timeout: float = None):
 
         elif is_hip_target(target):
             from tilelang.env import COMPOSABLE_KERNEL_INCLUDE_DIR
+
             src = tempfile.NamedTemporaryFile(mode="w", suffix=".cpp", delete=False)  # noqa: SIM115
             libpath = src.name.replace(".cpp", ".so")
             rocm_path = find_rocm_path()
@@ -131,6 +111,7 @@ def compile_lib(self, timeout: float = None):
             ]
         elif is_cpu_target(target):
             from tilelang.contrib.cc import get_cplus_compiler
+
             src = tempfile.NamedTemporaryFile(mode="w", suffix=".cpp", delete=False)  # noqa: SIM115
             libpath = src.name.replace(".cpp", ".so")
 
@@ -144,21 +125,17 @@ def compile_lib(self, timeout: float = None):
         command += [
             "-I" + env.TILELANG_TEMPLATE_PATH,
         ]
-        if env.USE_NVSHMEM:
+
+        if env.USE_NVSHMEM and is_cuda_target(target):
             assert env.NVSHMEM_INCLUDE_DIR is not None, "env.NVSHMEM_INCLUDE_DIR is not set"
             assert env.NVSHMEM_LIB_PATH is not None, "env.NVSHMEM_LIB_PATH is not set"
             command += ["-diag-suppress=20013"]
             if not disable_rdc:
                 command += ["-rdc=true"]
-            command += [
-                "-I" + env.NVSHMEM_INCLUDE_DIR, "-L" + env.NVSHMEM_LIB_PATH,
-                "-lnvshmem_host -lnvshmem_device"
-            ]
+            command += ["-I" + env.NVSHMEM_INCLUDE_DIR, "-L" + env.NVSHMEM_LIB_PATH, "-lnvshmem_host", "-lnvshmem_device"]
 
         if self.compile_flags:
-            command += [
-                item for flag in self.compile_flags for item in flag.split() if item not in command
-            ]
+            command += [item for flag in self.compile_flags for item in flag.split() if item not in command]
 
         command += ["-o", libpath]
 
@@ -172,8 +149,7 @@ def compile_lib(self, timeout: float = None):
             raise RuntimeError(f"Compile kernel failed because of {e}") from e
 
         if ret.returncode != 0:
-            raise RuntimeError(f"Compilation Failed! {command}"
-                               f"\n {self.lib_code}")
+            raise RuntimeError(f"Compilation Failed! {command}\n {self.lib_code}")
 
         self.srcpath = src.name
         self.libpath = libpath
@@ -194,102 +170,3 @@ def set_lib_path(self, libpath):
 
     def set_src_path(self, srcpath):
         self.srcpath = srcpath
-
-
-class PyLibraryGenerator(LibraryGenerator):
-    host_func: str | None = None
-    culib = None
-    pymodule = None
-
-    def __init__(self, target: Target, verbose: bool = False):
-        if not is_nvrtc_available:
-            raise ImportError("cuda-python is not available, nvrtc backend cannot be used. "
-                              "Please install cuda-python via `pip install cuda-python` "
-                              "if you want to use the nvrtc backend.")
-        super().__init__(target, verbose)
-
-    @staticmethod
-    def import_from_file(module_name, file_path):
-        spec = importlib.util.spec_from_file_location(module_name, file_path)
-        module = importlib.util.module_from_spec(spec)
-        spec.loader.exec_module(module)
-        return module
-
-    def update_host_func(self, host_func: str):
-        self.host_func = host_func
-
-    def load_lib(self, lib_path: str | None = None):
-        if lib_path is None:
-            lib_path = self.libpath
-
-        pypath = lib_path.replace(".cubin", ".py")
-        self.pymodule = self.import_from_file("kernel", pypath)
-
-        # Ensure the context is valid
-        ctx = cuda.cuCtxGetCurrent()[1]
-        if cuda.cuCtxGetApiVersion(ctx)[0] != cuda.CUresult.CUDA_SUCCESS:
-            import torch
-            torch.cuda.synchronize()
-
-        result, self.culib = cuda.cuLibraryLoadFromFile(
-            bytes(lib_path, "utf-8"), [], [], 0, [], [], 0)
-        assert result == cuda.CUresult.CUDA_SUCCESS, f"Failed to load library: {lib_path}"
-
-    def compile_lib(self, timeout: float = None):
-        target = self.target
-        verbose = self.verbose
-        if is_cuda_target(target):
-            from tilelang.env import (CUDA_HOME, CUTLASS_INCLUDE_DIR, TILELANG_TEMPLATE_PATH)
-            src = tempfile.NamedTemporaryFile(mode="w", suffix=".cu", delete=False)  # noqa: SIM115
-            libpath = src.name.replace(".cu", ".cubin")
-
-            project_root = osp.join(osp.dirname(__file__), "..", "..")
-            if CUTLASS_INCLUDE_DIR is None:
-                cutlass_path = osp.abspath(osp.join(project_root, "3rdparty/cutlass/include"))
-            else:
-                cutlass_path = CUTLASS_INCLUDE_DIR
-
-            if TILELANG_TEMPLATE_PATH is None:
-                tl_template_path = osp.abspath(osp.join(project_root, "src"))
-            else:
-                tl_template_path = TILELANG_TEMPLATE_PATH
-
-            cuda_home = CUDA_HOME if CUDA_HOME else "/usr/local/cuda"
-
-            options = [f"-I{tl_template_path}", f"-I{cutlass_path}", f"-I{cuda_home}/include"]
-            if self.compile_flags:
-                options += [
-                    item for flag in self.compile_flags for item in flag.split()
-                    if item not in options
-                ]
-
-            options = [f"-I{tl_template_path}", f"-I{cutlass_path}", f"-I{cuda_home}/include"]
-            if self.compile_flags:
-                options += [
-                    item for flag in self.compile_flags for item in flag.split()
-                    if item not in options
-                ]
-
-            cubin_bytes = compile_cuda(
-                self.lib_code, target_format="cubin", options=options, verbose=verbose)
-            with open(libpath, "wb") as f:
-                f.write(cubin_bytes)
-
-            src.write(self.lib_code)
-            src.flush()
-
-            self.srcpath = src.name
-            self.libpath = libpath
-
-            pypath = src.name.replace(".cu", ".py")
-            with open(pypath, "w") as f:
-                f.write(self.host_func)
-        else:
-            raise ValueError(f"Unsupported target: {target}")
-
-    def __del__(self):
-        if self.culib:
-            result = cuda.cuLibraryUnload(self.culib)[0]
-            if result != cuda.CUresult.CUDA_SUCCESS:
-                logger.warning(f"Failed to unload library: {self.libpath}")
-            self.culib = None
diff --git a/tilelang/jit/adapter/nvrtc/__init__.py b/tilelang/jit/adapter/nvrtc/__init__.py
index c9068fafd..c8abe8d77 100644
--- a/tilelang/jit/adapter/nvrtc/__init__.py
+++ b/tilelang/jit/adapter/nvrtc/__init__.py
@@ -5,19 +5,22 @@
 
 import logging
 
-__all__ = ['NVRTCKernelAdapter', 'is_nvrtc_available', 'check_nvrtc_available']
+__all__ = ["NVRTCKernelAdapter", "TLNVRTCSourceWrapper", "NVRTCLibraryGenerator", "is_nvrtc_available", "check_nvrtc_available"]
 
 logger = logging.getLogger(__name__)
 
 # Check if cuda-python is available
 is_nvrtc_available = False
-NVRTC_UNAVAILABLE_MESSAGE = ("cuda-python is not available, NVRTC backend cannot be used. "
-                             "Please install cuda-python via `pip install cuda-python` "
-                             "if you want to use the NVRTC backend.")
+NVRTC_UNAVAILABLE_MESSAGE = (
+    "cuda-python is not available, NVRTC backend cannot be used. "
+    "Please install cuda-python via `pip install cuda-python` "
+    "if you want to use the NVRTC backend."
+)
 
 try:
     import cuda.bindings.driver as cuda  # noqa: F401
     import cuda.bindings.nvrtc as nvrtc  # noqa: F401
+
     is_nvrtc_available = True
 except ImportError as e:
     logger.debug(f"cuda-python import failed: {e}")
@@ -37,7 +40,9 @@ def check_nvrtc_available():
 
 # Conditionally import the adapter
 if is_nvrtc_available:
-    from .adapter import NVRTCKernelAdapter  # noqa: F401
+    from .adapter import NVRTCKernelAdapter
+    from .wrapper import TLNVRTCSourceWrapper
+    from .libgen import NVRTCLibraryGenerator
 else:
     # Provide a dummy class that raises error on instantiation
     class NVRTCKernelAdapter:
@@ -45,3 +50,19 @@ class NVRTCKernelAdapter:
 
         def __init__(self, *args, **kwargs):
             raise ImportError(NVRTC_UNAVAILABLE_MESSAGE)
+
+        @classmethod
+        def from_database(cls, *args, **kwargs):
+            raise ImportError(NVRTC_UNAVAILABLE_MESSAGE)
+
+    class TLNVRTCSourceWrapper:
+        """Dummy TLNVRTCSourceWrapper that raises ImportError on instantiation."""
+
+        def __init__(self, *args, **kwargs):
+            raise ImportError(NVRTC_UNAVAILABLE_MESSAGE)
+
+    class NVRTCLibraryGenerator:
+        """Dummy NVRTCLibraryGenerator that raises ImportError on instantiation."""
+
+        def __init__(self, *args, **kwargs):
+            raise ImportError(NVRTC_UNAVAILABLE_MESSAGE)
diff --git a/tilelang/jit/adapter/nvrtc/adapter.py b/tilelang/jit/adapter/nvrtc/adapter.py
index d6723a031..083c8f215 100644
--- a/tilelang/jit/adapter/nvrtc/adapter.py
+++ b/tilelang/jit/adapter/nvrtc/adapter.py
@@ -9,12 +9,13 @@
 from tilelang import tvm as tvm
 from tilelang.engine.param import KernelParam
 from tilelang.jit.adapter.wrapper import TLPyWrapper
-from tilelang.jit.adapter.libgen import PyLibraryGenerator
 from tilelang.utils.language import retrieve_func_from_module
 from tilelang.utils.target import determine_target
 from tilelang.jit.adapter.base import BaseKernelAdapter
 from tilelang.jit.adapter.nvrtc import is_nvrtc_available, check_nvrtc_available
 
+from .libgen import NVRTCLibraryGenerator
+
 logger = logging.getLogger(__name__)
 
 # Import cuda bindings if available
@@ -26,23 +27,24 @@ class NVRTCKernelAdapter(BaseKernelAdapter):
     pymodule = None
     kernels = {}
 
-    def __init__(self,
-                 params: list[KernelParam],
-                 result_idx: list[int],
-                 target: str | Target,
-                 func_or_mod: tir.PrimFunc | tvm.IRModule,
-                 host_mod: tvm.IRModule | None = None,
-                 device_mod: tvm.IRModule | None = None,
-                 kernel_global_source: str | None = None,
-                 verbose: bool = False,
-                 pass_configs: dict[str, Any] | None = None,
-                 compile_flags: list[str] | None = None):
-
+    def __init__(
+        self,
+        params: list[KernelParam],
+        result_idx: list[int],
+        target: str | Target,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_mod: tvm.IRModule | None = None,
+        device_mod: tvm.IRModule | None = None,
+        device_kernel_source: str | None = None,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
         check_nvrtc_available()
 
         self.params = params
         self.result_idx = self._legalize_result_idx(result_idx)
-        self.kernel_global_source = kernel_global_source
+        self.device_kernel_source = device_kernel_source
 
         if isinstance(func_or_mod, tir.PrimFunc):
             self.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod})
@@ -50,7 +52,8 @@ def __init__(self,
             self.ir_module = func_or_mod
 
         # Cache parameter information during initialization
-        self.param_dtypes = [param.dtype for param in params]
+        # Convert tvm.DataType to torch.dtype for tensor creation
+        self.param_dtypes = [param.torch_dtype() for param in params]
         self.param_shapes = []
         for param in params:
             native_shape = []
@@ -73,10 +76,12 @@ def __init__(self,
         self.wrapper.assign_pass_configs(pass_configs)
         self.wrapper.assign_host_module(host_mod)
         self.wrapper.assign_device_module(device_mod)
-        self.host_func, self.function_names = self.wrapper.wrap(kernel_global_source)
+        wrapper_result = self.wrapper.wrap(device_kernel_source)
+        self.host_func = wrapper_result["host_func"]
+        self.function_names = wrapper_result["function_names"]
 
-        self.lib_generator = PyLibraryGenerator(self.target, self.verbose)
-        self.lib_generator.update_lib_code(self.kernel_global_source)
+        self.lib_generator = NVRTCLibraryGenerator(self.target, self.verbose)
+        self.lib_generator.update_lib_code(self.device_kernel_source)
         self.lib_generator.update_host_func(self.host_func)
         self.lib_generator.assign_compile_flags(compile_flags)
         self.lib_generator.compile_lib()
@@ -91,20 +96,24 @@ def __init__(self,
         self._post_init()
 
     @classmethod
-    def from_database(cls,
-                      params: list[KernelParam],
-                      result_idx: list[int],
-                      target: str,
-                      func_or_mod: tir.PrimFunc | tvm.IRModule,
-                      kernel_global_source: str,
-                      kernel_lib_path: str,
-                      verbose: bool = False,
-                      pass_configs: dict[str, Any] | None = None,
-                      compile_flags: list[str] | None = None):
+    def from_database(
+        cls,
+        params: list[KernelParam],
+        result_idx: list[int],
+        target: str,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_kernel_source: str,
+        device_kernel_source: str,
+        kernel_lib_path: str,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
         adapter = cls.__new__(cls)
         adapter.params = params
         adapter.result_idx = adapter._legalize_result_idx(result_idx)
-        adapter.kernel_global_source = kernel_global_source
+        adapter.host_kernel_source = host_kernel_source
+        adapter.device_kernel_source = device_kernel_source
 
         if isinstance(func_or_mod, tir.PrimFunc):
             adapter.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod})
@@ -112,7 +121,8 @@ def from_database(cls,
             adapter.ir_module = func_or_mod
 
         # Cache parameter information during initialization
-        adapter.param_dtypes = [param.dtype for param in params]
+        # Convert tvm.DataType to torch.dtype for tensor creation
+        adapter.param_dtypes = [param.torch_dtype() for param in params]
         adapter.param_shapes = []
         for param in params:
             native_shape = []
@@ -130,7 +140,7 @@ def from_database(cls,
 
         adapter.target = Target.canon_target(determine_target(target))
         adapter.verbose = verbose
-        adapter.lib_generator = PyLibraryGenerator(adapter.target, adapter.verbose)
+        adapter.lib_generator = NVRTCLibraryGenerator(adapter.target, adapter.verbose)
         adapter.lib_generator.assign_compile_flags(compile_flags)
         adapter.lib_generator.load_lib(lib_path=kernel_lib_path)
         adapter.pymodule = adapter.lib_generator.pymodule
@@ -166,7 +176,7 @@ def _process_dynamic_symbolic(self) -> dict[tir.Var, tuple[int, int]]:
                     dynamic_symbolic_map[shape] = (i, j)
         return dynamic_symbolic_map
 
-    def get_kernel_source(self) -> str | None:
+    def get_kernel_source(self, kernel_only: bool = True) -> str | None:
         """Get the CUDA kernel source code.
 
         Returns
@@ -174,11 +184,13 @@ def get_kernel_source(self) -> str | None:
         Optional[str]
             The kernel source code, or None if not available
         """
-        return self.kernel_global_source
+        if kernel_only:
+            return self.device_kernel_source
+        else:
+            return self.host_func
 
     def _forward_from_prebuild_lib(self, *args, stream: int | None = None):
-        """Low-level function to call the compiled CUDA kernel.
-        """
+        """Low-level function to call the compiled CUDA kernel."""
         return self.pymodule.call(self.kernels, *args, stream=stream)
 
     def _wrap_forward_from_prebuild_lib(self, *ins: list[torch.Tensor], stream: int | None = None):
diff --git a/tilelang/jit/adapter/nvrtc/kernel_cache.py b/tilelang/jit/adapter/nvrtc/kernel_cache.py
new file mode 100644
index 000000000..754ab6147
--- /dev/null
+++ b/tilelang/jit/adapter/nvrtc/kernel_cache.py
@@ -0,0 +1,18 @@
+import os
+
+from tilelang.cache.kernel_cache import KernelCache
+from tilelang.jit import JITKernel
+
+
+class NVRTCKernelCache(KernelCache):
+    kernel_lib_path = "kernel.cubin"
+    kernel_py_path = "kernel.py"
+
+    def _save_so_cubin_to_disk(self, kernel: JITKernel, cache_path: str, verbose: bool = False):
+        src_lib_path = kernel.adapter.libpath
+        kernel_py_path = os.path.join(cache_path, self.kernel_py_path)
+        src_lib_path = src_lib_path.replace(".cubin", ".py")
+        if verbose:
+            self.logger.debug(f"Saving kernel nvrtc python code to file: {kernel_py_path}")
+        KernelCache._safe_write_file(kernel_py_path, "wb", lambda file: file.write(KernelCache._load_binary(src_lib_path)))
+        super()._save_so_cubin_to_disk(kernel, cache_path, verbose)
diff --git a/tilelang/jit/adapter/nvrtc/libgen.py b/tilelang/jit/adapter/nvrtc/libgen.py
new file mode 100644
index 000000000..406cc44d9
--- /dev/null
+++ b/tilelang/jit/adapter/nvrtc/libgen.py
@@ -0,0 +1,233 @@
+"""NVRTC Library Generator for TileLang.
+
+Compiles CUDA kernels at runtime using NVRTC and manages resulting binaries.
+
+Why NVRTC instead of nvcc:
+- No offline compilation step, enables true JIT workflows
+- Works without CUDA toolkit installed (only requires driver)
+- Allows kernel specialization based on runtime parameters
+
+Key responsibilities:
+- Compile CUDA source to cubin using NVRTC API
+- Generate accompanying Python launcher code
+- Load compiled cubin and extract kernel handles
+- Manage library lifecycle (load/unload)
+"""
+
+from __future__ import annotations
+import importlib
+import logging
+import os.path as osp
+import platform
+import tempfile
+from types import ModuleType
+
+from tvm.target import Target
+
+from tilelang import tvm as tvm
+from tilelang.jit.adapter.libgen import LibraryGenerator
+from tilelang.jit.adapter.utils import is_cuda_target
+from tilelang.jit.adapter.nvrtc import is_nvrtc_available, NVRTC_UNAVAILABLE_MESSAGE
+
+logger = logging.getLogger(__name__)
+
+if is_nvrtc_available:
+    import cuda.bindings.driver as cuda
+    from tilelang.contrib.nvrtc import compile_cuda
+else:
+    raise ImportError(NVRTC_UNAVAILABLE_MESSAGE)
+
+
+class NVRTCLibraryGenerator(LibraryGenerator):
+    """Runtime compiler and loader for NVRTC-compiled CUDA kernels.
+
+    Lifecycle:
+        1. compile_lib(): CUDA source → cubin + Python launcher
+        2. load_lib(): cubin → loaded library + kernel handles
+        3. pymodule.call(): Execute kernels via Python launcher
+        4. __del__: Cleanup (unload library)
+
+    Why three files (cu, cubin, py):
+        - .cu: Source for debugging, kept in temp directory
+        - .cubin: Compiled binary, loaded by CUDA driver
+        - .py: Launch code, imported as Python module
+
+    Attributes:
+        host_func: Generated Python launch code (from wrapper)
+        culib: CUDA library handle (CUlibrary)
+        pymodule: Imported Python module containing call() function
+    """
+
+    host_func: str | None = None
+    culib: cuda.CUlibrary | None = None
+    pymodule: ModuleType | None = None
+    pypath: str | None = None
+
+    def __init__(self, target: Target, verbose: bool = False):
+        """Initialize NVRTC library generator.
+
+        Args:
+            target: Compilation target (must be CUDA)
+            verbose: Enable verbose compilation output
+        """
+        super().__init__(target, verbose)
+
+    @staticmethod
+    def import_from_file(module_name, file_path):
+        """Dynamically import Python module from file path.
+
+        Standard importlib pattern for loading modules outside sys.path.
+        Used to import generated .py launcher code from temp directory.
+
+        Args:
+            module_name: Name to assign to imported module
+            file_path: Absolute path to .py file
+
+        Returns:
+            Imported module object
+        """
+        spec = importlib.util.spec_from_file_location(module_name, file_path)
+        if spec is None or spec.loader is None:
+            raise ImportError(f"Failed to import module from file: {file_path}")
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        return module
+
+    def update_host_func(self, host_func: str):
+        """Store generated Python launch code for later file write.
+
+        Called by adapter after wrapper generates the launch code.
+        This is the bridge between code generation and file output.
+
+        Args:
+            host_func: Python source code containing call() function
+        """
+        self.host_func = host_func
+
+    def load_lib(self, lib_path: str | None = None):
+        """Load compiled cubin and Python launcher into memory.
+
+        Why two loads:
+            1. Import Python module for launch logic
+            2. Load cubin via CUDA Driver API for kernel handles
+
+        Context synchronization: CUDA context must be current before loading.
+        If not, use torch.cuda.synchronize() to establish context.
+
+        Args:
+            lib_path: Path to .cubin file (optional, uses self.libpath if None)
+
+        Side effects:
+            - Sets self.pymodule to imported Python module
+            - Sets self.culib to CUDA library handle
+        """
+        if lib_path is None:
+            lib_path = self.libpath
+        else:
+            self.libpath = lib_path
+
+        self.pypath = lib_path.replace(".cubin", ".py")
+        self.pymodule = self.import_from_file("kernel", self.pypath)
+
+        # Ensure the context is valid
+        ctx = cuda.cuCtxGetCurrent()[1]
+        if cuda.cuCtxGetApiVersion(ctx)[0] != cuda.CUresult.CUDA_SUCCESS:
+            import torch
+
+            torch.cuda.synchronize()
+
+        result, self.culib = cuda.cuLibraryLoadFromFile(bytes(lib_path, "utf-8"), [], [], 0, [], [], 0)
+        if result != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"Failed to load library: {lib_path}, error: {result}")
+
+    def compile_lib(self, timeout: float | None = None):
+        """Compile CUDA source to cubin using NVRTC and write output files.
+
+        Output artifacts (all in temp directory):
+            - .cu: Source code (for debugging)
+            - .cubin: Compiled binary (for execution)
+            - .py: Python launcher (for calling kernels)
+
+        Include paths setup:
+            - TileLang templates: kernel primitives and utilities
+            - CUTLASS: optimized GEMM/tensor ops
+            - CUDA headers: driver/runtime APIs
+
+        Why architecture detection:
+            ARM64 servers (SBSA) have different header paths than x86_64.
+
+        Args:
+            timeout: Compilation timeout in seconds (currently unsupported by NVRTC compiler)
+
+        Side effects:
+            - Writes .cu, .cubin, .py files to temp directory
+            - Sets self.srcpath, self.libpath, self.pypath
+        """
+        target = self.target
+        verbose = self.verbose
+        if is_cuda_target(target):
+            from tilelang.env import CUDA_HOME, CUTLASS_INCLUDE_DIR, TILELANG_TEMPLATE_PATH
+
+            src = tempfile.NamedTemporaryFile(mode="w", suffix=".cu", delete=False)
+            libpath = src.name.replace(".cu", ".cubin")
+
+            project_root = osp.join(osp.dirname(__file__), "..", "..")
+            if CUTLASS_INCLUDE_DIR is None:
+                cutlass_path = osp.abspath(osp.join(project_root, "3rdparty/cutlass/include"))
+            else:
+                cutlass_path = CUTLASS_INCLUDE_DIR
+
+            if TILELANG_TEMPLATE_PATH is None:
+                tl_template_path = osp.abspath(osp.join(project_root, "src"))
+            else:
+                tl_template_path = TILELANG_TEMPLATE_PATH
+
+            cuda_home = CUDA_HOME if CUDA_HOME else "/usr/local/cuda"
+            __CUDACC_VER_MAJOR__ = cuda.CUDA_VERSION // 1000
+
+            # Determine target architecture
+            machine = platform.machine()
+            target_arch = "sbsa-linux" if machine in ("aarch64", "arm64") else "x86_64-linux"
+
+            options = [
+                f"-I{tl_template_path}",
+                f"-I{cutlass_path}",
+                f"-I{cuda_home}/include",
+                f"-I{cuda_home}/targets/{target_arch}/include",
+                f"-I{cuda_home}/targets/{target_arch}/include/cccl",
+                f"-D__CUDACC_VER_MAJOR__={__CUDACC_VER_MAJOR__}",
+            ]
+            if self.compile_flags:
+                options += [item for flag in self.compile_flags for item in flag.split() if item not in options]
+
+            cubin_bytes = compile_cuda(self.lib_code, target_format="cubin", options=options, verbose=verbose)
+            with open(libpath, "wb") as f:
+                f.write(cubin_bytes)
+
+            src.write(self.lib_code)
+            src.flush()
+
+            self.srcpath = src.name
+            self.libpath = libpath
+            self.pypath = src.name.replace(".cu", ".py")
+            if self.host_func is None:
+                raise RuntimeError("Host function is not set, please call update_host_func() first.")
+            with open(self.pypath, "w") as f:
+                f.write(self.host_func)
+        else:
+            raise ValueError(f"Unsupported target: {target}")
+
+    def __del__(self):
+        """Cleanup: unload CUDA library when object is destroyed.
+
+        Critical for resource management - CUDA libraries consume GPU memory.
+        Failure to unload is logged but not raised (destructor can't fail).
+
+        Why explicit unload:
+            Python GC doesn't know about GPU resources, must release manually.
+        """
+        if self.culib:
+            result = cuda.cuLibraryUnload(self.culib)[0]
+            if result != cuda.CUresult.CUDA_SUCCESS:
+                logger.warning(f"Failed to unload library: {self.libpath}")
+            self.culib = None
diff --git a/tilelang/jit/adapter/nvrtc/wrapper.py b/tilelang/jit/adapter/nvrtc/wrapper.py
new file mode 100644
index 000000000..2316823ec
--- /dev/null
+++ b/tilelang/jit/adapter/nvrtc/wrapper.py
@@ -0,0 +1,581 @@
+"""NVRTC Source Wrapper for TileLang.
+
+Generates Python runtime code for launching CUDA kernels compiled via NVRTC.
+
+Why this exists:
+- NVRTC compiles kernels at runtime, needs Python launch code (not C++)
+- TMA descriptors must be initialized once per unique buffer, not per kernel
+- L2 cache policies require explicit CUDA Driver API setup/teardown
+
+Key design:
+- Two-pass generation: collect all descriptors first, then generate launches
+- Dict-based deduplication ensures TMA descriptors created only once
+- Generates pure Python using cuda.bindings.driver for zero C++ dependency
+"""
+
+from __future__ import annotations
+from typing import Any, ClassVar
+
+from tvm import IRModule
+from tvm.target import Target
+from tvm.tir.stmt_functor import post_order_visit
+
+from tilelang import tvm as tvm
+from tilelang.jit.adapter.wrapper import TLCUDASourceWrapper
+from tilelang.jit.adapter.utils import match_declare_kernel, pythonic_expr, parse_function_call_args, parse_tma_descriptor_args
+
+PREDEF_HOST_FUNC_PY = """
+from cuda.bindings.driver import (
+    CUtensorMapDataType,
+    CUtensorMapInterleave,
+    CUtensorMapSwizzle,
+    CUtensorMapL2promotion,
+    CUtensorMapFloatOOBfill,
+    cuTensorMapEncodeTiled,
+    cuTensorMapEncodeIm2col,
+    CUresult,
+    cuKernelSetAttribute,
+    CUfunction_attribute,
+    CUdevice,
+    CUlaunchConfig,
+    cuLaunchKernelEx,
+    cuuint64_t,
+    cuuint32_t,
+    CUkernel,
+)
+import ctypes
+
+_function_names = {}
+
+def call({}):
+    {}
+"""
+
+TMA_DESC_INIT_FUNC_PY = """
+    {0}_type = CUtensorMapDataType({1})
+    {0}_tensorRank = {2}
+    {0}_globalAddress = {3}.data_ptr()
+    {0}_globalDim = [{4}]
+    {0}_globalStride = [{5}][1:]
+    {0}_boxDim = [{6}]
+    {0}_elementStrides = [{7}]
+    {0}_interleave = CUtensorMapInterleave({8})
+    {0}_swizzle = CUtensorMapSwizzle({9})
+    {0}_l2Promotion = CUtensorMapL2promotion({10})
+    {0}_oobFill = CUtensorMapFloatOOBfill({11})
+
+    res, {0} = cuTensorMapEncodeTiled(
+        {0}_type,
+        {0}_tensorRank,
+        {0}_globalAddress,
+        {0}_globalDim,
+        {0}_globalStride,
+        {0}_boxDim,
+        {0}_elementStrides,
+        {0}_interleave,
+        {0}_swizzle,
+        {0}_l2Promotion,
+        {0}_oobFill,
+    )
+
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to initialize the TMA descriptor {0}: {{res}}")
+"""
+
+TMA_IM2COL_DESC_INIT_FUNC_PY = """
+    {0}_type = CUtensorMapDataType({1})
+    {0}_tensorRank = {2}
+    {0}_globalAddress = {3}.data_ptr()
+    {0}_globalDim = [{4}]
+    {0}_globalStride = [{5}][1:]
+    {0}_elementStrides = [{6}]
+    {0}_lowerCorner = [{7}]
+    {0}_upperCorner = [{8}]
+    {0}_channelsPerPixel = {9}
+    {0}_pixelsPerColumn = {10}
+    {0}_interleave = CUtensorMapInterleave({11})
+    {0}_swizzle = CUtensorMapSwizzle({12})
+    {0}_l2Promotion = CUtensorMapL2promotion({13})
+    {0}_oobFill = CUtensorMapFloatOOBfill({14})
+
+    res, {0} = cuTensorMapEncodeIm2col(
+        {0}_type,
+        {0}_tensorRank,
+        {0}_globalAddress,
+        {0}_globalDim,
+        {0}_globalStride,
+        {0}_lowerCorner,
+        {0}_upperCorner,
+        {0}_channelsPerPixel,
+        {0}_pixelsPerColumn,
+        {0}_elementStrides,
+        {0}_interleave,
+        {0}_swizzle,
+        {0}_l2Promotion,
+        {0}_oobFill,
+    )
+
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to initialize the TMA descriptor {0}: {{res}}")
+"""
+
+L2_PERSISTENT_MAP_CREATE_HANDLE_PY = """
+    from cuda.bindings.driver import (
+        CUstreamAttrValue,
+        CUstreamAttrID,
+        CUlimit,
+        CUaccessProperty,
+        cuCtxGetLimit,
+        cuCtxSetLimit,
+        cuStreamSetAttribute,
+        cuCtxResetPersistingL2Cache,
+    )
+
+    stream_attribute = CUstreamAttrValue()
+    res, init_persisting_l2_cache_size = cuCtxGetLimit(CUlimit.CU_LIMIT_PERSISTING_L2_CACHE_SIZE)
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to get L2 cache size limit: {{res}}")
+"""
+
+L2_PERSISTENT_MAP_INIT_FUNC_PY = """
+    stream_attribute.accessPolicyWindow.hitRatio = {1}
+    stream_attribute.accessPolicyWindow.hitProp = CUaccessProperty.CU_ACCESS_PROPERTY_PERSISTING
+    stream_attribute.accessPolicyWindow.missProp = CUaccessProperty.CU_ACCESS_PROPERTY_STREAMING
+
+    res = cuCtxSetLimit(CUlimit.CU_LIMIT_PERSISTING_L2_CACHE_SIZE, {2})[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to set L2 cache size limit: {{res}}")
+
+    stream_attribute.accessPolicyWindow.base_ptr = {0}.data_ptr()
+    stream_attribute.accessPolicyWindow.num_bytes = {2}
+
+    res = cuStreamSetAttribute(stream, CUstreamAttrID.CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW, stream_attribute)[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to set stream L2 access policy: {{res}}")
+"""
+
+L2_PERSISTENT_MAP_RESET_HANDLE_PY = """
+    stream_attribute.accessPolicyWindow.num_bytes = 0
+    res = cuStreamSetAttribute(stream, CUstreamAttrID.CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW, stream_attribute)[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to reset stream L2 access policy: {{res}}")
+
+    res = cuCtxResetPersistingL2Cache()[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to reset L2 cache: {{res}}")
+
+    res = cuCtxSetLimit(CUlimit.CU_LIMIT_PERSISTING_L2_CACHE_SIZE, init_persisting_l2_cache_size)[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to restore L2 cache size limit: {{res}}")
+"""
+
+KERNEL_LAUNCH_FUNC_PY = """
+    res = cuKernelSetAttribute(
+        CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+        {7},
+        kernels["{0}"],
+        CUdevice({10})
+    )[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to set max dynamic shared memory size to {7} for kernel {0}: {{res}}")
+
+    config = CUlaunchConfig()
+    config.gridDimX = {1}
+    config.gridDimY = {2}
+    config.gridDimZ = {3}
+    config.blockDimX = {4}
+    config.blockDimY = {5}
+    config.blockDimZ = {6}
+    config.sharedMemBytes = {7}
+    config.hStream = stream
+
+    arg_values = {8}
+    arg_types = {9}
+
+    res = cuLaunchKernelEx(config, kernels["{0}"], (arg_values, arg_types), 0)[0]
+    if res != CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"Failed to launch kernel {0}: {{res}}")
+"""
+
+
+class TLNVRTCSourceWrapper(TLCUDASourceWrapper):
+    """NVRTC backend wrapper: generates Python kernel launch code.
+
+    Core responsibility: transform TVM IRModule into executable Python function
+    that initializes resources (TMA descriptors, L2 cache) and launches kernels
+    via CUDA Driver API.
+
+    Data flow:
+        IRModule → collect kernel metadata → deduplicate resources →
+        generate Python code → executable function
+
+    Why Python generation instead of C++:
+        NVRTC workflow requires runtime compilation, Python is the natural host.
+        Using cuda.bindings.driver eliminates C++ wrapper complexity.
+    """
+
+    _TYPE_MAP: ClassVar[dict[str, str]] = {
+        "float32": "ctypes.c_float",
+        "float16": "ctypes.c_uint16",
+        "bfloat16": "ctypes.c_uint16",
+        "float8_e4m3": "ctypes.c_uint8",
+        "float8_e4m3fn": "ctypes.c_uint8",
+        "float8_e5m2": "ctypes.c_uint8",
+        "float64": "ctypes.c_double",
+        "int64": "ctypes.c_int64",
+        "int32": "ctypes.c_int32",
+        "uint32": "ctypes.c_uint32",
+        "bool": "ctypes.c_bool",
+        "int8": "ctypes.c_int8",
+        "uint8": "ctypes.c_uint8",
+        "int16": "ctypes.c_int16",
+        "uint16": "ctypes.c_uint16",
+        "uchar": "ctypes.c_uint8",
+    }
+
+    _generated_host_func: str | None = None
+
+    def __init__(
+        self,
+        scheduled_ir_module: IRModule,
+        source: str,
+        target: Target,
+        device_mod: IRModule | None = None,
+        host_mod: IRModule | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
+        """Initialize NVRTC wrapper with compiled IR modules.
+
+        Args:
+            scheduled_ir_module: TVM IR after scheduling passes
+            source: Generated CUDA C++ source code
+            target: Compilation target (should be NVRTC-compatible)
+            device_mod: Device-side IR module (kernel functions)
+            host_mod: Host-side IR module (launch logic)
+            pass_configs: Optional compiler pass configurations
+        """
+        super().__init__(scheduled_ir_module, source, target, device_mod, host_mod, pass_configs)
+
+    @property
+    def host_func(self):
+        """Override parent's host_func to return generated Python code."""
+        if self._generated_host_func is not None:
+            return self._generated_host_func
+        return super().host_func
+
+    @host_func.setter
+    def host_func(self, value):
+        """Allow setting generated host function code."""
+        self._generated_host_func = value
+
+    def _pythonic_expr(self, expr: tvm.tir.PrimExpr) -> str:
+        """Convert TVM expression to Python string, ignoring casts.
+
+        Casts are noise in generated Python code - Python is dynamically typed.
+        """
+        return pythonic_expr(expr, self._TYPE_MAP, ignore_cast=True, floor_div_op="//")
+
+    def create_dispatch_func(self, code, function_informations):
+        """Generate Python dispatch function that launches multiple CUDA kernels.
+
+        Why two-pass design:
+            Pass 1: Collect TMA descriptors from all kernels into shared dicts
+            Pass 2: Generate code - descriptors first (deduplicated), then launches
+
+            Single-pass would create duplicate descriptors for each kernel.
+            Dict naturally deduplicates by descriptor name.
+
+        Args:
+            code: CUDA C++ source containing kernel declarations
+            function_informations: Dict mapping kernel names to metadata
+                (grid/block dims, params, shared memory size)
+
+        Returns:
+            Python source code defining a call() function that:
+            1. Initializes L2 cache policies (if needed)
+            2. Creates TMA descriptors once per unique buffer
+            3. Launches each kernel with cuLaunchKernelEx
+            4. Resets L2 cache policies (if needed)
+        """
+        # Extract the set of dynamic symbolic names used in the primary function
+        dynamic_symbolic_set = self.get_dynamic_symbolic_set(self.prim_func)
+
+        function_args = [{"name": "kernels", "type": "dict[str, CUkernel]"}]
+        # Collect function arguments based on primary function's parameters and buffer mappings
+        for param in self.prim_func.params:
+            if param in self.prim_func.buffer_map:
+                buffer = self.prim_func.buffer_map[param]
+                function_args.append(
+                    {
+                        "name": buffer.data.name,
+                        "type": "ctypes.c_void_p",
+                    }
+                )
+            elif isinstance(param, tvm.tir.Var):
+                function_args.append({"name": param.name, "type": self._lookup_type(param.dtype)})
+            else:
+                raise ValueError(f"Parameter {param} is not in the buffer map of the primary function.")
+        # Add dynamic symbols as integer arguments
+        for dyn_sym, dyn_sym_dtype in dynamic_symbolic_set:
+            if dyn_sym not in [arg["name"] for arg in function_args]:
+                function_args.append({"name": dyn_sym, "type": self._lookup_type(dyn_sym_dtype)})
+
+        function_args.append(self.get_stream_type())
+
+        # Format the function arguments for declaration
+        def_args = ", ".join([f"{arg['name']}" for arg in function_args])
+
+        # Check if any function needs L2 Persistent Map
+        has_l2_persistent_map = False
+        for function_name, _ in function_informations.items():
+            if function_name in self.l2_persistent_map:
+                has_l2_persistent_map = True
+                break
+
+        desc_name_map: dict[str, str] = {}
+        desc_name_var_map: dict[str, tvm.tir.Var] = {}
+        device_index = 0
+        kernel_launch_code = """"""
+        if has_l2_persistent_map:
+            kernel_launch_code += L2_PERSISTENT_MAP_CREATE_HANDLE_PY
+
+        # First pass: collect all TMA descriptors from all kernels to avoid duplication
+        kernel_info_list = []
+        for function_name, function_info in function_informations.items():
+            block_info = function_info["block_info"]
+            grid_info = function_info["grid_info"]
+            dynamic_smem_buf = function_info["dynamic_smem_buf"]
+            function_params = function_info["function_params"]
+
+            # Find the location of the global kernel function in the code
+            index = match_declare_kernel(code, function_name + "(")
+
+            # Analyze the function declaration to prepare for argument extraction
+            declaration = code[index:].split(";")[0]
+
+            # Identify the start of the function body to insert arguments
+            index = code.index("{", index)
+
+            # Transform function for NVRTC: returns (arg_value, arg_type) tuples
+            def transform_nvrtc_arg(name: str, arg_type: str):
+                if arg_type == "ctypes.c_void_p":
+                    return (f"{name}.data_ptr()", arg_type)
+                return (name, arg_type)
+
+            call_args = parse_function_call_args(
+                declaration, function_args, function_params, desc_name_map, desc_name_var_map, transform_nvrtc_arg
+            )
+
+            for arg_name, arg_type in call_args:
+                if arg_type == "ctypes.c_void_p":
+                    device_index = f"{arg_name.replace('.data_ptr()', '')}.device.index"
+                    break
+
+            # Store kernel info for second pass
+            kernel_info_list.append(
+                {
+                    "function_name": function_name,
+                    "block_info": block_info,
+                    "grid_info": grid_info,
+                    "dynamic_smem_buf": dynamic_smem_buf,
+                    "call_args": call_args,
+                    "device_index": device_index,
+                }
+            )
+
+        # Generate TMA descriptor initialization code once for all kernels
+        kernel_launch_code += self.generate_tma_descriptor_args(desc_name_map, desc_name_var_map)
+
+        # Second pass: generate kernel launch code for each kernel
+        for kernel_info in kernel_info_list:
+            function_name = kernel_info["function_name"]
+            block_info = kernel_info["block_info"]
+            grid_info = kernel_info["grid_info"]
+            dynamic_smem_buf = kernel_info["dynamic_smem_buf"]
+            call_args = kernel_info["call_args"]
+            device_index = kernel_info["device_index"]
+
+            arg_names = ", ".join([arg[0] for arg in call_args])
+            arg_types = ", ".join([arg[1] for arg in call_args])
+            smem_str = 0 if dynamic_smem_buf is None else dynamic_smem_buf
+
+            # Generate L2 persistent map initialization for this function
+            init_l2_persistent_map = self.generate_l2_persistent_map(function_name)
+            kernel_launch_code += init_l2_persistent_map
+
+            # Generate kernel launch code
+            kernel_launch_code += KERNEL_LAUNCH_FUNC_PY.format(
+                function_name,
+                self._pythonic_expr(grid_info[0]),
+                self._pythonic_expr(grid_info[1]),
+                self._pythonic_expr(grid_info[2]),
+                self._pythonic_expr(block_info[0]),
+                self._pythonic_expr(block_info[1]),
+                self._pythonic_expr(block_info[2]),
+                smem_str,
+                arg_names,
+                arg_types,
+                device_index,
+            )
+
+        # Reset L2 persistent map after all kernel execution
+        if has_l2_persistent_map:
+            kernel_launch_code += L2_PERSISTENT_MAP_RESET_HANDLE_PY
+
+        # Wrap the kernel dispatch logic in an external C function
+        host_func = PREDEF_HOST_FUNC_PY.format(repr(list(function_informations.keys())), def_args, kernel_launch_code)
+        return host_func
+
+    def generate_l2_persistent_map(self, function_name: str) -> str:
+        """Generate Python code to configure L2 cache persistence for a kernel.
+
+        L2 persistence pins frequently-accessed data in L2 cache to reduce
+        memory bandwidth. Requires explicit setup via CUDA stream attributes.
+
+        Args:
+            function_name: Kernel name to check for L2 persistence config
+
+        Returns:
+            Python code that sets stream access policy window, or empty
+            string if no L2 persistence configured for this kernel.
+        """
+        if function_name not in self.l2_persistent_map:
+            return ""
+        init_l2_persistent_map = ""
+        for buffer_name, (hit_ratio, size_in_bytes) in self.l2_persistent_map[function_name].items():
+            # Get persisting_l2_cache_max_size
+            from tilelang.carver.arch.driver import get_persisting_l2_cache_max_size
+
+            persisting_l2_cache_max_size = get_persisting_l2_cache_max_size()
+            try:
+                num_bytes = min(size_in_bytes, persisting_l2_cache_max_size)
+            except TypeError:
+                # as size_in_bytes may be a symbolic expression
+                num_bytes = persisting_l2_cache_max_size
+            init_l2_persistent_map += L2_PERSISTENT_MAP_INIT_FUNC_PY.format(buffer_name, float(hit_ratio), self._pythonic_expr(num_bytes))
+
+        return init_l2_persistent_map
+
+    def generate_tma_descriptor_args(self, desc_name_map: dict[str, str], desc_name_var_map: dict[str, tvm.tir.Var]) -> str:
+        """Generate Python code to initialize TMA descriptors.
+
+        TMA (Tensor Memory Accelerator) descriptors are opaque CUDA objects
+        that describe memory layout for async copies. Must be created on host
+        before kernel launch.
+
+        Args:
+            desc_name_map: Maps descriptor variable names to buffer names
+            desc_name_var_map: Maps descriptor names to TVM variables
+
+        Returns:
+            Python code that calls cuTensorMapEncodeTiled/Im2col for each
+            unique descriptor. Empty string if no TMA descriptors needed.
+        """
+        tma_descriptor_init = ""
+        if self.tma_descriptor_args is None:
+            return tma_descriptor_init
+
+        # Parse TMA descriptor arguments using the common utility
+        parsed_params = parse_tma_descriptor_args(self.tma_descriptor_args, desc_name_map, desc_name_var_map, self._pythonic_expr)
+
+        # Generate Python code from parsed parameters
+        for params in parsed_params:
+            if not params.is_img2col:
+                tma_descriptor_init += TMA_DESC_INIT_FUNC_PY.format(
+                    params.handle_name,
+                    params.dtype,
+                    params.tensor_rank,
+                    params.global_address,
+                    ", ".join(map(lambda x: f"cuuint64_t({x})", params.global_dim)),
+                    ", ".join(map(lambda x: f"cuuint64_t({x})", params.global_stride)),
+                    ", ".join(map(lambda x: f"cuuint32_t({x})", params.box_dim)),
+                    ", ".join(map(lambda x: f"cuuint32_t({x})", params.element_strides)),
+                    params.interleave,
+                    params.swizzle,
+                    params.l2_promotion,
+                    params.oob_fill,
+                )
+            else:
+                tma_descriptor_init += TMA_IM2COL_DESC_INIT_FUNC_PY.format(
+                    params.handle_name,
+                    params.dtype,
+                    params.tensor_rank,
+                    params.global_address,
+                    ", ".join(map(lambda x: f"cuuint64_t({x})", params.global_dim)),
+                    ", ".join(map(lambda x: f"cuuint64_t({x})", params.global_stride)),
+                    ", ".join(map(lambda x: f"cuuint32_t({x})", params.element_strides)),
+                    ", ".join(params.lower_corner),
+                    ", ".join(params.upper_corner),
+                    params.smem_box_channel,
+                    params.smem_box_pixel,
+                    params.interleave,
+                    params.swizzle,
+                    params.l2_promotion,
+                    params.oob_fill,
+                )
+
+        return tma_descriptor_init
+
+    def update_lib_code(self, code: str):
+        """Update library code and generate host dispatch function.
+
+        Entry point for code generation. Walks the host IR to extract kernel
+        call sites, matches them with device kernels, then generates Python
+        dispatch code via create_dispatch_func().
+
+        Args:
+            code: CUDA C++ source code containing compiled kernels
+
+        Returns:
+            The same code string (stored in self.lib_code). Side effect:
+            sets self.host_func to generated Python dispatcher.
+        """
+        # Update the library code with the given code string
+        self.lib_code = code
+
+        # Organize function information for code generation
+        function_informations = {}
+        for function_name in self.function_names:
+            # Do not update function with dispatch host function
+            if (function_name not in self.block_info) or (function_name not in self.grid_info):
+                continue
+
+            assert function_name in self.device_mod, f"Function {function_name} not found in device module"
+            device_func = self.device_mod[function_name]
+            kernel_params_cnt = len(device_func.params)
+            function_params: list[str] | None = None
+
+            def visitor(node, fn=function_name, param_cnt=kernel_params_cnt):
+                nonlocal function_params
+                if isinstance(node, tvm.tir.Call):
+                    if not (hasattr(node, "op") and node.op == tvm.ir.Op.get("tir.tvm_call_packed")):
+                        return
+                    args = node.args
+                    if not args or args[0] != fn:
+                        return
+                    if len(args) < 1 + param_cnt:
+                        raise AssertionError("tvm_call_packed should have at least 1 argument and match device function parameters")
+                    function_params = args[1 : 1 + param_cnt]
+
+            post_order_visit(self.host_func.body, visitor)
+            assert function_params is not None, "function_params should not be None"
+
+            function_informations[function_name] = {
+                "function_name": function_name,
+                "block_info": self.block_info[function_name],
+                "grid_info": self.grid_info[function_name],
+                "dynamic_smem_buf": self.dynamic_smem_buf[function_name],
+                "function_params": function_params,
+            }
+
+        # Create the host function wrapper for the CUDA kernel
+        self.host_func = self.create_dispatch_func(code, function_informations)
+        return self.lib_code
+
+    def get_stream_type(self) -> dict[str, str]:
+        """Return stream parameter spec for Python signature.
+
+        NVRTC backend uses raw int for stream handle (not cudaStream_t pointer).
+        Default to 0 (NULL stream) for convenience.
+        """
+        return {"name": "stream=0", "type": "int"}
diff --git a/tilelang/jit/adapter/torch/__init__.py b/tilelang/jit/adapter/torch/__init__.py
index 2390e3e7c..f688993d0 100644
--- a/tilelang/jit/adapter/torch/__init__.py
+++ b/tilelang/jit/adapter/torch/__init__.py
@@ -1,3 +1,3 @@
 from .metal import MetalKernelAdapter
 
-__all__ = ['MetalKernelAdapter']
+__all__ = ["MetalKernelAdapter"]
diff --git a/tilelang/jit/adapter/torch/kernel_cache.py b/tilelang/jit/adapter/torch/kernel_cache.py
new file mode 100644
index 000000000..41b125fa4
--- /dev/null
+++ b/tilelang/jit/adapter/torch/kernel_cache.py
@@ -0,0 +1,4 @@
+from tilelang.cache.kernel_cache import KernelCache
+
+
+class TorchKernelCache(KernelCache): ...
diff --git a/tilelang/jit/adapter/torch/metal.py b/tilelang/jit/adapter/torch/metal.py
index 30e84ad71..4690cf59b 100644
--- a/tilelang/jit/adapter/torch/metal.py
+++ b/tilelang/jit/adapter/torch/metal.py
@@ -12,7 +12,6 @@
 
 
 class MetalKernelAdapter(BaseKernelAdapter):
-
     def __init__(
         self,
         params: list[KernelParam],
@@ -27,7 +26,11 @@ def __init__(
         #  compile_flags: Optional[List[str]] = None
     ):
         self.kernel_global_source = kernel_global_source
-        self.kernel_name = func_or_mod.__name__ + '_kernel'
+        if isinstance(func_or_mod, tir.PrimFunc):
+            func_name = func_or_mod.attrs["global_symbol"]
+        else:
+            func_name = func_or_mod.__name__
+        self.kernel_name = func_name + "_kernel"
         self.verbose = verbose
 
         self.block_info = [1, 1, 1]
@@ -35,7 +38,7 @@ def __init__(
 
         for var, func in device_mod.functions.items():
             assert var.name_hint == self.kernel_name
-            thread_extent = func.attrs['thread_extent']
+            thread_extent = func.attrs["thread_extent"]
             for tag, extent in thread_extent.items():
                 if "threadIdx" in tag:
                     self.block_info["xyz".index(tag[-1])] = extent
@@ -43,7 +46,7 @@ def __init__(
                     self.grid_info["xyz".index(tag[-1])] = extent
             break
         else:
-            raise AssertionError(f'no kernel with name {func_or_mod.__name__}')
+            raise AssertionError(f"no kernel with name {func_name}")
 
         # print(self.block_info, self.grid_info)
         super().__init__(func_or_mod, result_idx=result_idx, params=params)
@@ -51,15 +54,12 @@ def __init__(
     _kernel = None
 
     def _convert_torch_func(self) -> Callable:
-
         if self._kernel is None:
-
             _kernel = getattr(torch.mps.compile_shader(self.kernel_global_source), self.kernel_name)
             _threads = [x * y for (x, y) in zip(self.block_info, self.grid_info)]
 
             @wraps(_kernel)
             def launcher(*args: torch.Tensor):
-
                 return _kernel(
                     *args,
                     threads=_threads,
diff --git a/tilelang/jit/adapter/tvm_ffi.py b/tilelang/jit/adapter/tvm_ffi.py
new file mode 100644
index 000000000..533969f79
--- /dev/null
+++ b/tilelang/jit/adapter/tvm_ffi.py
@@ -0,0 +1,357 @@
+"""Utilities to adapt TVM FFI kernels to Torch tensors.
+
+This adapter intentionally captures PyTorch's current CUDA stream and device
+via light-weight callables so that, when the wrapped function is invoked,
+the execution observes the same stream context as the active Torch code.
+On non-CUDA builds, the stream/device fall back to 0/CPU semantics.
+"""
+
+from __future__ import annotations
+
+from typing import Callable, Any
+
+import torch
+from tilelang import tvm
+from tvm import runtime, tir
+from tvm.target import Target
+from tvm.relax import TensorType
+from tilelang.utils.target import determine_target
+from tilelang.jit.adapter.base import BaseKernelAdapter
+from tilelang.utils.language import retrieve_func_from_module
+from tilelang.engine.param import KernelParam
+from tilelang.language.v2.dtypes import dtype
+
+
+class TVMFFIKernelAdapter(BaseKernelAdapter):
+    """Adapter that runs a TVM runtime.Executable with Torch tensors.
+
+    Notes
+    - We capture the "current" PyTorch CUDA stream/device as thunks (callables)
+      rather than materializing them at construction time. This ensures the
+      actual stream/device is read just-in-time when the function runs, matching
+      the user's current Torch context (e.g., after a stream guard/switch).
+    - The stream pointer returned is a raw CUDA stream handle compatible with
+      TVM's device API; on CPU or when CUDA is unavailable, we return 0.
+    """
+
+    # Class attributes to store compiled kernel information
+    target: str | Target = "cuda"
+    ir_module: tvm.IRModule | None = None
+    # The global source code of the kernel -> global means the source code of the kernel
+    # that is not wrapped by the wrapper code
+    host_kernel_source: str | None = None
+    device_kernel_source: str | None = None
+    executable: tvm.runtime.Executable | None = None
+    # Pass configs for the compiler
+    pass_configs: dict[str, Any] | None = None
+    # host_mod
+    host_mod: tvm.IRModule | None = None
+    # device_mod
+    device_mod: tvm.IRModule | None = None
+    # rt_mod
+    rt_mod: tvm.runtime.Module | None = None
+    # Maps symbolic variables to their corresponding buffer and shape indices
+    dynamic_symbolic_map: dict[tir.Var, tuple[int, int, int]] | None = None
+
+    # Stream/device functors are inherited from BaseKernelAdapter
+    def __init__(
+        self,
+        params: list[KernelParam],
+        result_idx: list[int],
+        target: str | Target,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_mod: tvm.IRModule | None = None,
+        device_mod: tvm.IRModule | None = None,
+        rt_mod: tvm.runtime.Module | None = None,
+        host_kernel_source: str | None = None,
+        device_kernel_source: str | None = None,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
+        """Initialize the adapter with the given TIR function or module.
+
+        Args:
+            params: List of tensor types for inputs/outputs
+            result_idx: Indices of output tensors
+            target: Target platform (e.g., 'cuda')
+            func_or_mod: TIR function or module to be compiled
+            verbose: Enable verbose logging
+        """
+        self.params = params
+        self.result_idx = self._legalize_result_idx(result_idx)
+        self.host_kernel_source = host_kernel_source
+        self.device_kernel_source = device_kernel_source
+
+        if isinstance(func_or_mod, tir.PrimFunc):
+            self.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod})
+        else:
+            self.ir_module = func_or_mod
+
+        self.target = Target.canon_target(determine_target(target))
+
+        self.host_mod = host_mod
+        self.device_mod = device_mod
+        self.rt_mod = rt_mod
+        self.verbose = verbose
+        self.pass_configs = pass_configs
+        self.compile_flags = compile_flags
+        self.dynamic_symbolic_map = self._process_dynamic_symbolic()
+
+        self._post_init()
+
+    def _process_dynamic_symbolic(self) -> dict[tir.Var, tuple[int, int]]:
+        """Extract information about dynamic shapes from the TIR function.
+
+        Maps symbolic variables to their corresponding (id, buffer_index, dimension)
+        for runtime shape resolution.
+        id represents shape or stride, 0 represents shape, 1 represents stride
+        """
+        func = self.prim_func
+        params = func.params
+        buffer_map = func.buffer_map
+        dynamic_symbolic_map = {}
+        for i, param in enumerate(params):
+            if isinstance(param, tir.Var) and (param not in dynamic_symbolic_map):
+                dynamic_symbolic_map[param] = (2, i, -1)
+        for i, param in enumerate(params):
+            if param in buffer_map:
+                buffer = buffer_map[param]
+                for j, shape in enumerate(buffer.shape):
+                    if isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and (shape not in params):
+                        dynamic_symbolic_map[shape] = (0, i, j)
+        for i, param in enumerate(params):
+            if param in buffer_map:
+                buffer = buffer_map[param]
+                for j, stride in enumerate(buffer.strides):
+                    if isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and (stride not in params):
+                        dynamic_symbolic_map[stride] = (1, i, j)
+        return dynamic_symbolic_map
+
+    def _convert_torch_func(self) -> Callable[..., Any]:
+        # Capture thunks that reflect Torch's current stream and device.
+        # These are evaluated at call time to align TVM execution with the
+        # caller's active PyTorch stream/device.
+        # current_stream_functor = self.get_current_stream_functor()
+        current_device_functor = self.get_current_device_functor()
+
+        # Convert TVM types to native Python types during initialization
+        # Convert tvm.DataType to torch.dtype for tensor creation
+        param_dtypes = [param.torch_dtype() for param in self.params]
+        # Convert TVM shape arrays to native Python lists
+        param_shapes = []
+
+        for param in self.params:
+            native_shape = []
+            for dim in param.shape:
+                if isinstance(dim, tir.IntImm):
+                    native_shape.append(int(dim))
+                elif isinstance(dim, tir.Var):
+                    native_shape.append(dim)  # Keep tir.Var for dynamic dimensions
+                else:
+                    native_shape.append(dim)
+            tl_dtype = param.dtype
+            if tl_dtype.bits < 8:
+                stroage_dtype: dtype = dtype(param.torch_dtype())
+                # last dim divide by bits to get the actual shape
+                native_shape[-1] = native_shape[-1] * tl_dtype.bits * tl_dtype.lanes // (stroage_dtype.bits * stroage_dtype.lanes)
+            param_shapes.append(native_shape)
+
+        if self.executable is None:
+            self.executable = runtime.Executable(self.rt_mod)
+
+        dynamic_symbolic_map = self._process_dynamic_symbolic()
+        executable = self.executable
+
+        # Prepare helpers for friendly dtype error messages
+        prim_func = self.prim_func
+        buffer_map = prim_func.buffer_map
+        params = prim_func.params
+        # Expected dtype string per parameter index (for buffers only)
+        expected_dtype_strs: list[str | None] = []
+        # Track whether each param is a buffer (has dtype) vs scalar
+        is_buffer_param: list[bool] = []
+        for p in params:
+            if p in buffer_map:
+                expected_dtype_strs.append(str(buffer_map[p].dtype))
+                is_buffer_param.append(True)
+            else:
+                expected_dtype_strs.append(None)
+                is_buffer_param.append(False)
+
+        # Map torch dtype to TVM-style dtype string
+        def torch_dtype_to_tvm_str(dtype: torch.dtype) -> str:
+            try:
+                import torch as _torch
+            except Exception:  # pragma: no cover
+                # Fallback, though torch should always be available here
+                return str(dtype)
+            fp8_e4m3fn = getattr(_torch, "float8_e4m3fn", None)
+            fp8_e4m3fnuz = getattr(_torch, "float8_e4m3fnuz", None)
+            fp8_e5m2 = getattr(_torch, "float8_e5m2", None)
+            fp8_e5m2fnuz = getattr(_torch, "float8_e5m2fnuz", None)
+            if fp8_e4m3fn is not None and dtype == fp8_e4m3fn:
+                return "float8_e4m3"
+            if fp8_e4m3fnuz is not None and dtype == fp8_e4m3fnuz:
+                return "float8_e4m3fnuz"
+            if fp8_e5m2 is not None and dtype == fp8_e5m2:
+                return "float8_e5m2"
+            if fp8_e5m2fnuz is not None and dtype == fp8_e5m2fnuz:
+                return "float8_e5m2"
+            # Strip torch. prefix for readability
+            s = str(dtype)
+            return s[6:] if s.startswith("torch.") else s
+
+        def func(*inputs: torch.Tensor | Any):
+            # Validate input count strictly
+            expected_inputs = len(self.params) - len(self.result_idx)
+            if len(inputs) != expected_inputs:
+                raise ValueError(f"Kernel expected {expected_inputs} inputs, but {len(inputs)} are provided.")
+
+            # Resolve the device used for outputs. Prefer the first tensor input's device
+            # if available, otherwise use PyTorch's current device.
+            out_device: torch.device | None = None
+
+            # Stitch the full positional argument list expected by the TVM executable
+            ins_idx: int = 0
+            tensor_list: list[torch.Tensor] = []
+
+            # Prepare input and output tensors
+            for i in range(len(self.params)):
+                if i in self.result_idx:
+                    dtype = param_dtypes[i]
+                    shape = []
+                    # Now working with native Python list, no FFI calls needed
+                    for s in param_shapes[i]:
+                        if isinstance(s, tir.Var):
+                            for key in dynamic_symbolic_map:
+                                if str(s) == str(key):
+                                    ref_id, ref_tensor_idx, ref_shape_idx = dynamic_symbolic_map[key]
+                                    if ref_id == 2:
+                                        shape.append(inputs[ref_tensor_idx])
+                                    elif ref_id == 0:
+                                        shape.append(tensor_list[ref_tensor_idx].shape[ref_shape_idx])
+                                    elif ref_id == 1:
+                                        shape.append(tensor_list[ref_tensor_idx].stride()[ref_shape_idx])
+                        else:  # Already converted to Python int during initialization
+                            shape.append(s)
+
+                    if out_device is None:
+                        out_device = current_device_functor()
+
+                    if len(shape) == 0:
+                        param_name = self.params[i].name if hasattr(self.params[i], "name") else f"parameter_{i}"
+                        raise ValueError(
+                            f"Cannot create output tensor (name={param_name}) - 0-dimensional tensors are not supported. "
+                            f"Expected shape: {shape}"
+                        )
+                    tensor = torch.empty(*shape, dtype=dtype, device=out_device)
+                else:
+                    tensor = inputs[ins_idx]
+                    ins_idx += 1
+                tensor_list.append(tensor)
+
+            executable(*tensor_list)
+
+            # Return outputs in the requested form
+            if len(self.result_idx) == 1:
+                return tensor_list[self.result_idx[0]]
+            return [tensor_list[i] for i in self.result_idx]
+
+        return func
+
+    @classmethod
+    def from_database(
+        cls,
+        params: list[TensorType],
+        result_idx: list[int],
+        target: str,
+        func_or_mod: tir.PrimFunc | tvm.IRModule,
+        host_kernel_source: str,
+        device_kernel_source: str,
+        kernel_lib_path: str,
+        verbose: bool = False,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ):
+        adapter = cls.__new__(cls)
+        adapter.params = params
+        adapter.result_idx = adapter._legalize_result_idx(result_idx)
+        adapter.host_kernel_source = host_kernel_source
+        adapter.device_kernel_source = device_kernel_source
+        adapter.wrapped_source = device_kernel_source + "\n\n" + host_kernel_source
+        adapter.pass_configs = pass_configs
+
+        if isinstance(func_or_mod, tir.PrimFunc):
+            adapter.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod})
+        else:
+            adapter.ir_module = func_or_mod
+
+        target = determine_target(target, return_object=True)
+        adapter.target = Target.canon_target(determine_target(target))
+
+        adapter.verbose = verbose
+        adapter.executable = runtime.load_module(kernel_lib_path)
+        adapter._post_init()
+        return adapter
+
+    def get_host_source(self):
+        """Returns the source code of the host module."""
+        if self.host_kernel_source is not None:
+            return self.host_kernel_source
+        return self.rt_mod.inspect_source()
+
+    def get_device_source(self):
+        """Returns the source code of the device module."""
+        if self.device_kernel_source is not None:
+            return self.device_kernel_source
+        return self.rt_mod.imports[0].inspect_source()
+
+    def get_kernel_source(self, kernel_only: bool = False):
+        """Returns the source code of the compiled kernel."""
+        if kernel_only:
+            return self.get_device_source()
+        else:
+            return self.get_device_source() + "\n\n" + self.get_host_source()
+
+    @property
+    def prim_func(self) -> tir.PrimFunc:
+        """Returns the primary TIR function from the IR module."""
+        return retrieve_func_from_module(self.ir_module)
+
+    def init_table(self, host_table_ptr: int, table_size: int, stream: int = 0) -> int:
+        """Initialize distributed table by copying host data to device meta_data symbol.
+
+        This method is used for TileScale distributed kernels to initialize the
+        meta_data constant memory with rank information and remote base pointers.
+
+        Args:
+            host_table_ptr: Pointer to host table data (as int)
+            table_size: Number of uint64_t elements in the table
+            stream: CUDA stream pointer (as int), default 0 for default stream
+
+        Returns:
+            0 on success, non-zero on failure
+
+        Raises:
+            RuntimeError: If rt_mod is not available or init_table function not found
+        """
+        # Must use the jitted module (the same one the kernel will run from)
+        # to ensure we write to the same CUmodule's meta_data that the kernel reads.
+        if self.executable is None:
+            if self.rt_mod is None:
+                raise RuntimeError("rt_mod is not available for init_table")
+            self.executable = runtime.Executable(self.rt_mod)
+
+        if isinstance(self.executable, runtime.Executable):
+            jitted_mod = self.executable.jit()
+        else:
+            # from_database path: executable is already a loaded Module
+            jitted_mod = self.executable
+
+        init_table_func = jitted_mod.get_function("__tilescale_init_table", query_imports=True)
+        if init_table_func is None:
+            raise RuntimeError("__tilescale_init_table function not found in module")
+
+        # Call the TVM FFI function
+        return init_table_func(host_table_ptr, table_size, stream)
diff --git a/tilelang/jit/adapter/utils.py b/tilelang/jit/adapter/utils.py
index efc965e1b..d43adf840 100644
--- a/tilelang/jit/adapter/utils.py
+++ b/tilelang/jit/adapter/utils.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import re
-from typing import Literal
+from typing import Literal, Callable, Any
 from tilelang import tvm as tvm
 from tvm import IRModule, tir
 from tvm.target import Target
@@ -38,6 +38,53 @@ def match_declare_kernel(source: str, annotation: str = "__global__") -> int:
     raise ValueError("No global kernel found in the source code")
 
 
+def match_declare_kernel_cutedsl(source: str, annotation: str = "@cute.kernel") -> int:
+    # Match decorator followed by function definition across lines
+    # \s+ allows any whitespace including newlines between decorator and def
+    pattern = r"@cute\.kernel\s+def\s+(\w+)"
+    matched = re.search(pattern, source, re.MULTILINE)
+    if matched:
+        # Find the position of the opening parenthesis after the function name
+        # matched.start(1) gives position of function name
+        func_name_pos = matched.start(1)
+        # Find the '(' after function name
+        paren_pos = source.find("(", func_name_pos)
+        if paren_pos != -1:
+            return paren_pos
+    raise ValueError("No global kernel found in the source code")
+
+
+def extract_python_func_declaration(source: str, func_name: str) -> str:
+    """Extract the full Python function declaration from decorator to colon.
+
+    Args:
+        source: Source code containing the function
+        func_name: Name of the function to extract (can include '(' suffix)
+
+    Returns:
+        The function declaration from 'def' to ':', including parameters
+
+    Example:
+        For code:
+            @cute.kernel
+            def kernel(arg1: cute.Tensor, arg2: int):
+                ...
+        Returns: "def kernel(arg1: cute.Tensor, arg2: int)"
+    """
+    # Remove '(' suffix if present
+    if func_name.endswith("("):
+        func_name = func_name[:-1]
+
+    # Match from def to the closing ) followed by :
+    # This handles multi-line function signatures
+    pattern = rf"def\s+{re.escape(func_name)}\s*\([^)]*\)"
+    matched = re.search(pattern, source, re.DOTALL)
+    if matched:
+        return matched.group(0)
+
+    raise ValueError(f"No function declaration found for {func_name}")
+
+
 def match_declare_kernel_cpu(source: str, annotation: str = "int32_t") -> int:
     pattern = r"int32_t\s+\w+"
     for line in source.split("\n"):
@@ -64,13 +111,16 @@ def is_metal_target(target: Target) -> bool:
     return target.kind.name == "metal"
 
 
+def is_cutedsl_target(target: Target) -> bool:
+    return target.kind.name == "cuda" and "cutedsl" in target.keys
+
+
 def get_annotated_mod(
     func_or_mod: tir.PrimFunc | tvm.IRModule,
     target: str | Target = "auto",
     target_host: str | Target | None = None,
     model_type: Literal["device", "host", "all"] = "all",
 ) -> IRModule | tuple[IRModule, IRModule]:
-
     # Validate model_type early
     if model_type not in {"device", "host", "all"}:
         raise ValueError(f"Invalid model type: {model_type}")
@@ -95,25 +145,28 @@ def get_annotated_mod(
 
     # Define dispatch dictionary for different model types
     dispatch = {
-        "device":
-            lambda m: tir.transform.Filter(_is_device_call)(m),
-        "host":
-            lambda m: tir.transform.Filter(_is_host_call)(m),
-        "all":
-            lambda m: (tir.transform.Filter(_is_device_call)(m), tir.transform.Filter(_is_host_call)
-                       (m)),
+        "device": lambda m: tir.transform.Filter(_is_device_call)(m),
+        "host": lambda m: tir.transform.Filter(_is_host_call)(m),
+        "all": lambda m: (tir.transform.Filter(_is_device_call)(m), tir.transform.Filter(_is_host_call)(m)),
     }
 
     return dispatch[model_type](mod)
 
 
-def pythonic_expr(expr: tvm.tir.PrimExpr, dtype_map: dict[str, str] | None = None) -> str:
+def pythonic_expr(
+    expr: tvm.tir.PrimExpr, dtype_map: dict[str, str] | None = None, ignore_cast: bool = False, floor_div_op: str = "/"
+) -> str:
     """
     Converts a TVM PrimExpr into a Python-style string, correctly handling operator precedence.
 
     Args:
         expr: The TVM PrimExpr to convert.
-
+        dtype_map: A dictionary mapping data types to their string representations.
+        ignore_cast: Whether to ignore the cast operator and return the string representation of the value without the cast.
+        floor_div_op: Operator to use for tvm.tir.FloorDiv. Default '/' preserves prior
+                      behavior (suitable for generating C/C++ expressions). For generating
+                      Python code where integer division is required (e.g. grid/block),
+                      pass '//' explicitly.
     Returns:
         A string representation of the expression.
     """
@@ -158,18 +211,33 @@ def _visitor(node):
         elif isinstance(node, tvm.tir.Cast):
             # C-style cast has high precedence
             value_str, _ = node_to_result_map[node.value]
-            if dtype_map is None:
-                s = f"({node.dtype}){value_str}"
+            if ignore_cast:
+                s = value_str
             else:
-                s = f"({dtype_map[node.dtype]}){value_str}"
+                type_str = node.dtype if dtype_map is None else dtype_map[node.dtype]
+                s = f"({type_str}){value_str}"
             p = PRECEDENCE.get(type(node), ATOMIC_PRECEDENCE)
         elif isinstance(
-                node,
-            (tvm.tir.Mul, tvm.tir.FloorDiv, tvm.tir.Add, tvm.tir.Sub, tvm.tir.FloorMod, tvm.tir.LT,
-             tvm.tir.LE, tvm.tir.GT, tvm.tir.GE, tvm.tir.EQ, tvm.tir.NE, tvm.tir.And, tvm.tir.Or)):
+            node,
+            (
+                tvm.tir.Mul,
+                tvm.tir.FloorDiv,
+                tvm.tir.Add,
+                tvm.tir.Sub,
+                tvm.tir.FloorMod,
+                tvm.tir.LT,
+                tvm.tir.LE,
+                tvm.tir.GT,
+                tvm.tir.GE,
+                tvm.tir.EQ,
+                tvm.tir.NE,
+                tvm.tir.And,
+                tvm.tir.Or,
+            ),
+        ):
             op_map = {
                 tvm.tir.Mul: "*",
-                tvm.tir.FloorDiv: "/",
+                tvm.tir.FloorDiv: floor_div_op,
                 tvm.tir.Add: "+",
                 tvm.tir.Sub: "-",
                 tvm.tir.FloorMod: "%",
@@ -216,3 +284,211 @@ def _visitor(node):
     tvm.tir.stmt_functor.post_order_visit(expr, _visitor)
 
     return next(iter(node_to_result_map[expr]), "")
+
+
+def maybe_desc_name(name: str, matches: list[str], i: int, desc_name_map: dict[str, str] | None = None) -> bool:
+    """
+    Check if a parameter name corresponds to a TMA descriptor.
+
+    Args:
+        name: The parameter name to check.
+        matches: List of all matched parameter names.
+        i: Index of the current match.
+        desc_name_map: Optional mapping to store descriptor name relationships.
+
+    Returns:
+        True if the parameter is a TMA descriptor.
+    """
+    match = matches[i]
+    if not (match == name + "_desc" or match.startswith(name + "_desc_")):
+        return False
+    desc_decls = []
+    if desc_name_map is not None:
+        desc_name_map[match] = name
+    if i > 0:
+        desc_decls.append(matches[i - 1])
+    if i < len(matches) - 1:
+        desc_decls.append(matches[i + 1])
+    return any([decl == "CUtensorMap" for decl in desc_decls])
+
+
+def parse_function_call_args(
+    declaration: str,
+    function_args: list[dict[str, str]],
+    function_params: list[Any],
+    desc_name_map: dict[str, str] | None = None,
+    desc_name_var_map: dict[str, tvm.tir.Var] | None = None,
+    transform_arg: Callable[[str, str], Any] | None = None,
+) -> list[Any]:
+    """
+    Parse function call arguments from a kernel declaration.
+
+    Args:
+        declaration: The kernel function declaration string.
+        function_args: List of function argument specifications.
+        function_params: List of function parameters from TVM IR.
+        desc_name_map: Optional mapping for descriptor names.
+        desc_name_var_map: Optional mapping from descriptor names to TVM variables.
+        transform_arg: Optional function to transform each argument (name, type) -> result.
+
+    Returns:
+        List of parsed call arguments.
+    """
+    pattern = r"[,\s]*(?:\w+\s*\*+\s*__restrict__\s+)?(\w+)"
+    matches = re.findall(pattern, declaration)
+    call_args = []
+
+    for i, match in enumerate(matches):
+        for arg in function_args:
+            if arg["name"] == match:
+                if transform_arg is not None:
+                    call_args.append(transform_arg(match, arg["type"]))
+                else:
+                    call_args.append(match)
+            elif maybe_desc_name(arg["name"], matches, i, desc_name_map):
+                if transform_arg is not None:
+                    call_args.append(transform_arg(match, "None"))
+                else:
+                    call_args.append(match)
+                if desc_name_var_map is not None and function_params is not None:
+                    assert len(call_args) <= len(function_params), f"Too many arguments: {len(call_args)} > {len(function_params)}"
+                    desc_name_var_map[match] = function_params[len(call_args) - 1]
+
+    return call_args
+
+
+class TMADescriptorParams:
+    """Parsed TMA descriptor parameters."""
+
+    def __init__(self, handle_name: str, dtype: str, tensor_rank: int, global_address: Any, is_img2col: bool = False):
+        self.handle_name = handle_name
+        self.dtype = dtype
+        self.tensor_rank = tensor_rank
+        self.global_address = global_address
+        self.is_img2col = is_img2col
+
+        # Common fields
+        self.global_dim: list[str] = []
+        self.global_stride: list[str] = []
+        self.element_strides: list[str] = []
+        self.interleave: str = ""
+        self.swizzle: str = ""
+        self.l2_promotion: str = ""
+        self.oob_fill: str = ""
+
+        # Tiled-specific fields
+        self.box_dim: list[str] = []
+
+        # Im2col-specific fields
+        self.lower_corner: list[str] = []
+        self.upper_corner: list[str] = []
+        self.smem_box_channel: str = ""
+        self.smem_box_pixel: str = ""
+
+
+def parse_tma_descriptor_args(
+    tma_descriptor_args: dict[tvm.tir.Var, list[Any]],
+    desc_name_map: dict[str, str],
+    desc_name_var_map: dict[str, tvm.tir.Var],
+    pythonic_expr_func: Callable[[Any], str],
+) -> list[TMADescriptorParams]:
+    """
+    Parse TMA descriptor arguments into structured parameters.
+
+    Args:
+        tma_descriptor_args: Dictionary mapping TMA descriptor variables to their arguments.
+        desc_name_map: Mapping from descriptor handles to parameter names.
+        desc_name_var_map: Mapping from descriptor handles to TVM variables.
+        pythonic_expr_func: Function to convert TVM expressions to strings.
+
+    Returns:
+        List of parsed TMA descriptor parameters.
+    """
+    if not tma_descriptor_args:
+        return []
+
+    results = []
+
+    for handle_name, _ in desc_name_map.items():
+        assert handle_name in desc_name_var_map, f"Handle name {handle_name} not found in desc_name_var_map"
+        desc_var = desc_name_var_map[handle_name]
+
+        assert desc_var in tma_descriptor_args, f"TMA descriptor {desc_var} not found in {tma_descriptor_args}"
+        args = tma_descriptor_args[desc_var]
+
+        # Skip __tvm_tensormap_create_tiled and second element (like CUDA version)
+        if len(args) < 3:
+            raise ValueError(f"TMA descriptor args too short: {len(args)} elements, expected at least 3")
+
+        tma_create_str, _, dtype, tensor_rank, global_address, *remaining_args = args
+
+        is_img2col = tma_create_str.value == "__tvm_tensormap_create_im2col"
+
+        # Convert basic fields
+        dtype = pythonic_expr_func(dtype)
+        tensor_rank = int(pythonic_expr_func(tensor_rank))
+
+        # Validate tensor_rank
+        if not isinstance(tensor_rank, int) or tensor_rank <= 0:
+            raise ValueError(f"Invalid tensor_rank: {tensor_rank}. Must be a positive integer")
+
+        params = TMADescriptorParams(handle_name, dtype, tensor_rank, global_address, is_img2col)
+
+        if not is_img2col:
+            # Tiled mode
+            expected_args_len = 4 * tensor_rank + 4
+            if len(remaining_args) < expected_args_len:
+                raise ValueError(
+                    f"Insufficient remaining args: got {len(remaining_args)}, expected {expected_args_len} for tensor_rank {tensor_rank}"
+                )
+
+            # Extract dimensions and strides
+            params.global_dim = [pythonic_expr_func(i) for i in remaining_args[:tensor_rank]]
+            params.global_stride = [pythonic_expr_func(i) for i in remaining_args[tensor_rank : 2 * tensor_rank]]
+            params.box_dim = [pythonic_expr_func(i) for i in remaining_args[2 * tensor_rank : 3 * tensor_rank]]
+            params.element_strides = [pythonic_expr_func(i) for i in remaining_args[3 * tensor_rank : 4 * tensor_rank]]
+
+            # Extract remaining parameters
+            try:
+                interleave, swizzle, l2_promotion, oob_fill = remaining_args[4 * tensor_rank : 4 * tensor_rank + 4]
+                params.interleave = pythonic_expr_func(interleave)
+                params.swizzle = pythonic_expr_func(swizzle)
+                params.l2_promotion = pythonic_expr_func(l2_promotion)
+                params.oob_fill = pythonic_expr_func(oob_fill)
+            except ValueError as e:
+                raise ValueError("Failed to unpack the final 4 TMA parameters (interleave, swizzle, l2Promotion, oobFill)") from e
+        else:
+            # Im2col mode
+            expected_args_len = 5 * tensor_rank + 2
+            if len(remaining_args) < expected_args_len:
+                raise ValueError(
+                    f"Insufficient remaining args: got {len(remaining_args)}, expected {expected_args_len} for tensor_rank {tensor_rank}"
+                )
+
+            # Extract dimensions and strides
+            params.global_dim = [pythonic_expr_func(i) for i in remaining_args[:tensor_rank]]
+            params.global_stride = [pythonic_expr_func(i) for i in remaining_args[tensor_rank : 2 * tensor_rank]]
+            params.element_strides = [pythonic_expr_func(i) for i in remaining_args[2 * tensor_rank : 3 * tensor_rank]]
+            params.lower_corner = [pythonic_expr_func(i) for i in remaining_args[3 * tensor_rank : 4 * tensor_rank - 2]]
+            params.upper_corner = [pythonic_expr_func(i) for i in remaining_args[4 * tensor_rank - 2 : 5 * tensor_rank - 4]]
+
+            # Extract remaining parameters
+            try:
+                smem_box_pixel, smem_box_channel, interleave, swizzle, l2_promotion, oob_fill = remaining_args[
+                    5 * tensor_rank - 4 : 5 * tensor_rank + 2
+                ]
+                params.smem_box_pixel = pythonic_expr_func(smem_box_pixel)
+                params.smem_box_channel = pythonic_expr_func(smem_box_channel)
+                params.interleave = pythonic_expr_func(interleave)
+                params.swizzle = pythonic_expr_func(swizzle)
+                params.l2_promotion = pythonic_expr_func(l2_promotion)
+                params.oob_fill = pythonic_expr_func(oob_fill)
+            except ValueError as e:
+                raise ValueError(
+                    "Failed to unpack the final 6 TMA parameters "
+                    "(smem_box_pixel, smem_box_channel, interleave, swizzle, l2Promotion, oobFill)"
+                ) from e
+
+        results.append(params)
+
+    return results
diff --git a/tilelang/jit/adapter/wrapper.py b/tilelang/jit/adapter/wrapper.py
index 4cd001cce..f08587da5 100644
--- a/tilelang/jit/adapter/wrapper.py
+++ b/tilelang/jit/adapter/wrapper.py
@@ -5,8 +5,20 @@
 from typing import Any
 from tvm import IRModule
 from tvm.target import Target
-from .utils import (is_metal_target, match_declare_kernel, match_declare_kernel_cpu, is_cuda_target,
-                    is_hip_target, is_cpu_target, get_annotated_mod, pythonic_expr)
+
+from .utils import (
+    is_metal_target,
+    is_cutedsl_target,
+    match_declare_kernel,
+    match_declare_kernel_cpu,
+    is_cuda_target,
+    is_hip_target,
+    is_cpu_target,
+    get_annotated_mod,
+    pythonic_expr,
+    parse_function_call_args,
+    parse_tma_descriptor_args,
+)
 import re
 import logging
 import textwrap
@@ -72,16 +84,6 @@
 }}
 """
 
-PREDEF_HOST_FUNC_PY = """
-import cuda.bindings.driver
-import ctypes
-
-_function_names = {}
-
-def call({}):
-    {}
-"""
-
 L2_PERSISTENT_MAP_CREATE_HANDLE = """
 \tcudaStreamAttrValue stream_attribute;
 \tsize_t init_persisting_l2_cache_size;
@@ -159,68 +161,8 @@ def call({}):
 \t}}
 """
 
-TMA_DESC_INIT_FUNC_PY = """
-\t{0}_type = cuda.bindings.driver.CUtensorMapDataType({1})
-\t{0}_tensorRank = {2}
-\t{0}_globalAddress = {3}.data_ptr()
-\t{0}_globalDim = [{4}]
-\t{0}_globalStride = [{5}][1:]
-\t{0}_boxDim = [{6}]
-\t{0}_elementStrides = [{7}]
-\t{0}_interleave = cuda.bindings.driver.CUtensorMapInterleave({8})
-\t{0}_swizzle = cuda.bindings.driver.CUtensorMapSwizzle({9})
-\t{0}_l2Promotion = cuda.bindings.driver.CUtensorMapL2promotion({10})
-\t{0}_oobFill = cuda.bindings.driver.CUtensorMapFloatOOBfill({11})
-
-\tres, {0} = cuda.bindings.driver.cuTensorMapEncodeTiled(
-\t\t{0}_type,
-\t\t{0}_tensorRank,
-\t\t{0}_globalAddress,
-\t\t{0}_globalDim,
-\t\t{0}_globalStride,
-\t\t{0}_boxDim,
-\t\t{0}_elementStrides,
-\t\t{0}_interleave,
-\t\t{0}_swizzle,
-\t\t{0}_l2Promotion,
-\t\t{0}_oobFill,
-\t)
-
-\tif res != cuda.bindings.driver.CUresult.CUDA_SUCCESS:
-\t\traise RuntimeError(f"Failed to initialize the TMA descriptor {0}: {{res}}")
-"""
-
-KERNEL_LAUNCH_FUNC_PY = """
-\tres = cuda.bindings.driver.cuKernelSetAttribute(
-\t\tcuda.bindings.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
-\t\t{7},
-\t\tkernels["{0}"],
-\t\tcuda.bindings.driver.CUdevice({10})
-\t)[0]
-\tif res != cuda.bindings.driver.CUresult.CUDA_SUCCESS:
-\t\traise RuntimeError(f"Failed to set max dynamic shared memory size to {7} for kernel {0}: {{res}}")
-
-\tconfig = cuda.bindings.driver.CUlaunchConfig()
-\tconfig.gridDimX = {1}
-\tconfig.gridDimY = {2}
-\tconfig.gridDimZ = {3}
-\tconfig.blockDimX = {4}
-\tconfig.blockDimY = {5}
-\tconfig.blockDimZ = {6}
-\tconfig.sharedMemBytes = {7}
-\tconfig.hStream = stream
-
-\targ_values = {8}
-\targ_types = {9}
-
-\tres = cuda.bindings.driver.cuLaunchKernelEx(config, kernels["{0}"], (arg_values, arg_types), 0)[0]
-\tif res != cuda.bindings.driver.CUresult.CUDA_SUCCESS:
-\t\traise RuntimeError(f"Failed to launch kernel {0}: {{res}}")
-"""
-
 
 class BaseWrapper(ABC):
-
     @abstractmethod
     def wrap(self, *args, **kwargs):
         raise NotImplementedError
@@ -235,6 +177,7 @@ class TLCUDASourceWrapper:
         "float16": "half_t",
         "bfloat16": "bfloat16_t",
         "float8_e4m3": "fp8_e4_t",
+        "float8_e4m3fn": "fp8_e4_t",
         "float8_e5m2": "fp8_e5_t",
         "float64": "double",
         "int64": "int64_t",
@@ -254,13 +197,15 @@ class TLCUDASourceWrapper:
     host_mod: IRModule | None = None
     pass_configs: dict[str, Any] | None = None
 
-    def __init__(self,
-                 scheduled_ir_module: IRModule,
-                 source: str,
-                 target: Target,
-                 device_mod: IRModule | None = None,
-                 host_mod: IRModule | None = None,
-                 pass_configs: dict[str, Any] | None = None):
+    def __init__(
+        self,
+        scheduled_ir_module: IRModule,
+        source: str,
+        target: Target,
+        device_mod: IRModule | None = None,
+        host_mod: IRModule | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
         self.mod = scheduled_ir_module
         self.target = target
         self.source = source
@@ -281,7 +226,15 @@ def __init__(self,
         self.lib_code: str | None = self.update_lib_code(source)
 
     def _pythonic_expr(self, expr: tvm.tir.PrimExpr) -> str:
-        return pythonic_expr(expr, self._TYPE_MAP)
+        # This wrapper generates C/CUDA source. C/C++ integer division uses '/',
+        # and '//' is not a valid operator in C/C++.
+        return pythonic_expr(expr, self._TYPE_MAP, floor_div_op="/")
+
+    def _lookup_type(self, dtype: str | Any) -> str:
+        key = dtype if isinstance(dtype, str) else str(dtype)
+        result = self._TYPE_MAP.get(key)
+        assert result is not None, f"Unsupported dtype {dtype}"
+        return result
 
     def is_tma_descriptor_arg(self, arg_name: str) -> bool:
         return arg_name in self.prim_func.buffer_map
@@ -298,60 +251,26 @@ def create_dispatch_func(self, code, function_informations):
         for param in self.prim_func.params:
             if param in self.prim_func.buffer_map:
                 buffer = self.prim_func.buffer_map[param]
-                function_args.append({
-                    "name": buffer.data.name,
-                    "type": self._TYPE_MAP[buffer.dtype] + "* __restrict__",
-                })
+                function_args.append(
+                    {
+                        "name": buffer.data.name,
+                        "type": self._lookup_type(buffer.dtype) + "* __restrict__",
+                    }
+                )
             elif isinstance(param, tvm.tir.Var):
-                function_args.append({"name": param.name, "type": self._TYPE_MAP[param.dtype]})
+                function_args.append({"name": param.name, "type": self._lookup_type(param.dtype)})
             else:
-                raise ValueError(
-                    f"Parameter {param} is not in the buffer map of the primary function.")
+                raise ValueError(f"Parameter {param} is not in the buffer map of the primary function.")
         # Add dynamic symbols as integer arguments
-        for dyn_sym in dynamic_symbolic_set:
+        for dyn_sym, dyn_sym_dtype in dynamic_symbolic_set:
             if dyn_sym not in [arg["name"] for arg in function_args]:
-                function_args.append({"name": dyn_sym, "type": "int"})
+                function_args.append({"name": dyn_sym, "type": self._lookup_type(dyn_sym_dtype)})
 
         function_args.append(self.get_stream_type())
 
         # Format the function arguments for declaration
         def_args = ", ".join([f"{arg['type']} {arg['name']}" for arg in function_args])
 
-        def func_call_args(s,
-                           function_args,
-                           function_params,
-                           desc_name_map: dict[str, str] | None = None,
-                           desc_name_var_map: dict[str, tvm.tir.Var] | None = None):
-            # Extract the function call arguments matching the function definition
-            def maybe_desc(name: str, matches: list[str], i: int):
-                match = matches[i]
-                if not (match == name + "_desc" or match.startswith(name + "_desc_")):
-                    return False
-                desc_decls = []
-                if desc_name_map is not None:
-                    desc_name_map[match] = name
-                if i > 0:
-                    desc_decls.append(matches[i - 1])
-                if i < len(matches) - 1:
-                    desc_decls.append(matches[i + 1])
-                return any([decl == "CUtensorMap" for decl in desc_decls])
-
-            pattern = r"[,\s]*(?:\w+\s*\*+\s*__restrict__\s+)?(\w+)"
-            matches = re.findall(pattern, s)
-            call_args = []
-            for i, match in enumerate(matches):
-                for arg in function_args:
-                    if arg["name"] == match:
-                        call_args.append(match)
-                    elif maybe_desc(arg["name"], matches, i):
-                        call_args.append(match)
-                        assert len(call_args) <= len(
-                            function_params
-                        ), f"Function {function_name} has {len(function_params)} parameters, but {len(call_args)} arguments"
-                        desc_name_var_map[match] = function_params[len(call_args) - 1]
-
-            return call_args
-
         has_l2_persistent_map = False
         for function_name, _ in function_informations.items():
             if function_name in self.l2_persistent_map:
@@ -378,38 +297,40 @@ def maybe_desc(name: str, matches: list[str], i: int):
             # Identify the start of the function body to insert arguments
             index = code.index("{", index)
 
-            block_str = f"dim3({self._pythonic_expr(block_info[0])}, {self._pythonic_expr(block_info[1])}, {self._pythonic_expr(block_info[2])})"
-            grid_str = f"dim3({self._pythonic_expr(grid_info[0])}, {self._pythonic_expr(grid_info[1])}, {self._pythonic_expr(grid_info[2])})"
+            block_str = (
+                f"dim3({self._pythonic_expr(block_info[0])}, {self._pythonic_expr(block_info[1])}, {self._pythonic_expr(block_info[2])})"
+            )
+            grid_str = (
+                f"dim3({self._pythonic_expr(grid_info[0])}, {self._pythonic_expr(grid_info[1])}, {self._pythonic_expr(grid_info[2])})"
+            )
             smem_str = 0 if dynamic_smem_buf is None else dynamic_smem_buf
             init_l2_persistent_map = self.generate_l2_persistent_map(function_name)
             kernel_launch_code += init_l2_persistent_map
 
             if self.use_cooperative_groups[function_name]:
-                args_list = func_call_args(declaration, function_args, function_params,
-                                           desc_name_map, desc_name_var_map)
-                assert len(function_params) == len(
-                    args_list
-                ), f"Function {function_name} has {len(function_params)} parameters, but {len(args_list)} arguments"
+                args_list = parse_function_call_args(declaration, function_args, function_params, desc_name_map, desc_name_var_map)
+                assert len(function_params) == len(args_list), (
+                    f"Function {function_name} has {len(function_params)} parameters, but {len(args_list)} arguments"
+                )
                 args_array = [f"(void*)&{arg}" for arg in args_list]
                 call_args = f"\tvoid* {function_name}_args[] = {{{', '.join(args_array)}}};\n"
                 kernel_launch_code += call_args
                 # Using cudaLaunchCooperativeKernel to launch the kernel
                 kernel_launch_code += "\tTILELANG_CHECK(cudaLaunchCooperativeKernel((void*){}, {}, {}, {}, {}, stream));\n".format(
-                    function_name, grid_str, block_str, function_name + "_args", smem_str)
+                    function_name, grid_str, block_str, function_name + "_args", smem_str
+                )
             else:
-                args_list = func_call_args(declaration, function_args, function_params,
-                                           desc_name_map, desc_name_var_map)
-                assert len(function_params) == len(
-                    args_list
-                ), f"Function {function_name} has {len(function_params)} parameters, but {len(args_list)} arguments"
+                args_list = parse_function_call_args(declaration, function_args, function_params, desc_name_map, desc_name_var_map)
+                assert len(function_params) == len(args_list), (
+                    f"Function {function_name} has {len(function_params)} parameters, but {len(args_list)} arguments"
+                )
                 call_args = ", ".join(args_list)
                 kernel_launch_code += f"\t{function_name}<<<{grid_str}, {block_str}, {smem_str}, stream>>>({call_args});\n"
-                kernel_launch_code += f"\tTILELANG_CHECK_LAST_ERROR(\"{function_name}\");\n"
+                kernel_launch_code += f'\tTILELANG_CHECK_LAST_ERROR("{function_name}");\n'
             if has_l2_persistent_map:
                 kernel_launch_code += L2_PERSISTENT_MAP_RESET_HANDLE
 
-        init_tma_descriptor_args = self.generate_tma_descriptor_args(desc_name_map,
-                                                                     desc_name_var_map)
+        init_tma_descriptor_args = self.generate_tma_descriptor_args(desc_name_map, desc_name_var_map)
         kernel_launch_code = init_tma_descriptor_args + kernel_launch_code
 
         # Wrap the kernel dispatch logic in an external C function
@@ -420,123 +341,65 @@ def generate_l2_persistent_map(self, function_name: str) -> str:
         if function_name not in self.l2_persistent_map:
             return ""
         init_l2_persistent_map = ""
-        for buffer_name, (hit_ratio,
-                          size_in_bytes) in self.l2_persistent_map[function_name].items():
+        for buffer_name, (hit_ratio, size_in_bytes) in self.l2_persistent_map[function_name].items():
             # get persisting_l2_cache_max_size
             from tilelang.carver.arch.driver import get_persisting_l2_cache_max_size
+
             persisting_l2_cache_max_size = get_persisting_l2_cache_max_size()
             try:
                 num_bytes = min(size_in_bytes, persisting_l2_cache_max_size)
             except Exception:
                 # as size_in_bytes maybe a symbolic expression
                 num_bytes = persisting_l2_cache_max_size
-            init_l2_persistent_map += L2_PERSISTENT_MAP_INIT_FUNC.format(
-                buffer_name, float(hit_ratio), self._pythonic_expr(num_bytes))
+            init_l2_persistent_map += L2_PERSISTENT_MAP_INIT_FUNC.format(buffer_name, float(hit_ratio), self._pythonic_expr(num_bytes))
 
         return init_l2_persistent_map
 
-    def generate_tma_descriptor_args(self, desc_name_map: dict[str, str],
-                                     desc_name_var_map: dict[str, tvm.tir.Var]) -> str:
-        tma_descripter_init = ""
+    def generate_tma_descriptor_args(self, desc_name_map: dict[str, str], desc_name_var_map: dict[str, tvm.tir.Var]) -> str:
+        tma_descriptor_init = ""
         if self.tma_descriptor_args is None:
-            return tma_descripter_init
-        for handle_name, _ in desc_name_map.items():
-            assert handle_name in desc_name_var_map, f"Handle name {handle_name} not found in desc_name_var_map"
-            desc_var = desc_name_var_map[handle_name]
-
-            assert desc_var in self.tma_descriptor_args, f"TMA descriptor {desc_var} not found in {self.tma_descriptor_args}"
-            args = self.tma_descriptor_args[desc_var]
-            # Skip __tvm_tensormap_create_tiled
-            if len(args) < 3:
-                raise ValueError(
-                    f"TMA descriptor args too short: {len(args)} elements, expected at least 3")
-
-            tma_create_str, _, dtype, tensor_rank, globalAddress, *remaining_args = args
-
-            is_img2col = (tma_create_str.value == "__tvm_tensormap_create_im2col")
-            dtype = self._pythonic_expr(dtype)
-            tensor_rank = int(self._pythonic_expr(tensor_rank))
-
-            # Validate tensor_rank
-            if not isinstance(tensor_rank, int) or tensor_rank <= 0:
-                raise ValueError(f"Invalid tensor_rank: {tensor_rank}. Must be a positive integer")
-
-            if not is_img2col:
-                # Calculate required length for remaining_args
-                expected_args_len = 4 * tensor_rank + 4  # 4 groups of tensor_rank size + 4 parameters
-                if len(remaining_args) < expected_args_len:
-                    raise ValueError(f"Insufficient remaining args: got {len(remaining_args)}, "
-                                     f"expected {expected_args_len} for tensor_rank {tensor_rank}")
-
-                # Extract dimensions and strides using list slicing
-                global_dim = remaining_args[:tensor_rank]
-                global_stride = remaining_args[tensor_rank:2 * tensor_rank]
-                box_dim = remaining_args[2 * tensor_rank:3 * tensor_rank]
-                element_strides = remaining_args[3 * tensor_rank:4 * tensor_rank]
-
-                global_dim = [self._pythonic_expr(i) for i in global_dim]
-                global_stride = [self._pythonic_expr(i) for i in global_stride]
-                box_dim = [self._pythonic_expr(i) for i in box_dim]
-                element_strides = [self._pythonic_expr(i) for i in element_strides]
-
-                # Extract remaining parameters
-                try:
-                    interleave, swizzle, l2Promotion, oobFill = remaining_args[4 * tensor_rank:4 *
-                                                                               tensor_rank + 4]
-                    interleave = self._pythonic_expr(interleave)
-                    swizzle = self._pythonic_expr(swizzle)
-                    l2Promotion = self._pythonic_expr(l2Promotion)
-                    oobFill = self._pythonic_expr(oobFill)
-                except ValueError as e:
-                    raise ValueError(
-                        "Failed to unpack the final 4 TMA parameters (interleave, swizzle, l2Promotion, oobFill)"
-                    ) from e
-
-                tma_descripter_init += TMA_DESC_INIT_FUNC.format(
-                    handle_name, dtype, tensor_rank, globalAddress, ",".join(global_dim),
-                    ",".join(global_stride), ",".join(box_dim), ",".join(element_strides),
-                    interleave, swizzle, l2Promotion, oobFill)
+            return tma_descriptor_init
+
+        # Parse TMA descriptor arguments using the common utility
+        parsed_params = parse_tma_descriptor_args(self.tma_descriptor_args, desc_name_map, desc_name_var_map, self._pythonic_expr)
+
+        # Generate C++ code from parsed parameters
+        for params in parsed_params:
+            if not params.is_img2col:
+                tma_descriptor_init += TMA_DESC_INIT_FUNC.format(
+                    params.handle_name,
+                    params.dtype,
+                    params.tensor_rank,
+                    params.global_address,
+                    ",".join(params.global_dim),
+                    ",".join(params.global_stride),
+                    ",".join(params.box_dim),
+                    ",".join(params.element_strides),
+                    params.interleave,
+                    params.swizzle,
+                    params.l2_promotion,
+                    params.oob_fill,
+                )
             else:
-                # Calculate required length for remaining_args
-                expected_args_len = 5 * tensor_rank + 2
-                if len(remaining_args) < expected_args_len:
-                    raise ValueError(f"Insufficient remaining args: got {len(remaining_args)}, "
-                                     f"expected {expected_args_len} for tensor_rank {tensor_rank}")
-
-                # Extract dimensions and strides using list slicing
-                global_dim = remaining_args[:tensor_rank]
-                global_stride = remaining_args[tensor_rank:2 * tensor_rank]
-                element_strides = remaining_args[2 * tensor_rank:3 * tensor_rank]
-                lower_corner = remaining_args[3 * tensor_rank:4 * tensor_rank - 2]
-                upper_corner = remaining_args[4 * tensor_rank - 2:5 * tensor_rank - 4]
-                global_dim = [self._pythonic_expr(i) for i in global_dim]
-                global_stride = [self._pythonic_expr(i) for i in global_stride]
-                element_strides = [self._pythonic_expr(i) for i in element_strides]
-                lower_corner = [self._pythonic_expr(i) for i in lower_corner]
-                upper_corner = [self._pythonic_expr(i) for i in upper_corner]
-
-                # Extract remaining parameters
-                try:
-                    smem_box_pixel, smem_box_channel, interleave, swizzle, l2Promotion, oobFill = remaining_args[
-                        5 * tensor_rank - 4:5 * tensor_rank + 2]
-                    smem_box_pixel = self._pythonic_expr(smem_box_pixel)
-                    smem_box_channel = self._pythonic_expr(smem_box_channel)
-                    interleave = self._pythonic_expr(interleave)
-                    swizzle = self._pythonic_expr(swizzle)
-                    l2Promotion = self._pythonic_expr(l2Promotion)
-                    oobFill = self._pythonic_expr(oobFill)
-                except ValueError as e:
-                    raise ValueError(
-                        "Failed to unpack the final 6 TMA parameters (smem_box_pixel, smem_box_channel, interleave, swizzle, l2Promotion, oobFill)"
-                    ) from e
-
-                tma_descripter_init += TMA_IM2COL_DESC_INIT_FUNC.format(
-                    handle_name, dtype, tensor_rank, globalAddress, ",".join(global_dim),
-                    ",".join(global_stride), ",".join(element_strides), ",".join(lower_corner),
-                    ",".join(upper_corner), smem_box_channel, smem_box_pixel, interleave, swizzle,
-                    l2Promotion, oobFill)
-
-        return tma_descripter_init
+                tma_descriptor_init += TMA_IM2COL_DESC_INIT_FUNC.format(
+                    params.handle_name,
+                    params.dtype,
+                    params.tensor_rank,
+                    params.global_address,
+                    ",".join(params.global_dim),
+                    ",".join(params.global_stride),
+                    ",".join(params.element_strides),
+                    ",".join(params.lower_corner),
+                    ",".join(params.upper_corner),
+                    params.smem_box_channel,
+                    params.smem_box_pixel,
+                    params.interleave,
+                    params.swizzle,
+                    params.l2_promotion,
+                    params.oob_fill,
+                )
+
+        return tma_descriptor_init
 
     def parse_source_information(self):
         if self.device_mod is None or self.host_mod is None:
@@ -544,9 +407,8 @@ def parse_source_information(self):
                 device_mod, host_mod = get_annotated_mod(self.mod, self.target)
             self.device_mod = device_mod
             self.host_mod = host_mod
-        assert (len(self.device_mod.functions)
-                >= 1), "Device module should have at least one function."
-        assert (len(self.host_mod.functions) == 1), "Only support one function in host module."
+        assert len(self.device_mod.functions) >= 1, "Device module should have at least one function."
+        assert len(self.host_mod.functions) == 1, "Only support one function in host module."
 
         block_info_map = {}
         grid_info_map = {}
@@ -603,18 +465,20 @@ def parse_source_information(self):
 
     def get_dynamic_symbolic_set(self, prim_func):
         # Determine the set of dynamic symbols used in the function
-        dynamic_symbolic_set: list[str] = []
+        dynamic_symbolic_set: dict[str, str] = {}
 
-        def unique_push_back(name: str):
+        def unique_push_back(name: str, dtype: str):
             if name not in dynamic_symbolic_set:
-                dynamic_symbolic_set.append(name)
+                dynamic_symbolic_set[name] = dtype
+            else:
+                assert dtype == dynamic_symbolic_set[name]
 
         for param in prim_func.params:
             if param in prim_func.buffer_map:
                 buffer = prim_func.buffer_map[param]
                 for dim in buffer.shape:
                     if isinstance(dim, tvm.tir.Var):
-                        unique_push_back(dim.name)
+                        unique_push_back(dim.name, str(dim.dtype))
 
         # Note: In buffer definitions, any dynamic symbols appearing in strides are listed after those in the shape.
         for param in prim_func.params:
@@ -622,9 +486,9 @@ def unique_push_back(name: str):
                 buffer = prim_func.buffer_map[param]
                 for stride in buffer.strides:
                     if isinstance(stride, tvm.tir.Var):
-                        unique_push_back(stride.name)
+                        unique_push_back(stride.name, str(stride.dtype))
 
-        return dynamic_symbolic_set
+        return list(dynamic_symbolic_set.items())
 
     def get_init_func(self):
         # Initialize an empty string for the CUDA function call
@@ -633,8 +497,8 @@ def get_init_func(self):
         for function_name, dynamic_smem_buf in self.dynamic_smem_buf.items():
             if dynamic_smem_buf is not None:
                 # Format the cudaFuncSetAttribute call for dynamic shared memory
-                call_str += PREDEF_ATTRIBUTE_SET_DYNAMIC_MEMORY.format(
-                    function_name, dynamic_smem_buf)
+                call_str += PREDEF_ATTRIBUTE_SET_DYNAMIC_MEMORY.format(function_name, dynamic_smem_buf)
+        # Add NVSHMEM init if needed
         nvshmem_init_str = "nvshmem_init();\n\t" if self.use_nvshmem else ""
         # Format the initialization function using the call_str
         init_funcs = PREDEF_INIT_FUNC.format(nvshmem_init_str + call_str)
@@ -664,17 +528,14 @@ def update_lib_code(self, code: str):
             def visitor(node, fn=function_name, param_cnt=kernel_params_cnt):
                 nonlocal function_params
                 if isinstance(node, tvm.tir.Call):
-                    if not (hasattr(node, "op") and
-                            node.op == tvm.ir.Op.get("tir.tvm_call_packed")):
+                    if not (hasattr(node, "op") and node.op == tvm.ir.Op.get("tir.tvm_call_packed")):
                         return
                     args = node.args
                     if not args or args[0] != fn:
                         return
                     if len(args) < 1 + param_cnt:
-                        raise AssertionError(
-                            "tvm_call_packed should have at least 1 argument and match device function parameters"
-                        )
-                    function_params = args[1:1 + param_cnt]
+                        raise AssertionError("tvm_call_packed should have at least 1 argument and match device function parameters")
+                    function_params = args[1 : 1 + param_cnt]
 
             post_order_visit(self.host_func.body, visitor)
             assert function_params is not None, "function_params should not be None"
@@ -694,7 +555,7 @@ def visitor(node, fn=function_name, param_cnt=kernel_params_cnt):
         return lib_code
 
     def get_stream_type(self) -> dict[str, str]:
-        return {"name": "stream", "type": "cudaStream_t"}
+        return {"name": "stream=cudaStreamDefault", "type": "cudaStream_t"}
 
     @property
     def prim_func(self):
@@ -736,212 +597,6 @@ def host_func(self):
             raise ValueError("Cannot find primary function in the module.")
 
 
-class TLNVRTCSourceWrapper(TLCUDASourceWrapper):
-    """
-    A wrapper class for the TileLang NVRTC backend.
-    """
-
-    _TYPE_MAP = {
-        "float32": "ctypes.c_float",
-        "float16": "ctypes.c_uint16",
-        "bfloat16": "ctypes.c_uint16",
-        "float8_e4m3": "ctypes.c_uint8",
-        "float8_e5m2": "ctypes.c_uint8",
-        "float64": "ctypes.c_double",
-        "int64": "ctypes.c_int64",
-        "int32": "ctypes.c_int32",
-        "uint32": "ctypes.c_uint32",
-        "bool": "ctypes.c_bool",
-        "int8": "ctypes.c_int8",
-        "uint8": "ctypes.c_uint8",
-        "int16": "ctypes.c_int16",
-        "uint16": "ctypes.c_uint16",
-        "uchar": "ctypes.c_uint8",
-    }
-
-    def __init__(self,
-                 scheduled_ir_module: IRModule,
-                 source: str,
-                 target: Target,
-                 device_mod: IRModule | None = None,
-                 host_mod: IRModule | None = None,
-                 pass_configs: dict[str, Any] | None = None):
-        super().__init__(scheduled_ir_module, source, target, device_mod, host_mod, pass_configs)
-
-    def create_dispatch_func(self, code, function_informations):
-        # Extract the set of dynamic symbolic names used in the primary function
-        dynamic_symbolic_set = self.get_dynamic_symbolic_set(self.prim_func)
-
-        function_args = [{"name": "kernels", "type": "Dict[str, cuda.bindings.driver.CUkernel]"}]
-        # Collect function arguments based on primary function's parameters and buffer mappings
-        for param in self.prim_func.params:
-            if param in self.prim_func.buffer_map:
-                buffer = self.prim_func.buffer_map[param]
-                function_args.append({
-                    "name": buffer.data.name,
-                    "type": "ctypes.c_void_p",
-                })
-            elif isinstance(param, tvm.tir.Var):
-                function_args.append({"name": param.name, "type": self._TYPE_MAP[param.dtype]})
-            else:
-                raise ValueError(
-                    f"Parameter {param} is not in the buffer map of the primary function.")
-        # Add dynamic symbols as integer arguments
-        for dyn_sym in dynamic_symbolic_set:
-            if dyn_sym not in [arg["name"] for arg in function_args]:
-                function_args.append({"name": dyn_sym, "type": "ctypes.c_int"})
-
-        function_args.append(self.get_stream_type())
-        # Format the function arguments for declaration
-        def_args = ", ".join([f"{arg['name']}" for arg in function_args])
-
-        def func_call_args(s, function_args, desc_name_map: dict[str, str] | None = None):
-            # Extract the function call arguments matching the function definition
-            def maybe_desc(name: str, matches: list[str], i: int):
-                match = matches[i]
-                if not (match == name + "_desc" or match.startswith(name + "_desc_")):
-                    return False
-                desc_decls = []
-                if desc_name_map is not None:
-                    desc_name_map[match] = name
-                if i > 0:
-                    desc_decls.append(matches[i - 1])
-                if i < len(matches) - 1:
-                    desc_decls.append(matches[i + 1])
-                return any([decl == "CUtensorMap" for decl in desc_decls])
-
-            pattern = r"[,\s]*(?:\w+\s*\*+\s*__restrict__\s+)?(\w+)"
-            matches = re.findall(pattern, s)
-            call_args = []
-            for i, match in enumerate(matches):
-                for arg in function_args:
-                    if arg["name"] == match:
-                        call_args.append(
-                            (f"{match}.data_ptr()" if arg["type"] == "ctypes.c_void_p" else match,
-                             arg["type"]))
-                    elif maybe_desc(arg["name"], matches, i):
-                        call_args.append((match, "None"))
-            return call_args
-
-        desc_name_map: dict[str, str] = {}
-        device_index = 0
-        kernel_launch_code = """"""
-        for function_name, function_info in function_informations.items():
-            block_info = function_info["block_info"]
-            grid_info = function_info["grid_info"]
-            dynamic_smem_buf = function_info["dynamic_smem_buf"]
-
-            # Find the location of the global kernel function in the code
-            index = match_declare_kernel(code, function_name + "(")
-
-            # Analyze the function declaration to prepare for argument extraction
-            declaration = code[index:].split(";")[0]
-
-            # Identify the start of the function body to insert arguments
-            index = code.index("{", index)
-            call_args = func_call_args(declaration, function_args, desc_name_map)
-            for arg_name, arg_type in call_args:
-                if arg_type == "ctypes.c_void_p":
-                    device_index = f"{arg_name.replace('.data_ptr()', '')}.device.index"
-                    break
-            arg_names = ", ".join([arg[0] for arg in call_args])
-            arg_types = ", ".join([arg[1] for arg in call_args])
-            smem_str = 0 if dynamic_smem_buf is None else dynamic_smem_buf
-            kernel_launch_code += self.generate_tma_descriptor_args(
-                desc_name_map) + KERNEL_LAUNCH_FUNC_PY.format(
-                    function_name, self._pythonic_expr(grid_info[0]),
-                    self._pythonic_expr(grid_info[1]), self._pythonic_expr(grid_info[2]),
-                    self._pythonic_expr(block_info[0]), self._pythonic_expr(block_info[1]),
-                    self._pythonic_expr(
-                        block_info[2]), smem_str, arg_names, arg_types, device_index)
-
-        # Wrap the kernel dispatch logic in an external C function
-        host_func = PREDEF_HOST_FUNC_PY.format(
-            repr(list(function_informations.keys())), def_args, kernel_launch_code)
-        return host_func
-
-    def generate_tma_descriptor_args(self, desc_name_map: dict[str, str]) -> str:
-        tma_descripter_init = ""
-        if self.tma_descriptor_args is None:
-            return tma_descripter_init
-
-        for handle_name, name in desc_name_map.items():
-            desc_name = name + "_desc"
-            assert desc_name in self.tma_descriptor_args, f"TMA descriptor {desc_name} not found in {self.tma_descriptor_args}"
-            args = self.tma_descriptor_args[desc_name]
-            # Skip __tvm_tensormap_create_tiled
-            if len(args) < 3:
-                raise ValueError(
-                    f"TMA descriptor args too short: {len(args)} elements, expected at least 3")
-            _, dtype, tensor_rank, globalAddress, *remaining_args = args[1:]
-
-            tensor_rank = int(tensor_rank)
-            # Validate tensor_rank
-            if not isinstance(tensor_rank, int) or tensor_rank <= 0:
-                raise ValueError(f"Invalid tensor_rank: {tensor_rank}. Must be a positive integer")
-
-            # Calculate required length for remaining_args
-            # 4 groups of tensor_rank size + 4 parameters
-            expected_args_len = 4 * tensor_rank + 4
-            if len(remaining_args) < expected_args_len:
-                raise ValueError(f"Insufficient remaining args: got {len(remaining_args)}, "
-                                 f"expected {expected_args_len} for tensor_rank {tensor_rank}")
-
-            # Extract dimensions and strides using list slicing
-            global_dim = remaining_args[:tensor_rank]
-            global_stride = remaining_args[tensor_rank:2 * tensor_rank]
-            box_dim = remaining_args[2 * tensor_rank:3 * tensor_rank]
-            element_strides = remaining_args[3 * tensor_rank:4 * tensor_rank]
-
-            global_dim = [str(i) for i in global_dim]
-            global_stride = [str(i) for i in global_stride]
-            box_dim = [str(i) for i in box_dim]
-            element_strides = [str(i) for i in element_strides]
-
-            # Extract remaining parameters
-            try:
-                interleave, swizzle, l2Promotion, oobFill = remaining_args[4 * tensor_rank:4 *
-                                                                           tensor_rank + 4]
-            except ValueError as e:
-                raise ValueError(
-                    "Failed to unpack the final 4 TMA parameters (interleave, swizzle, l2Promotion, oobFill)"
-                ) from e
-
-            tma_descripter_init += TMA_DESC_INIT_FUNC_PY.format(
-                handle_name, dtype, tensor_rank, globalAddress,
-                ", ".join(map(lambda x: f"cuda.bindings.driver.cuuint64_t({x})", global_dim)),
-                ", ".join(map(lambda x: f"cuda.bindings.driver.cuuint64_t({x})", global_stride)),
-                ", ".join(map(lambda x: f"cuda.bindings.driver.cuuint32_t({x})", box_dim)),
-                ", ".join(map(lambda x: f"cuda.bindings.driver.cuuint32_t({x})",
-                              element_strides)), interleave, swizzle, l2Promotion, oobFill)
-        return tma_descripter_init
-
-    def update_lib_code(self, code: str):
-        # Update the library code with the given code string
-        self.lib_code = code
-
-        # Organize function information for code generation
-        function_informations = {}
-        for function_name in self.function_names:
-            # Do not update function with dispatch host function
-            if (function_name not in self.block_info) or (function_name not in self.grid_info):
-                continue
-
-            function_informations[function_name] = {
-                "function_name": function_name,
-                "block_info": self.block_info[function_name],
-                "grid_info": self.grid_info[function_name],
-                "dynamic_smem_buf": self.dynamic_smem_buf[function_name],
-            }
-
-        # Create the host function wrapper for the CUDA kernel
-        self.host_func = self.create_dispatch_func(code, function_informations)
-        return self.lib_code
-
-    def get_stream_type(self) -> dict[str, str]:
-        return {"name": "stream", "type": "int"}
-
-
 class TLHIPSourceWrapper(TLCUDASourceWrapper):
     """
     A wrapper class for the TileLang HIP backend.
@@ -952,6 +607,7 @@ class TLHIPSourceWrapper(TLCUDASourceWrapper):
         "float16": "half_t",
         "bfloat16": "bfloat16_t",
         "float8_e4m3": "fp8_e4_t",
+        "float8_e4m3fn": "fp8_e4_t",
         "float8_e5m2": "fp8_e5_t",
         "float8_e4m3fnuz": "fp8_e4_t",
         "e4m3fnuz_float8": "fp8_e4_t",
@@ -967,13 +623,15 @@ class TLHIPSourceWrapper(TLCUDASourceWrapper):
         "uchar": "uint8_t",
     }
 
-    def __init__(self,
-                 scheduled_ir_module: IRModule,
-                 source: str,
-                 target: Target,
-                 device_mod: IRModule | None = None,
-                 host_mod: IRModule | None = None,
-                 pass_configs: dict[str, Any] | None = None):
+    def __init__(
+        self,
+        scheduled_ir_module: IRModule,
+        source: str,
+        target: Target,
+        device_mod: IRModule | None = None,
+        host_mod: IRModule | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
         super().__init__(scheduled_ir_module, source, target, device_mod, host_mod, pass_configs)
 
     def get_init_func(self):
@@ -983,8 +641,7 @@ def get_init_func(self):
         for function_name, dynamic_smem_buf in self.dynamic_smem_buf.items():
             if dynamic_smem_buf is not None:
                 # Format the cudaFuncSetAttribute call for dynamic shared memory
-                call_str += PREDEF_ATTRIBUTE_SET_DYNAMIC_MEMORY_HIP.format(
-                    function_name, dynamic_smem_buf)
+                call_str += PREDEF_ATTRIBUTE_SET_DYNAMIC_MEMORY_HIP.format(function_name, dynamic_smem_buf)
         # Format the initialization function using the call_str
         init_funcs = PREDEF_INIT_FUNC.format(call_str)
         return init_funcs
@@ -998,16 +655,19 @@ class TLCPUSourceWrapper:
         "float32": "float",
         "float16": "half",
         "int32": "int32_t",
+        "int8": "int8_t",
+        "uint8": "uint8_t",
+        "int16": "int16_t",
+        "uint16": "uint16_t",
+        "int64": "int64_t",
+        "uint64": "uint64_t",
+        "float64": "double",
+        "bool": "bool",
+        "uchar": "uchar",
     }
 
-    INIT_FUNC = textwrap.dedent('''
-        #ifdef __cplusplus
-        extern "C"
-        #endif
-        int32_t init() {
-            return 0;
-        }
-    ''')
+    # Use common init with error buffer and get_last_error for CPU backend as well
+    INIT_FUNC = PREDEF_INIT_FUNC.format("")
 
     CALL_PREFIX = textwrap.dedent("""
         #ifdef __cplusplus
@@ -1023,13 +683,15 @@ class TLCPUSourceWrapper:
     host_mod: IRModule | None = None
     pass_configs: dict[str, Any] | None = None
 
-    def __init__(self,
-                 scheduled_ir_module: IRModule,
-                 source: str,
-                 target: Target,
-                 device_mod: IRModule | None = None,
-                 host_mod: IRModule | None = None,
-                 pass_configs: dict[str, Any] | None = None):
+    def __init__(
+        self,
+        scheduled_ir_module: IRModule,
+        source: str,
+        target: Target,
+        device_mod: IRModule | None = None,
+        host_mod: IRModule | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
         self.mod = scheduled_ir_module
         self.target = target
         self.source = source
@@ -1043,6 +705,12 @@ def __init__(self,
         self.libpath: str | None = None
         self.lib_code: str | None = self.update_lib_code(source)
 
+    def _lookup_type(self, dtype: str | Any) -> str:
+        key = dtype if isinstance(dtype, str) else str(dtype)
+        result = self._TYPE_MAP.get(key)
+        assert result is not None, f"Unsupported dtype {dtype}"
+        return result
+
     def create_call_func(self, code, function_informations):
         # Extract the set of dynamic symbolic names used in the primary function
         dynamic_symbolic_set = self.get_dynamic_symbolic_set(self.prim_func)
@@ -1052,18 +720,19 @@ def create_call_func(self, code, function_informations):
         for param in self.prim_func.params:
             if param in self.prim_func.buffer_map:
                 buffer = self.prim_func.buffer_map[param]
-                function_args.append({
-                    "name": buffer.name,
-                    "type": self._TYPE_MAP[buffer.dtype] + "*",
-                })
+                function_args.append(
+                    {
+                        "name": buffer.name,
+                        "type": self._lookup_type(buffer.dtype) + "*",
+                    }
+                )
             elif isinstance(param, tvm.tir.Var):
-                function_args.append({"name": param.name, "type": self._TYPE_MAP[param.dtype]})
+                function_args.append({"name": param.name, "type": self._lookup_type(param.dtype)})
             else:
-                raise ValueError(
-                    f"Parameter {param} is not in the buffer map of the primary function.")
+                raise ValueError(f"Parameter {param} is not in the buffer map of the primary function.")
         # Add dynamic symbols as integer arguments
-        for dyn_sym in dynamic_symbolic_set:
-            function_args.append({"name": dyn_sym, "type": "int"})
+        for dyn_sym, dyn_sym_dtype in dynamic_symbolic_set:
+            function_args.append({"name": dyn_sym, "type": self._lookup_type(dyn_sym_dtype)})
         # Format the function arguments for declaration
         def_args = ", ".join([f"{arg['type']} {arg['name']}" for arg in function_args])
 
@@ -1080,7 +749,6 @@ def func_call_args(s, function_args):
         _call_str = """"""
 
         for function_name, _ in function_informations.items():
-
             # Find the location of the global kernel function in the code
             index = match_declare_kernel_cpu(code, function_name + "(")
 
@@ -1100,8 +768,8 @@ def func_call_args(s, function_args):
     def parse_source_information(self):
         with tvm.transform.PassContext(opt_level=3, config=self.pass_configs):
             device_mod, host_mod = get_annotated_mod(self.mod, self.target)
-        assert (len(device_mod.functions) >= 1), "Device module should have at least one function."
-        assert (len(host_mod.functions) == 1), "Only support one function in host module."
+        assert len(device_mod.functions) >= 1, "Device module should have at least one function."
+        assert len(host_mod.functions) == 1, "Only support one function in host module."
 
         function_names = []
         for g_var, _ in device_mod.functions.items():
@@ -1112,18 +780,18 @@ def parse_source_information(self):
 
     def get_dynamic_symbolic_set(self, prim_func):
         # Determine the set of dynamic symbols used in the function
-        dynamic_symbolic_set: list[str] = []
+        dynamic_symbolic_set: dict[str, str] = {}
         for param in prim_func.params:
             if param in prim_func.buffer_map:
                 buffer = prim_func.buffer_map[param]
                 for dim in buffer.shape:
                     if isinstance(dim, tvm.tir.Var) and (dim.name not in dynamic_symbolic_set):
-                        dynamic_symbolic_set.append(dim.name)
-        return dynamic_symbolic_set
+                        dynamic_symbolic_set[dim.name] = str(dim.dtype)
+        return list(dynamic_symbolic_set.items())
 
     def get_cpu_init_func(self):
-        init_funcs = self.INIT_FUNC
-        return init_funcs
+        # Provide init() and get_last_error() for CPU backend
+        return self.INIT_FUNC
 
     def update_lib_code(self, code: str):
         # Update the library code with the given code string
@@ -1161,14 +829,15 @@ def prim_func(self):
 
 
 class TLMetalSourceWrapper:
-
-    def __init__(self,
-                 scheduled_ir_module: IRModule,
-                 source: str,
-                 target: Target,
-                 device_mod: IRModule | None = None,
-                 host_mod: IRModule | None = None,
-                 pass_configs: dict[str, Any] | None = None):
+    def __init__(
+        self,
+        scheduled_ir_module: IRModule,
+        source: str,
+        target: Target,
+        device_mod: IRModule | None = None,
+        host_mod: IRModule | None = None,
+        pass_configs: dict[str, Any] | None = None,
+    ):
         self.mod = scheduled_ir_module
         self.target = target
         self.source = source
@@ -1182,10 +851,14 @@ def update_lib_code(self, code: str):
         return self.lib_code
 
 
+# TLCuTeDSLSourceWrapper has been moved to tilelang.jit.adapter.cutedsl.wrapper
+
+
 class TLWrapper(BaseWrapper):
     """
     A wrapper class for the TileLang backend.
     """
+
     device_mod: IRModule | None = None
     host_mod: IRModule | None = None
     pass_configs: dict[str, Any] | None = None
@@ -1230,26 +903,40 @@ def wrap(self, c_source: str):
             target=self.target,
             device_mod=self.device_mod,
             host_mod=self.host_mod,
-            pass_configs=self.pass_configs)
+            pass_configs=self.pass_configs,
+        )
         return wrapper.lib_code
 
 
 class TLPyWrapper(TLWrapper):
-
     def __init__(self, target: Target):
         super().__init__(target)
 
-    def wrap(self, c_source: str):
+    def wrap(self, py_source: str):
         # assert self.scheduled_ir_module is not None, "Please assign optimized module first."
-        if is_cuda_target(self.target):
+        if is_cutedsl_target(self.target):
+            from tilelang.jit.adapter.cutedsl import TLCuTeDSLSourceWrapper
+
+            wrapper_class = TLCuTeDSLSourceWrapper
+        elif is_cuda_target(self.target):
+            from tilelang.jit.adapter.nvrtc import TLNVRTCSourceWrapper
+
             wrapper_class = TLNVRTCSourceWrapper
         else:
-            raise ValueError(f"Unsupported platform: {self.arch.platform}")
+            raise ValueError(f"Unsupported target for NVRTC backend: {self.target}")
         wrapper = wrapper_class(
             scheduled_ir_module=self.scheduled_ir_module,
-            source=c_source,
+            source=py_source,
             target=self.target,
             device_mod=self.device_mod,
             host_mod=self.host_mod,
-            pass_configs=self.pass_configs)
-        return wrapper.host_func, wrapper.function_names
+            pass_configs=self.pass_configs,
+        )
+        return {
+            "host_func": getattr(wrapper, "host_func", None),
+            "function_names": getattr(wrapper, "function_names", None),
+            "tma_cpp_init_code": getattr(wrapper, "tma_cpp_init_code", None),
+            "tma_lib_name": getattr(wrapper, "tma_lib_name", None),
+            "launcher_cpp_code": getattr(wrapper, "launcher_cpp_code", None),
+            "launcher_lib_name": getattr(wrapper, "launcher_lib_name", None),
+        }
diff --git a/tilelang/jit/execution_backend.py b/tilelang/jit/execution_backend.py
new file mode 100644
index 000000000..fa3c1ecb0
--- /dev/null
+++ b/tilelang/jit/execution_backend.py
@@ -0,0 +1,108 @@
+from __future__ import annotations
+
+from collections.abc import Iterable
+
+from tvm.target import Target
+from tilelang.jit.adapter.utils import is_cutedsl_target
+
+# Canonical names for execution backends used internally
+_CANONICAL_MAP = {
+    "dlpack": "tvm_ffi",  # historical alias
+}
+
+
+def _canon_backend(name: str | None) -> str | None:
+    if name is None:
+        return None
+    key = str(name).lower()
+    return _CANONICAL_MAP.get(key, key)
+
+
+def _target_kind(target: Target) -> str:
+    # tvm.target.Target always has kind.name
+    return target.kind.name
+
+
+def allowed_backends_for_target(target: Target, *, include_unavailable: bool = True) -> list[str]:
+    """Return allowed execution backends for a given TVM target kind.
+
+    include_unavailable: if False, this will filter out backends that are known
+    to be unavailable at runtime (e.g., NVRTC without cuda-python installed).
+    """
+    kind = _target_kind(target)
+
+    if is_cutedsl_target(target):
+        return ["cutedsl"]
+    elif kind == "cuda":
+        allowed = ["tvm_ffi", "nvrtc", "cython"]
+    elif kind == "hip":
+        allowed = ["tvm_ffi", "cython"]
+    elif kind == "metal":
+        allowed = ["torch"]
+    elif kind == "c":  # CPU C backend
+        allowed = ["cython", "tvm_ffi"]
+    else:
+        # Fallback: prefer portable hosts
+        allowed = ["cython", "tvm_ffi"]
+
+    if not include_unavailable:
+        # Drop NVRTC if not importable
+        try:
+            from tilelang.jit.adapter.nvrtc import is_nvrtc_available  # lazy
+
+            if not is_nvrtc_available and "nvrtc" in allowed:
+                allowed = [b for b in allowed if b != "nvrtc"]
+        except Exception:
+            # Be conservative and keep nvrtc if detection itself fails
+            pass
+
+    return allowed
+
+
+def _format_options(options: Iterable[str]) -> str:
+    return ", ".join(sorted(options))
+
+
+def resolve_execution_backend(requested: str | None, target: Target) -> str:
+    """Resolve an execution backend string to a concrete backend.
+
+    - Supports the alias "dlpack" -> "tvm_ffi".
+    - Supports the sentinel "auto" which selects a sensible default per target.
+    - Validates the combination (target, backend) and raises with helpful
+      alternatives when invalid.
+    """
+    req = _canon_backend(requested)
+    allowed_all = allowed_backends_for_target(target, include_unavailable=True)
+    allowed_avail = allowed_backends_for_target(target, include_unavailable=False)
+
+    # Default selection for auto/None
+    if req in (None, "auto"):
+        if is_cutedsl_target(target):
+            return "cutedsl"
+        kind = _target_kind(target)
+        if kind == "cuda":
+            choice = "tvm_ffi"
+        elif kind == "metal":
+            choice = "torch"
+        else:
+            choice = "cython"
+        # If the chosen default is not available (very rare), fall back to first available
+        if choice not in allowed_avail and allowed_avail:
+            choice = allowed_avail[0]
+        return choice
+
+    # Validate against allowed
+    if req not in allowed_all:
+        raise ValueError(
+            f"Invalid execution backend '{requested}' for target '{_target_kind(target)}'. "
+            f"Allowed: {_format_options(allowed_all)}. Tip: use execution_backend='auto'."
+        )
+
+    # Promote to availability-aware set for nicer errors (e.g., nvrtc not installed)
+    if req not in allowed_avail:
+        raise ValueError(
+            f"Execution backend '{requested}' requires extra dependencies and is not available now. "
+            f"Try one of: {_format_options(allowed_avail)}."
+        )
+
+    return req
diff --git a/tilelang/jit/kernel.py b/tilelang/jit/kernel.py
index 4e6e32089..7a934e166 100644
--- a/tilelang/jit/kernel.py
+++ b/tilelang/jit/kernel.py
@@ -1,7 +1,13 @@
 from __future__ import annotations
-from typing import Any, Callable, Literal
+from typing import Any, Callable, Generic, Literal, TypeVar
 
-from tilelang.jit.adapter.utils import is_metal_target
+# Python 3.9 compatibility for ParamSpec
+try:
+    from typing import ParamSpec
+except ImportError:  # Python < 3.10
+    from typing_extensions import ParamSpec
+
+from tilelang.jit.adapter.utils import is_cutedsl_target, is_metal_target, is_cuda_target
 from tvm.target import Target
 from tvm.tir import PrimFunc
 
@@ -9,18 +15,29 @@
 from tilelang import tvm
 from tilelang import env
 from tilelang.engine.param import CompiledArtifact, KernelParam
-from tilelang.jit.adapter import (BaseKernelAdapter, CtypesKernelAdapter, CythonKernelAdapter,
-                                  NVRTCKernelAdapter, TorchDLPackKernelAdapter, MetalKernelAdapter)
+from tilelang.jit.adapter import (
+    BaseKernelAdapter,
+    CythonKernelAdapter,
+    CuTeDSLKernelAdapter,
+    TVMFFIKernelAdapter,
+    MetalKernelAdapter,
+)
 from tilelang.profiler import Profiler, TensorSupplyType
 from tilelang.utils.target import determine_target
+from tilelang.contrib import nvcc as tl_nvcc
+from tilelang.transform import PassConfigKey
 from tilelang.utils.allocator import BaseAllocator
 import ctypes
 import logging
+import os
 
 logger = logging.getLogger(__name__)
 
+_P = ParamSpec("_P")
+_T = TypeVar("_T")
+
 
-class JITKernel:
+class JITKernel(Generic[_P, _T]):
     """
     A wrapper class for compiling and invoking TileLang (TVM TIR) functions as PyTorch-compatible functions.
 
@@ -33,6 +50,7 @@ class JITKernel:
     torch_function : Callable
         The compiled function that can be invoked as a PyTorch-compatible function.
     """
+
     prim_func: PrimFunc = None
     artifact: CompiledArtifact = None
     adapter: BaseKernelAdapter = None
@@ -47,7 +65,7 @@ def __init__(
         self,
         func: PrimFunc = None,
         out_idx: list[int] | int = None,
-        execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"] = "cython",
+        execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi",
         target: str | Target = "auto",
         target_host: str | Target = None,
         verbose: bool = False,
@@ -64,8 +82,8 @@ def __init__(
             The TileLang TIR function to compile and wrap.
         out_idx : Union[List[int], int], optional
             Index(es) of the output tensors to return (default: None).
-        execution_backend : Literal["dlpack", "ctypes", "cython", "nvrtc"], optional
-            Execution backend to use for kernel execution (default: "cython").
+        execution_backend : Literal["tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"], optional
+            Execution backend to use for kernel execution.
         target : Union[str, Target], optional
             Compilation target, either as a string or a TVM Target object (default: "auto").
         target_host : Union[str, Target], optional
@@ -87,25 +105,23 @@ def __init__(
             pass_configs = {}
         self.pass_configs = pass_configs
 
-        self.compile_flags = compile_flags
+        self.compile_flags = [compile_flags] if isinstance(compile_flags, str) else compile_flags
 
         # Ensure the target is always a valid TVM Target object.
         self.target = determine_target(target, return_object=True)
 
         # Validate the execution backend.
         assert execution_backend in [
-            "dlpack",
-            "ctypes",
+            "tvm_ffi",
             "cython",
             "nvrtc",
             "torch",
+            "cutedsl",
         ], f"Invalid execution backend. {execution_backend}"
         if execution_backend == "cython":
             from tilelang.contrib.cc import get_cplus_compiler
 
-            assert (
-                get_cplus_compiler() is not None
-            ), "Cython backend requires a C++ compiler, please install or use other backends."
+            assert get_cplus_compiler() is not None, "Cython backend requires a C++ compiler, please install or use other backends."
 
         if from_database:
             return
@@ -136,13 +152,14 @@ def __init__(
     def from_database(
         cls,
         func: PrimFunc,
-        kernel_global_source: str,
+        host_kernel_source: str,
+        device_kernel_source: str,
         kernel_lib_path: str,
         params: list[KernelParam],
         target: str | Target,
         target_host: str | Target,
         out_idx: list[int] | int,
-        execution_backend: Literal["dlpack", "ctypes", "cython", "nvrtc"],
+        execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch"],
         pass_configs: dict[str, Any] | None = None,
         compile_flags: list[str] | None = None,
     ):
@@ -165,7 +182,8 @@ def from_database(
             params=params,
             result_idx=out_idx,
             target=target,
-            kernel_global_source=kernel_global_source,
+            host_kernel_source=host_kernel_source,
+            device_kernel_source=device_kernel_source,
             kernel_lib_path=kernel_lib_path,
             pass_configs=pass_configs,
             compile_flags=compile_flags,
@@ -173,7 +191,7 @@ def from_database(
         instance.torch_function = instance.adapter.func
         return instance
 
-    def __call__(self, *args: Any, **kwds: Any) -> Any:
+    def __call__(self, *args: _P.args, **kwds: _P.kwargs) -> _T:
         """
         Invokes the compiled function with the given arguments.
 
@@ -191,8 +209,7 @@ def __call__(self, *args: Any, **kwds: Any) -> Any:
         """
         return self.torch_function(*args, **kwds)
 
-    def _compile_and_create_adapter(self, tilelang_func: PrimFunc,
-                                    out_idx: list[int]) -> BaseKernelAdapter:
+    def _compile_and_create_adapter(self, tilelang_func: PrimFunc, out_idx: list[int]) -> BaseKernelAdapter:
         """
         Compiles the given TileLang PrimFunc using TVM and creates a kernel adapter.
 
@@ -211,40 +228,46 @@ def _compile_and_create_adapter(self, tilelang_func: PrimFunc,
         target_host = self.target_host
 
         execution_backend = self.execution_backend
-        pass_configs = self.pass_configs
+        pass_configs = self.pass_configs or {}
+
+        compile_flags = self.compile_flags
+
+        if compile_flags is not None:
+            compile_flags_cfg = pass_configs.get(PassConfigKey.TL_DEVICE_COMPILE_FLAGS)
+            pass_configs[PassConfigKey.TL_DEVICE_COMPILE_FLAGS] = (
+                compile_flags_cfg + compile_flags if compile_flags_cfg is not None else compile_flags
+            )
 
         compile_flags = self.compile_flags
 
         # Compile the function with TVM, optimizing with shared memory lowering.
-        enable_host_codegen = execution_backend == "dlpack"
-        enable_device_compile = execution_backend == "dlpack"
+        enable_host_codegen = execution_backend == "tvm_ffi"
+        enable_device_compile = execution_backend == "tvm_ffi"
         with tvm.transform.PassContext(opt_level=3, config=pass_configs), self.target:
             artifact = tilelang.lower(
                 tilelang_func,
                 target=target,
                 target_host=target_host,
                 enable_host_codegen=enable_host_codegen,
-                enable_device_compile=enable_device_compile)
+                enable_device_compile=enable_device_compile,
+            )
 
         self.artifact = artifact
 
         # Create an adapter based on the specified execution backend.
-        if execution_backend == "dlpack":
-            # Use TorchDLPackKernelAdapter for interoperability with PyTorch via DLPack.
+        if execution_backend == "tvm_ffi":
+            # Use TVMFFIKernelAdapter for interoperability with PyTorch via DLPack.
             # But we need to ensure that the runtime is enabled and the runtime module is not None.
-            assert tvm.runtime.enabled("llvm"), "DLPack backend requires LLVM runtime."
-            assert (artifact.rt_mod is not None), "DLPack backend requires a runtime module."
-            adapter = TorchDLPackKernelAdapter(
-                artifact.rt_mod, params=artifact.params, result_idx=out_idx)
-        elif execution_backend == "ctypes":
-            adapter = CtypesKernelAdapter(
+            assert artifact.rt_mod is not None, "tvm_ffi backend requires a runtime module."
+            adapter = TVMFFIKernelAdapter(
                 params=artifact.params,
                 result_idx=out_idx,
                 target=target,
                 func_or_mod=tilelang_func,
                 host_mod=artifact.host_mod,
                 device_mod=artifact.device_mod,
-                kernel_global_source=artifact.kernel_source,
+                rt_mod=artifact.rt_mod,
+                device_kernel_source=artifact.kernel_source,
                 verbose=verbose,
                 pass_configs=pass_configs,
                 compile_flags=compile_flags,
@@ -257,12 +280,14 @@ def _compile_and_create_adapter(self, tilelang_func: PrimFunc,
                 func_or_mod=tilelang_func,
                 host_mod=artifact.host_mod,
                 device_mod=artifact.device_mod,
-                kernel_global_source=artifact.kernel_source,
+                device_kernel_source=artifact.kernel_source,
                 verbose=verbose,
                 pass_configs=pass_configs,
                 compile_flags=compile_flags,
             )
         elif execution_backend == "nvrtc":
+            from tilelang.jit.adapter import NVRTCKernelAdapter
+
             adapter = NVRTCKernelAdapter(
                 params=artifact.params,
                 result_idx=out_idx,
@@ -270,7 +295,7 @@ def _compile_and_create_adapter(self, tilelang_func: PrimFunc,
                 func_or_mod=tilelang_func,
                 host_mod=artifact.host_mod,
                 device_mod=artifact.device_mod,
-                kernel_global_source=artifact.kernel_source,
+                device_kernel_source=artifact.kernel_source,
                 verbose=verbose,
                 pass_configs=pass_configs,
                 compile_flags=compile_flags,
@@ -289,34 +314,50 @@ def _compile_and_create_adapter(self, tilelang_func: PrimFunc,
                 # pass_configs=pass_configs,
                 # compile_flags=compile_flags,
             )
+        elif execution_backend == "cutedsl":
+            assert is_cutedsl_target(target)
+            adapter = CuTeDSLKernelAdapter(
+                params=artifact.params,
+                result_idx=out_idx,
+                target=target,
+                func_or_mod=tilelang_func,
+                host_mod=artifact.host_mod,
+                device_mod=artifact.device_mod,
+                device_kernel_source=artifact.kernel_source,
+                verbose=verbose,
+                pass_configs=pass_configs,
+                compile_flags=compile_flags,
+            )
         else:
             # Handle invalid backend.
             raise ValueError(f"Invalid execution backend: {execution_backend}")
 
         return adapter
 
-    def _create_adapter_from_database(self,
-                                      params: list[KernelParam],
-                                      result_idx: list[int] | int,
-                                      target: str | Target,
-                                      func_or_mod: PrimFunc | tvm.runtime.Module,
-                                      kernel_global_source: str,
-                                      kernel_lib_path: str,
-                                      pass_configs: dict[str, Any] | None = None,
-                                      compile_flags: list[str] | None = None) -> BaseKernelAdapter:
+    def _create_adapter_from_database(
+        self,
+        params: list[KernelParam],
+        result_idx: list[int] | int,
+        target: str | Target,
+        func_or_mod: PrimFunc | tvm.runtime.Module,
+        host_kernel_source: str,
+        device_kernel_source: str,
+        kernel_lib_path: str,
+        pass_configs: dict[str, Any] | None = None,
+        compile_flags: list[str] | None = None,
+    ) -> BaseKernelAdapter:
         target = self.target
         execution_backend = self.execution_backend
 
         # Create an adapter based on the specified execution backend.
-        if execution_backend == "dlpack":
-            raise ValueError("DLPack backend is not supported for TileLang JIT.")
-        elif execution_backend == "ctypes":
-            adapter = CtypesKernelAdapter.from_database(
+        if execution_backend == "tvm_ffi":
+            adapter = TVMFFIKernelAdapter.from_database(
                 params=params,
                 result_idx=result_idx,
                 target=target,
                 func_or_mod=func_or_mod,
-                kernel_global_source=kernel_global_source,
+                host_kernel_source=host_kernel_source,
+                device_kernel_source=device_kernel_source,
                 kernel_lib_path=kernel_lib_path,
                 pass_configs=pass_configs,
                 compile_flags=compile_flags,
@@ -327,17 +368,33 @@ def _create_adapter_from_database(self,
                 result_idx=result_idx,
                 target=target,
                 func_or_mod=func_or_mod,
-                kernel_global_source=kernel_global_source,
+                host_kernel_source=host_kernel_source,
+                device_kernel_source=device_kernel_source,
                 kernel_lib_path=kernel_lib_path,
                 pass_configs=pass_configs,
             )
         elif execution_backend == "nvrtc":
+            from tilelang.jit.adapter import NVRTCKernelAdapter
+
             adapter = NVRTCKernelAdapter.from_database(
                 params=params,
                 result_idx=result_idx,
                 target=target,
                 func_or_mod=func_or_mod,
-                kernel_global_source=kernel_global_source,
+                host_kernel_source=host_kernel_source,
+                device_kernel_source=device_kernel_source,
+                kernel_lib_path=kernel_lib_path,
+                pass_configs=pass_configs,
+                compile_flags=compile_flags,
+            )
+        elif execution_backend == "cutedsl":
+            adapter = CuTeDSLKernelAdapter.from_database(
+                params=params,
+                result_idx=result_idx,
+                target=target,
+                func_or_mod=func_or_mod,
+                host_kernel_source=host_kernel_source,
+                device_kernel_source=device_kernel_source,
                 kernel_lib_path=kernel_lib_path,
                 pass_configs=pass_configs,
                 compile_flags=compile_flags,
@@ -367,8 +424,7 @@ def from_tilelang_function(cls, tilelang_func: PrimFunc, **kwargs):
         """
         return cls(func=tilelang_func, **kwargs)
 
-    def get_profiler(self,
-                     tensor_supply_type: TensorSupplyType = TensorSupplyType.Auto) -> Profiler:
+    def get_profiler(self, tensor_supply_type: TensorSupplyType = TensorSupplyType.Auto) -> Profiler:
         """
         Creates a profiler to benchmark the compiled runtime module.
 
@@ -382,10 +438,9 @@ def get_profiler(self,
         Profiler
             A Profiler instance for benchmarking the runtime module.
         """
-        return Profiler(self.params, self.out_idx,
-                        tensor_supply_type).with_default_adapter(self.adapter)
+        return Profiler(self.params, self.out_idx, tensor_supply_type).with_default_adapter(self.adapter)
 
-    def get_kernel_source(self) -> str:
+    def get_kernel_source(self, kernel_only: bool = True) -> str:
         """
         Returns the source code of the compiled kernel function.
 
@@ -394,34 +449,156 @@ def get_kernel_source(self) -> str:
         str
             The source code of the compiled kernel function.
         """
-        if self.execution_backend in {"ctypes", "cython", "nvrtc"}:
-            return self.adapter.get_kernel_source()
+        if self.execution_backend in {"cython", "nvrtc", "tvm_ffi", "cutedsl"}:
+            return self.adapter.get_kernel_source(kernel_only=kernel_only)
         return self.artifact.kernel_source
 
     def get_host_source(self) -> str:
         """
         Returns the source code of the host function.
         """
+        if self.execution_backend in {"cython", "nvrtc", "tvm_ffi", "cutedsl"}:
+            return self.adapter.get_host_source()
+        assert self.artifact.host_mod is not None, "host_mod is not available"
         return str(self.artifact.host_mod)
 
     def initialize(
         self,
         allocator: BaseAllocator,
-        stream: int = None,
+        stream: int | None = None,
     ):
+        """Initialize base addr table for TileScale kernels."""
+
         assert allocator.initialized(), "Allocator is not initialized"
-        result = self.adapter.lib.init_table(
-            ctypes.c_void_p(allocator.table.data_ptr()), allocator.table_size,
-            ctypes.c_void_p(stream) if stream is not None else ctypes.c_void_p(0))
-        if result != 0:
-            error_msg = self.adapter.lib.get_last_error().decode('utf-8')
-            raise RuntimeError(f"Initialization failed: {error_msg}")
+
+        stream_val = stream if stream is not None else 0
+
+        if self.execution_backend == "tvm_ffi":
+            # TVM FFI adapter: call init_table method directly
+            # Note: TVM FFI expects plain int pointers, not ctypes objects
+            result = self.adapter.init_table(
+                allocator.table.data_ptr(),
+                allocator.table_size,
+                stream_val,
+            )
+            if result != 0:
+                raise RuntimeError("Initialization failed for TVM FFI adapter")
+        else:
+            # Cython/NVRTC adapter: use ctypes lib interface
+            result = self.adapter.lib.init_table(
+                ctypes.c_void_p(allocator.table.data_ptr()),
+                allocator.table_size,
+                ctypes.c_void_p(stream_val),
+            )
+            if result != 0:
+                error_msg = self.adapter.lib.get_last_error().decode("utf-8")
+                raise RuntimeError(f"Initialization failed: {error_msg}")
 
     def run_once(self, func: Callable | None = None) -> None:
         return self.get_profiler().run_once(func)
 
-    def update_tuner_result(self, latency: float, config: dict[str, Any],
-                            ref_latency: float) -> JITKernel:
+    def show_source(self, which: Literal["kernel", "host", "both"] = "kernel") -> None:
+        """
+        Print generated source code to stdout.
+
+        Parameters
+        ----------
+        which : Literal["kernel", "host", "both"], optional
+            Select which source to print. Defaults to "kernel".
+
+        Examples
+        --------
+        >>> jit_kernel.show_source()            # print kernel source
+        >>> jit_kernel.show_source("host")      # print host source
+        >>> jit_kernel.show_source("both")      # print both sources
+        """
+        try:
+            if which == "kernel":
+                src = self.get_kernel_source()
+                print(src)
+            elif which == "host":
+                src = self.get_host_source()
+                # Host is generally C/C++
+                print(src)
+            elif which == "both":
+                print("===== Kernel Source =====")
+                ksrc = self.get_kernel_source()
+                print(ksrc)
+                print("===== Host Source =====")
+                hsrc = self.get_host_source()
+                print(hsrc)
+            else:
+                raise ValueError(f"Unknown option for 'which': {which}")
+        except Exception as e:
+            logger.error(f"Failed to show source code: {e}")
+
+    def export_sources(self, kernel_path: str | None = None, host_path: str | None = None) -> None:
+        """
+        Export generated source code to files.
+
+        Parameters
+        ----------
+        kernel_path : Optional[str]
+            Destination file path to write the kernel source. If None, skips writing kernel code.
+        host_path : Optional[str]
+            Destination file path to write the host source. If None, skips writing host code.
+
+        Examples
+        --------
+        >>> jit_kernel.export_sources(kernel_path="/tmp/kernel.cu")
+        >>> jit_kernel.export_sources(host_path="/tmp/host.cc")
+        >>> jit_kernel.export_sources(
+        ...     kernel_path="/tmp/kernel.cu",
+        ...     host_path="/tmp/host.cc",
+        ... )
+        """
+        if kernel_path is None and host_path is None:
+            raise ValueError("At least one of kernel_path or host_path must be provided.")
+        try:
+            if kernel_path is not None:
+                dir_path = os.path.dirname(kernel_path)
+                if dir_path:
+                    os.makedirs(dir_path, exist_ok=True)
+                with open(kernel_path, "w") as f:
+                    f.write(self.get_kernel_source())
+            if host_path is not None:
+                dir_path = os.path.dirname(host_path)
+                if dir_path:
+                    os.makedirs(dir_path, exist_ok=True)
+                with open(host_path, "w") as f:
+                    f.write(self.get_host_source())
+        except Exception as e:
+            logger.error(f"Failed to export sources: {e}")
+
+    # Backward compatibility alias (deprecated)
+    def print_source_code(self, which: Literal["kernel", "host", "both"] = "kernel", file: str | None = None) -> None:
+        """
+        Deprecated: use show_source() or export_sources() instead.
+
+        Parameters
+        ----------
+        which : Literal["kernel", "host", "both"], optional
+            Kept for backward compatibility with printing behavior.
+        file : Optional[str]
+            If provided, behaves like export_sources(kernel_path=file).
+
+        Examples
+        --------
+        >>> # New API (preferred)
+        >>> jit_kernel.show_source("both")
+        >>> jit_kernel.export_sources(kernel_path="/tmp/kernel.cu")
+
+        >>> # Old API (still works but deprecated)
+        >>> jit_kernel.print_source_code(file="/tmp/kernel.cu")
+        """
+        logger.warning("print_source_code is deprecated; use show_source() or export_sources() instead.")
+        if file is not None:
+            # Historical behavior wrote only kernel source when file provided
+            self.export_sources(kernel_path=file)
+        else:
+            self.show_source(which=which)
+
+    def update_tuner_result(self, latency: float, config: dict[str, Any], ref_latency: float) -> JITKernel:
         """
         Updates the tuning results for this kernel.
 
@@ -499,3 +676,129 @@ def export_library(self, kernel_file: str) -> None:
 
         # Export the compiled kernel function to a shared library file.
         self.rt_module.export_library(kernel_file)
+
+    def _get_ptx(self, verbose: bool | None = None) -> str:
+        """
+        Compile and return PTX for the current kernel (CUDA only).
+
+        Parameters
+        ----------
+        verbose : Optional[bool]
+            Whether to enable verbose NVRTC logs. Defaults to self.verbose.
+
+        Returns
+        -------
+        str
+            The compiled PTX text.
+        """
+        if not is_cuda_target(self.target):
+            raise ValueError("PTX is only available for CUDA targets.")
+        # Prefer NVCC for PTX generation via contrib helper
+        code = self.get_kernel_source()
+        if verbose is None:
+            verbose = self.verbose
+        # Ensure target is set so nvcc picks correct arch via Target.current()
+        with self.target:
+            return tl_nvcc.get_ptx_from_source(code, compile_flags=self.compile_flags, verbose=verbose)
+
+    def show_ptx(self) -> None:
+        """
+        Print compiled PTX for the kernel (CUDA only).
+
+        Examples
+        --------
+        >>> jit_kernel.show_ptx()
+        """
+        try:
+            ptx = self._get_ptx()
+            print(ptx)
+        except Exception as e:
+            logger.error(f"Failed to show PTX: {e}")
+
+    def export_ptx(self, path: str) -> None:
+        """
+        Export compiled PTX to a file (CUDA only).
+
+        Parameters
+        ----------
+        path : str
+            Destination file path to write PTX.
+
+        Examples
+        --------
+        >>> jit_kernel.export_ptx("/tmp/kernel.ptx")
+        """
+        if not path:
+            raise ValueError("path must be provided to export PTX")
+        try:
+            ptx = self._get_ptx()
+            dir_path = os.path.dirname(path)
+            if dir_path:
+                os.makedirs(dir_path, exist_ok=True)
+            with open(path, "w") as f:
+                f.write(ptx)
+            logger.info(f"PTX saved to {os.path.abspath(path)}")
+        except Exception as e:
+            logger.error(f"Failed to export PTX: {e}")
+
+    def _get_sass(self, verbose: bool | None = None) -> str:
+        """
+        Compile and return SASS for the current kernel (CUDA only).
+
+        Parameters
+        ----------
+        verbose : Optional[bool]
+            Whether to enable verbose tool logs. Defaults to self.verbose.
+
+        Returns
+        -------
+        str
+            The disassembled SASS text.
+        """
+        if not is_cuda_target(self.target):
+            raise ValueError("SASS is only available for CUDA targets.")
+        code = self.get_kernel_source()
+        if verbose is None:
+            verbose = self.verbose
+        with self.target:
+            return tl_nvcc.get_sass_from_source(code, compile_flags=self.compile_flags, verbose=verbose)
+
+    def show_sass(self) -> None:
+        """
+        Print disassembled SASS for the kernel (CUDA only).
+
+        Examples
+        --------
+        >>> jit_kernel.show_sass()
+        """
+        try:
+            sass = self._get_sass()
+            print(sass)
+        except Exception as e:
+            logger.error(f"Failed to show SASS: {e}")
+
+    def export_sass(self, path: str) -> None:
+        """
+        Export disassembled SASS to a file (CUDA only).
+
+        Parameters
+        ----------
+        path : str
+            Destination file path to write SASS.
+
+        Examples
+        --------
+        >>> jit_kernel.export_sass("/tmp/kernel.sass")
+        """
+        if not path:
+            raise ValueError("path must be provided to export SASS")
+        try:
+            sass = self._get_sass()
+            dir_path = os.path.dirname(path)
+            if dir_path:
+                os.makedirs(dir_path, exist_ok=True)
+            with open(path, "w") as f:
+                f.write(sass)
+            logger.info(f"SASS saved to {os.path.abspath(path)}")
+        except Exception as e:
+            logger.error(f"Failed to export SASS: {e}")
diff --git a/tilelang/language/__init__.py b/tilelang/language/__init__.py
index bb182ae64..88f164b30 100644
--- a/tilelang/language/__init__.py
+++ b/tilelang/language/__init__.py
@@ -1,4 +1,5 @@
 """The language interface for tl programs."""
+
 from __future__ import annotations
 
 # from .parser import *
@@ -8,24 +9,21 @@
 # upstream tir script is fully compatible
 from tvm.script.parser.tir import *
 from . import overrides as _overrides  # noqa: F401
-from .tir import (
-    prim_func,  # noqa: F401
-)
+
+# from .tir import prim_func, macro,  # noqa: F401
+from .v2 import *  # noqa: F401
 from .tir.ir import *  # noqa: F401
 from tilelang.layout import Layout, Fragment  # noqa: F401
-from .proxy import (
-    ptr,  # noqa: F401
-    make_tensor,  # noqa: F401
-    Buffer,  # noqa: F401
-    Tensor,  # noqa: F401
-    StridedTensor,  # noqa: F401
-    FragmentBuffer,  # noqa: F401
-    SharedBuffer,  # noqa: F401
-    LocalBuffer,  # noqa: F401
+from .proxy import ptr, make_tensor, Buffer, Tensor, StridedTensor, FragmentBuffer, SharedBuffer, LocalBuffer  # noqa: F401
+from .loop import (
+    Parallel,  # noqa: F401
+    Persistent,  # noqa: F401
+    Pipelined,  # noqa: F401
+    serial,  # noqa: F401
+    unroll,  # noqa: F401
+    Serial,  # noqa: F401
+    Unroll,  # noqa: F401
 )
-from .parallel import Parallel  # noqa: F401
-from .pipeline import Pipelined  # noqa: F401
-from .persistent import Persistent  # noqa: F401
 from .frame import has_let_value, get_let_value  # noqa: F401
 from .math_intrinsics import *  # noqa: F401
 from .kernel import (
@@ -46,12 +44,17 @@
     alloc_tmem,  # noqa: F401
     alloc_reducer,  # noqa: F401
     alloc_descriptor,  # noqa: F401
+    alloc_wgmma_desc,  # noqa: F401
+    alloc_tcgen05_smem_desc,  # noqa: F401
+    alloc_tcgen05_instr_desc,  # noqa: F401
+    empty,  # noqa: F401
 )
-from .copy import copy, c2d_im2col  # noqa: F401
-from .gemm import GemmWarpPolicy, gemm, gemm_v2  # noqa: F401
-from .experimental.gemm_sp import gemm_sp  # noqa: F401
-from .fill import fill, clear  # noqa: F401
-from .reduce import (
+from .copy_op import copy, c2d_im2col  # noqa: F401
+from tilelang.tileop.base import GemmWarpPolicy  # noqa: F401
+from .gemm_op import gemm, gemm_v1, gemm_v2  # noqa: F401
+from .experimental.gemm_sp import gemm_sp, gemm_sp_v2  # noqa: F401
+from .fill_op import fill, clear  # noqa: F401
+from .reduce_op import (
     reduce,  # noqa: F401
     reduce_max,  # noqa: F401
     reduce_min,  # noqa: F401
@@ -69,7 +72,7 @@
     warp_reduce_bitand,  # noqa: F401
     warp_reduce_bitor,  # noqa: F401
 )
-from .print import print  # noqa: F401
+from .print_op import print, device_assert  # noqa: F401
 from .customize import (
     atomic_max,  # noqa: F401
     atomic_min,  # noqa: F401
@@ -86,17 +89,29 @@
 )
 from .logical import any_of, all_of  # noqa: F401
 from .builtin import *  # noqa: F401
-from .distributed.multi_device.nvshmem import *  # noqa: F401
-from .distributed.multi_device.cpengine import *  # noqa: F401
-from .distributed.common import *  # noqa: F401
+from .builtin import __ldg as __ldg  # noqa: F401
 
 from .utils import index_to_coordinates  # noqa: F401
 
 from .symbolics import dynamic, symbolic  # noqa: F401
 from .annotations import (  # noqa: F401
-    use_swizzle, annotate_layout, annotate_safe_value, annotate_l2_hit_ratio,
+    use_swizzle,
+    annotate_layout,
+    annotate_safe_value,
+    annotate_l2_hit_ratio,
+    annotate_restrict_buffers,
 )
 
+from .random import (
+    rng_init,  # noqa: F401
+    rng_rand,  # noqa: F401
+)
+
+# Distributed multi-device primitives (NVSHMEM)
+from .distributed.multi_device.nvshmem import *  # noqa: F401
+from .distributed.multi_device.cpengine import *  # noqa: F401
+from .distributed.common import *  # noqa: F401
+
 
 def import_source(source: str | None = None):
     # source is the source code to be imported
diff --git a/tilelang/language/allocate.py b/tilelang/language/allocate.py
index cde803f0a..69811fc63 100644
--- a/tilelang/language/allocate.py
+++ b/tilelang/language/allocate.py
@@ -13,15 +13,31 @@
 Each function takes shape and dtype parameters and returns a TVM buffer object
 with the appropriate memory scope.
 """
+
 from __future__ import annotations
 
+from typing import TypeVar, overload, Literal
+
+# Python 3.9 compatibility for advanced typing features (PEP 646)
+try:
+    from typing import TypeVarTuple  # type: ignore[attr-defined]
+except Exception:
+    from typing_extensions import TypeVarTuple  # type: ignore
 from tilelang import tvm as tvm
 from tvm.script import tir as T
 from tvm.tir import PrimExpr
 from tvm.script.parser.tir import block_attr
+from tvm.tir.buffer import Buffer
+from tvm.tir.expr import FloatImm, IntImm
+from .v2 import dtypes as _dtypes
+from .v2.dtypes import dtype as tl_dtype
+from .v2.builder import OutTensor
 
+_Shapes = TypeVarTuple("_Shapes")
+_DType = TypeVar("_DType")
 
-def alloc_shared(shape, dtype, scope="shared.dyn"):
+
+def alloc_shared(shape, dtype: _DType, scope="shared.dyn"):
     """Allocate a shared memory buffer for inter-thread communication.
 
     Args:
@@ -39,7 +55,7 @@ def alloc_shared(shape, dtype, scope="shared.dyn"):
     return T.alloc_buffer(shape, dtype, scope=scope)
 
 
-def alloc_local(shape, dtype, scope="local"):
+def alloc_local(shape, dtype: _DType, scope="local"):
     """Allocate a local memory buffer for thread-private storage.
 
     Args:
@@ -53,7 +69,7 @@ def alloc_local(shape, dtype, scope="local"):
     return T.alloc_buffer(shape, dtype, scope=scope)
 
 
-def alloc_fragment(shape, dtype, scope="local.fragment"):
+def alloc_fragment(shape, dtype: _DType, scope="local.fragment"):
     """Allocate a fragment memory buffer for specialized operations.
 
     Args:
@@ -67,6 +83,14 @@ def alloc_fragment(shape, dtype, scope="local.fragment"):
     return T.alloc_buffer(shape, dtype, scope=scope)
 
 
+@overload
+def alloc_var(dtype: str, init: PrimExpr | int | float, scope: str = "local.var") -> Buffer: ...
+
+
+@overload
+def alloc_var(dtype: str, scope: str = "local.var", *, init: PrimExpr | int | float | None = None) -> Buffer: ...
+
+
 def alloc_var(dtype, *args, scope="local.var", init: PrimExpr | None = None):
     """Allocate a single-element variable buffer.
 
@@ -82,7 +106,12 @@ def alloc_var(dtype, *args, scope="local.var", init: PrimExpr | None = None):
         init (PrimExpr, optional): The optional initializer value. When provided,
             the generated code will initialize the variable with this value instead
             of defaulting to zero.
-
+    Examples:
+        a = T.alloc_var('int32', 1) # var with init 1
+        a = T.alloc_var('int32', 'local.var') # var with local.var scope
+        a = T.alloc_var('int32', 1, 'local.var') # var with init 1 and local.var scope
+        a = T.alloc_var('int32', 'local.var', init=1) # var with init 1 and local.var scope
+        a = T.alloc_var('int32', init=1) # var with init 1 and local.var scope
     Returns:
         T.Buffer: A TVM buffer object allocated as a single-element variable
     """
@@ -105,15 +134,17 @@ def alloc_var(dtype, *args, scope="local.var", init: PrimExpr | None = None):
             raise TypeError("Scope must be provided as a string in alloc_var.")
         parsed_scope = parsed_scope_arg
     elif len(args) > 2:
-        raise TypeError(
-            f"alloc_var expected at most 3 positional arguments but got {len(args) + 1}.")
+        raise TypeError(f"alloc_var expected at most 3 positional arguments but got {len(args) + 1}.")
 
     if not isinstance(parsed_scope, str):
         raise TypeError("Scope must be a string in alloc_var.")
 
     buffer = T.alloc_buffer([1], dtype, scope=parsed_scope)
     if parsed_init is not None:
-        block_attr({"tl.local_var_init": {buffer.data: parsed_init}})
+        if isinstance(parsed_init, (int, float, IntImm, FloatImm)):
+            block_attr({"tl.local_var_init": {buffer.data: tl_dtype(dtype)(parsed_init)}})
+        else:
+            T.buffer_store(buffer, parsed_init, 0)
     return buffer
 
 
@@ -126,7 +157,7 @@ def alloc_barrier(arrive_count: int):
     Returns:
         T.Buffer: A TVM buffer object allocated as a barrier
     """
-    return T.alloc_buffer([arrive_count], "uint64", scope="shared.barrier")
+    return T.alloc_buffer([arrive_count], _dtypes.uint64, scope="shared.barrier")
 
 
 def alloc_tmem(shape, dtype):
@@ -194,10 +225,55 @@ def alloc_reducer(shape, dtype, op="sum", replication=None):
     return reducer
 
 
-def alloc_descriptor(dtype="uint64", scope="local.descriptor"):
-    """Allocate a descriptor buffer for wgmma and utcmma.
+DescKind = Literal["wgmma", "tcgen05_smem", "tcgen05_instr"]
+
+
+def alloc_descriptor(
+    kind: DescKind = "wgmma",
+    dtype: str = _dtypes.uint64,
+):
+    """Allocate a descriptor buffer for WGMMA and TCGEN5.MMA.
+
+    Args:
+        kind: The descriptor kind, one of "wgmma", "tcgen05" ("utcmma" as alias).
 
     Returns:
         T.Buffer: A TVM buffer object allocated as a descriptor
     """
+
+    scope = "local.descriptor." + kind
+    # Buffer naming via `name` is not supported by this TVM builder signature;
+    # keep parameter for forward-compat, but do not pass it.
     return T.alloc_buffer([1], dtype, scope=scope)
+
+
+def alloc_wgmma_desc(dtype: str = _dtypes.uint64):
+    return alloc_descriptor("wgmma", dtype=dtype)
+
+
+def alloc_tcgen05_smem_desc(dtype: str = _dtypes.uint64):
+    return alloc_descriptor("tcgen05_smem", dtype=dtype)
+
+
+def alloc_tcgen05_instruction_desc(dtype: str = _dtypes.uint32):
+    return alloc_descriptor("tcgen05_instr", dtype=dtype)
+
+
+# Alias: short name consistent with imports
+def alloc_tcgen05_instr_desc(dtype: str = _dtypes.uint32):
+    return alloc_tcgen05_instruction_desc(dtype)
+
+
+@overload
+def empty(shape, dtype: str = _dtypes.float32): ...
+
+
+def empty(*shape, dtype: str = _dtypes.float32):
+    if len(shape) == 1 and isinstance(shape[0], (tuple, list)):
+        return OutTensor(shape[0], dtype)
+    elif len(shape) == 2 and isinstance(shape[0], (tuple, list)) and isinstance(shape[1], str):
+        return OutTensor(shape[0], shape[1])
+    elif all([isinstance(x, (int, PrimExpr)) for x in shape]):
+        return OutTensor(shape, dtype)
+    else:
+        raise RuntimeError(f"Invalid shape {shape}")
diff --git a/tilelang/language/annotations.py b/tilelang/language/annotations.py
index 12d3af4d3..6e95cdafe 100644
--- a/tilelang/language/annotations.py
+++ b/tilelang/language/annotations.py
@@ -1,16 +1,18 @@
 """Annotation helpers exposed on the TileLang language surface."""
-from __future__ import annotations
 
 from typing import Callable
 
-from tilelang.layout import Layout
+from tilelang.layout import Fragment, Layout
+from tilelang.utils.language import is_fragment
 from tvm.script.parser.tir import attr, block_attr
+from tvm.tir import FloatImm
 
 __all__ = [
     "use_swizzle",
     "annotate_layout",
     "annotate_safe_value",
     "annotate_l2_hit_ratio",
+    "annotate_restrict_buffers",
 ]
 
 
@@ -26,6 +28,8 @@ def annotate_layout(layout_map: dict):
     """Annotate the layout of the buffer."""
     _layout_map = {}
     for buffer, layout in layout_map.items():
+        if is_fragment(buffer):
+            assert isinstance(layout, Fragment), f"for Fragment {buffer}, layout must be a Fragment, but got {type(layout)}"
         if isinstance(layout, Layout):
             _layout_map[buffer.data] = layout
         elif isinstance(layout, Callable):
@@ -49,5 +53,33 @@ def annotate_l2_hit_ratio(l2_hit_ratio_map: dict):
     _l2_hit_ratio_map = {}
     for buffer, hit_ratio in l2_hit_ratio_map.items():
         assert buffer.scope() == "global", "persistent L2 can only be applied to global buffers"
-        _l2_hit_ratio_map[buffer.data] = float(hit_ratio)
+        _l2_hit_ratio_map[buffer.data] = FloatImm("float32", float(hit_ratio))
     return block_attr({"l2_hit_ratio_map": _l2_hit_ratio_map})
+
+
+def annotate_restrict_buffers(*buffers):
+    """Mark the given buffer parameters as non-restrict.
+
+    This annotation tells codegen to omit the `__restrict__` qualifier for the
+    specified kernel buffer parameters. Use this when two (or more) buffers may
+    alias, for example overlapping slices from the same base tensor.
+
+    Example
+    -------
+    >>> @T.prim_func
+    ... def buggy_kernel(x: T.Tensor((N,), T.float32),
+    ...                  y: T.Tensor((N,), T.float32)):
+    ...     T.annotate_restrict_buffers(x, y)
+    ...     with T.Kernel(N, threads=32) as pid:
+    ...         y[pid] = x[pid] + 1
+    """
+    if not buffers:
+        return None
+    data_vars = []
+    for buf in buffers:
+        try:
+            data_vars.append(buf.data)
+        except Exception as e:
+            raise TypeError(f"annotate_restrict_buffers expects Buffer arguments, got {type(buf)}") from e
+    # Also return as block attribute (root block exists by default) for readability/tools.
+    return block_attr({"tl.non_restrict_params": data_vars})
diff --git a/tilelang/language/ast/__init__.py b/tilelang/language/ast/__init__.py
index 9d7745442..6ab6249b1 100644
--- a/tilelang/language/ast/__init__.py
+++ b/tilelang/language/ast/__init__.py
@@ -17,6 +17,7 @@
 # This file is modified from the original version,
 # which is part of the TVM project (https://tvm.apache.org/).
 """Package tvm.script.ir_builder.tir"""
+
 from .ir import *  # noqa: F401
 from .ir import boolean as bool  # noqa: F401
 from .ir import buffer as Buffer  # noqa: F401
diff --git a/tilelang/language/ast/_ffi_api.py b/tilelang/language/ast/_ffi_api.py
index 518d57ea8..5cc74762a 100644
--- a/tilelang/language/ast/_ffi_api.py
+++ b/tilelang/language/ast/_ffi_api.py
@@ -17,6 +17,7 @@
 # This file is modified from the original version,
 # which is part of the TVM project (https://tvm.apache.org/).
 """FFI APIs"""
+
 import tvm.ffi
 
 tvm.ffi._init_api("script.ir_builder.tir", __name__)  # pylint: disable=protected-access
diff --git a/tilelang/language/ast/ir.py b/tilelang/language/ast/ir.py
index 0948cdfa7..a4caefc24 100644
--- a/tilelang/language/ast/ir.py
+++ b/tilelang/language/ast/ir.py
@@ -92,7 +92,7 @@
 
 def buffer(
     shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral],
-    dtype: str = "float32",
+    dtype: str = T.float32,
     data: Var = None,
     strides: List[PrimExpr] = None,
     elem_offset: PrimExpr = None,
@@ -143,7 +143,7 @@ def buffer(
     """
     shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
     if strides is not None:
-        strides = [Var(s, "int32") if isinstance(s, str) else s for s in strides]
+        strides = [Var(s, T.int32) if isinstance(s, str) else s for s in strides]
     else:
         strides = []
     return _ffi_api.Buffer(  # type: ignore[attr-defined] # pylint: disable=no-member
@@ -244,7 +244,7 @@ def func_ret(ret_type: Type) -> Type:
 def match_buffer(
     param: Union[Var, BufferLoad, BufferRegion],
     shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral] = None,
-    dtype: str = "float32",
+    dtype: str = T.float32,
     data: Var = None,
     strides: List[PrimExpr] = None,
     elem_offset: PrimExpr = None,
@@ -266,11 +266,11 @@ def match_buffer(
     -------
     Match buffer from function parameter
     .. code-block:: python
-        A = T.match_buffer(a, (128, 128), dtype="float32")
+        A = T.match_buffer(a, (128, 128), dtype=T.float32)
 
     Match buffer from Buffer subregion
     .. code-block:: python
-        A = T.match_buffer(B[0:128, i * 128 : i * 128 + 128], (128, 128), dtype="float32")
+        A = T.match_buffer(B[0:128, i * 128 : i * 128 + 128], (128, 128), dtype=T.float32)
 
     Parameters
     ----------
@@ -320,7 +320,7 @@ def match_buffer(
             raise ValueError("Shape must be specified when binding input param")
     shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
     if strides is not None:
-        idx_dtype = shape[0].dtype if isinstance(shape[0], PrimExpr) else "int32"
+        idx_dtype = shape[0].dtype if isinstance(shape[0], PrimExpr) else T.int32
         strides = [Var(s, idx_dtype) if isinstance(s, str) else s for s in strides]
     else:
         strides = []
@@ -440,7 +440,7 @@ def block_attr(attrs: Dict[str, Any]) -> None:
 
 def alloc_buffer(
     shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral],
-    dtype: str = "float32",
+    dtype: str = T.float32,
     data: Var = None,
     strides: List[PrimExpr] = None,
     elem_offset: PrimExpr = None,
@@ -491,7 +491,7 @@ def alloc_buffer(
     """
     shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
     if strides is not None:
-        strides = [Var(s, "int32") if isinstance(s, str) else s for s in strides]
+        strides = [Var(s, T.int32) if isinstance(s, str) else s for s in strides]
     else:
         strides = []
     return _ffi_api.AllocBuffer(  # type: ignore[attr-defined] # pylint: disable=no-member
@@ -537,7 +537,7 @@ class axis:  # pylint: disable=invalid-name
     def spatial(
         dom: Union[ir.Range, List[PrimExpr], Tuple[PrimExpr]],
         binding: PrimExpr,
-        dtype: str = "int32",
+        dtype: str = T.int32,
     ) -> Var:
         """The spatial block axis defining function.
 
@@ -558,13 +558,14 @@ def spatial(
             The iteration variable.
         """
         return _ffi_api.AxisSpatial(  # type: ignore[attr-defined] # pylint: disable=no-member
-            _as_range(dom), binding, dtype)
+            _as_range(dom), binding, dtype
+        )
 
     @staticmethod
     def reduce(
         dom: Union[ir.Range, List[PrimExpr], Tuple[PrimExpr]],
         binding: PrimExpr,
-        dtype: str = "int32",
+        dtype: str = T.int32,
     ) -> Var:
         """The reduced block axis defining function.
 
@@ -585,13 +586,14 @@ def reduce(
             The iteration variable.
         """
         return _ffi_api.AxisReduce(  # type: ignore[attr-defined] # pylint: disable=no-member
-            _as_range(dom), binding, dtype)
+            _as_range(dom), binding, dtype
+        )
 
     @staticmethod
     def scan(
         dom: Union[ir.Range, List[PrimExpr], Tuple[PrimExpr]],
         binding: PrimExpr,
-        dtype: str = "int32",
+        dtype: str = T.int32,
     ) -> Var:
         """The scanning block axis defining function.
 
@@ -612,13 +614,14 @@ def scan(
             The iteration variable.
         """
         return _ffi_api.AxisScan(  # type: ignore[attr-defined] # pylint: disable=no-member
-            _as_range(dom), binding, dtype)
+            _as_range(dom), binding, dtype
+        )
 
     @staticmethod
     def opaque(
         dom: Union[ir.Range, List[PrimExpr], Tuple[PrimExpr]],
         binding: PrimExpr,
-        dtype: str = "int32",
+        dtype: str = T.int32,
     ) -> Var:
         """The opaque block axis defining function.
 
@@ -639,10 +642,11 @@ def opaque(
             The iteration variable.
         """
         return _ffi_api.AxisOpaque(  # type: ignore[attr-defined] # pylint: disable=no-member
-            _as_range(dom), binding, dtype)
+            _as_range(dom), binding, dtype
+        )
 
     @staticmethod
-    def remap(kinds: str, bindings: List[PrimExpr], dtype: str = "int32") -> Union[List[Var], Var]:
+    def remap(kinds: str, bindings: List[PrimExpr], dtype: str = T.int32) -> Union[List[Var], Var]:
         """The block axis remapping function.
 
         Parameters
@@ -662,17 +666,15 @@ def remap(kinds: str, bindings: List[PrimExpr], dtype: str = "int32") -> Union[L
             The iteration variables.
         """
         iter_vars = _ffi_api.AxisRemap(  # type: ignore[attr-defined] # pylint: disable=no-member
-            kinds, bindings, dtype)
+            kinds, bindings, dtype
+        )
         return iter_vars[0] if len(iter_vars) == 1 else iter_vars
 
     S = spatial  # pylint: disable=invalid-name
     R = reduce  # pylint: disable=invalid-name
 
 
-def serial(start: PrimExpr,
-           stop: PrimExpr = None,
-           *,
-           annotations: Dict[str, Any] = None) -> frame.ForFrame:
+def serial(start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None) -> frame.ForFrame:
     """The serial For statement.
 
     Parameters
@@ -700,10 +702,7 @@ def serial(start: PrimExpr,
     return _ffi_api.Serial(start, stop, annotations)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
-def parallel(start: PrimExpr,
-             stop: PrimExpr = None,
-             *,
-             annotations: Dict[str, Any] = None) -> frame.ForFrame:
+def parallel(start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None) -> frame.ForFrame:
     """The parallel For statement.
 
     Parameters
@@ -731,10 +730,7 @@ def parallel(start: PrimExpr,
     return _ffi_api.Parallel(start, stop, annotations)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
-def vectorized(start: PrimExpr,
-               stop: PrimExpr = None,
-               *,
-               annotations: Dict[str, Any] = None) -> frame.ForFrame:
+def vectorized(start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None) -> frame.ForFrame:
     """The vectorized For statement.
 
     Parameters
@@ -762,10 +758,7 @@ def vectorized(start: PrimExpr,
     return _ffi_api.Vectorized(start, stop, annotations)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
-def unroll(start: PrimExpr,
-           stop: PrimExpr = None,
-           *,
-           annotations: Dict[str, Any] = None) -> frame.ForFrame:
+def unroll(start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None) -> frame.ForFrame:
     """The unrolled For statement.
 
     Parameters
@@ -837,7 +830,8 @@ def thread_binding(
         else:
             start = 0
     return _ffi_api.ThreadBinding(  # type: ignore[attr-defined] # pylint: disable=no-member
-        start, stop, thread, annotations)
+        start, stop, thread, annotations
+    )
 
 
 def grid(*extents: PrimExpr) -> frame.ForFrame:
@@ -878,10 +872,10 @@ def Assert(condition: PrimExpr, message: str) -> frame.AssertFrame:  # pylint: d
 
 
 def LetStmt(  # pylint: disable=invalid-name
-        value: PrimExpr,
-        type_annotation: Optional[Type] = None,  # pylint: disable=redefined-outer-name
-        *,
-        var: Optional[Var] = None,  # pylint: disable=redefined-outer-name
+    value: PrimExpr,
+    type_annotation: Optional[Type] = None,  # pylint: disable=redefined-outer-name
+    *,
+    var: Optional[Var] = None,  # pylint: disable=redefined-outer-name
 ) -> frame.LetFrame:
     """Create a LetStmt binding
 
@@ -909,8 +903,8 @@ def LetStmt(  # pylint: disable=invalid-name
 
 
 def Let(  # pylint: disable=invalid-name
-        expr: PrimExpr,
-        where: Dict[Var, PrimExpr],  # pylint: disable=redefined-outer-name
+    expr: PrimExpr,
+    where: Dict[Var, PrimExpr],  # pylint: disable=redefined-outer-name
 ) -> PrimExpr:
     """Create a Let expression binding"""
     assert len(where) == 1, "T.Let only allows `where` to have exactly one element"
@@ -980,7 +974,8 @@ def realize(
         The result RealizeFrame.
     """
     return _ffi_api.Realize(  # type: ignore[attr-defined] # pylint: disable=no-member
-        buffer_slice, storage_scope, condition)
+        buffer_slice, storage_scope, condition
+    )
 
 
 def allocate(
@@ -1012,7 +1007,8 @@ def allocate(
     if isinstance(condition, bool):
         condition = IntImm("bool", condition)
     return _ffi_api.Allocate(  # type: ignore[attr-defined] # pylint: disable=no-member
-        extents, dtype, scope, condition, annotations)
+        extents, dtype, scope, condition, annotations
+    )
 
 
 def allocate_const(
@@ -1048,7 +1044,8 @@ def allocate_const(
         np_data = np_data.reshape(extents)
 
     return _ffi_api.AllocateConst(  # type: ignore[attr-defined] # pylint: disable=no-member
-        ndarray.array(np_data), dtype, extents, annotations)
+        ndarray.array(np_data), dtype, extents, annotations
+    )
 
 
 def attr(node: Any, attr_key: str, value: Union[PrimExpr, str]) -> frame.AttrFrame:
@@ -1136,7 +1133,7 @@ def Else() -> frame.ElseFrame:  # pylint: disable=invalid-name
 
 def decl_buffer(
     shape,
-    dtype="float32",
+    dtype=T.float32,
     data=None,
     strides=None,
     elem_offset=None,
@@ -1187,7 +1184,7 @@ def decl_buffer(
     """
     shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
     if strides is not None:
-        strides = [Var(s, "int32") if isinstance(s, str) else s for s in strides]
+        strides = [Var(s, T.int32) if isinstance(s, str) else s for s in strides]
     else:
         strides = []
     return _ffi_api.DeclBuffer(  # type: ignore[attr-defined] # pylint: disable=no-member
@@ -1240,7 +1237,7 @@ def launch_thread(
     return _ffi_api.LaunchThread(thread, extent)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
-def env_thread(thread_tag: str, dtype: str = "int32") -> IterVar:
+def env_thread(thread_tag: str, dtype: str = T.int32) -> IterVar:
     """Bind a var to thread env
 
     Parameters
@@ -1297,7 +1294,8 @@ def buffer_store(
     if isinstance(value, bool) and buffer.dtype == "bool":
         value = IntImm("bool", value)
     return _ffi_api.BufferStore(  # type: ignore[attr-defined] # pylint: disable=no-member
-        buffer, value, expr_indices)
+        buffer, value, expr_indices
+    )
 
 
 def prefetch(
@@ -1464,10 +1462,7 @@ def boolean(expr: Optional[PrimExpr] = None, is_size_var: bool = False) -> PrimE
     return _ffi_api.Boolean(expr, is_size_var)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
-def handle(dtype: Optional[str] = None,
-           storage_scope: str = "global",
-           *,
-           is_size_var: bool = False) -> Var:
+def handle(dtype: Optional[str] = None, storage_scope: str = "global", *, is_size_var: bool = False) -> Var:
     """Create a TIR var that represents a pointer.
 
     Parameters
@@ -1661,13 +1656,13 @@ def comm_reducer(combiner: Callable, identity: List[PrimExpr]) -> CommReducer:
     args = []
     for name, i in zip(params.keys(), identity + identity):
         if isinstance(i, int):
-            args.append(Var(name, "int32"))
+            args.append(Var(name, T.int32))
         else:
             args.append(Var(name, i.dtype))
     res = combiner(*args)
     if not isinstance(res, tuple):
         res = (res,)
-    return CommReducer(args[:num_args // 2], args[num_args // 2:], res, identity)
+    return CommReducer(args[: num_args // 2], args[num_args // 2 :], res, identity)
 
 
 def index_map(
@@ -1700,16 +1695,15 @@ def target(
         The target.
     """
     if not isinstance(target_config, (str, dict)):
-        raise ValueError(
-            f"T.target expected a config dict or string, but got {type(target_config)}")
+        raise ValueError(f"T.target expected a config dict or string, but got {type(target_config)}")
     if host is not None and not isinstance(host, (str, dict, Target)):
-        raise ValueError("T.target expected the host to be "
-                         "a config dict, string, or T.target, "
-                         f"but got {type(host)}")
+        raise ValueError(f"T.target expected the host to be a config dict, string, or T.target, but got {type(host)}")
     if isinstance(target_config, dict) and "host" in target_config and host is not None:
-        raise ValueError("T.target expects to either receive the host "
-                         "as part of the target's config dictionary, "
-                         "or as a separate argument, but not both.")
+        raise ValueError(
+            "T.target expects to either receive the host "
+            "as part of the target's config dictionary, "
+            "or as a separate argument, but not both."
+        )
     return Target(target_config, host)
 
 
@@ -1742,7 +1736,6 @@ def __init__(self, value: Any) -> None:
         self.value = value
 
     def __iter__(self):
-
         def f():
             for i in self.value:
                 yield meta_var(i)
@@ -1754,7 +1747,6 @@ def f():
 
 
 def _op_wrapper(func):
-
     @functools.wraps(func)
     def wrapped(*args, **kwargs):
         if "dtype" in kwargs:
@@ -1874,7 +1866,6 @@ def wrapped(*args, **kwargs):
 
 
 def _dtype_forward(func):
-
     @functools.wraps(func)
     def wrapped(*args, **kwargs):
         if "dtype" in kwargs:
@@ -1894,6 +1885,8 @@ def wrapped(*args, **kwargs):
 ptx_mma_sp = _dtype_forward(_tir_op.ptx_mma_sp)
 ptx_wgmma_ss = _dtype_forward(_tir_op.ptx_wgmma_ss)
 ptx_wgmma_rs = _dtype_forward(_tir_op.ptx_wgmma_rs)
+ptx_tcgen05_mma_ss = _dtype_forward(_tir_op.ptx_tcgen05_mma_ss)
+ptx_tcgen05_mma_ts = _dtype_forward(_tir_op.ptx_tcgen05_mma_ts)
 ptx_ldmatrix = _dtype_forward(_tir_op.ptx_ldmatrix)
 ptx_cp_async = _dtype_forward(_tir_op.ptx_cp_async)
 ptx_cp_async_bulk = _dtype_forward(_tir_op.ptx_cp_async_bulk)
@@ -2145,6 +2138,7 @@ def wrapped(*args, **kwargs):
     "ptx_mma_sp",
     "ptx_wgmma_ss",
     "ptx_wgmma_rs",
+    "ptx_tcgen05_mma_ss",
     "ptx_ldmatrix",
     "ptx_cp_async",
     "ptx_cp_async_bulk",
diff --git a/tilelang/language/atomic.py b/tilelang/language/atomic.py
index f1b37d236..30b5f533b 100644
--- a/tilelang/language/atomic.py
+++ b/tilelang/language/atomic.py
@@ -1,13 +1,11 @@
-# Copyright (c) Tile-AI Corporation.
-# Licensed under the MIT License.
-"""Atomic operations for tilelang."""
+"""Atomic operations exposed on the TileLang language surface."""
+
 from __future__ import annotations
 
 import tilelang.language as T
-from tvm import ir, tir
+from tvm import ir
 from tvm.tir import PrimExpr, Buffer, BufferRegion, Var, op
-from tilelang.language.utils import buffer_to_tile_region, buffer_region_to_tile_region, buffer_load_to_tile_region
-from tilelang.utils.language import get_buffer_region_from_load
+from tilelang.utils.language import to_buffer_region, legalize_pairwise_extents
 
 _MEMORY_ORDER_ID_MAP = {
     "relaxed": 0,
@@ -19,10 +17,7 @@
 }
 
 
-def atomic_max(dst: Buffer,
-               value: PrimExpr,
-               memory_order: str | None = None,
-               return_prev: bool = False) -> PrimExpr:
+def atomic_max(dst: Buffer, value: PrimExpr, memory_order: str | None = None, return_prev: bool = False) -> PrimExpr:
     """
     Perform an atomic maximum on the value stored at dst with an optional memory-order.
 
@@ -60,15 +55,18 @@ def atomic_max(dst: Buffer,
     return_type = dst.dtype if return_prev else "handle"
 
     if memory_order is None:
-        return T.call_extern(return_type, func_name, dst, value)
+        return T.call_extern(return_type, func_name, T.address_of(dst), value)
     else:
-        return T.call_extern(return_type, func_name, dst, value, _MEMORY_ORDER_ID_MAP[memory_order])
+        return T.call_extern(
+            return_type,
+            func_name,
+            T.address_of(dst),
+            value,
+            _MEMORY_ORDER_ID_MAP[memory_order],
+        )
 
 
-def atomic_min(dst: Buffer,
-               value: PrimExpr,
-               memory_order: str | None = None,
-               return_prev: bool = False) -> PrimExpr:
+def atomic_min(dst: Buffer, value: PrimExpr, memory_order: str | None = None, return_prev: bool = False) -> PrimExpr:
     """
     Atomically update the value at dst to the minimum of its current value and value.
 
@@ -108,16 +106,18 @@ def atomic_min(dst: Buffer,
     return_type = dst.dtype if return_prev else "handle"
 
     if memory_order is None:
-        return T.call_extern(return_type, func_name, dst, value)
+        return T.call_extern(return_type, func_name, T.address_of(dst), value)
     else:
-        return T.call_extern(return_type, func_name, dst, value, _MEMORY_ORDER_ID_MAP[memory_order])
+        return T.call_extern(
+            return_type,
+            func_name,
+            T.address_of(dst),
+            value,
+            _MEMORY_ORDER_ID_MAP[memory_order],
+        )
 
 
-def atomic_add(dst: Buffer,
-               value: PrimExpr,
-               memory_order: str | None = None,
-               return_prev: bool = False,
-               use_tma: bool = False) -> PrimExpr:
+def atomic_add(dst: Buffer, value: PrimExpr, memory_order: str | None = None, return_prev: bool = False, use_tma: bool = False) -> PrimExpr:
     """
     Atomically add `value` into `dst`, returning a handle to the operation.
 
@@ -189,11 +189,17 @@ def get_extent(data):
         func_name = "AtomicAddRet" if return_prev else "AtomicAdd"
         return_type = dst.dtype if return_prev else "handle"
 
+        # Pass destination by pointer to match device signature
         if memory_order is None:
-            return T.call_extern(return_type, func_name, dst, value)
+            return T.call_extern(return_type, func_name, T.address_of(dst), value)
         else:
-            return T.call_extern(return_type, func_name, dst, value,
-                                 _MEMORY_ORDER_ID_MAP[memory_order])
+            return T.call_extern(
+                return_type,
+                func_name,
+                T.address_of(dst),
+                value,
+                _MEMORY_ORDER_ID_MAP[memory_order],
+            )
 
     if isinstance(dst, Buffer) and isinstance(value, Buffer):
         ir.assert_structural_equal(dst.shape, value.shape)
@@ -201,37 +207,24 @@ def get_extent(data):
     assert src_extent or dst_extent, "Can't deduce atomicadd extents from args"
     src_extent = list(src_extent) if src_extent else [1] * len(dst_extent)
     dst_extent = list(dst_extent) if dst_extent else [1] * len(src_extent)
-    extent = max(src_extent, dst_extent)
-
-    def _to_region(data, access_type):
-        if isinstance(data, tir.Var) and T.has_let_value(data):
-            data = T.get_let_value(data)
-        if isinstance(data, tir.Buffer):
-            return buffer_to_tile_region(data, access_type)
-        elif isinstance(data, tir.BufferRegion):
-            return buffer_region_to_tile_region(data, access_type, extent)
-        elif isinstance(data, tir.BufferLoad):
-            region = get_buffer_region_from_load(data)
-            if region is None:
-                return buffer_load_to_tile_region(data, access_type, extent)
-            return buffer_region_to_tile_region(region, access_type, extent)
-        else:
-            return buffer_load_to_tile_region(data, access_type, extent)
+    src_extent, dst_extent = legalize_pairwise_extents(src_extent, dst_extent)
 
-    value = _to_region(value, "r")
-    dst = _to_region(dst, "w")
+    value = to_buffer_region(value, access_type="r", extents=src_extent)
+    dst = to_buffer_region(dst, access_type="w", extents=dst_extent)
 
     # Note: tile-region-based atomic operations don't support return_prev yet
     # This would need to be implemented in the tile runtime
     if return_prev:
-        raise NotImplementedError(
-            "return_prev is not supported for tile-region-based atomic operations")
+        raise NotImplementedError("return_prev is not supported for tile-region-based atomic operations")
 
-    if memory_order is None:
-        return T.call_intrin("handle", op.Op.get("tl.atomicadd"), value, dst, use_tma, 0)
-    else:
-        return T.call_intrin("handle", op.Op.get("tl.atomicadd"), value, dst, use_tma,
-                             _MEMORY_ORDER_ID_MAP[memory_order])
+    # Build annotations dict
+    ann = {}
+    if use_tma:
+        ann["use_tma"] = 1
+    if memory_order is not None:
+        ann["memory_order"] = _MEMORY_ORDER_ID_MAP[memory_order]
+
+    return T.call_intrin("handle", op.Op.get("tl.tileop.atomicadd"), value, dst, annotations=ann if ann else None)
 
 
 def atomic_addx2(dst: Buffer, value: PrimExpr, return_prev: bool = False) -> PrimExpr:
@@ -346,7 +339,7 @@ def atomic_load(src: Buffer, memory_order: str = "seq_cst") -> PrimExpr:
         >>> counter = T.Tensor([1], "int64", name="counter")
         >>> current_count = atomic_load(counter, memory_order="relaxed")
     """
-    return T.call_extern(src.dtype, "AtomicLoad", src, _MEMORY_ORDER_ID_MAP[memory_order])
+    return T.call_extern(src.dtype, "AtomicLoad", T.address_of(src), _MEMORY_ORDER_ID_MAP[memory_order])
 
 
 def atomic_store(dst: Buffer, src: PrimExpr, memory_order: str = "seq_cst") -> PrimExpr:
@@ -399,4 +392,4 @@ def atomic_store(dst: Buffer, src: PrimExpr, memory_order: str = "seq_cst") -> P
         >>> log_counter = T.Tensor([1], "int64", name="log_counter")
         >>> atomic_store(log_counter, 0)  # Reset counter atomically
     """
-    return T.call_extern("handle", "AtomicStore", dst, src, _MEMORY_ORDER_ID_MAP[memory_order])
+    return T.call_extern("handle", "AtomicStore", T.address_of(dst), src, _MEMORY_ORDER_ID_MAP[memory_order])
diff --git a/tilelang/language/builtin.py b/tilelang/language/builtin.py
index 8db638308..f4d46d6c8 100644
--- a/tilelang/language/builtin.py
+++ b/tilelang/language/builtin.py
@@ -1,14 +1,15 @@
-"""The language interface for tl programs."""
+"""Builtin operations exposed on the TileLang language surface."""
+
 from __future__ import annotations
 
 from tilelang import tvm as tvm
-from tilelang.language import ptx_arrive_barrier, evaluate, address_of
+from tilelang.language import ptx_arrive_barrier, evaluate, address_of, alloc_buffer
 from tilelang.language.kernel import get_thread_bindings, get_block_extents
 from tilelang.utils.target import check_hip_availability
-from tvm import tir
+from tvm import DataType, tir
+from tvm.runtime import convert
 from typing import Any, Literal
-import tilelang.language as T
-from tvm.tir import PrimExpr, Var, Call, Buffer, BufferLoad
+from tvm.tir import PrimExpr, Var, Call, BufferLoad, BufferRegion
 
 _IS_HIP_AVAILABLE = check_hip_availability()
 
@@ -59,6 +60,35 @@ def create_list_of_mbarrier(*args: Any) -> Call:
         raise TypeError("create_list_of_mbarrier expects a list or one or more arguments.")
 
 
+def __ldg(load_or_buf: BufferLoad | tir.Buffer, index: PrimExpr | int | None = None) -> PrimExpr:
+    """Explicitly load via CUDA read-only data cache.
+
+    Prefer calling with a BufferLoad: `T.__ldg(x[i])` emits `__ldg(&x[i])` on CUDA.
+    On non-CUDA backends, falls back to a regular load.
+
+    Args:
+        load_or_buf: A `BufferLoad` like `x[i]`, or a `Buffer`.
+        index: Optional index when passing a `Buffer` directly.
+
+    Returns:
+        PrimExpr: The loaded value.
+    """
+    if isinstance(load_or_buf, BufferLoad):
+        dtype = load_or_buf.dtype
+        return tir.call_intrin(str(dtype), tir.op.Op.get("tl.__ldg"), load_or_buf)
+    if isinstance(load_or_buf, tir.Buffer):
+        if index is None:
+            raise ValueError("T.__ldg(Buffer, index) requires an index when passing a Buffer.")
+        idx = index
+        if isinstance(index, (list, tuple)):
+            if len(index) != 1:
+                raise ValueError("T.__ldg currently supports 1D flattened indices.")
+            idx = index[0]
+        bl = BufferLoad(load_or_buf, [idx])
+        return tir.call_intrin(str(load_or_buf.dtype), tir.op.Op.get("tl.__ldg"), bl)
+    raise TypeError("T.__ldg expects a BufferLoad or a Buffer.")
+
+
 def get_mbarrier(*args):
     """Retrieve a memory barrier operation.
 
@@ -150,38 +180,32 @@ def set_max_nreg(reg_count: int, is_inc: int):
 
 
 def inc_max_nreg(reg_count: int):
-    """Increment the maximum number of registers to use.
-    """
+    """Increment the maximum number of registers to use."""
     return set_max_nreg(reg_count, 1)
 
 
 def dec_max_nreg(reg_count: int):
-    """Decrement the maximum number of registers to use.
-    """
+    """Decrement the maximum number of registers to use."""
     return set_max_nreg(reg_count, 0)
 
 
 def annotate_producer_reg_dealloc(reg_count: int = 24):
-    """Annotate the producer reg dealloc.
-    """
+    """Annotate the producer reg dealloc."""
     return dec_max_nreg(reg_count)
 
 
 def annotate_consumer_reg_alloc(reg_count: int = 240):
-    """Annotate the consumer reg alloc.
-    """
+    """Annotate the consumer reg alloc."""
     return inc_max_nreg(reg_count)
 
 
 def no_set_max_nreg():
-    """Disable the maximum register limit setting.
-    """
+    """Disable the maximum register limit setting."""
     return tir.call_intrin("handle", tir.op.Op.get("tl.no_set_max_nreg"))
 
 
 def disable_warp_group_reg_alloc():
-    """Disable the warp group reg alloc.
-    """
+    """Disable the warp group reg alloc."""
     return no_set_max_nreg()
 
 
@@ -296,7 +320,9 @@ def warpgroup_wait(num_mma: int):
     return tir.call_intrin("handle", tir.op.Op.get("tl.warpgroup_wait"), num_mma)
 
 
-def get_lane_idx(warp_size: int | PrimExpr | None = None,) -> PrimExpr:
+def get_lane_idx(
+    warp_size: int | PrimExpr | None = None,
+) -> PrimExpr:
     """Return the logical lane index of the calling thread within a warp.
 
     Parameters
@@ -321,7 +347,9 @@ def get_lane_idx(warp_size: int | PrimExpr | None = None,) -> PrimExpr:
     return tir.call_intrin("int32", tir.op.Op.get("tl.get_lane_idx"), warp_size_expr)
 
 
-def get_warp_idx_sync(warp_size: int | PrimExpr | None = None,) -> PrimExpr:
+def get_warp_idx_sync(
+    warp_size: int | PrimExpr | None = None,
+) -> PrimExpr:
     """Return the canonical warp index, assuming the warp's threads are converged.
 
     Parameters
@@ -345,7 +373,9 @@ def get_warp_idx_sync(warp_size: int | PrimExpr | None = None,) -> PrimExpr:
     return tir.call_intrin("int32", tir.op.Op.get("tl.get_warp_idx_sync"), warp_size_expr)
 
 
-def get_warp_idx(warp_size: int | PrimExpr | None = None,) -> PrimExpr:
+def get_warp_idx(
+    warp_size: int | PrimExpr | None = None,
+) -> PrimExpr:
     """Return the canonical warp index without synchronizing the warp.
 
     Parameters
@@ -400,8 +430,7 @@ def get_warp_group_idx(
         args.append(warp_size_expr)
     if warps_per_group_expr is not None:
         if warp_size_expr is None:
-            raise ValueError("get_warp_group_idx expects `warp_size` when specifying "
-                             "`warps_per_group`.")
+            raise ValueError("get_warp_group_idx expects `warp_size` when specifying `warps_per_group`.")
         args.append(warps_per_group_expr)
     return tir.call_intrin("int32", tir.op.Op.get("tl.get_warp_group_idx"), *args)
 
@@ -430,6 +459,165 @@ def shuffle_elect(thread_extent: int) -> PrimExpr:
     return tir.call_intrin("bool", tir.op.Op.get("tl.tl_shuffle_elect"), thread_extent)
 
 
+def warpgroup_fence_operand(
+    buffer_or_ptr: tir.Buffer | PrimExpr, offset: int | PrimExpr = 0, num_regs: int | PrimExpr | None = None, dtype: str | None = None
+):
+    """Insert a warpgroup fence for the destination accumulator registers.
+
+    This prevents NVCC from sinking uses of accumulator fragments past the corresponding
+    WGMMA operations by issuing an empty inline assembly barrier on every register.
+
+    Args:
+        buffer_or_ptr: Buffer | BufferLoad | BufferRegion | PrimExpr
+            A buffer representing the accumulator fragment, a buffer load/region
+            that identifies a starting element within the fragment, or a pointer expression
+            (e.g., tvm_access_ptr/address_of/typed Var).
+        offset: int | PrimExpr
+            Element offset from the start of the accumulator fragment.
+        num_regs: int | PrimExpr | None
+            Number of 32-bit registers to fence. If None and a Buffer is provided, it will be
+            derived from the buffer shape and dtype.
+        dtype: str | None
+            Data type string of the accumulator elements. When passing a buffer or
+            buffer-derived expression, dtype is inferred. It is required only when
+            passing a raw pointer expression that cannot be inferred.
+
+    Returns:
+        tir.Call: A handle to the warpgroup fence operation.
+    """
+    if isinstance(buffer_or_ptr, BufferLoad):
+        # Treat BufferLoad as a request to fence starting from the loaded element's address
+        buf = buffer_or_ptr.buffer
+        data_ptr = buf.data
+        inferred_dtype = buf.dtype
+        if dtype is not None and dtype != inferred_dtype:
+            raise ValueError(f"dtype mismatch: provided {dtype}, buffer uses {inferred_dtype}.")
+        dtype = inferred_dtype
+        # Compute element offset from indices using strides if present, otherwise row-major
+        if len(buf.strides) == len(buf.shape) and len(buf.strides) > 0:
+            elem_off = 0
+            for idx, stride in zip(buffer_or_ptr.indices, buf.strides):
+                elem_off = elem_off + idx * stride
+        else:
+            elem_off = 0
+            stride_acc = 1
+            for idx, dim in zip(reversed(buffer_or_ptr.indices), reversed(buf.shape)):
+                elem_off = elem_off + idx * stride_acc
+                stride_acc = stride_acc * dim
+        # Combine with user-provided offset
+        offset = elem_off + convert(offset)
+        if num_regs is None:
+            raise ValueError("num_regs must be provided when passing a BufferLoad.")
+        return evaluate(
+            tir.call_intrin(
+                "handle",
+                tir.op.Op.get("tl.warpgroup_fence_operand"),
+                dtype,
+                data_ptr,
+                convert(offset),
+                convert(num_regs),
+            )
+        )
+
+    if isinstance(buffer_or_ptr, tir.Buffer):
+        data_ptr = buffer_or_ptr.data
+        inferred_dtype = buffer_or_ptr.dtype
+        if dtype is not None and dtype != inferred_dtype:
+            raise ValueError(f"dtype mismatch: provided {dtype}, buffer uses {inferred_dtype}.")
+        dtype = inferred_dtype
+        if num_regs is None:
+            total_elems = 1
+            for dim in buffer_or_ptr.shape:
+                if isinstance(dim, tir.IntImm):
+                    total_elems *= int(dim)
+                else:
+                    raise ValueError("warpgroup_fence_operand requires num_regs when buffer shape is symbolic.")
+            bits_per_elem = DataType(dtype).bits
+            num_regs = (total_elems * bits_per_elem + 31) // 32
+    elif isinstance(buffer_or_ptr, BufferRegion):
+        buf = buffer_or_ptr.buffer
+        data_ptr = buf.data
+        inferred_dtype = buf.dtype
+        if dtype is not None and dtype != inferred_dtype:
+            raise ValueError(f"dtype mismatch: provided {dtype}, buffer uses {inferred_dtype}.")
+        dtype = inferred_dtype
+        # Compute element offset from region min using strides if present, otherwise row-major
+        if len(buf.strides) == len(buf.shape) and len(buf.strides) > 0:
+            elem_off = 0
+            for r, stride in zip(buffer_or_ptr.region, buf.strides):
+                elem_off = elem_off + r.min * stride
+        else:
+            elem_off = 0
+            stride_acc = 1
+            for r, dim in zip(reversed(buffer_or_ptr.region), reversed(buf.shape)):
+                elem_off = elem_off + r.min * stride_acc
+                stride_acc = stride_acc * dim
+        # Combine with user-provided offset
+        offset = elem_off + convert(offset)
+        # Try derive num_regs from region extents if fully static; otherwise require user input
+        if num_regs is None:
+            total_elems = 1
+            static = True
+            for r in buffer_or_ptr.region:
+                if isinstance(r.extent, tir.IntImm):
+                    total_elems *= int(r.extent)
+                else:
+                    static = False
+                    break
+            if static:
+                bits_per_elem = DataType(dtype).bits
+                num_regs = (total_elems * bits_per_elem + 31) // 32
+            else:
+                raise ValueError("warpgroup_fence_operand requires num_regs when BufferRegion extent is symbolic.")
+        return evaluate(
+            tir.call_intrin(
+                "handle",
+                tir.op.Op.get("tl.warpgroup_fence_operand"),
+                dtype,
+                data_ptr,
+                convert(offset),
+                convert(num_regs),
+            )
+        )
+    else:
+        data_ptr = buffer_or_ptr
+        # Try to infer dtype from common pointer expressions when not provided
+        if dtype is None:
+            inferred = None
+            # Case 1: Pointer from Buffer.access_ptr -> tir.builtin.tvm_access_ptr
+            if isinstance(data_ptr, Call) and data_ptr.op.same_as(tir.builtin.tvm_access_ptr()):
+                # args[0] is a type annotation call; its dtype carries the element dtype
+                inferred = str(data_ptr.args[0].dtype)
+            # Case 2: Pointer from tir.address_of(BufferLoad(...))
+            elif isinstance(data_ptr, Call) and data_ptr.op.same_as(tir.builtin.address_of()):
+                # args[0] should be a BufferLoad; its dtype is the element dtype
+                inferred = str(data_ptr.args[0].dtype)
+            # Case 3: Typed pointer Var with PrimType element (typed TIR)
+            elif hasattr(data_ptr, "type_annotation") and data_ptr.type_annotation is not None:
+                try:
+                    elem_ty = getattr(data_ptr.type_annotation, "element_type", None)
+                    if elem_ty is not None and hasattr(elem_ty, "dtype"):
+                        inferred = str(elem_ty.dtype)
+                except Exception:
+                    inferred = None
+            if inferred is None:
+                raise ValueError("dtype must be provided when passing a pointer expression and cannot be inferred.")
+            dtype = inferred
+        if num_regs is None:
+            raise ValueError("num_regs must be provided when passing a pointer expression.")
+
+    return evaluate(
+        tir.call_intrin(
+            "handle",
+            tir.op.Op.get("tl.warpgroup_fence_operand"),
+            dtype,
+            data_ptr,
+            convert(offset),
+            convert(num_regs),
+        )
+    )
+
+
 def wait_wgmma(id: int):
     """Wait for WGMMA (Warp Group Matrix Multiply-Accumulate) operations to complete.
 
@@ -482,7 +670,7 @@ def shfl_xor(value: int | PrimExpr | tir.Call, offset: int | PrimExpr | tir.Call
     if _IS_HIP_AVAILABLE:
         return tir.call_extern(value.dtype, "__shfl_xor", value, offset)
     else:
-        return tir.call_extern(value.dtype, "__shfl_xor_sync", 0xffffffff, value, offset)
+        return tir.call_extern(value.dtype, "__shfl_xor_sync", 0xFFFFFFFF, value, offset)
 
 
 def shfl_down(value: int | PrimExpr | tir.Call, offset: int | PrimExpr | tir.Call):
@@ -495,7 +683,7 @@ def shfl_down(value: int | PrimExpr | tir.Call, offset: int | PrimExpr | tir.Cal
     if _IS_HIP_AVAILABLE:
         return tir.call_extern(value.dtype, "__shfl_down", value, offset)
     else:
-        return tir.call_extern(value.dtype, "__shfl_down_sync", 0xffffffff, value, offset)
+        return tir.call_extern(value.dtype, "__shfl_down_sync", 0xFFFFFFFF, value, offset)
 
 
 def shfl_up(value: int | PrimExpr | tir.Call, offset: int | PrimExpr | tir.Call):
@@ -508,12 +696,11 @@ def shfl_up(value: int | PrimExpr | tir.Call, offset: int | PrimExpr | tir.Call)
     if _IS_HIP_AVAILABLE:
         return tir.call_extern(value.dtype, "__shfl_up", value, offset)
     else:
-        return tir.call_extern(value.dtype, "__shfl_up_sync", 0xffffffff, value, offset)
+        return tir.call_extern(value.dtype, "__shfl_up_sync", 0xFFFFFFFF, value, offset)
 
 
-def sync_threads(barrier_id: int | PrimExpr = None, arrive_count: int = None):
-    """Synchronize all threads in a block.
-    """
+def sync_threads(barrier_id: int = None, arrive_count: int = None):
+    """Synchronize all threads in a block."""
     args = []
     if barrier_id is not None:
         args.append(barrier_id)
@@ -523,8 +710,7 @@ def sync_threads(barrier_id: int | PrimExpr = None, arrive_count: int = None):
 
 
 def sync_global():
-    """Synchronize all threads in the entire grid.
-    """
+    """Synchronize all threads in the entire grid."""
     tx, ty, tz = get_thread_bindings()
     ex, ey, ez = get_block_extents()
     print(tx, ty, tz, ex, ey, ez)
@@ -532,25 +718,215 @@ def sync_global():
     return evaluate(tir.Call("handle", "tir.tvm_storage_sync", args))
 
 
-def sync_grid_cg():
-    """Synchronize all threads in a grid.
+def sync_grid():
+    """Synchronize all threads in a grid."""
+    return tir.call_intrin("handle", tir.op.Op.get("tl.sync_grid"))
+
+
+def initialize_wgmma_descriptor(
+    descriptor: tir.Buffer,
+    start_address: PrimExpr,
+    layout_type_: int = 0,
+    leading_byte_offset: int = 0,
+    stride_byte_offset: int = 0,
+) -> PrimExpr:
+    """Initialize a WGMMA/UTCMMA shared-memory descriptor."""
+
+    if not isinstance(descriptor, (BufferLoad, tir.Buffer)):
+        raise TypeError("Descriptor must be a tvm.tir.Buffer or tvm.tir.BufferLoad.")
+
+    if isinstance(descriptor, tir.Buffer) and (len(descriptor.shape) != 1 or descriptor.shape[0] != 1):
+        raise ValueError("Descriptor must be a 1D buffer of size 1.")
+
+    descriptor = descriptor if isinstance(descriptor, BufferLoad) else tir.BufferLoad(descriptor, [0])
+
+    return evaluate(
+        tir.call_intrin(
+            "handle",
+            tir.op.Op.get("tl.initialize_wgmma_descriptor"),
+            descriptor,
+            start_address,
+            layout_type_,
+            int(leading_byte_offset),
+            int(stride_byte_offset),
+        )
+    )
+
+
+def initialize_tcgen05_descriptor(
+    descriptor: tir.Buffer,
+    start_address: PrimExpr,
+    leading_byte_offset: int,
+    stride_byte_offset: int,
+    base_offset: int = 0,
+    leading_is_absolute: bool = False,
+    swizzle_mode: int = 0,
+) -> PrimExpr:
+    """Initialize a TCGEN05 shared-memory descriptor."""
+
+    if not isinstance(descriptor, (BufferLoad, tir.Buffer)):
+        raise TypeError("Descriptor must be a tvm.tir.Buffer or tvm.tir.BufferLoad.")
+
+    if isinstance(descriptor, tir.Buffer) and (len(descriptor.shape) != 1 or descriptor.shape[0] != 1):
+        raise ValueError("Descriptor must be a 1D buffer of size 1.")
+
+    descriptor = descriptor if isinstance(descriptor, BufferLoad) else tir.BufferLoad(descriptor, [0])
+
+    return evaluate(
+        tir.call_intrin(
+            "handle",
+            tir.op.Op.get("tl.initialize_tcgen05_descriptor"),
+            descriptor,
+            start_address,
+            int(leading_byte_offset),
+            int(stride_byte_offset),
+            int(base_offset),
+            tir.IntImm("int32", 1 if leading_is_absolute else 0),
+            int(swizzle_mode),
+        )
+    )
+
+
+def increase_descriptor_offset(descriptor: PrimExpr, offset: PrimExpr) -> PrimExpr:
+    """
+    Increase the offset of a memory descriptor.
+
+    Parameters:
+        descriptor (PrimExpr): The memory descriptor to modify.
+        offset (PrimExpr): The offset value to increase.
+
+    Returns:
+        PrimExpr: A handle representing the modified descriptor.
     """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.sync_grid_cg"))
+    if not isinstance(descriptor, (BufferLoad, tir.Buffer)):
+        raise TypeError("Descriptor must be a tvm.tir.Buffer or tvm.tir.BufferLoad.")
+
+    if isinstance(descriptor, tir.Buffer) and len(descriptor.shape) != 1 or descriptor.shape[0] != 1:
+        raise ValueError("Descriptor must be a 1D buffer of size 1.")
 
+    descriptor = descriptor if isinstance(descriptor, BufferLoad) else tir.BufferLoad(descriptor, [0])
 
-def copy_unrolled(dst: PrimExpr, src: PrimExpr, size: int, unroll_factor: int = 4):
-    """Copy between two global memory buffers with unrolled loop.
+    return evaluate(tir.call_intrin("handle", tir.op.Op.get("tl.increase_descriptor_offset"), descriptor, offset))
 
-    Args:
-        dst: tir.Buffer
-            The destination buffer
-        src: tir.Buffer
-            The source buffer
-        unroll_factor: int
-            The unroll factor
+
+def cp_async_barrier_noinc(barrier_id: int | PrimExpr | tir.Call):
+    """Perform a ptx async copy barrier using cp.async.mbarrier.arrive.noinc."""
+    return tir.call_intrin("handle", tir.op.Op.get("tl.ptx_cp_async_barrier_noinc"), barrier_id)
+
+
+def tcgen05_mma_arrive(mbar_ptr):
+    """Signal UMMA (TCGEN05) barrier arrival for a shared-memory mbarrier pointer.
+
+    Parameters
+    ----------
+    mbar_ptr : PrimExpr
+        Pointer to the mbarrier object in shared memory (e.g., Barrier*).
     """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.copy_unrolled"), dst, src, size,
-                           unroll_factor)
+    return tir.call_intrin("void", tir.op.Op.get("tl.tcgen05_mma_arrive"), mbar_ptr)
+
+
+def ptx_mma_sm70(
+    shape,
+    A_layout,
+    B_layout,
+    A_dtype,
+    B_dtype,
+    C_dtype,
+    multiplicand_a,
+    a_index,
+    multiplicand_b,
+    b_index,
+    accumulator,
+    c_index,
+):
+    """TVM intrinsic for ptx tensor core mma instructions on SM70 (Volta).
+
+    This intrinsic provides SM70-specific MMA operations that support m16n16k4 shape
+    with FP16 inputs and FP16/FP32 accumulation.
+
+    Parameters
+    ----------
+
+    shape : str
+        The shape of mma fragment (e.g., "m16n16k4").
+
+    A_layout : str
+        The layout of multiplicand fragment A ("row" or "col").
+
+    B_layout : str
+        The layout of multiplicand fragment B ("row" or "col").
+
+    A_dtype : str
+        The data type of multiplicand fragment A (typically "fp16").
+
+    B_dtype : str
+        The data type of multiplicand fragment B (typically "fp16").
+
+    C_dtype : str
+        The data type of accumulator fragment C ("fp16" or "fp32").
+
+    multiplicand_a : Var
+        The multiplicand fragment A variable.
+
+    a_index : Expr
+        The index of multiplicand fragment A.
+
+    multiplicand_b : Var
+        The multiplicand fragment B variable.
+
+    b_index : Expr
+        The index of multiplicand fragment B.
+
+    accumulator : Var
+        The accumulator fragment C variable.
+
+    c_index : Expr
+        The index of accumulator fragment C.
+
+    Returns
+    -------
+    call : PrimExpr
+        The call expression.
+
+    Examples
+    --------
+    >>> T.ptx_mma_sm70(
+    ...     "float16",
+    ...     "m16n16k4",
+    ...     "row",
+    ...     "col",
+    ...     "fp16",
+    ...     "fp16",
+    ...     "fp16",
+    ...     A_local.data,
+    ...     0,
+    ...     B_local.data,
+    ...     0,
+    ...     C_local.data,
+    ...     0,
+    ... )
+    """
+    return tir.call_intrin(
+        "handle",
+        tir.op.Op.get("tl.ptx_mma_sm70"),
+        shape,
+        A_layout,
+        B_layout,
+        A_dtype,
+        B_dtype,
+        C_dtype,
+        multiplicand_a,
+        a_index,
+        multiplicand_b,
+        b_index,
+        accumulator,
+        c_index,
+    )
+
+
+# =====================================================================
+# TileScale Distributed Features
+# =====================================================================
 
 
 # Device-level barrier synchronization
@@ -562,7 +938,7 @@ def alloc_barrier_gpu():
     Returns:
         T.Buffer: A single-element TVM buffer object allocated as a barrier
     """
-    return T.alloc_buffer([1], "uint32", scope="global")
+    return alloc_buffer([1], "uint32", scope="global")
 
 
 def init_barrier_gpu(barrier: PrimExpr, expected: int):
@@ -572,8 +948,7 @@ def init_barrier_gpu(barrier: PrimExpr, expected: int):
         barrier: The barrier to initialize
         expected (int): The number of threads that need to arrive at the barrier.
     """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.init_barrier_gpu"), address_of(barrier),
-                           expected)
+    return tir.call_intrin("handle", tir.op.Op.get("tl.init_barrier_gpu"), address_of(barrier), expected)
 
 
 def arrive_barrier_gpu(barrier: PrimExpr):
@@ -603,15 +978,6 @@ def sync_barrier_gpu(barrier: PrimExpr):
     return tir.call_intrin("handle", tir.op.Op.get("tl.sync_barrier_gpu"), address_of(barrier))
 
 
-def sync_grid(barrier: PrimExpr):
-    """Synchronize at a barrier for GPU-level synchronization in cooperative group style.
-
-    Args:
-        barrier: The barrier to synchronize at
-    """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.sync_grid"), address_of(barrier))
-
-
 def barrier_blocks(barrier: PrimExpr):
     """Barrier all blocks at a system-level barrier.
     Compare to sync_blocks, barrier_blocks have an extra system-level fence effect
@@ -619,8 +985,7 @@ def barrier_blocks(barrier: PrimExpr):
     Args:
         barrier: The barrier to synchronize at, should be [num_ranks] of int32
     """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.barrier_blocks"), address_of(barrier),
-                           1)  # whether need fence
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.barrier_blocks"), address_of(barrier), 1)  # whether need fence
 
 
 def sync_blocks(barrier: PrimExpr):
@@ -629,8 +994,7 @@ def sync_blocks(barrier: PrimExpr):
     Args:
         barrier: The barrier to synchronize at, should be [num_ranks] of int32
     """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.barrier_blocks"), address_of(barrier),
-                           0)  # whether need fence
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.barrier_blocks"), address_of(barrier), 0)  # whether need fence
 
 
 def fence_cta():
@@ -648,96 +1012,6 @@ def fence_sys():
     return tir.call_intrin("handle", tir.op.Op.get("tl.fence_sys"))
 
 
-def get_clock():
-    """Get the current clock cycle count.
-
-    Returns:
-        tir.Call: A handle to the clock cycle count operation
-    """
-    return tir.call_intrin("int64", tir.op.Op.get("tl.get_clock"))
-
-
-def initialize_descriptor(descriptor: Buffer,
-                          start_address: PrimExpr,
-                          layout_type_: int = 0,
-                          leading_byte_offset: int = 0,
-                          stride_byte_offset: int = 0) -> PrimExpr:
-    """
-    Initialize a memory descriptor with the given parameters.
-
-    Parameters:
-        descriptor (Buffer): The memory descriptor to initialize.
-        start_address (PrimExpr): The starting address of the memory region.
-        layout_type_ (int, optional): Layout type identifier. Defaults to 0.
-        leading_byte_offset (int, optional): Leading byte offset. Defaults to 0.
-        stride_byte_offset (int, optional): Stride byte offset. Defaults to 0.
-
-    Returns:
-        PrimExpr: A handle representing the initialized descriptor.
-    """
-
-    if not isinstance(descriptor, (BufferLoad, Buffer)):
-        raise TypeError("Descriptor must be a tvm.tir.Buffer or tvm.tir.BufferLoad.")
-
-    if isinstance(descriptor, Buffer) and len(descriptor.shape) != 1 or descriptor.shape[0] != 1:
-        raise ValueError("Descriptor must be a 1D buffer of size 1.")
-
-    descriptor = descriptor if isinstance(descriptor, BufferLoad) else tir.BufferLoad(
-        descriptor, [0])
-
-    return evaluate(
-        tir.call_intrin("handle", tir.op.Op.get("tl.initialize_descriptor"), descriptor,
-                        start_address, layout_type_, int(leading_byte_offset),
-                        int(stride_byte_offset)))
-
-
-def increase_descriptor_offset(descriptor: PrimExpr, offset: PrimExpr) -> PrimExpr:
-    """
-    Increase the offset of a memory descriptor.
-
-    Parameters:
-        descriptor (PrimExpr): The memory descriptor to modify.
-        offset (PrimExpr): The offset value to increase.
-
-    Returns:
-        PrimExpr: A handle representing the modified descriptor.
-    """
-    if not isinstance(descriptor, (BufferLoad, Buffer)):
-        raise TypeError("Descriptor must be a tvm.tir.Buffer or tvm.tir.BufferLoad.")
-
-    if isinstance(descriptor, Buffer) and len(descriptor.shape) != 1 or descriptor.shape[0] != 1:
-        raise ValueError("Descriptor must be a 1D buffer of size 1.")
-
-    descriptor = descriptor if isinstance(descriptor, BufferLoad) else tir.BufferLoad(
-        descriptor, [0])
-
-    return evaluate(
-        tir.call_intrin("handle", tir.op.Op.get("tl.increase_descriptor_offset"), descriptor,
-                        offset))
-
-
-def loop_break():
-    """Break out of the innermost loop.
-    """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.loop_break"))
-
-
-def cp_async_barrier_noinc(barrier_id: int | PrimExpr | tir.Call):
-    """Perform a ptx async copy barrier using cp.async.mbarrier.arrive.noinc.
-    """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.ptx_cp_async_barrier_noinc"), barrier_id)
-
-
-def atom_add(barrier: PrimExpr, value: PrimExpr, scope: str = "gpu", sem: str = "relaxed"):
-    """Perform a ptx async copy barrier using cp.async.mbarrier.arrive.noinc.
-    """
-    assert scope in ["gpu", "sys"], "Scope must be one of 'gpu', or 'sys'."
-    assert sem in ["relaxed", "acquire", "release", "acq_rel"
-                  ], "Semantic must be one of 'relaxed', 'acquire', 'release', or 'acq_rel'."
-    return tir.call_intrin("uint32", tir.op.Op.get("tl.atom_add"), address_of(barrier), value, sem,
-                           scope)
-
-
 def ld(
     src: PrimExpr,
     value: PrimExpr,
@@ -763,15 +1037,14 @@ def ld(
         tir.Call: A handle to the load operation.
     """
     assert scope in ["cta", "gpu", "sys"], "Scope must be one of 'cta', 'gpu', or 'sys'."
-    assert sem in [
-        "weak", "volatile", "acquire", "relaxed"
-    ], "Semantic must be one of 'weak', 'volatile', 'acquire', 'release', or 'relaxed'."
+    assert sem in ["weak", "volatile", "acquire", "relaxed"], (
+        "Semantic must be one of 'weak', 'volatile', 'acquire', 'release', or 'relaxed'."
+    )
     scope = {"cta": 0, "gpu": 1, "sys": 2}[scope]
     sem = {"weak": 0, "volatile": 1, "acquire": 2, "release": 3, "relaxed": 4}[sem]
     na = 1 if na else 0
     nc = 1 if nc else 0
-    return tir.call_intrin("handle", tir.op.Op.get("tl.ld"), address_of(src), value, sem, scope, na,
-                           nc, src_pe)
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.ld"), address_of(src), value, sem, scope, na, nc, src_pe)
 
 
 def st(
@@ -797,20 +1070,13 @@ def st(
         tir.Call: A handle to the store operation.
     """
     assert scope in ["cta", "gpu", "sys"], "Scope must be one of 'cta', 'gpu', or 'sys'."
-    assert sem in ["weak", "volatile", "release", "relaxed"
-                  ], "Semantic must be one of 'weak', 'volatile', 'release', or 'relaxed'."
+    assert sem in ["weak", "volatile", "release", "relaxed"], "Semantic must be one of 'weak', 'volatile', 'release', or 'relaxed'."
 
     # convert to int
     scope = {"cta": 0, "gpu": 1, "sys": 2}[scope]
     sem = {"weak": 0, "volatile": 1, "acquire": 2, "release": 3, "relaxed": 4}[sem]
     na = 1 if na else 0
-    return tir.call_intrin("handle", tir.op.Op.get("tl.st"), address_of(dst), value, sem, scope, na,
-                           dst_pe)
-
-
-def elect_one_sync():
-    """Efficiently elect exactly one lane within a warp."""
-    return tir.call_intrin("bool", tir.op.Op.get("tl.elect_one_sync"))
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.st"), address_of(dst), value, sem, scope, na, dst_pe)
 
 
 def sync_warp():
@@ -818,11 +1084,6 @@ def sync_warp():
     return tir.call_intrin("handle", tir.op.Op.get("tl.sync_warp"))
 
 
-def loop_continue():
-    """Continue the innermost loop."""
-    return tir.call_intrin("handle", tir.op.Op.get("tl.loop_continue"))
-
-
 def warp_any(value, mask=-1):
     """Check if any lane in the warp has a true value.
 
@@ -847,3 +1108,16 @@ def warp_all(value, mask=-1):
         result (int): The result of the vote.
     """
     return tir.call_intrin("int32", tir.op.Op.get("tl.warp_all"), value, mask)
+
+
+def atom_add(target: PrimExpr, value: PrimExpr, scope: str = "gpu", sem: str = "relaxed"):
+    """Perform an atomic addition to a value with specified scope and semantic.
+    Args:
+        target: The target to add to.
+        value: The value to add.
+        scope: The memory scope.
+        sem: The memory semantic.
+    """
+    assert scope in ["gpu", "sys"], "Scope must be one of 'gpu', or 'sys'."
+    assert sem in ["relaxed", "acquire", "release", "acq_rel"], "Semantic must be one of 'relaxed', 'acquire', 'release', or 'acq_rel'."
+    return tir.call_intrin("uint32", tir.op.Op.get("tl.atom_add"), address_of(target), value, sem, scope)
diff --git a/tilelang/language/copy.py b/tilelang/language/copy.py
index 84444b8c6..4e48d263d 100644
--- a/tilelang/language/copy.py
+++ b/tilelang/language/copy.py
@@ -1,4 +1,5 @@
 """The language interface for tl programs."""
+
 from __future__ import annotations
 
 from typing import Literal
@@ -8,11 +9,13 @@
 from tilelang.language.utils import buffer_to_tile_region, buffer_region_to_tile_region, buffer_load_to_tile_region
 
 
-def copy(src: tir.Buffer | tir.BufferLoad | tir.BufferRegion,
-         dst: tir.Buffer | tir.BufferLoad,
-         coalesced_width: int | None = None,
-         disable_tma: bool = False,
-         eviction_policy: Literal["evict_normal", "evict_first", "evict_last"] | None = None):
+def copy(
+    src: tir.Buffer | tir.BufferLoad | tir.BufferRegion,
+    dst: tir.Buffer | tir.BufferLoad,
+    coalesced_width: int | None = None,
+    disable_tma: bool = False,
+    eviction_policy: Literal["evict_normal", "evict_first", "evict_last"] | None = None,
+):
     """Copy data between memory regions.
 
     Args:
@@ -47,8 +50,7 @@ def get_extent(data):
     src_extent = get_extent(src)
     dst_extent = get_extent(dst)
     # Combine the nested if statements into a single if statement as suggested by SIM102
-    if (src_extent is None and dst_extent is None and isinstance(src, tir.BufferLoad) and
-            isinstance(dst, tir.BufferLoad)):
+    if src_extent is None and dst_extent is None and isinstance(src, tir.BufferLoad) and isinstance(dst, tir.BufferLoad):
         # check if the case is like this:
         # copy(buffer_a[i], buffer_b[i]) where both are BufferLoad nodes
         # In this case, lower it to a simple BufferStore: buffer_b[i] = buffer_a[i]
@@ -83,19 +85,20 @@ def _to_region(data, access_type):
         eviction_policy = 0
     else:
         eviction_policy = {"evict_normal": 0, "evict_first": 1, "evict_last": 2}[eviction_policy]
-    return tir.call_intrin("handle", tir.op.Op.get("tl.copy"), src, dst, coalesced_width,
-                           disable_tma, eviction_policy)
-
-
-def c2d_im2col(img: tir.Buffer,
-               col: tir.Buffer,
-               nhw_step: tir.PrimExpr,
-               c_step: tir.PrimExpr,
-               kernel: int,
-               stride: int,
-               dilation: int,
-               pad: int,
-               eviction_policy: Literal["evict_normal", "evict_first", "evict_last"] | None = None):
+    return tir.call_intrin("handle", tir.op.Op.get("tl.copy"), src, dst, coalesced_width, disable_tma, eviction_policy)
+
+
+def c2d_im2col(
+    img: tir.Buffer,
+    col: tir.Buffer,
+    nhw_step: tir.PrimExpr,
+    c_step: tir.PrimExpr,
+    kernel: int,
+    stride: int,
+    dilation: int,
+    pad: int,
+    eviction_policy: Literal["evict_normal", "evict_first", "evict_last"] | None = None,
+):
     """Perform im2col transformation for 2D convolution.
 
     Args:
@@ -115,6 +118,16 @@ def c2d_im2col(img: tir.Buffer,
         eviction_policy = 0
     else:
         eviction_policy = {"evict_normal": 0, "evict_first": 1, "evict_last": 2}[eviction_policy]
-    return tir.call_intrin("handle", tir.op.Op.get("tl.c2d_im2col"), img.access_ptr("r"),
-                           col.access_ptr("w"), nhw_step, c_step, kernel, stride, dilation, pad,
-                           eviction_policy)
+    return tir.call_intrin(
+        "handle",
+        tir.op.Op.get("tl.c2d_im2col"),
+        img.access_ptr("r"),
+        col.access_ptr("w"),
+        nhw_step,
+        c_step,
+        kernel,
+        stride,
+        dilation,
+        pad,
+        eviction_policy,
+    )
diff --git a/tilelang/language/copy_op.py b/tilelang/language/copy_op.py
new file mode 100644
index 000000000..6401520fb
--- /dev/null
+++ b/tilelang/language/copy_op.py
@@ -0,0 +1,154 @@
+"""Copy operations exposed on the TileLang language surface."""
+
+from __future__ import annotations
+from typing import Literal
+from tilelang import language as T
+from tilelang.utils.language import (
+    to_buffer_region,
+    get_buffer_region_from_load,
+    legalize_pairwise_extents,
+)
+from tvm import ir, tir
+
+
+def copy(
+    src: tir.Buffer | tir.BufferLoad | tir.BufferRegion,
+    dst: tir.Buffer | tir.BufferLoad | tir.BufferRegion,
+    coalesced_width: int | None = None,
+    disable_tma: bool = False,
+    eviction_policy: Literal["evict_normal", "evict_first", "evict_last"] | None = None,
+    annotations: dict | None = None,
+):
+    """Copy data between memory regions.
+
+    Args:
+        src (Union[tir.Buffer, tir.BufferLoad, tir.BufferRegion]): Source memory region
+        dst (Union[tir.Buffer, tir.BufferLoad, tir.BufferRegion]): Destination memory region
+        coalesced_width (Optional[int], optional): Width for coalesced memory access. Defaults to None.
+        disable_tma (bool, optional): Whether to disable TMA acceleration. Defaults to False.
+        eviction_policy (Optional[str], optional): Cache eviction policy. Defaults to None.
+        annotations (Optional[dict], optional): Additional annotations dict. If provided,
+            coalesced_width, disable_tma, and eviction_policy can also be specified here.
+            Values in annotations take precedence over individual arguments.
+
+    Raises:
+        TypeError: If copy extents cannot be deduced from arguments
+
+    Returns:
+        tir.Call: A handle to the copy operation
+
+    Range handling notes:
+    - Accepts `Buffer`/`BufferRegion`/`BufferLoad` on either side. Extents are
+      derived as follows: `Buffer -> shape`, `BufferRegion -> [r.extent]`,
+      `BufferLoad -> extents from its inferred/encoded region`.
+    - If both `src` and `dst` are scalar `BufferLoad` without region extents,
+      lowers to a direct store: `dst[...] = src`.
+    - If one side is missing extents, it is treated as all-ones with the other
+      side's rank to enable broadcasting.
+    - Extents are right-aligned and legalized via `legalize_pairwise_extents`:
+      per tail-dimension, equal keeps as-is, a `1` broadcasts to the other,
+      otherwise a conservative `tir.max` is used to remain safe for dynamic
+      shapes.
+    - The finalized extents are encoded with `tl.region` via `to_buffer_region`
+      and passed through to the backend; low-level loop construction and any
+      scope-specific decisions happen during lowering.
+    """
+    if isinstance(src, tir.Buffer) and isinstance(dst, tir.Buffer):
+        ir.assert_structural_equal(src.shape, dst.shape)
+
+    def get_extent(data):
+        if isinstance(data, tir.Var) and T.has_let_value(data):
+            data = T.get_let_value(data)
+        if isinstance(data, tir.Buffer):
+            return data.shape
+        elif isinstance(data, tir.BufferRegion):
+            return [x.extent for x in data.region]
+        elif isinstance(data, tir.BufferLoad):
+            region = get_buffer_region_from_load(data)
+            if region is None:
+                return None
+            return [x.extent for x in region.region]
+        else:
+            return None
+
+    src_extent = get_extent(src)
+    dst_extent = get_extent(dst)
+    # Combine the nested if statements into a single if statement as suggested by SIM102
+    if src_extent is None and dst_extent is None and isinstance(src, tir.BufferLoad) and isinstance(dst, tir.BufferLoad):
+        # check if the case is like this:
+        # copy(buffer_a[i], buffer_b[i]) where both are BufferLoad nodes
+        # In this case, lower it to a simple BufferStore: buffer_b[i] = buffer_a[i]
+        return tir.BufferStore(dst.buffer, src, dst.indices)
+
+    assert src_extent or dst_extent, "Can't deduce copy extents from args"
+    # Treat missing extent as length-matched ones to enable broadcasting.
+    src_extent = list(src_extent) if src_extent else [1] * len(dst_extent)
+    dst_extent = list(dst_extent) if dst_extent else [1] * len(src_extent)
+
+    # Align and broadcast extents from the right (tail) side.
+    src_extent, dst_extent = legalize_pairwise_extents(src_extent, dst_extent)
+
+    # Use legalized extents for src and dst respectively.
+    src = to_buffer_region(src, access_type="r", extents=src_extent)
+    dst = to_buffer_region(dst, access_type="w", extents=dst_extent)
+
+    # Build annotations dict
+    ann = annotations.copy() if annotations else {}
+
+    # Individual arguments take lower precedence than annotations
+    if "coalesced_width" not in ann and coalesced_width is not None:
+        ann["coalesced_width"] = coalesced_width
+    if "disable_tma" not in ann and disable_tma:
+        ann["disable_tma"] = disable_tma
+    if "eviction_policy" not in ann and eviction_policy is not None:
+        eviction_policy_map = {"evict_normal": 0, "evict_first": 1, "evict_last": 2}
+        ann["eviction_policy"] = eviction_policy_map[eviction_policy]
+
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.copy"), src, dst, annotations=ann if ann else None)
+
+
+def c2d_im2col(
+    img: tir.Buffer,
+    col: tir.Buffer,
+    nhw_step: tir.PrimExpr,
+    c_step: tir.PrimExpr,
+    kernel: int,
+    stride: int,
+    dilation: int,
+    pad: int,
+    eviction_policy: Literal["evict_normal", "evict_first", "evict_last"] | None = None,
+):
+    """Perform im2col transformation for 2D convolution.
+
+    Args:
+        img (tir.Buffer): Input image buffer
+        col (tir.Buffer): Output column buffer
+        nhw_step (tir.PrimExpr): Step size for batch and spatial dimensions
+        c_step (tir.PrimExpr): Step size for channel dimension
+        kernel (int): Kernel size
+        stride (int): Stride of the convolution
+        dilation (int): Dilation rate
+        pad (int): Padding size
+
+    Returns:
+        tir.Call: A handle to the im2col operation
+    """
+    if eviction_policy is None:
+        eviction_policy = 0
+    else:
+        eviction_policy = {"evict_normal": 0, "evict_first": 1, "evict_last": 2}[eviction_policy]
+    img_region = to_buffer_region(img, access_type="r")
+    col_region = to_buffer_region(col, access_type="w")
+    return tir.call_intrin(
+        "handle",
+        tir.op.Op.get("tl.tileop.c2d_im2col"),
+        img_region,
+        col_region,
+        nhw_step,
+        c_step,
+        kernel,
+        stride,
+        dilation,
+        pad,
+        eviction_policy,
+    )
diff --git a/tilelang/language/customize.py b/tilelang/language/customize.py
index 9f7d35871..ae4e754f7 100644
--- a/tilelang/language/customize.py
+++ b/tilelang/language/customize.py
@@ -1,10 +1,9 @@
-# Copyright (c) Tile-AI Corporation.
-# Licensed under the MIT License.
-"""The language interface for tl programs."""
-from __future__ import annotations
+"""Some customized operations frequently used in tensor programming, exposed on the TileLang language surface."""
 
+from __future__ import annotations
 import tilelang.language as T
 from tvm.tir import PrimExpr, Buffer, op
+from tilelang.utils.language import bits_product, prim_expr_equal
 from .atomic import atomic_max, atomic_min, atomic_add, atomic_addx2, atomic_addx4, atomic_load, atomic_store  # noqa: F401
 
 
@@ -48,19 +47,22 @@ def reshape(src: Buffer, shape: list[PrimExpr]) -> Buffer:
     Returns:
         Buffer: A new buffer view with the specified shape
     """
+    assert prim_expr_equal(bits_product(shape, src.dtype), bits_product(src.shape, src.dtype)), (
+        f"T.reshape/view shape check failed. src {src} src.shape: {src.shape}, src.dtype: {src.dtype}, target shape: {shape}, target dtype: {src.dtype}"
+    )
     return T.Tensor(shape, src.dtype, src.data)
 
 
 def view(src: Buffer, shape: list[PrimExpr] | None = None, dtype: str | None = None) -> Buffer:
-    """
-         Return a Tensor view of the input buffer with an optional new shape and dtype.
+    """Return a Tensor view of the input buffer with an optional new shape and dtype.
 
-         If `shape` is None the source buffer's shape is used; if `dtype` is None the source buffer's dtype is used. The returned buffer shares the same underlying data as `src` (no copy).
-         """
+    If `shape` is None the source buffer's shape is used; if `dtype` is None the source buffer's dtype is used. The returned buffer shares the same underlying data as `src` (no copy).
+    """
     if shape is None:
         shape = src.shape
     if dtype is None:
         dtype = src.dtype
+    assert prim_expr_equal(bits_product(shape, dtype), bits_product(src.shape, src.dtype)), "T.reshape/view shape check failed."
     return T.Tensor(shape, dtype, src.data)
 
 
diff --git a/tilelang/language/distributed/__init__.py b/tilelang/language/distributed/__init__.py
index e69de29bb..976130a40 100644
--- a/tilelang/language/distributed/__init__.py
+++ b/tilelang/language/distributed/__init__.py
@@ -0,0 +1,35 @@
+from __future__ import annotations
+
+# Does not import NVSHMEM related by default
+
+from .common import (
+    get_rank,
+    get_num_ranks,
+    put_warp,
+    get_warp,
+    put_block,
+    get_block,
+    BinaryRelation,
+    wait_eq,
+    wait_ne,
+    wait_ge,
+    wait_le,
+    wait_gt,
+    wait_lt,
+)
+
+__all__ = [
+    "get_rank",
+    "get_num_ranks",
+    "put_warp",
+    "get_warp",
+    "put_block",
+    "get_block",
+    "BinaryRelation",
+    "wait_eq",
+    "wait_ne",
+    "wait_ge",
+    "wait_le",
+    "wait_gt",
+    "wait_lt",
+]
diff --git a/tilelang/language/distributed/common.py b/tilelang/language/distributed/common.py
index adb559e92..97bdf49eb 100644
--- a/tilelang/language/distributed/common.py
+++ b/tilelang/language/distributed/common.py
@@ -1,4 +1,5 @@
 """The language interface for tl programs."""
+
 from __future__ import annotations
 
 from tvm import tir
@@ -8,23 +9,23 @@
 
 
 def get_rank():
-    """Get the rank of the current process.
-    """
+    """Get the rank of the current process."""
     return tir.call_intrin("uint64", tir.op.Op.get("tl.get_rank"))
 
 
 def get_num_ranks():
-    """Get the number of processes.
-    """
+    """Get the number of processes."""
     return tir.call_intrin("uint64", tir.op.Op.get("tl.get_num_ranks"))
 
 
-def put_warp(src: PrimExpr,
-             dst: PrimExpr,
-             size: PrimExpr,
-             dst_pe: PrimExpr | IntImm | None = -1,
-             unroll_factor: int = 4,
-             enable_aggressive_vectorize: bool = False):
+def put_warp(
+    src: PrimExpr,
+    dst: PrimExpr,
+    size: PrimExpr,
+    dst_pe: PrimExpr | IntImm | None = -1,
+    unroll_factor: int = 4,
+    enable_aggressive_vectorize: bool = False,
+):
     """Put to a remote buffer with unrolled loop.
 
     Args:
@@ -43,16 +44,19 @@ def put_warp(src: PrimExpr,
             Whether to enable aggressive vectorization.
             If True, the compiler with try to vectorize the copy via int4.
     """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.put"), src, dst, size, dst_pe, unroll_factor,
-                           "warp", enable_aggressive_vectorize)
-
-
-def get_warp(src: PrimExpr,
-             dst: PrimExpr,
-             size: PrimExpr,
-             src_pe: PrimExpr | IntImm | None = -1,
-             unroll_factor: int = 4,
-             enable_aggressive_vectorize: bool = False):
+    return tir.call_intrin(
+        "handle", tir.op.Op.get("tl.tileop.put"), src, dst, size, dst_pe, unroll_factor, "warp", enable_aggressive_vectorize
+    )
+
+
+def get_warp(
+    src: PrimExpr,
+    dst: PrimExpr,
+    size: PrimExpr,
+    src_pe: PrimExpr | IntImm | None = -1,
+    unroll_factor: int = 4,
+    enable_aggressive_vectorize: bool = False,
+):
     """Get from a remote buffer with unrolled loop.
 
     Args:
@@ -71,8 +75,9 @@ def get_warp(src: PrimExpr,
             Whether to enable aggressive vectorization.
             If True, the compiler with try to vectorize the copy via int4.
     """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.get"), src, dst, size, src_pe, unroll_factor,
-                           "warp", enable_aggressive_vectorize)
+    return tir.call_intrin(
+        "handle", tir.op.Op.get("tl.tileop.get"), src, dst, size, src_pe, unroll_factor, "warp", enable_aggressive_vectorize
+    )
 
 
 def put_block(src: PrimExpr, dst: PrimExpr, size: PrimExpr, dst_pe: PrimExpr | IntImm | None = -1):
@@ -90,7 +95,7 @@ def put_block(src: PrimExpr, dst: PrimExpr, size: PrimExpr, dst_pe: PrimExpr | I
             -1 by default, which means local copy.
     """
     return tir.call_intrin(
-        "handle", tir.op.Op.get("tl.put"), src, dst, size, dst_pe, 0, "block", True
+        "handle", tir.op.Op.get("tl.tileop.put"), src, dst, size, dst_pe, 0, "block", True
     )  # NOTE: unroll_factor is not needed because currently we implement block-level comm based on NVSHMEM-style copy
 
 
@@ -109,7 +114,7 @@ def get_block(src: PrimExpr, dst: PrimExpr, size: PrimExpr, src_pe: PrimExpr | I
             -1 by default, which means local copy.
     """
     return tir.call_intrin(
-        "handle", tir.op.Op.get("tl.get"), src, dst, size, src_pe, 0, "block", True
+        "handle", tir.op.Op.get("tl.tileop.get"), src, dst, size, src_pe, 0, "block", True
     )  # NOTE: unroll_factor is not needed because currently we implement block-level comm based on NVSHMEM-style copy
 
 
@@ -122,41 +127,31 @@ class BinaryRelation(Enum):
     LT = 5
 
 
-def wait_eq(barrier: PrimExpr, expected: PrimExpr):
-    """Wait until *barrier == expected* for GPU-level synchronization.
-    # todo: have different semantic compared to 3 fns below currently
-    Args:
-        barrier: The barrier to wait at
-        expected: The expected value to wait for
-    """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.wait_eq"), address_of(barrier), expected)
+def wait_eq(value: PrimExpr, expected: PrimExpr, peer: PrimExpr | None = -1):
+    """Wait until value == expected"""
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.wait"), BinaryRelation.EQ.value, address_of(value), expected, peer)
 
 
-def wait_ne(ptr: PrimExpr, expected: PrimExpr, peer: PrimExpr | None = -1):
-    """Wait until *ptr != expected"""
-    return tir.call_intrin("handle", tir.op.Op.get("tl.wait"), BinaryRelation.NE.value,
-                           address_of(ptr), expected, peer)
+def wait_ne(value: PrimExpr, expected: PrimExpr, peer: PrimExpr | None = -1):
+    """Wait until value != expected"""
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.wait"), BinaryRelation.NE.value, address_of(value), expected, peer)
 
 
-def wait_ge(ptr: PrimExpr, expected: PrimExpr, peer: PrimExpr | None = -1):
-    """Wait until *ptr >= expected"""
-    return tir.call_intrin("handle", tir.op.Op.get("tl.wait"), BinaryRelation.GE.value,
-                           address_of(ptr), expected, peer)
+def wait_ge(value: PrimExpr, expected: PrimExpr, peer: PrimExpr | None = -1):
+    """Wait until value >= expected"""
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.wait"), BinaryRelation.GE.value, address_of(value), expected, peer)
 
 
-def wait_le(ptr: PrimExpr, expected: PrimExpr, peer: PrimExpr | None = -1):
-    """Wait until *ptr <= expected"""
-    return tir.call_intrin("handle", tir.op.Op.get("tl.wait"), BinaryRelation.LE.value,
-                           address_of(ptr), expected, peer)
+def wait_le(value: PrimExpr, expected: PrimExpr, peer: PrimExpr | None = -1):
+    """Wait until value <= expected"""
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.wait"), BinaryRelation.LE.value, address_of(value), expected, peer)
 
 
-def wait_gt(ptr: PrimExpr, expected: PrimExpr, peer: PrimExpr | None = -1):
-    """Wait until *ptr > expected"""
-    return tir.call_intrin("handle", tir.op.Op.get("tl.wait"), BinaryRelation.GT.value,
-                           address_of(ptr), expected, peer)
+def wait_gt(value: PrimExpr, expected: PrimExpr, peer: PrimExpr | None = -1):
+    """Wait until value > expected"""
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.wait"), BinaryRelation.GT.value, address_of(value), expected, peer)
 
 
-def wait_lt(ptr: PrimExpr, expected: PrimExpr, peer: PrimExpr | None = -1):
-    """Wait until *ptr < expected"""
-    return tir.call_intrin("handle", tir.op.Op.get("tl.wait"), BinaryRelation.LT.value,
-                           address_of(ptr), expected, peer)
+def wait_lt(value: PrimExpr, expected: PrimExpr, peer: PrimExpr | None = -1):
+    """Wait until value < expected"""
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.wait"), BinaryRelation.LT.value, address_of(value), expected, peer)
diff --git a/tilelang/language/distributed/multi_device/__init__.py b/tilelang/language/distributed/multi_device/__init__.py
index e69de29bb..637735790 100644
--- a/tilelang/language/distributed/multi_device/__init__.py
+++ b/tilelang/language/distributed/multi_device/__init__.py
@@ -0,0 +1,4 @@
+"""Multi-device distributed primitives for NVSHMEM."""
+
+from .nvshmem import *  # noqa: F401
+from .cpengine import *  # noqa: F401
diff --git a/tilelang/language/distributed/multi_device/nvshmem.py b/tilelang/language/distributed/multi_device/nvshmem.py
index 186a5d991..1b35f0d37 100644
--- a/tilelang/language/distributed/multi_device/nvshmem.py
+++ b/tilelang/language/distributed/multi_device/nvshmem.py
@@ -95,13 +95,6 @@ def getmem(*args):
 
 
 def putmem_block(*args):
-    """Put data from local memory to remote memory at block granularity.
-    Args:
-        dest: Symmetric address of the destination data object.
-        src: Symmetric address of the object containing the data to be copied.
-        nelems: Number of elements to be transferred (in bytes).
-        pe: The PE ID of the destination PE.
-    """
     return tir.call_intrin("handle", tir.op.Op.get("tl.PutmemBlock"), *args)
 
 
@@ -156,8 +149,7 @@ def putmem_signal_nbi_block(dest, src, nelems, sig_addr, signal, sig_op, pe):
         sig_op: The type of update to be performed on the remote signal data object.
         pe: The PE ID of the destination PE.
     """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.PutmemSignalNbiBlock"), dest, src, nelems,
-                           sig_addr, signal, sig_op, pe)
+    return tir.call_intrin("handle", tir.op.Op.get("tl.PutmemSignalNbiBlock"), dest, src, nelems, sig_addr, signal, sig_op, pe)
 
 
 def putmem_signal_warp(*args):
@@ -179,15 +171,9 @@ def signal_op(sig_addr, signal, sig_op, pe):
     return tir.call_intrin("handle", tir.op.Op.get("tl.SignalOp"), sig_addr, signal, sig_op, pe)
 
 
-def signal_wait_until(sig_addr, cmp, cmp_val):
-    """Waits until the signal at `sig_addr` reaches the specified `signal` value on the specified PE.
-    Args:
-        sig_addr (uint64_t*): Symmetric address of the signal word to be waited on.
-        cmp: The comparison operation to be performed on the signal value.
-        cmp_val (uint64_t): The value to compare against the signal value.
-    """
-    # Actually nvshmem_signal_wait_until returns a uint64_t* value, but we simply ignore it
-    return tir.call_intrin("handle", tir.op.Op.get("tl.SignalWaitUntil"), sig_addr, cmp, cmp_val)
+def signal_wait_until(*args):
+    # TODO: handle return value(which is uint*64)?
+    return tir.call_intrin("int32", tir.op.Op.get("tl.SignalWaitUntil"), *args)
 
 
 def broadcast(*args):
diff --git a/tilelang/language/experimental/gemm_sp.py b/tilelang/language/experimental/gemm_sp.py
index fc511c007..993cf8758 100644
--- a/tilelang/language/experimental/gemm_sp.py
+++ b/tilelang/language/experimental/gemm_sp.py
@@ -1,9 +1,20 @@
 """The language interface for tl programs."""
+
 from __future__ import annotations
 
-from tilelang.primitives.gemm.base import GemmWarpPolicy
+from tilelang.tileop.base import GemmWarpPolicy
 import tilelang.language as T
 from tvm import tir
+from tilelang.utils.language import (
+    to_buffer_region,
+    retrieve_shape,
+    retrieve_stride,
+    retrieve_offset,
+    prim_expr_equal,
+)
+from tilelang.language.utils import (
+    buffer_region_to_tile_region,
+)
 
 
 def gemm_sp(
@@ -63,17 +74,18 @@ def legalize_arguments(arg: tir.Buffer | tir.Var):
     K_A = A_sparse.shape[0] if transpose_A else A_sparse.shape[1]
     K_B = B.shape[1] if transpose_B else B.shape[0]
     assert K_A * 2 == K_B, f"T.gemm_sp K shape check failed: K_A = {K_A}, K_B = {K_B}"
-    Aptr = A_sparse.access_ptr("r")
-    Bptr = B.access_ptr("r")
-    Cptr = C.access_ptr("rw")
-    Eptr = E.access_ptr("r")
+    # Build tl.region descriptors for operands
+    A_arg = to_buffer_region(A_sparse, access_type="r")
+    E_arg = to_buffer_region(E, access_type="r")
+    B_arg = to_buffer_region(B, access_type="r")
+    C_arg = to_buffer_region(C, access_type="rw")
     return tir.call_intrin(
         "handle",
-        tir.op.Op.get("tl.gemm_sp"),
-        Aptr,
-        Eptr,
-        Bptr,
-        Cptr,
+        tir.op.Op.get("tl.tileop.gemm_sp"),
+        A_arg,
+        E_arg,
+        B_arg,
+        C_arg,
         transpose_A,
         transpose_B,
         M,
@@ -84,3 +96,129 @@ def legalize_arguments(arg: tir.Buffer | tir.Var):
         k_pack,
         wg_wait,
     )
+
+
+# experimental currently, for fast compilation
+def gemm_sp_v2(
+    A_sparse: tir.Buffer | tir.Var,
+    E: tir.Buffer | tir.Var,
+    B: tir.Buffer | tir.Var,
+    C: tir.Buffer | tir.Var,
+    transpose_A: bool = False,
+    transpose_B: bool = False,
+    transpose_E: bool = False,
+    policy: GemmWarpPolicy = GemmWarpPolicy.Square,
+    clear_accum: bool = False,
+    k_pack: int = 1,
+    wg_wait: int = 0,
+):
+    """Perform a General Matrix Multiplication (GEMM) operation.
+
+    This function computes C = A @ B where A and B can optionally be transposed.
+    The operation supports various warp policies and accumulation modes.
+
+    Args:
+        A_sparse (Union[tir.Buffer, tir.Var]): First input matrix, contains only non-zero elements
+        E (Union[tir.Buffer, tir.Var]): The metadata of A_sparse, noted as E
+        B (Union[tir.Buffer, tir.Var]): Second input matrix
+        C (Union[tir.Buffer, tir.Var]): Output matrix for results
+        transpose_A (bool, optional): Whether to transpose matrix A. Defaults to False.
+        transpose_B (bool, optional): Whether to transpose matrix B. Defaults to False.
+        policy (GemmWarpPolicy, optional): Warp execution policy. Defaults to GemmWarpPolicy.Square.
+        clear_accum (bool, optional): Whether to clear accumulator before computation. Defaults to False.
+        k_pack (int, optional): Number of k dimensions packed into a single warp. Defaults to 1.
+        wg_wait (int, optional): Warp group wait count. Defaults to 0.
+
+    Returns:
+        tir.Call: A handle to the GEMM operation
+
+    Raises:
+        AssertionError: If the K dimensions of matrices A and B don't match
+    """
+
+    def legalize_arguments(arg: tir.Buffer | tir.Var):
+        """Convert let-bound variables to their corresponding buffers.
+
+        Args:
+            arg (Union[tir.Buffer, tir.Var]): Input argument to legalize
+
+        Returns:
+            Union[tir.Buffer, tir.Var]: The legalized argument
+        """
+        if isinstance(arg, tir.Var) and T.has_let_value(arg):
+            return T.get_let_value(arg).buffer
+        return arg
+
+    A_sparse = legalize_arguments(A_sparse)
+    E = legalize_arguments(E)
+    B = legalize_arguments(B)
+    C = legalize_arguments(C)
+
+    A_region = to_buffer_region(A_sparse)
+    E_region = to_buffer_region(E)
+    B_region = to_buffer_region(B)
+    C_region = to_buffer_region(C)
+
+    A_shape = retrieve_shape(A_sparse)
+    E_shape = retrieve_shape(E)  # nolint: F841
+    B_shape = retrieve_shape(B)
+    C_shape = retrieve_shape(C)
+
+    A_stride = retrieve_stride(A_sparse)
+    B_stride = retrieve_stride(B)
+
+    assert len(C_shape) == 2, "current only support C as a 2D tensor"
+    assert len(A_shape) >= 2, "current only support A as a 2D or higher-order tensor"
+    assert len(B_shape) >= 2, "current only support B as a 2D or higher-order tensor"
+    if len(A_shape) > 2:
+        for i in range(len(A_shape) - 2):
+            assert A_shape[i] == 1, (
+                "current only support A as a 2D or higher-order tensor with the last two dimensions being the matrix dimensions"
+            )
+    if len(B_shape) > 2:
+        for i in range(len(B_shape) - 2):
+            assert B_shape[i] == 1, (
+                "current only support B as a 2D or higher-order tensor with the last two dimensions being the matrix dimensions"
+            )
+
+    M, N = C_shape
+    K = 2 * (A_shape[-2] if transpose_A else A_shape[-1])
+    K_B = B_shape[-1] if transpose_B else B_shape[-2]
+    assert prim_expr_equal(K, K_B), f"T.gemm_sp K shape check failed: K_A (wo sparse) = {K}, K_B = {K_B}"
+
+    stride_a = A_stride[-2]
+    stride_b = B_stride[-2]
+
+    A_offset = retrieve_offset(A_sparse)
+    B_offset = retrieve_offset(B)
+    assert A_offset[-2] == 0, "The offset of the first dimension of A must be 0"
+    assert B_offset[-2] == 0, "The offset of the first dimension of B must be 0"
+    offset_a = A_offset[-1]
+    offset_b = B_offset[-1]
+
+    A_arg = buffer_region_to_tile_region(A_region, "r", [r for r in A_shape])
+    E_arg = buffer_region_to_tile_region(E_region, "r", [r for r in E_shape])
+    B_arg = buffer_region_to_tile_region(B_region, "r", [r for r in B_shape])
+    C_arg = buffer_region_to_tile_region(C_region, "rw", [r for r in C_shape])
+    return tir.call_intrin(
+        "handle",
+        tir.op.Op.get("tl.tileop.gemm_sp_py"),
+        A_arg,
+        E_arg,
+        B_arg,
+        C_arg,
+        transpose_A,
+        transpose_B,
+        transpose_E,
+        M,
+        N,
+        K,
+        policy,
+        clear_accum,
+        stride_a,
+        stride_b,
+        offset_a,
+        offset_b,
+        k_pack,
+        wg_wait,
+    )
diff --git a/tilelang/language/fastmath.py b/tilelang/language/fastmath.py
index 0146f53ac..c77fad34c 100644
--- a/tilelang/language/fastmath.py
+++ b/tilelang/language/fastmath.py
@@ -1,3 +1,5 @@
+"""Fast math operations exposed on the TileLang language surface."""
+
 from tvm import tir
 
 
diff --git a/tilelang/language/fill.py b/tilelang/language/fill.py
index 95ef26746..b412458e2 100644
--- a/tilelang/language/fill.py
+++ b/tilelang/language/fill.py
@@ -1,4 +1,5 @@
 """The language interface for tl programs."""
+
 from __future__ import annotations
 
 from tvm import tir
@@ -40,8 +41,7 @@ def clear(buffer: tir.Buffer | tir.Var):
         elif isinstance(buffer_region, tir.BufferLoad):
             region = get_buffer_region_from_load(buffer_region)
             if region is None:
-                raise ValueError(
-                    f"Invalid buffer region: {buffer_region}, type: {type(buffer_region)}")
+                raise ValueError(f"Invalid buffer region: {buffer_region}, type: {type(buffer_region)}")
             return fill(region, 0)
         else:
             raise ValueError(f"Invalid buffer region: {buffer_region}, type: {type(buffer_region)}")
diff --git a/tilelang/language/fill_op.py b/tilelang/language/fill_op.py
new file mode 100644
index 000000000..a093a8459
--- /dev/null
+++ b/tilelang/language/fill_op.py
@@ -0,0 +1,62 @@
+"""Fill operations exposed on the TileLang language surface."""
+
+from __future__ import annotations
+from tvm import tir
+from tilelang.language import has_let_value, get_let_value
+from tilelang.utils.language import get_buffer_region_from_load, to_buffer_region
+
+
+def fill(buffer: tir.Buffer | tir.BufferRegion | tir.BufferLoad, value: tir.PrimExpr):
+    """Fill a buffer or buffer region with a specified value.
+
+    Args:
+        buffer: Either a TVM buffer or buffer region to be filled
+        value: The value to fill the buffer with
+
+    Returns:
+        A TVM intrinsic call that performs the fill operation
+    """
+    # Normalize Var with let value to its underlying object
+    if isinstance(buffer, tir.Var) and has_let_value(buffer):
+        buffer = get_let_value(buffer)
+
+    # Build tl.region as argument
+    if isinstance(buffer, tir.Buffer):
+        extents = list(buffer.shape)
+    elif isinstance(buffer, tir.BufferRegion):
+        extents = [r.extent for r in buffer.region]
+    elif isinstance(buffer, tir.BufferLoad):
+        region = get_buffer_region_from_load(buffer)
+        if region is not None:
+            extents = [r.extent for r in region.region]
+        else:
+            extents = [tir.IntImm("int32", 1) for _ in buffer.indices]
+    else:
+        extents = []
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.fill"), to_buffer_region(buffer, access_type="w", extents=extents), value)
+
+
+def clear(buffer: tir.Buffer | tir.Var):
+    """Clear a buffer by filling it with zeros.
+
+    Args:
+        buffer: Either a TVM buffer or a variable that contains a buffer region
+
+    Returns:
+        A fill operation that sets the buffer contents to zero
+
+    Raises:
+        ValueError: If the buffer variable contains an invalid buffer region
+    """
+    if isinstance(buffer, tir.Var) and has_let_value(buffer):
+        buffer_region = get_let_value(buffer)  # Get the actual buffer region from variable
+        if isinstance(buffer_region, tir.BufferRegion):
+            return fill(buffer_region, 0)
+        elif isinstance(buffer_region, tir.BufferLoad):
+            region = get_buffer_region_from_load(buffer_region)
+            if region is None:
+                raise ValueError(f"Invalid buffer region: {buffer_region}, type: {type(buffer_region)}")
+            return fill(region, 0)
+        else:
+            raise ValueError(f"Invalid buffer region: {buffer_region}, type: {type(buffer_region)}")
+    return fill(buffer, 0)
diff --git a/tilelang/language/frame.py b/tilelang/language/frame.py
index 8e6d59268..2f4d1c671 100644
--- a/tilelang/language/frame.py
+++ b/tilelang/language/frame.py
@@ -1,4 +1,5 @@
 """Override the LetFrame to print a message when entering the frame."""
+
 from __future__ import annotations
 
 from tvm.ffi import register_object as _register_object
@@ -30,7 +31,7 @@ def push(self, item):
             item: The frame object to push onto the stack
         """
         self._stack.append(item)
-        if hasattr(item, 'var') and hasattr(item, 'value'):
+        if hasattr(item, "var") and hasattr(item, "value"):
             self._var_value_map[item.var] = item.value
 
     def pop(self):
@@ -44,7 +45,7 @@ def pop(self):
         """
         if self._stack:
             item = self._stack.pop()
-            if hasattr(item, 'var'):
+            if hasattr(item, "var"):
                 self._var_value_map.pop(item.var, None)
             return item
         raise IndexError(f"{self.__class__.__name__} is empty")
@@ -130,8 +131,7 @@ def __enter__(self) -> Var:
                     is_block_load = True
                     break
             if is_block_load:
-                self.value = BufferRegion(self.value.buffer,
-                                          [Range(x.base, x.lanes) for x in indices])
+                self.value = BufferRegion(self.value.buffer, [Range(x.base, x.lanes) for x in indices])
 
         _get_let_stack().push(self)
         return self.var
diff --git a/tilelang/language/gemm.py b/tilelang/language/gemm.py
index bb8dc6ce8..d91141de5 100644
--- a/tilelang/language/gemm.py
+++ b/tilelang/language/gemm.py
@@ -1,4 +1,5 @@
 """The language interface for tl programs."""
+
 from __future__ import annotations
 
 from tilelang.primitives.gemm.base import GemmWarpPolicy
@@ -79,8 +80,7 @@ def retrieve_shape(object: tir.Buffer | tir.BufferRegion) -> list[int]:
                 shape.append(r.extent)
             return shape
         else:
-            raise ValueError(
-                f"Unsupported retrieve_shape argument type: {type(object)} for buffer {object}")
+            raise ValueError(f"Unsupported retrieve_shape argument type: {type(object)} for buffer {object}")
 
     def retrieve_stride(object: tir.Buffer | tir.BufferRegion) -> list[int]:
         if isinstance(object, tir.Buffer):
@@ -107,8 +107,7 @@ def retrieve_stride(object: tir.Buffer | tir.BufferRegion) -> list[int]:
                 stride *= s
             return strides
         else:
-            raise ValueError(
-                f"Unsupported retrieve_stride argument type: {type(object)} for buffer {object}")
+            raise ValueError(f"Unsupported retrieve_stride argument type: {type(object)} for buffer {object}")
 
     A_shape = retrieve_shape(A)
     B_shape = retrieve_shape(B)
@@ -122,12 +121,14 @@ def retrieve_stride(object: tir.Buffer | tir.BufferRegion) -> list[int]:
     assert len(B_shape) >= 2, "current only support B as a 2D or higher-order tensor"
     if len(A_shape) > 2:
         for i in range(len(A_shape) - 2):
-            assert A_shape[i] == 1, \
+            assert A_shape[i] == 1, (
                 "current only support A as a 2D or higher-order tensor with the last two dimensions being the matrix dimensions"
+            )
     if len(B_shape) > 2:
         for i in range(len(B_shape) - 2):
-            assert B_shape[i] == 1, \
+            assert B_shape[i] == 1, (
                 "current only support B as a 2D or higher-order tensor with the last two dimensions being the matrix dimensions"
+            )
 
     M, N = C_shape
     K = A_shape[-2] if transpose_A else A_shape[-1]
@@ -171,8 +172,7 @@ def retrieve_ptr(object: tir.Buffer | tir.BufferRegion, access_type: str = "r")
                 offset += indices[i] * strides[i]
             return buffer.access_ptr(access_mask=access_type, offset=offset)
         else:
-            raise ValueError(
-                f"Unsupported retrieve_ptr argument type: {type(object)} for buffer {object}")
+            raise ValueError(f"Unsupported retrieve_ptr argument type: {type(object)} for buffer {object}")
 
     def retrieve_offset(object: tir.Buffer | tir.BufferRegion) -> tir.PrimExpr:
         """Retrieve the offset of the buffer or buffer region."""
@@ -191,8 +191,7 @@ def retrieve_offset(object: tir.Buffer | tir.BufferRegion) -> tir.PrimExpr:
                 indices.append(r.min)
             return indices
         else:
-            raise ValueError(
-                f"Unsupported retrieve_offset argument type: {type(object)} for buffer {object}")
+            raise ValueError(f"Unsupported retrieve_offset argument type: {type(object)} for buffer {object}")
 
     A_offset = retrieve_offset(A)
     B_offset = retrieve_offset(B)
@@ -206,9 +205,29 @@ def retrieve_offset(object: tir.Buffer | tir.BufferRegion) -> tir.PrimExpr:
     Cptr = retrieve_ptr(C, "rw")
     mbarptr = retrieve_ptr(mbar, "rw") if mbar is not None else tir.const(0, "uint32")
     C_coords = [r.min for r in C.region] if isinstance(C, tir.BufferRegion) else [0, 0]
-    return tir.call_intrin("handle", tir.op.Op.get("tl.gemm"), Aptr, Bptr, Cptr, transpose_A,
-                           transpose_B, M, N, K, policy, clear_accum, stride_a, stride_b, offset_a,
-                           offset_b, k_pack, wg_wait, mbarptr, C_coords[0], C_coords[1])
+    return tir.call_intrin(
+        "handle",
+        tir.op.Op.get("tl.gemm"),
+        Aptr,
+        Bptr,
+        Cptr,
+        transpose_A,
+        transpose_B,
+        M,
+        N,
+        K,
+        policy,
+        clear_accum,
+        stride_a,
+        stride_b,
+        offset_a,
+        offset_b,
+        k_pack,
+        wg_wait,
+        mbarptr,
+        C_coords[0],
+        C_coords[1],
+    )
 
 
 # experimental currently, for fast compilation
@@ -279,8 +298,7 @@ def retrieve_shape(object: tir.Buffer | tir.BufferRegion) -> list[int]:
                 shape.append(r.extent)
             return shape
         else:
-            raise ValueError(
-                f"Unsupported retrieve_shape argument type: {type(object)} for buffer {object}")
+            raise ValueError(f"Unsupported retrieve_shape argument type: {type(object)} for buffer {object}")
 
     def retrieve_stride(object: tir.Buffer | tir.BufferRegion) -> list[int]:
         if isinstance(object, tir.Buffer):
@@ -307,8 +325,7 @@ def retrieve_stride(object: tir.Buffer | tir.BufferRegion) -> list[int]:
                 stride *= s
             return strides
         else:
-            raise ValueError(
-                f"Unsupported retrieve_stride argument type: {type(object)} for buffer {object}")
+            raise ValueError(f"Unsupported retrieve_stride argument type: {type(object)} for buffer {object}")
 
     A_shape = retrieve_shape(A)
     B_shape = retrieve_shape(B)
@@ -322,12 +339,14 @@ def retrieve_stride(object: tir.Buffer | tir.BufferRegion) -> list[int]:
     assert len(B_shape) >= 2, "current only support B as a 2D or higher-order tensor"
     if len(A_shape) > 2:
         for i in range(len(A_shape) - 2):
-            assert A_shape[i] == 1, \
+            assert A_shape[i] == 1, (
                 "current only support A as a 2D or higher-order tensor with the last two dimensions being the matrix dimensions"
+            )
     if len(B_shape) > 2:
         for i in range(len(B_shape) - 2):
-            assert B_shape[i] == 1, \
+            assert B_shape[i] == 1, (
                 "current only support B as a 2D or higher-order tensor with the last two dimensions being the matrix dimensions"
+            )
 
     M, N = C_shape
     K = A_shape[-2] if transpose_A else A_shape[-1]
@@ -371,8 +390,7 @@ def retrieve_ptr(object: tir.Buffer | tir.BufferRegion, access_type: str = "r")
                 offset += indices[i] * strides[i]
             return buffer.access_ptr(access_mask=access_type, offset=offset)
         else:
-            raise ValueError(
-                f"Unsupported retrieve_ptr argument type: {type(object)} for buffer {object}")
+            raise ValueError(f"Unsupported retrieve_ptr argument type: {type(object)} for buffer {object}")
 
     def retrieve_offset(object: tir.Buffer | tir.BufferRegion) -> tir.PrimExpr:
         """Retrieve the offset of the buffer or buffer region."""
@@ -391,8 +409,7 @@ def retrieve_offset(object: tir.Buffer | tir.BufferRegion) -> tir.PrimExpr:
                 indices.append(r.min)
             return indices
         else:
-            raise ValueError(
-                f"Unsupported retrieve_offset argument type: {type(object)} for buffer {object}")
+            raise ValueError(f"Unsupported retrieve_offset argument type: {type(object)} for buffer {object}")
 
     A_offset = retrieve_offset(A)
     B_offset = retrieve_offset(B)
diff --git a/tilelang/language/gemm_op.py b/tilelang/language/gemm_op.py
new file mode 100644
index 000000000..0b8f3ccf1
--- /dev/null
+++ b/tilelang/language/gemm_op.py
@@ -0,0 +1,222 @@
+"""GEMM (General Matrix Multiplication) operators exposed on the TileLang language surface."""
+
+from __future__ import annotations
+
+from tilelang.tileop.base import GemmWarpPolicy
+import tilelang.language as T
+from tvm import tir
+from tilelang.utils.language import (
+    to_buffer_region,
+    retrieve_shape,
+    retrieve_stride,
+    retrieve_offset,
+    prim_expr_equal,
+)
+from tilelang.language.utils import (
+    buffer_region_to_tile_region,
+)
+from tilelang.env import env as _env
+
+
+def _gemm_impl(
+    op_key: str,
+    A: tir.Buffer | tir.Var,
+    B: tir.Buffer | tir.Var,
+    C: tir.Buffer | tir.Var,
+    transpose_A: bool = False,
+    transpose_B: bool = False,
+    policy: GemmWarpPolicy = GemmWarpPolicy.Square,
+    clear_accum: bool = False,
+    k_pack: int = 1,
+    wg_wait: int = 0,
+    mbar: tir.Buffer | None = None,
+):
+    """Shared GEMM implementation.
+
+    Returns a call_intrin handle for the given op key.
+    """
+
+    def legalize_arguments(arg: tir.Buffer | tir.Var):
+        """Convert let-bound variables to their corresponding buffers.
+
+        Args:
+            arg (Union[tir.Buffer, tir.Var]): Input argument to legalize
+
+        Returns:
+            Union[tir.Buffer, tir.Var]: The legalized argument
+        """
+        if isinstance(arg, tir.Var) and T.has_let_value(arg):
+            return T.get_let_value(arg).buffer
+        return arg
+
+    A = legalize_arguments(A)
+    B = legalize_arguments(B)
+    C = legalize_arguments(C)
+    mbar = legalize_arguments(mbar) if mbar is not None else None
+
+    # Normalize A/B/C to BufferRegion for shape/stride/offset analysis
+    A_region = to_buffer_region(A)
+    B_region = to_buffer_region(B)
+    C_region = to_buffer_region(C)
+
+    A_shape = retrieve_shape(A_region)
+    B_shape = retrieve_shape(B_region)
+    C_shape = retrieve_shape(C_region)
+
+    A_stride = retrieve_stride(A_region)
+    B_stride = retrieve_stride(B_region)
+
+    assert len(C_shape) == 2, "current only support C as a 2D tensor"
+    assert len(A_shape) >= 2, "current only support A as a 2D or higher-order tensor"
+    assert len(B_shape) >= 2, "current only support B as a 2D or higher-order tensor"
+    if len(A_shape) > 2:
+        for i in range(len(A_shape) - 2):
+            assert A_shape[i] == 1, (
+                "current only support A as a 2D or higher-order tensor with the last two dimensions being the matrix dimensions"
+            )
+    if len(B_shape) > 2:
+        for i in range(len(B_shape) - 2):
+            assert B_shape[i] == 1, (
+                "current only support B as a 2D or higher-order tensor with the last two dimensions being the matrix dimensions"
+            )
+
+    M, N = C_shape
+    K = A_shape[-2] if transpose_A else A_shape[-1]
+    K_B = B_shape[-1] if transpose_B else B_shape[-2]
+    assert prim_expr_equal(K, K_B), f"T.gemm K shape check failed: K_A = {K}, K_B = {K_B}"
+
+    stride_a = A_stride[-2]
+    stride_b = B_stride[-2]
+
+    A_offset = retrieve_offset(A_region)
+    B_offset = retrieve_offset(B_region)
+    assert A_offset[-2] == 0, "The offset of the first dimension of A must be 0"
+    assert B_offset[-2] == 0, "The offset of the first dimension of B must be 0"
+    offset_a = A_offset[-1]
+    offset_b = B_offset[-1]
+
+    mbar = to_buffer_region(mbar, access_type="rw") if mbar is not None else tir.const(0, T.uint32)
+    C_coords = [r.min for r in C_region.region]
+    # Convert BufferRegion to tl.region calls for arguments
+    A_arg = buffer_region_to_tile_region(A_region, "r", [r for r in A_shape])
+    B_arg = buffer_region_to_tile_region(B_region, "r", [r for r in B_shape])
+    C_arg = buffer_region_to_tile_region(C_region, "rw", [r for r in C_shape])
+    return tir.call_intrin(
+        "handle",
+        tir.op.Op.get(op_key),
+        A_arg,
+        B_arg,
+        C_arg,
+        transpose_A,
+        transpose_B,
+        M,
+        N,
+        K,
+        policy,
+        clear_accum,
+        stride_a,
+        stride_b,
+        offset_a,
+        offset_b,
+        k_pack,
+        wg_wait,
+        mbar,
+        C_coords[0],
+        C_coords[1],
+    )
+
+
+# Public wrappers
+def gemm_v1(
+    A: tir.Buffer | tir.Var,
+    B: tir.Buffer | tir.Var,
+    C: tir.Buffer | tir.Var,
+    transpose_A: bool = False,
+    transpose_B: bool = False,
+    policy: GemmWarpPolicy = GemmWarpPolicy.Square,
+    clear_accum: bool = False,
+    k_pack: int = 1,
+    wg_wait: int = 0,
+    mbar: tir.Buffer | None = None,
+):
+    """GEMM v1: use op tl.gemm."""
+    return _gemm_impl(
+        "tl.tileop.gemm",
+        A,
+        B,
+        C,
+        transpose_A,
+        transpose_B,
+        policy,
+        clear_accum,
+        k_pack,
+        wg_wait,
+        mbar,
+    )
+
+
+# experimental currently, for fast compilation
+def gemm_v2(
+    A: tir.Buffer | tir.Var,
+    B: tir.Buffer | tir.Var,
+    C: tir.Buffer | tir.Var,
+    transpose_A: bool = False,
+    transpose_B: bool = False,
+    policy: GemmWarpPolicy = GemmWarpPolicy.Square,
+    clear_accum: bool = False,
+    k_pack: int = 1,
+    wg_wait: int = 0,
+    mbar: tir.Buffer | None = None,
+):
+    """GEMM v2: use op tl.gemm_py."""
+    return _gemm_impl(
+        "tl.tileop.gemm_py",
+        A,
+        B,
+        C,
+        transpose_A,
+        transpose_B,
+        policy,
+        clear_accum,
+        k_pack,
+        wg_wait,
+        mbar,
+    )
+
+
+# Default to v2; allow forcing v1 via environment variable
+# gemm = gemm_v1 if _env.use_gemm_v1() else gemm_v2
+
+
+def gemm(
+    A: tir.Buffer | tir.Var,
+    B: tir.Buffer | tir.Var,
+    C: tir.Buffer | tir.Var,
+    transpose_A: bool = False,
+    transpose_B: bool = False,
+    policy: GemmWarpPolicy = GemmWarpPolicy.Square,
+    clear_accum: bool = False,
+    k_pack: int = 1,
+    wg_wait: int = 0,
+    mbar: tir.Buffer | None = None,
+):
+    """TileLang GEMM operator.
+
+    Args:
+        A (tir.Buffer | tir.Var): Input buffer A.
+        B (tir.Buffer | tir.Var): Input buffer B.
+        C (tir.Buffer | tir.Var): Output buffer C.
+        transpose_A (bool): Whether to transpose A. Defaults to False.
+        transpose_B (bool): Whether to transpose B. Defaults to False.
+        policy (GemmWarpPolicy): GEMM warp partition policy.
+        clear_accum (bool): Whether to clear the accumulator.
+        k_pack (int): Numbers of packed matrix cores, for ROCm only. Defaults to 1.
+        wg_wait (int): Int identifier of the warpgroup MMA batch to wait on.. Defaults to 0.
+        mbar (tir.Buffer | None, optional): Mbarrier in Blackwell. Defaults to None.
+
+    Returns:
+        tir.Call: A handle to the GEMM operation.
+    """
+
+    impl = gemm_v1 if _env.use_gemm_v1() else gemm_v2
+    return impl(A, B, C, transpose_A, transpose_B, policy, clear_accum, k_pack, wg_wait, mbar)
diff --git a/tilelang/language/kernel.py b/tilelang/language/kernel.py
index 54b78d3d9..8679971e4 100644
--- a/tilelang/language/kernel.py
+++ b/tilelang/language/kernel.py
@@ -1,6 +1,6 @@
-"""The language interface for tl programs."""
-from __future__ import annotations
+"""Kernel launching language interface in TileLang."""
 
+from __future__ import annotations
 from collections import deque
 from tvm import tir
 from tvm.tir import Var
@@ -108,8 +108,7 @@ def __enter__(self) -> Var | list[Var]:
         _get_current_stack().push(self)
 
         last_block_frame = self.frames[-1]
-        assert isinstance(last_block_frame,
-                          BlockFrame), f"Last frame must be a block frame, got {last_block_frame}"
+        assert isinstance(last_block_frame, BlockFrame), f"Last frame must be a block frame, got {last_block_frame}"
 
         maybe_cpu = last_block_frame.annotations.get("tilelang.is_cpu_kernel_frame", False)
 
@@ -227,7 +226,7 @@ def num_threads(self) -> int:
 
 
 def Kernel(
-    *blocks: list[tir.PrimExpr],
+    *blocks: tir.PrimExpr,
     threads: int | list[int] | tuple | None = None,
     is_cpu: bool = False,
     prelude: str | None = None,
@@ -236,7 +235,7 @@ def Kernel(
 
     Parameters
     ----------
-    blocks : List[int]
+    blocks : int
         A list of extent, can be 1-3 dimension, representing gridDim.(x|y|z)
     threads : int
         A integer representing blockDim.x
@@ -304,56 +303,48 @@ def Kernel(
 
 
 def get_thread_binding(dim: int = 0) -> Var:
-    """Returns the thread binding for the given dimension.
-    """
+    """Returns the thread binding for the given dimension."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_thread_binding(dim)
 
 
 def get_thread_bindings() -> list[Var]:
-    """Returns all three thread bindings.
-    """
+    """Returns all three thread bindings."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_thread_bindings()
 
 
 def get_block_binding(dim: int = 0) -> Var:
-    """Returns the block binding for the given dimension.
-    """
+    """Returns the block binding for the given dimension."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_block_binding(dim)
 
 
 def get_block_bindings() -> list[Var]:
-    """Returns all three block bindings.
-    """
+    """Returns all three block bindings."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_block_bindings()
 
 
 def get_thread_extent(dim: int = 0) -> int:
-    """Returns the thread extent for the given dimension.
-    """
+    """Returns the thread extent for the given dimension."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_thread_extent(dim)
 
 
 def get_thread_extents() -> list[int]:
-    """Returns all three thread extents.
-    """
+    """Returns all three thread extents."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_thread_extents()
 
 
 def get_block_extent(dim: int = 0) -> int:
-    """Returns the block extent for the given dimension.
-    """
+    """Returns the block extent for the given dimension."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_block_extent(dim)
 
 
 def get_block_extents() -> list[int]:
-    """Returns all three block extents.
-    """
+    """Returns all three block extents."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
     return KernelLaunchFrame.Current().get_block_extents()
diff --git a/tilelang/language/logical.py b/tilelang/language/logical.py
index a09088e68..66f0a2e2b 100644
--- a/tilelang/language/logical.py
+++ b/tilelang/language/logical.py
@@ -1,4 +1,5 @@
-"""The language interface for tl programs."""
+"""Logical operations exposed on the TileLang language surface."""
+
 from __future__ import annotations
 
 from tilelang import language as T
@@ -36,8 +37,7 @@ def any_of(buffer: T.Tensor | BufferRegion):
                     )
                 new_region.append(r.min)
         buffer_load = BufferLoad(buffer, new_region)
-        return T.call_intrin(return_type, tir.op.Op.get("tl.any_of"), T.address_of(buffer_load),
-                             extent)
+        return T.call_intrin(return_type, tir.op.Op.get("tl.any_of"), T.address_of(buffer_load), extent)
     else:
         raise ValueError(f"Invalid buffer type: {type(buffer)}")
 
@@ -71,7 +71,6 @@ def all_of(buffer: T.Tensor | BufferRegion):
                     )
                 new_region.append(r.min)
         buffer_load = BufferLoad(buffer, new_region)
-        return T.call_intrin(return_type, tir.op.Op.get("tl.all_of"), T.address_of(buffer_load),
-                             extent)
+        return T.call_intrin(return_type, tir.op.Op.get("tl.all_of"), T.address_of(buffer_load), extent)
     else:
         raise ValueError(f"Invalid buffer type: {type(buffer)}")
diff --git a/tilelang/language/loop.py b/tilelang/language/loop.py
new file mode 100644
index 000000000..45b768095
--- /dev/null
+++ b/tilelang/language/loop.py
@@ -0,0 +1,226 @@
+"""Loop related language interfaces in TileLang."""
+
+from __future__ import annotations
+from typing import Any
+from tvm import tir
+from tvm.tir import IntImm
+import tvm.script.ir_builder.tir as tb_tir
+from .v2.builder import SerialForWithStep, UnrollForWithStep
+from tilelang import _ffi_api
+from tvm.script.ir_builder.tir import frame
+
+
+def Parallel(*extents: tir.PrimExpr, coalesced_width: int | None = None):
+    """Tools to construct nested parallel for loop.
+       This can be used to create element-wise tensor expression.
+
+    Parameters
+    ----------
+    extents : PrimExpr
+        The extents of the iteration.
+
+    coalesced_width : Optional[int]
+        The coalesced width of the parallel loop.
+
+    Returns
+    -------
+    res : frame.ForFrame
+        The ForFrame.
+    """
+    annotations: dict[str, Any] = {}
+    if coalesced_width is not None:
+        annotations.update({"coalesced_width": coalesced_width})
+    return _ffi_api.Parallel(extents, annotations)  # type: ignore[attr-defined] # pylint: disable=no-member
+
+
+def Persistent(
+    domain: list[tir.PrimExpr],
+    wave_size: tir.PrimExpr,
+    index: tir.PrimExpr,
+    group_size: tir.PrimExpr | None = 8,
+):
+    """Tools to construct persistent for loop.
+
+    Parameters
+    ----------
+    domain : List[tir.PrimExpr]
+        The list of dominators.
+    wave_size : int
+        The wave size.
+    index : int
+        The tile index in one wave.
+    group_size : tir.PrimExpr
+        The group size.
+    """
+    return _ffi_api.Persistent(domain, wave_size, index, group_size)
+
+
+def Pipelined(
+    start: tir.PrimExpr,
+    stop: tir.PrimExpr = None,
+    num_stages: int = 0,
+    order: list[int] | None = None,
+    stage: list[int] | None = None,
+    sync: list[list[int]] | None = None,
+    group: list[list[int]] | None = None,
+):
+    """Tools to construct pipelined for loop.
+
+    Parameters
+    ----------
+    start : PrimExpr
+        The minimum value of iteration.
+    stop : PrimExpr
+        The maximum value of iteration.
+    num_stages : int
+        The max number of buffer used between pipeline producers and consumers.
+        if num_stages is 0, pipeline will not be enabled.
+    Returns
+    -------
+    res : frame.ForFrame
+        The ForFrame.
+    """
+    if stop is None:
+        stop = start
+        start = IntImm(start.dtype, 0) if hasattr(start, "dtype") else 0
+    if order is None:
+        order = []
+    if stage is None:
+        stage = []
+    if sync is None:
+        sync = []
+    if group is None:
+        group = []
+    # type: ignore[attr-defined] # pylint: disable=no-member
+    return _ffi_api.Pipelined(start, stop, num_stages, order, stage, sync, group)
+
+
+def serial(
+    start: tir.PrimExpr, stop: tir.PrimExpr | None = None, step: tir.PrimExpr | None = None, *, annotations: dict[str, Any] | None = None
+) -> frame.ForFrame:
+    """The serial For statement.
+
+    Parameters
+    ----------
+    start : PrimExpr
+        The minimum value of iteration.
+
+    stop : PrimExpr
+        The maximum value of iteration.
+
+    step : PrimExpr
+        The step size of the iteration.
+
+    annotations : Dict[str, Any]
+        The optional annotations of the For statement.
+
+    Returns
+    -------
+    res : frame.ForFrame
+        The ForFrame.
+    """
+
+    step_is_one = False
+    step_is_one |= isinstance(step, int) and step == 1
+    step_is_one |= isinstance(step, IntImm) and step.value == 1
+    if step is None or step_is_one:
+        return tb_tir.serial(start, stop, annotations=annotations)
+    else:
+        if stop is None:
+            stop = start
+            start = IntImm(start.dtype, 0) if hasattr(start, "dtype") else 0
+        return SerialForWithStep(start, stop, step, annotations=annotations)
+
+
+def unroll(
+    start: tir.PrimExpr,
+    stop: tir.PrimExpr | None = None,
+    step: tir.PrimExpr | None = None,
+    *,
+    explicit: bool = False,
+    unroll_factor: int | None = None,
+    annotations: dict[str, Any] | None = None,
+) -> frame.ForFrame:
+    """The unrolled For statement.
+
+    Parameters
+    ----------
+    start : PrimExpr
+        The minimum value of iteration.
+
+    stop : PrimExpr
+        The maximum value of iteration.
+
+    step : PrimExpr
+        The step size of the iteration.
+
+    explicit : bool
+        Whether to explicitly unroll the loop.
+
+    unroll_factor : int
+        The unroll factor of the loop.
+
+    annotations : Dict[str, Any]
+        The optional annotations of the For statement.
+
+    Returns
+    -------
+    res : frame.ForFrame
+        The ForFrame.
+    """
+
+    step_is_one = False
+    if stop is None:
+        stop = start
+        if hasattr(start, "dtype"):
+            start = IntImm(start.dtype, 0)
+        else:
+            start = 0
+
+    # Ensure annotations has {"pragma_unroll_explicit": True} by default
+    if annotations is None:
+        annotations = {"pragma_unroll_explicit": explicit}
+    else:
+        # Add "pragma_unroll_explicit": True if not already present
+        annotations = dict(annotations)
+        annotations.setdefault("pragma_unroll_explicit", explicit)
+
+    if unroll_factor is not None:
+        # check pragma_unroll_explicit must be False
+        if annotations.get("pragma_unroll_explicit", True):
+            raise ValueError("pragma_unroll_explicit must be True when unroll_factor is not None")
+        annotations.update({"pragma_unroll_factor": unroll_factor})
+
+    if step is None or step_is_one:
+        return tb_tir.unroll(start, stop, annotations=annotations)
+    else:
+        return UnrollForWithStep(start, stop, step, annotations=annotations)
+
+
+# "Serial" and "Unroll" are aliases of "T.serial" and "T.unroll". We use uppercase to emphasize that they are tile-level loops.
+
+
+def Serial(
+    start: tir.PrimExpr,
+    stop: tir.PrimExpr | None = None,
+    step: tir.PrimExpr | None = None,
+    *,
+    annotations: dict[str, Any] | None = None,
+):
+    """Alias of T.serial."""
+
+    return serial(start, stop, step, annotations=annotations)
+
+
+def Unroll(
+    start: tir.PrimExpr,
+    stop: tir.PrimExpr | None = None,
+    step: tir.PrimExpr | None = None,
+    *,
+    explicit: bool = False,
+    unroll_factor: int | None = None,
+    annotations: dict[str, Any] | None = None,
+):
+    """Alias of T.unroll."""
+
+    return unroll(start, stop, step, explicit=explicit, unroll_factor=unroll_factor, annotations=annotations)
diff --git a/tilelang/language/math_intrinsics.py b/tilelang/language/math_intrinsics.py
index 39cab27ad..6dfb617e5 100644
--- a/tilelang/language/math_intrinsics.py
+++ b/tilelang/language/math_intrinsics.py
@@ -1,9 +1,11 @@
+"""Common math intrinsics exposed on the TileLang language surface."""
+
 from tvm import tir
 
 
 def _validate_rounding_mode(rounding_mode):
     """Validate that the rounding mode is one of the supported IEEE modes"""
-    valid_modes = {'rn', 'rz', 'ru', 'rd'}
+    valid_modes = {"rn", "rz", "ru", "rd"}
     if isinstance(rounding_mode, str) and rounding_mode in valid_modes:
         return
     raise ValueError(f"Invalid rounding mode '{rounding_mode}'. Must be one of: {valid_modes}")
diff --git a/tilelang/language/overrides/parser.py b/tilelang/language/overrides/parser.py
index 28cb9d554..0b2fcc44f 100644
--- a/tilelang/language/overrides/parser.py
+++ b/tilelang/language/overrides/parser.py
@@ -1,5 +1,4 @@
 """TVMScript parser overrides tailored for TileLang."""
-from __future__ import annotations
 
 from functools import partial
 
@@ -8,7 +7,6 @@
 from tvm.tir import BufferLoad, Var
 
 from tvm.script.parser.tir import parser as tvm_tir_parser
-from tilelang.language.tir.ir import SerialStepSpec
 
 
 def _get_node_span(node: doc.AST) -> tuple[int, int, int, int]:
@@ -61,8 +59,12 @@ def tilelang_visit_assign(self, node: doc.Assign) -> None:  # pylint: disable=un
             lhs.ctx = load_ctx
             lhs_value = self.eval_expr(lhs)
             lhs.ctx = store_ctx
-            if (isinstance(lhs_value, BufferLoad) and lhs_value.buffer.scope() == "local.var" and
-                    len(lhs_value.indices) == 1 and lhs_value.indices[0] == 0):
+            if (
+                isinstance(lhs_value, BufferLoad)
+                and lhs_value.buffer.scope() == "local.var"
+                and len(lhs_value.indices) == 1
+                and lhs_value.indices[0] == 0
+            ):
                 T.buffer_store(lhs_value.buffer, rhs, indices=[0])
                 continue
 
@@ -109,8 +111,12 @@ def tilelang_visit_aug_assign(self, node: doc.AugAssign) -> None:  # pylint: dis
         lhs.ctx = load_ctx
         lhs_value = self.eval_expr(lhs)
         lhs.ctx = store_ctx
-        if (isinstance(lhs_value, BufferLoad) and lhs_value.buffer.scope() == "local.var" and
-                len(lhs_value.indices) == 1 and lhs_value.indices[0] == 0):
+        if (
+            isinstance(lhs_value, BufferLoad)
+            and lhs_value.buffer.scope() == "local.var"
+            and len(lhs_value.indices) == 1
+            and lhs_value.indices[0] == 0
+        ):
             T.buffer_store(lhs_value.buffer, rhs, indices=[0])
             return
 
@@ -134,8 +140,12 @@ def tilelang_visit_ann_assign(self, node: doc.AnnAssign) -> None:  # pylint: dis
         lhs.ctx = load_ctx
         lhs_value = self.eval_expr(lhs)
         lhs.ctx = store_ctx
-        if (isinstance(lhs_value, BufferLoad) and lhs_value.buffer.scope() == "local.var" and
-                len(lhs_value.indices) == 1 and lhs_value.indices[0] == 0):
+        if (
+            isinstance(lhs_value, BufferLoad)
+            and lhs_value.buffer.scope() == "local.var"
+            and len(lhs_value.indices) == 1
+            and lhs_value.indices[0] == 0
+        ):
             T.buffer_store(lhs_value.buffer, rhs, indices=[0])
             return
 
@@ -143,64 +153,3 @@ def tilelang_visit_ann_assign(self, node: doc.AnnAssign) -> None:  # pylint: dis
     frame = T.LetStmt(rhs, var=ann_var)
     frame.add_callback(partial(frame.__exit__, None, None, None))
     frame.__enter__()
-
-
-# Override For to support stepped serial: T.serial(start, end, step)
-@dispatch.register(token="tir", type_name="For")
-def tilelang_visit_for(self, node: doc.For) -> None:  # pylint: disable=unused-argument
-    """Override `For` to add support for T.serial(start, end, step).
-
-    When the iterable is a SerialStepSpec, lower it to a unit-step loop over
-    t in [0, floor_div(|end-start|, step)] and bind the loop variable using a
-    Let to `start + t*step` (inclusive semantics).
-    """
-    iter_val = self.eval_expr(node.iter)
-
-    # Fast path: fall back to TVM default behavior when not a SerialStepSpec
-    if not isinstance(iter_val, SerialStepSpec):
-        if not isinstance(iter_val, T.frame.ForFrame):
-            self.report_error(
-                node.iter,
-                "Expect the for loop to be one of the following: "
-                "range, T.serial, T.grid, T.parallel, T.vectorized, T.unroll, T.thread_binding",
-            )
-        with self.var_table.with_frame(), iter_val as iters:
-            self.eval_assign(
-                target=node.target, source=iters, bind_value=tvm_tir_parser.bind_for_value)
-            self.visit_body(node.body)
-        return
-
-    # Stepped inclusive serial: require positive integer step
-    start = iter_val.start
-    end = iter_val.stop
-    step = iter_val.step
-    annotations = iter_val.annotations
-
-    # Normalize step to Python int if possible, otherwise expect IntImm-like
-    if isinstance(step, int):
-        step_val = step
-    else:
-        step_val = getattr(step, "value", None)
-        if step_val is None:
-            self.report_error(node.iter, "T.serial step must be an integer or IntImm")
-            return
-
-    if step_val <= 0:
-        self.report_error(node.iter, "T.serial step must be a positive integer")
-        return
-
-    # Use tvm.tir.floordiv via builder ops from tilelang.tir.ir if available
-    # Avoid importing op wrappers; compute using arithmetic to keep it simple.
-    # We construct: T.ceildiv((end - start), step)
-    extent = T.ceildiv(end - start, step_val)  # type: ignore[operator]
-
-    for_frame = T.serial(0, extent, annotations=annotations)
-    with self.var_table.with_frame(), for_frame as t:
-        # Bind loop target as Let var: i = start + t * step
-        stepped_index = start + t * step_val  # type: ignore[operator]
-        self.eval_assign(
-            target=node.target,
-            source=stepped_index,
-            bind_value=tvm_tir_parser.bind_assign_value,
-        )
-        self.visit_body(node.body)
diff --git a/tilelang/language/parallel.py b/tilelang/language/parallel.py
index 8173675a8..05450128e 100644
--- a/tilelang/language/parallel.py
+++ b/tilelang/language/parallel.py
@@ -1,4 +1,5 @@
 """The language interface for tl programs."""
+
 from __future__ import annotations
 
 from typing import Any
diff --git a/tilelang/language/parser/entry.py b/tilelang/language/parser/entry.py
index aa98cf569..53316d8c2 100644
--- a/tilelang/language/parser/entry.py
+++ b/tilelang/language/parser/entry.py
@@ -18,6 +18,7 @@
 # which is part of the TVM project (https://tvm.apache.org/).
 # ruff: noqa
 """The entry point of TVM parser for tir."""
+
 import inspect
 from typing import Callable, Optional, Union
 
@@ -29,9 +30,7 @@
 from tvm.script.parser.core.parser import Parser, ScriptMacro
 
 
-def prim_func(func: Optional[Callable] = None,
-              private: bool = False,
-              check_well_formed=True) -> Union[PrimFunc, Callable]:
+def prim_func(func: Optional[Callable] = None, private: bool = False, check_well_formed=True) -> Union[PrimFunc, Callable]:
     """The parsing method for tir prim func, by using `@prim_func` as decorator.
 
     Parameters
@@ -149,8 +148,7 @@ def _decorator(func: Callable) -> TIRMacro:
     if len(args) == 1 and inspect.isfunction(args[0]):
         return _decorator(args[0])
 
-    raise ValueError(
-        "Invalid use of T.macro. Usage: @T.macro, @T.macro(), @T.macro(hygienic=[True|False])")
+    raise ValueError("Invalid use of T.macro. Usage: @T.macro, @T.macro(), @T.macro(hygienic=[True|False])")
 
 
 class BufferProxy:
@@ -159,7 +157,7 @@ class BufferProxy:
     def __call__(
         self,
         shape,
-        dtype="float32",
+        dtype=T.float32,
         data=None,
         strides=None,
         elem_offset=None,
diff --git a/tilelang/language/parser/operation.py b/tilelang/language/parser/operation.py
index 43774947e..473da4327 100644
--- a/tilelang/language/parser/operation.py
+++ b/tilelang/language/parser/operation.py
@@ -17,7 +17,6 @@
 # This file is modified from the original version,
 # which is part of the TVM project (https://tvm.apache.org/).
 """The tir expression operation registration"""
-from __future__ import annotations
 
 from tvm import tir
 from tvm.ffi.runtime_ctypes import DataType, DataTypeCode
@@ -57,11 +56,9 @@ def _get_type_str(dtype: str):
         return dtype[0:index]
 
     def _auto_broadcast(a, b, op):
-
         if isinstance(a, int):
             if hasattr(b, "dtype"):
-                if (DataType(b.dtype).type_code == DataTypeCode.INT or
-                        DataType(b.dtype).type_code == DataTypeCode.UINT):
+                if DataType(b.dtype).type_code == DataTypeCode.INT or DataType(b.dtype).type_code == DataTypeCode.UINT:
                     a = IntImm(_get_type_str(b.dtype), a)
                 elif DataType(b.dtype).type_code == DataTypeCode.FLOAT:
                     a = FloatImm(_get_type_str(b.dtype), a)
@@ -77,8 +74,7 @@ def _auto_broadcast(a, b, op):
 
         assert isinstance(a, tir.PrimExpr), "Operand should be a PrimExpr."
         if isinstance(b, int):
-            if (DataType(a.dtype).type_code == DataTypeCode.INT or
-                    DataType(a.dtype).type_code == DataTypeCode.UINT):
+            if DataType(a.dtype).type_code == DataTypeCode.INT or DataType(a.dtype).type_code == DataTypeCode.UINT:
                 b = IntImm(_get_type_str(a.dtype), b)
             elif DataType(a.dtype).type_code == DataTypeCode.FLOAT:
                 b = FloatImm(_get_type_str(a.dtype), b)
@@ -87,10 +83,10 @@ def _auto_broadcast(a, b, op):
 
         if DataType(a.dtype).lanes == DataType(b.dtype).lanes:
             return op(a, b)
-        elif (DataType(a.dtype).lanes == 1 and DataType(a.dtype).lanes != DataType(b.dtype).lanes):
+        elif DataType(a.dtype).lanes == 1 and DataType(a.dtype).lanes != DataType(b.dtype).lanes:
             broadcast_a = tir.Broadcast(a, DataType(b.dtype).lanes)
             return op(broadcast_a, b)
-        elif (DataType(b.dtype).lanes == 1 and DataType(a.dtype).lanes != DataType(b.dtype).lanes):
+        elif DataType(b.dtype).lanes == 1 and DataType(a.dtype).lanes != DataType(b.dtype).lanes:
             broadcast_b = tir.Broadcast(b, DataType(a.dtype).lanes)
             return op(a, broadcast_b)
         else:
diff --git a/tilelang/language/parser/parser.py b/tilelang/language/parser/parser.py
index 3aa720d4e..4cac0ad74 100644
--- a/tilelang/language/parser/parser.py
+++ b/tilelang/language/parser/parser.py
@@ -146,8 +146,7 @@ def bind_assign_value(self: Parser, node: doc.expr, var_name: str, value: Any) -
         res = value.__enter__()
         IRBuilder.name(var_name, res)
         return res
-    elif isinstance(value, (Buffer, IterVar)) or (isinstance(value, Var) and
-                                                  not self.var_table.exist(value)):
+    elif isinstance(value, (Buffer, IterVar)) or (isinstance(value, Var) and not self.var_table.exist(value)):
         IRBuilder.name(var_name, value)
         return value
     else:
@@ -191,8 +190,7 @@ def visit_for(self: Parser, node: doc.For) -> None:
     if not isinstance(for_frame, T.frame.ForFrame):
         self.report_error(
             node.iter,
-            "Expect the for loop to be one of the following: "
-            "range, T.serial, T.grid, T.parallel, T.vectorized, T.unroll, T.thread_binding",
+            "Expect the for loop to be one of the following: range, T.serial, T.grid, T.parallel, T.vectorized, T.unroll, T.thread_binding",
         )
     with self.var_table.with_frame():
         with for_frame as iters:
@@ -361,8 +359,7 @@ def visit_with(self: Parser, node: doc.With) -> None:
         for item in node.items:
             frame = self.eval_expr(item.context_expr)
             if not isinstance(frame, Frame):
-                self.report_error(item.context_expr,
-                                  "Invalid context expression in the with-statement.")
+                self.report_error(item.context_expr, "Invalid context expression in the with-statement.")
             rhs = stack.enter_context(frame)
             if item.optional_vars is not None:
                 self.eval_assign(target=item.optional_vars, source=rhs, bind_value=bind_with_value)
@@ -505,8 +502,7 @@ def visit_if(self: Parser, node: doc.If) -> None:
                 with self.var_table.with_frame():
                     self.visit_body(node.orelse)
         else:
-            self.report_error(node.test,
-                              f"If condition must be a boolean expression, but got {predicate}")
+            self.report_error(node.test, f"If condition must be a boolean expression, but got {predicate}")
 
 
 @dispatch.register(token="tir", type_name="Assert")
diff --git a/tilelang/language/persistent.py b/tilelang/language/persistent.py
index 0ee7f112a..362314d2a 100644
--- a/tilelang/language/persistent.py
+++ b/tilelang/language/persistent.py
@@ -1,4 +1,5 @@
 """The language interface for tl programs."""
+
 from __future__ import annotations
 
 from tvm import tir
diff --git a/tilelang/language/pipeline.py b/tilelang/language/pipeline.py
index 895ed914a..26ec72cbb 100644
--- a/tilelang/language/pipeline.py
+++ b/tilelang/language/pipeline.py
@@ -1,4 +1,5 @@
 """The language interface for tl programs."""
+
 from __future__ import annotations
 
 from tvm import tir
diff --git a/tilelang/language/print.py b/tilelang/language/print_op.py
similarity index 81%
rename from tilelang/language/print.py
rename to tilelang/language/print_op.py
index 6c473aa1f..3b9a49b1c 100644
--- a/tilelang/language/print.py
+++ b/tilelang/language/print_op.py
@@ -1,10 +1,11 @@
 """
 This module provides macros and utilities for debugging TileLang (tl) programs.
-It includes functionality to print variables, print values in buffers, and conditionally execute debug prints.
+It includes functionality to print variables, print values in buffers, conditionally execute debug prints and assert.
 """
 
 from tvm import tir
 from typing import Any
+import tilelang.language as T
 from tilelang.language.kernel import get_thread_bindings
 from tilelang.language import copy, macro, serial, alloc_shared
 from tilelang.language.utils import index_to_coordinates
@@ -25,9 +26,7 @@ def print_var(var: tir.PrimExpr, msg: str = "") -> tir.PrimExpr:
 
 
 @macro
-def print_var_with_condition(condition: tir.PrimExpr,
-                             var: tir.PrimExpr,
-                             msg: str = "") -> tir.PrimExpr:
+def print_var_with_condition(condition: tir.PrimExpr, var: tir.PrimExpr, msg: str = "") -> tir.PrimExpr:
     """
     Conditionally prints a TIR primitive expression (PrimExpr) if a given condition is True.
 
@@ -43,10 +42,7 @@ def print_var_with_condition(condition: tir.PrimExpr,
 
 
 @macro
-def print_global_buffer_with_condition(condition: tir.PrimExpr,
-                                       buffer: tir.Buffer,
-                                       elems: int,
-                                       msg: str = "") -> tir.PrimExpr:
+def print_global_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buffer, elems: int, msg: str = "") -> tir.PrimExpr:
     """
     Conditionally prints the values of a flattened TIR buffer if the condition is True.
     """
@@ -54,17 +50,13 @@ def print_global_buffer_with_condition(condition: tir.PrimExpr,
         # Iterate through the buffer elements and print each one.
         for i in serial(elems):
             coords = index_to_coordinates(i, buffer.shape)
-            tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i,
-                            buffer[coords])
+            tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i, buffer[coords])
     else:
         tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i, buffer[coords])
 
 
 @macro
-def print_shared_buffer_with_condition(condition: tir.PrimExpr,
-                                       buffer: tir.Buffer,
-                                       elems: int,
-                                       msg: str = "") -> tir.PrimExpr:
+def print_shared_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buffer, elems: int, msg: str = "") -> tir.PrimExpr:
     """
     Conditionally prints the values of a flattened TIR buffer if the condition is True.
 
@@ -80,15 +72,11 @@ def print_shared_buffer_with_condition(condition: tir.PrimExpr,
         # Iterate through the buffer elements and print each one.
         for i in serial(elems):
             coords = index_to_coordinates(i, buffer.shape)
-            tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i,
-                            buffer[coords])
+            tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i, buffer[coords])
 
 
 @macro
-def print_fragment_buffer_with_condition(condition: tir.PrimExpr,
-                                         buffer: tir.Buffer,
-                                         elems: int,
-                                         msg: str = "") -> tir.PrimExpr:
+def print_fragment_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buffer, elems: int, msg: str = "") -> tir.PrimExpr:
     """
     Conditionally prints the values of a flattened TIR buffer if the condition is True.
 
@@ -110,10 +98,7 @@ def print_fragment_buffer_with_condition(condition: tir.PrimExpr,
 
 
 @macro
-def print_local_buffer_with_condition(condition: tir.PrimExpr,
-                                      buffer: tir.Buffer,
-                                      elems: int,
-                                      msg: str = "") -> tir.PrimExpr:
+def print_local_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buffer, elems: int, msg: str = "") -> tir.PrimExpr:
     """
     Conditionally prints the values of a flattened TIR buffer if the condition is True.
 
@@ -129,8 +114,28 @@ def print_local_buffer_with_condition(condition: tir.PrimExpr,
         # Iterate through the buffer elements and print each one.
         for i in serial(elems):
             coords = index_to_coordinates(i, buffer.shape)
-            tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i,
-                            buffer[coords])
+            tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i, buffer[coords])
+
+
+from tilelang.utils.target import check_cuda_availability
+import warnings
+
+_IS_CUDA_AVAILABLE = check_cuda_availability()
+
+
+@macro
+def device_assert(condition: tir.PrimExpr, msg: str = ""):
+    """
+    Device-side assert emulation.
+    Emits a device-side assert call on CUDA targets when CUDA is available.
+    The assert is always enabled and cannot be disabled at runtime.
+    """
+    if _IS_CUDA_AVAILABLE:
+        if msg == "":
+            T.call_intrin("void", tir.op.Op.get("tl.device_assert"), condition)
+        else:
+            warnings.warn("Non-empty msg may slightly slow down the kernel", stacklevel=2)
+            T.call_intrin("void", tir.op.Op.get("tl.device_assert_with_msg"), condition, msg)
 
 
 @macro
@@ -187,7 +192,7 @@ def print(obj: Any = None, msg: str = "", warp_group_id: int = 0, warp_id: int =
                 elems *= dim
 
             # Ensure only the first thread (tx=0, ty=0, tz=0) executes the print.
-            condition = (tx == main_lane and ty == 0 and tz == 0)
+            condition = tx == main_lane and ty == 0 and tz == 0
             if not msg:
                 msg = f"buffer<{buffer.name}, {buffer.dtype}>"
             return print_fragment_buffer_with_condition(condition, buffer, elems, msg)
@@ -198,7 +203,7 @@ def print(obj: Any = None, msg: str = "", warp_group_id: int = 0, warp_id: int =
                 elems *= dim
 
             # Ensure only the first thread (tx=0, ty=0, tz=0) executes the print.
-            condition = (tx == main_lane and ty == 0 and tz == 0)
+            condition = tx == main_lane and ty == 0 and tz == 0
             if not msg:
                 msg = f"buffer<{buffer.name}, {buffer.dtype}>"
             return print_shared_buffer_with_condition(condition, buffer, elems, msg)
@@ -223,5 +228,4 @@ def print(obj: Any = None, msg: str = "", warp_group_id: int = 0, warp_id: int =
 
     else:
         # Unsupported object type.
-        raise ValueError(
-            f"Unexpected type: {type(obj)}. Supported types are tir.Buffer, tir.PrimExpr and None")
+        raise ValueError(f"Unexpected type: {type(obj)}. Supported types are tir.Buffer and tir.PrimExpr.")
diff --git a/tilelang/language/proxy.py b/tilelang/language/proxy.py
index 539c1d94c..90a2d5ff3 100644
--- a/tilelang/language/proxy.py
+++ b/tilelang/language/proxy.py
@@ -1,7 +1,9 @@
-"""The language interface for tl programs."""
+"""Buffer/Tensor proxy in TileLang."""
 
 from __future__ import annotations
-from typing import Any, Sequence, SupportsIndex, TYPE_CHECKING
+
+from typing import Any, SupportsIndex, TYPE_CHECKING, Generic, TypeVar
+from collections.abc import Sequence
 from typing_extensions import Self
 
 from tvm import tir
@@ -50,11 +52,9 @@ def __getitem__(self, keys) -> tir.Buffer:
             return self(keys)
         return self(*keys)  # type: ignore[attr-defined] # pylint: disable=no-member
 
-    def from_ptr(self,
-                 pointer_var: Var,
-                 shape: tuple[PrimExpr, ...],
-                 dtype: str = "float32",
-                 strides: tuple[PrimExpr, ...] = None) -> Buffer:
+    def from_ptr(
+        self, pointer_var: Var, shape: tuple[PrimExpr, ...], dtype: str = "float32", strides: tuple[PrimExpr, ...] = None
+    ) -> Buffer:
         """Create a buffer from a pointer, shape, and data type.
 
         Args:
@@ -75,6 +75,7 @@ class BaseTensorProxy:
     customizable default values for scope, alignment, and offset factors. It implements
     the core functionality for creating TIR buffers with specific memory configurations.
     """
+
     default_scope = "global"
     default_align = 0
     default_offset_factor = 0
@@ -96,7 +97,6 @@ def __call__(
         scope = scope or self.default_scope
         align = align or self.default_align
         offset_factor = offset_factor or self.default_offset_factor
-
         return buffer(
             shape,
             dtype=dtype,
@@ -113,15 +113,13 @@ def __call__(
     def __getitem__(self, keys) -> tir.Buffer:
         assert isinstance(keys, tuple)
         # Single argument (the shape)
-        if all([type(s) not in (tuple, str, list) for s in keys]):
+        if all([not isinstance(s, (tuple, list, str)) for s in keys]):
             keys = (keys,)
         return self(*keys)
 
-    def from_ptr(self,
-                 pointer_var: Var,
-                 shape: tuple[PrimExpr, ...],
-                 dtype: str = "float32",
-                 strides: tuple[PrimExpr, ...] = None) -> tir.Buffer:
+    def from_ptr(
+        self, pointer_var: Var, shape: tuple[PrimExpr, ...], dtype: str = "float32", strides: tuple[PrimExpr, ...] = None
+    ) -> tir.Buffer:
         """Create a buffer from a pointer, shape, and data type.
 
         Args:
@@ -150,19 +148,10 @@ def _construct_strides(shape: tuple[Any]):
             strides.append(s)
         return tuple(reversed(strides))
 
-    def __call__(self,
-                 shape: tuple[Any] | PrimExpr | int,
-                 dtype: str = "float32",
-                 data=None,
-                 scope=None) -> tir.Buffer:
+    def __call__(self, shape: tuple[Any] | PrimExpr | int, dtype: str = "float32", data=None, scope=None) -> tir.Buffer:
         if isinstance(shape, (int, PrimExpr)):
             shape = (shape,)
-        return super().__call__(
-            shape,
-            dtype=dtype,
-            strides=TensorProxy._construct_strides(shape),
-            data=data,
-            scope=scope)
+        return super().__call__(shape, dtype=dtype, strides=TensorProxy._construct_strides(shape), data=data, scope=scope)
 
 
 class StridedTensorProxy(BaseTensorProxy):
@@ -171,11 +160,7 @@ class StridedTensorProxy(BaseTensorProxy):
     This class implements the default tensor proxy with global memory scope, with the stride information required.
     """
 
-    def __call__(self,
-                 shape: tuple[Any],
-                 strides: tuple[Any],
-                 dtype: str = "float32",
-                 scope=None) -> tir.Buffer:
+    def __call__(self, shape: tuple[Any], strides: tuple[Any], dtype: str = "float32", scope=None) -> tir.Buffer:
         if len(shape) != len(strides):
             raise ValueError("Invalid shape/strides' dimensions")
         return super().__call__(shape, dtype=dtype, strides=strides, scope=scope)
@@ -187,6 +172,7 @@ class FragmentBufferProxy(BaseTensorProxy):
     This class represents tensor proxies specifically for local fragment memory,
     typically used in GPU tensor core operations.
     """
+
     default_scope = "local.fragment"
 
 
@@ -196,6 +182,7 @@ class SharedBufferProxy(BaseTensorProxy):
     This class represents tensor proxies for dynamic shared memory,
     commonly used in GPU shared memory operations.
     """
+
     default_scope = "shared.dyn"
 
 
@@ -205,6 +192,7 @@ class LocalBufferProxy(BaseTensorProxy):
     This class represents tensor proxies for local memory scope,
     typically used for temporary computations in GPU kernels.
     """
+
     default_scope = "local"
 
 
@@ -215,15 +203,12 @@ class LocalBufferProxy(BaseTensorProxy):
 if TYPE_CHECKING:
 
     class BaseTensor:
-
         def __class_getitem__(cls, key):
             return cls
 
-        def __getitem__(self, key) -> Any:
-            ...
+        def __getitem__(self, key) -> Any: ...
 
-        def __setitem__(self, key, value) -> None:
-            ...
+        def __setitem__(self, key, value) -> None: ...
 
         def __init__(
             self,
@@ -237,31 +222,26 @@ def __init__(
             offset_factor=None,
             buffer_type="",
             axis_separators=None,
-        ):
-            ...
+        ): ...
 
         @classmethod
-        def from_ptr(cls,
-                     pointer_var: Var,
-                     shape: Sequence[PrimExpr, ...],
-                     dtype: str = "float32",
-                     strides: tuple[PrimExpr, ...] = None) -> Self:
-            ...
+        def from_ptr(
+            cls, pointer_var: Var, shape: Sequence[PrimExpr, ...], dtype: str = "float32", strides: tuple[PrimExpr, ...] = None
+        ) -> Self: ...
 
-    class Tensor(BaseTensor):
-        ...
+    class Tensor(BaseTensor): ...
 
-    class StridedTensor(BaseTensor):
-        ...
+    class StridedTensor(BaseTensor): ...
 
-    class FragmentBuffer(BaseTensor):
-        ...
+    class FragmentBuffer(BaseTensor): ...
 
-    class SharedBuffer(BaseTensor):
-        ...
+    class SharedBuffer(BaseTensor): ...
 
-    class LocalBuffer(BaseTensor):
-        ...
+    class LocalBuffer(BaseTensor): ...
+
+    _T = TypeVar("_T")
+
+    class Ref(Generic[_T], tir.Var): ...
 else:
     Tensor = TensorProxy()  # pylint: disable=invalid-name
     StridedTensor = StridedTensorProxy()  # pylint: disable=invalid-name
@@ -269,11 +249,10 @@ class LocalBuffer(BaseTensor):
     SharedBuffer = SharedBufferProxy()  # pylint: disable=invalid-name
     LocalBuffer = LocalBufferProxy()  # pylint: disable=invalid-name
 
+    class Ref: ...
+
 
-def ptr(dtype: str | None = None,
-        storage_scope: str = "global",
-        *,
-        is_size_var: bool = False) -> Var:
+def ptr(dtype: str | None = None, storage_scope: str = "global", *, is_size_var: bool = False) -> Var:
     """Create a TIR var that represents a pointer.
 
     Parameters
@@ -295,8 +274,5 @@ def ptr(dtype: str | None = None,
     return handle(dtype=dtype, storage_scope=storage_scope, is_size_var=is_size_var)
 
 
-def make_tensor(ptr: Var,
-                shape: tuple[PrimExpr, ...],
-                dtype: str = "float32",
-                strides: tuple[PrimExpr, ...] = None) -> tir.Buffer:
+def make_tensor(ptr: Var, shape: tuple[PrimExpr, ...], dtype: str = "float32", strides: tuple[PrimExpr, ...] = None) -> tir.Buffer:
     return Tensor.from_ptr(ptr, shape, dtype, strides)
diff --git a/tilelang/language/random.py b/tilelang/language/random.py
new file mode 100644
index 000000000..a76625be2
--- /dev/null
+++ b/tilelang/language/random.py
@@ -0,0 +1,44 @@
+from tvm import tir
+import tilelang.language as T
+
+
+# https://docs.nvidia.com/cuda/curand/device-api-overview.html#device-api-overview
+def rng_init(seed, seq=None, off=0):
+    """Initialize CUDA curand random number generator state
+
+    Parameters
+    ----------
+    seed : PrimExpr
+        Random seed value.
+    seq : PrimExpr
+        Sequence number for parallel random number generation.
+    off : PrimExpr
+        Offset number for parallel random number generation.
+
+    Returns
+    -------
+    state : PrimExpr
+        The random number generator state handle.
+    """
+    seed = tir.convert(seed)
+    if seq is None:
+        bx = T.get_block_binding()
+        ex = T.kernel.get_thread_extent()
+        tx = T.get_thread_binding()
+        id = tx + bx * ex
+        seq = tir.convert(id)
+    else:
+        seq = tir.convert(seq)
+    off = tir.convert(off)
+    return tir.call_intrin("void", tir.op.Op.get("tl.rng_init"), seed, seq, off)
+
+
+def rng_rand():
+    """Generate a 32-bit unsigned random integer
+
+    Returns
+    -------
+    random_value : PrimExpr
+        A 32-bit unsigned random integer.
+    """
+    return tir.call_intrin("uint32", tir.op.Op.get("tl.rng_rand"))
diff --git a/tilelang/language/reduce.py b/tilelang/language/reduce.py
index 23167bdbf..94c11afde 100644
--- a/tilelang/language/reduce.py
+++ b/tilelang/language/reduce.py
@@ -1,4 +1,5 @@
 """The language interface for tl programs."""
+
 from __future__ import annotations
 
 from tvm import tir
@@ -25,15 +26,13 @@ def reduce(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clea
         tir.Call: Handle to the reduction operation
     """
     # input shape: [X, d, Y], expected output shape: [X, Y] or [X, 1, Y]
-    expected_shapes = [
-        buffer.shape[:dim] + buffer.shape[dim + 1:],
-        buffer.shape[:dim] + [1] + buffer.shape[dim + 1:]
-    ]
+    expected_shapes = [buffer.shape[:dim] + buffer.shape[dim + 1 :], buffer.shape[:dim] + [1] + buffer.shape[dim + 1 :]]
     if list(out.shape) not in expected_shapes:
-        expected_shapes_str = ' or '.join(map(str, expected_shapes))
+        expected_shapes_str = " or ".join(map(str, expected_shapes))
         raise ValueError(
             f"Invalid reduce output shape, buffer shape is {buffer.shape}, dim is {dim}, "
-            f"output shape is {out.shape}, expected shapes are {expected_shapes_str}")
+            f"output shape is {out.shape}, expected shapes are {expected_shapes_str}"
+        )
     buffer = buffer.access_ptr("r")
     out = out.access_ptr("w")
     return tir.call_intrin(
diff --git a/tilelang/language/reduce_op.py b/tilelang/language/reduce_op.py
new file mode 100644
index 000000000..9db56df0d
--- /dev/null
+++ b/tilelang/language/reduce_op.py
@@ -0,0 +1,464 @@
+"""Reduce operations exposed on the TileLang language surface."""
+
+from __future__ import annotations
+from tvm import tir
+from tilelang.language import copy, macro, alloc_shared, alloc_fragment
+from tilelang.utils.language import to_buffer_region, retrieve_shape, _get_buffer
+from tilelang.utils.language import is_shared, is_fragment
+from tvm.script.ir_builder import IRBuilder
+
+
+def _legalize_dim(buffer: tir.Buffer, dim: int):
+    if dim < 0:
+        dim = len(buffer.shape) + dim
+    return dim
+
+
+_REDUCE_OP_KEY = "tl.tileop.reduce"
+
+
+def reduce(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clear: bool):
+    """Perform a reduction operation on a buffer along a specified dimension.
+
+    Args:
+        buffer (tir.Buffer): Input buffer to reduce
+        out (tir.Buffer): Output buffer to store results
+        reduce_type (str): Type of reduction ('max', 'min', 'sum', 'abssum')
+        dim (int): Dimension along which to perform reduction
+        clear (bool): Whether to initialize the output buffer before reduction
+
+    Returns:
+        tir.Call: Handle to the reduction operation
+    """
+    # input shape: [X, d, Y], expected output shape: [X, Y] or [X, 1, Y]
+    expected_shapes = [buffer.shape[:dim] + buffer.shape[dim + 1 :], buffer.shape[:dim] + [1] + buffer.shape[dim + 1 :]]
+    if list(out.shape) not in expected_shapes:
+        expected_shapes_str = " or ".join(map(str, expected_shapes))
+        raise ValueError(
+            f"Invalid reduce output shape, buffer shape is {buffer.shape}, dim is {dim}, "
+            f"output shape is {out.shape}, expected shapes are {expected_shapes_str}"
+        )
+
+    @macro
+    def reduce_macro(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clear: bool):
+        if is_shared(buffer) and is_shared(out):
+            red_frag_in = alloc_fragment(buffer.shape, buffer.dtype)
+            red_frag_out = alloc_fragment(out.shape, out.dtype)
+
+            # rename buffers
+            IRBuilder.name(buffer.name + "_frag", red_frag_in)
+            IRBuilder.name(out.name + "_frag", red_frag_out)
+
+            copy(buffer, red_frag_in)
+            tir.call_intrin(
+                "handle",
+                tir.op.Op.get(_REDUCE_OP_KEY),
+                to_buffer_region(red_frag_in, access_type="r"),
+                to_buffer_region(red_frag_out, access_type="w"),
+                reduce_type,
+                dim,
+                clear,
+            )
+            copy(red_frag_out, out)
+        elif is_shared(buffer) and is_fragment(out):
+            red_frag_in = alloc_fragment(buffer.shape, buffer.dtype)
+            IRBuilder.name(buffer.name + "_frag", red_frag_in)
+
+            copy(buffer, red_frag_in)
+            tir.call_intrin(
+                "handle",
+                tir.op.Op.get(_REDUCE_OP_KEY),
+                to_buffer_region(red_frag_in, access_type="r"),
+                to_buffer_region(out, access_type="w"),
+                reduce_type,
+                dim,
+                clear,
+            )
+        elif is_fragment(buffer) and is_shared(out):
+            red_frag_out = alloc_fragment(out.shape, out.dtype)
+            IRBuilder.name(out.name + "_frag", red_frag_out)
+
+            tir.call_intrin(
+                "handle",
+                tir.op.Op.get(_REDUCE_OP_KEY),
+                to_buffer_region(buffer, access_type="r"),
+                to_buffer_region(red_frag_out, access_type="w"),
+                reduce_type,
+                dim,
+                clear,
+            )
+            copy(red_frag_out, out)
+        elif is_fragment(buffer) and is_fragment(out):
+            tir.call_intrin(
+                "handle",
+                tir.op.Op.get(_REDUCE_OP_KEY),
+                to_buffer_region(buffer, access_type="r"),
+                to_buffer_region(out, access_type="w"),
+                reduce_type,
+                dim,
+                clear,
+            )
+        else:
+            raise ValueError(f"Invalid buffer scopes: {buffer.scope()} and {out.scope()}")
+
+    return reduce_macro(buffer, out, reduce_type, dim, clear)
+
+
+def reduce_max(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True):
+    """Perform reduce max on input buffer, store the result to output buffer
+
+    Parameters
+    ----------
+    buffer : Buffer
+        The input buffer.
+    out : Buffer
+        The output buffer.
+    dim : int
+        The dimension to perform reduce on
+    clear : bool
+        If set to True, the output buffer will first be initialized to -inf.
+    Returns
+    -------
+    handle : PrimExpr
+    """
+    dim = _legalize_dim(buffer, dim)
+    return reduce(buffer, out, "max", dim, clear)
+
+
+def reduce_min(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True):
+    """Perform reduce min on input buffer, store the result to output buffer.
+
+    Args:
+        buffer (tir.Buffer): The input buffer
+        out (tir.Buffer): The output buffer
+        dim (int): The dimension to perform reduce on
+        clear (bool, optional): If True, output buffer will be initialized to inf. Defaults to True.
+
+    Returns:
+        tir.Call: Handle to the reduction operation
+    """
+    dim = _legalize_dim(buffer, dim)
+    return reduce(buffer, out, "min", dim, clear)
+
+
+def reduce_sum(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True):
+    """Perform reduce sum on input buffer, store the result to output buffer.
+
+    Args:
+        buffer (tir.Buffer): The input buffer
+        out (tir.Buffer): The output buffer
+        dim (int): The dimension to perform reduce on
+        clear (bool, optional): If True, output buffer will be cleared before reduction.
+                              If False, results will be accumulated on existing values.
+                              Defaults to True.
+    Note: When clear=True, reduce_sum will not compute directly on the output buffer. This is because
+          during warp reduction, the same value would be accumulated multiple times (number of threads
+          in the warp). Therefore, the implementation with clear=True follows these steps:
+        1. create a temp buffer with same shape and dtype as out
+        2. copy out to temp buffer
+        3. call reduce_sum with temp buffer and out
+        4. Add temp buffer to out
+
+    Returns:
+        tir.Call: Handle to the reduction operation
+    """
+    dim = _legalize_dim(buffer, dim)
+    return reduce(buffer, out, "sum", dim, clear)
+
+
+def reduce_abssum(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1):
+    """Perform reduce absolute sum on input buffer, store the result to output buffer.
+
+    Args:
+        buffer (tir.Buffer): The input buffer
+        out (tir.Buffer): The output buffer
+        dim (int): The dimension to perform reduce on
+
+    Returns:
+        tir.Call: Handle to the reduction operation
+    """
+    dim = _legalize_dim(buffer, dim)
+    return reduce(buffer, out, "abssum", dim, True)
+
+
+def reduce_absmax(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True):
+    """Perform reduce absolute max on input buffer, store the result to output buffer.
+
+    Args:
+        buffer (tir.Buffer): The input buffer
+        out (tir.Buffer): The output buffer
+        dim (int): The dimension to perform reduce on
+
+    Returns:
+        tir.Call: Handle to the reduction operation
+    """
+    dim = _legalize_dim(buffer, dim)
+    return reduce(buffer, out, "absmax", dim, clear)
+
+
+def reduce_bitand(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True):
+    """Perform reduce bitwise-and on input buffer, store the result to output buffer.
+
+    Args:
+        buffer (tir.Buffer): The input buffer
+        out (tir.Buffer): The output buffer
+        dim (int): The dimension to perform reduce on
+
+    Returns:
+        tir.Call: Handle to the reduction operation
+    """
+    dim = _legalize_dim(buffer, dim)
+    return reduce(buffer, out, "bitand", dim, clear)
+
+
+def reduce_bitor(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True):
+    """Perform reduce bitwise-or on input buffer, store the result to output buffer.
+
+    Args:
+        buffer (tir.Buffer): The input buffer
+        out (tir.Buffer): The output buffer
+        dim (int): The dimension to perform reduce on
+
+    Returns:
+        tir.Call: Handle to the reduction operation
+    """
+    dim = _legalize_dim(buffer, dim)
+    return reduce(buffer, out, "bitor", dim, clear)
+
+
+def reduce_bitxor(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True):
+    """Perform reduce bitwise-xor on input buffer, store the result to output buffer.
+
+    Args:
+        buffer (tir.Buffer): The input buffer
+        out (tir.Buffer): The output buffer
+        dim (int): The dimension to perform reduce on
+
+    Returns:
+        tir.Call: Handle to the reduction operation
+    """
+    dim = _legalize_dim(buffer, dim)
+    return reduce(buffer, out, "bitxor", dim, clear)
+
+
+@macro
+def cumsum_fragment(
+    src: tir.Buffer,
+    dst: tir.Buffer,
+    dim: int,
+    reverse: bool,
+) -> tir.PrimExpr:
+    """
+    Compute cumulative sum for fragment buffers by copying to shared memory first.
+
+    This macro handles cumulative sum operations on fragment buffers by first copying
+    the data to shared memory, performing the cumsum operation, and then copying back.
+
+    Args:
+        src: Source buffer (Buffer, BufferRegion, or BufferLoad) containing input data.
+        dst: Destination buffer (Buffer, BufferRegion, or BufferLoad) for output data.
+        dim: Dimension along which to compute cumulative sum.
+        reverse: If True, compute cumulative sum in reverse order.
+
+    Returns:
+        tir.PrimExpr: A handle to the cumulative sum operation.
+    """
+    src_shape = retrieve_shape(src)
+    src_buffer = _get_buffer(src)
+    # Get dtype from the buffer
+    if isinstance(src, tir.Buffer):
+        dtype = src.dtype
+    else:
+        dtype = src_buffer.dtype
+    cumsum_smem = alloc_shared(src_shape, dtype, "shared.dyn")
+    copy(src, cumsum_smem)
+    tir.call_intrin(
+        "handle",
+        tir.op.Op.get("tl.tileop.cumsum"),
+        to_buffer_region(cumsum_smem, access_type="r"),
+        to_buffer_region(cumsum_smem, access_type="w"),
+        dim,
+        reverse,
+    )
+    copy(cumsum_smem, dst)
+
+
+def cumsum(
+    src: tir.Buffer | tir.BufferRegion | tir.BufferLoad,
+    dst: tir.Buffer | tir.BufferRegion | tir.BufferLoad | None = None,
+    dim: int = 0,
+    reverse: bool = False,
+):
+    """
+    Compute the cumulative sum of `src` along `dim`, writing results to `dst`.
+
+    Negative `dim` indices are normalized (Python-style). If `dst` is None, the operation is performed in-place into `src`. Raises ValueError when `dim` is out of bounds for `src.shape`. When `src.scope() == "local.fragment"`, this delegates to `cumsum_fragment`; otherwise it emits the `tl.cumsum` intrinsic.
+
+    Supports Buffer, BufferRegion, and BufferLoad inputs, allowing operations on buffer slices/regions.
+
+    Examples:
+        A 1D inclusive scan that writes the result into a separate shared-memory buffer:
+
+        >>> import tilelang.language as T
+        >>> @T.prim_func
+        ... def kernel(A: T.Tensor((128,), "float32"), B: T.Tensor((128,), "float32")):
+        ...     with T.Kernel(1, threads=128):
+        ...         A_shared = T.alloc_shared((128,), "float32")
+        ...         T.copy(A, A_shared)
+        ...         T.cumsum(src=A_shared, dst=A_shared, dim=0)
+        ...         T.copy(A_shared, B)
+
+        A 2D prefix sum along the last dimension with reverse accumulation:
+
+        >>> import tilelang.language as T
+        >>> @T.prim_func
+        ... def kernel2d(A: T.Tensor((64, 64), "float16"), B: T.Tensor((64, 64), "float16")):
+        ...     with T.Kernel(1, 1, threads=256):
+        ...         tile = T.alloc_shared((64, 64), "float16")
+        ...         T.copy(A, tile)
+        ...         T.cumsum(src=tile, dim=1, reverse=True)
+        ...         T.copy(tile, B)
+
+        Operating on a buffer region (slice):
+
+        >>> import tilelang.language as T
+        >>> @T.prim_func
+        ... def kernel_region(InputG_fragment: T.Tensor((128,), "float32"), chunk_size: T.int32):
+        ...     with T.Kernel(1, threads=128):
+        ...         i = T.int32(0)
+        ...         T.cumsum(InputG_fragment[i * chunk_size:(i + 1) * chunk_size], dim=0)
+
+    Returns:
+        tir.Call: A handle to the emitted cumulative-sum operation.
+    """
+
+    # Get shape from src (supports Buffer, BufferRegion, BufferLoad)
+    shape = retrieve_shape(src)
+    if dim >= len(shape) or dim < -len(shape):
+        raise ValueError(f"Dimension {dim} is out of bounds for buffer with shape {shape}")
+    if dim < 0:
+        dim = len(shape) + dim
+
+    if dst is None:
+        dst = src
+    else:
+        # Validate that dst shape matches src shape
+        dst_shape = retrieve_shape(dst)
+        if len(dst_shape) != len(shape):
+            raise ValueError(f"cumsum dst shape {dst_shape} must match src shape {shape} (rank mismatch)")
+        # Check each dimension matches
+        for i in range(len(shape)):
+            if not tir.analysis.expr_deep_equal(dst_shape[i], shape[i]):
+                raise ValueError(f"cumsum dst shape {dst_shape} must match src shape {shape} (dim {i} mismatch)")
+
+    # Check if src is a fragment buffer
+    if is_fragment(src):
+        return cumsum_fragment(src, dst, dim, reverse)
+    return tir.call_intrin(
+        "handle",
+        tir.op.Op.get("tl.tileop.cumsum"),
+        to_buffer_region(src, access_type="r"),
+        to_buffer_region(dst, access_type="w"),
+        dim,
+        reverse,
+    )
+
+
+def finalize_reducer(reducer: tir.Buffer):
+    """
+    Finalize a reducer buffer by emitting the `tl.tileop.finalize_reducer` intrinsic.
+
+    This returns a TVM `tir.Call` handle that finalizes the given reducer using its writable pointer.
+    The call does not modify Python objects directly; it produces the low-level intrinsic call used by the IR.
+
+    Parameters:
+        reducer (tir.Buffer): Reducer buffer whose writable pointer will be finalized.
+
+    Returns:
+        tir.Call: Handle to the finalize reducer intrinsic call.
+    """
+    return tir.call_intrin(
+        "handle",
+        tir.op.Op.get("tl.tileop.finalize_reducer"),
+        to_buffer_region(reducer, access_type="w"),
+    )
+
+
+def warp_reduce_sum(value: tir.PrimExpr):
+    """Perform warp reduction sum on a register value.
+
+    This function reduces a value across all threads in a warp using shuffle operations.
+    Each thread provides a  register `value`, and after the reduction, all threads
+    will have the sum of all values across the warp.
+
+    Args:
+        value (tir.PrimExpr): The input register value to reduce
+
+    Returns:
+        tir.PrimExpr: The reduced sum value (same on all threads in the warp)
+    """
+    return tir.call_intrin(value.dtype, tir.op.Op.get("tl.warp_reduce_sum"), value)
+
+
+def warp_reduce_max(value: tir.PrimExpr):
+    """Perform warp reduction max on a register value.
+
+    This function reduces a value across all threads in a warp using shuffle operations.
+    Each thread provides a  register `value`, and after the reduction, all threads
+    will have the max of all values across the warp.
+
+    Args:
+        value (tir.PrimExpr): The input register value to reduce
+
+    Returns:
+        tir.PrimExpr: The reduced max value (same on all threads in the warp)
+    """
+    return tir.call_intrin(value.dtype, tir.op.Op.get("tl.warp_reduce_max"), value)
+
+
+def warp_reduce_min(value: tir.PrimExpr):
+    """Perform warp reduction min on a register value.
+
+    This function reduces a value across all threads in a warp using shuffle operations.
+    Each thread provides a  register `value`, and after the reduction, all threads
+    will have the min of all values across the warp.
+
+    Args:
+        value (tir.PrimExpr): The input register value to reduce
+
+    Returns:
+        tir.PrimExpr: The reduced min value (same on all threads in the warp)
+    """
+    return tir.call_intrin(value.dtype, tir.op.Op.get("tl.warp_reduce_min"), value)
+
+
+def warp_reduce_bitand(value: tir.PrimExpr):
+    """Perform warp reduction bitwise-and on a register value.
+
+    This function reduces a value across all threads in a warp using shuffle operations.
+    Each thread provides a  register `value`, and after the reduction, all threads
+    will have the bitwise-and of all values across the warp.
+
+    Args:
+        value (tir.PrimExpr): The input register value to reduce
+
+    Returns:
+        tir.PrimExpr: The reduced bitwise-and value (same on all threads in the warp)
+    """
+    return tir.call_intrin(value.dtype, tir.op.Op.get("tl.warp_reduce_bitand"), value)
+
+
+def warp_reduce_bitor(value: tir.PrimExpr):
+    """Perform warp reduction bitwise-or on a register value.
+
+    This function reduces a value across all threads in a warp using shuffle operations.
+    Each thread provides a  register `value`, and after the reduction, all threads
+    will have the bitwise-or of all values across the warp.
+
+    Args:
+        value (tir.PrimExpr): The input register value to reduce
+
+    Returns:
+        tir.PrimExpr: The reduced bitwise-or value (same on all threads in the warp)
+    """
+    return tir.call_intrin(value.dtype, tir.op.Op.get("tl.warp_reduce_bitor"), value)
diff --git a/tilelang/language/symbolics.py b/tilelang/language/symbolics.py
index 92b9d5bab..34c74a2b3 100644
--- a/tilelang/language/symbolics.py
+++ b/tilelang/language/symbolics.py
@@ -1,5 +1,7 @@
 """Symbolic variable helpers exposed on the TileLang language surface."""
 
+from __future__ import annotations
+import re
 from tvm import tir
 
 from tilelang.utils import deprecated
@@ -7,8 +9,7 @@
 __all__ = ["dynamic", "symbolic"]
 
 
-@deprecated("T.dynamic(...)", "tir.Var(...)", "v0.1.9")
-def dynamic(name: str, dtype: str = "int32"):
+def dynamic(name: str, dtype: str = "int32") -> tuple[tir.Var, ...] | tir.Var:
     """
     Create a TIR dynamic symbolic variable.
 
@@ -19,10 +20,16 @@ def dynamic(name: str, dtype: str = "int32"):
     Returns:
         tir.Var: A TIR variable with the given name and dtype for use in TIR/TensorIR kernels.
     """
+    if "," in name:
+        names = re.split(r"\s*,\s*", name)
+        return tuple(tir.Var(n, dtype) for n in names)
+    if " " in name:
+        names = re.split(r"\s+", name)
+        return tuple(tir.Var(n, dtype) for n in names)
     return tir.Var(name, dtype)
 
 
-@deprecated("T.symbolic(...)", "T.dynamic(...)")
+@deprecated("T.symbolic(...)", "T.dynamic(...)", "v0.1.9")
 def symbolic(name: str, dtype: str = "int32"):
     """Deprecated alias for `T.dynamic`."""
-    return tir.Var(name, dtype)
+    return dynamic(name, dtype)
diff --git a/tilelang/language/tir/entry.py b/tilelang/language/tir/entry.py
index 22702ae43..8d65786e4 100644
--- a/tilelang/language/tir/entry.py
+++ b/tilelang/language/tir/entry.py
@@ -7,9 +7,7 @@
 from tvm.script.parser._core import parse, scan_macro, utils
 
 
-def prim_func(func: Callable | None = None,
-              private: bool = False,
-              check_well_formed: bool = False) -> PrimFunc | Callable:
+def prim_func(func: Callable | None = None, private: bool = False, check_well_formed: bool = False) -> PrimFunc | Callable:
     """The parsing method for tir prim func, by using `@prim_func` as decorator.
 
     Parameters
@@ -91,12 +89,12 @@ def dynamic_capture(A, B):
 
 
         @T.prim_func
-        def use1(A: T.Buffer((1024,), "int32"), B: T.Buffer((), "int32")) -> None:
+        def use1(A: T.Buffer((1024,), T.int32), B: T.Buffer((), T.int32)) -> None:
             for x_value in T.serial(10):
                 static_capture(A, B)    ### Produces B[()] = A[128]
 
         @T.prim_func
-        def use2(A: T.Buffer((1024,), "int32"), B: T.Buffer((), "int32")) -> None:
+        def use2(A: T.Buffer((1024,), T.int32), B: T.Buffer((), T.int32)) -> None:
             for x_value in T.serial(10):
                 dynamic_capture(A, B)   ### Produces B[()] = A[x_value]
         ```
@@ -113,8 +111,7 @@ def _decorator(func: Callable) -> _tir_entry.TIRMacro:
     if len(args) == 1 and inspect.isfunction(args[0]):
         return _decorator(args[0])
 
-    raise ValueError(
-        "Invalid use of T.macro. Usage: @T.macro, @T.macro(), @T.macro(hygienic=[True|False])")
+    raise ValueError("Invalid use of T.macro. Usage: @T.macro, @T.macro(), @T.macro(hygienic=[True|False])")
 
 
 setattr(macro, "dispatch_token", "tir")  # noqa: B010
diff --git a/tilelang/language/tir/ir.py b/tilelang/language/tir/ir.py
index 977e65036..0b86033f8 100644
--- a/tilelang/language/tir/ir.py
+++ b/tilelang/language/tir/ir.py
@@ -7,26 +7,7 @@
 import functools
 
 
-class SerialStepSpec:
-    """A lightweight spec object for stepped serial loops.
-
-    This is consumed by the TileLang TIR parser override to realize
-    inclusive stepped loops like T.serial(start, end, step).
-    """
-
-    def __init__(self, start: PrimExpr, stop: PrimExpr, step: PrimExpr | int,
-                 annotations: dict[str, Any] | None):
-        self.start = start
-        self.stop = stop
-        self.step = step
-        self.annotations = annotations
-
-
-def serial(start: PrimExpr,
-           stop: PrimExpr | None = None,
-           step: PrimExpr | int | None = None,
-           *,
-           annotations: dict[str, Any] = None) -> frame.ForFrame | SerialStepSpec:
+def serial(start: PrimExpr, stop: PrimExpr = None, *, annotations: dict[str, Any] = None) -> frame.ForFrame:
     """The serial For statement.
 
     Parameters
@@ -37,14 +18,6 @@ def serial(start: PrimExpr,
     stop : PrimExpr
         The maximum value of iteration.
 
-    step : PrimExpr | int | None
-        Optional step size of iteration. When provided as the third positional
-        argument (or keyword), the loop iterates inclusively with stride `step`:
-        i = start, start+step, ..., <= end. If `end-start` is not divisible by
-        `step`, the last value will be the largest `start + k*step` such that
-        it does not exceed `end` (for positive step). Negative steps are not
-        currently supported.
-
     annotations : Dict[str, Any]
         The optional annotations of the For statement.
 
@@ -53,23 +26,10 @@ def serial(start: PrimExpr,
     res : frame.ForFrame
         The ForFrame.
     """
-    # If no step is provided, delegate to the upstream builder (supports
-    # both one-arg and two-arg forms).
-    if step is None:
-        return _ir.serial(start=start, stop=stop, annotations=annotations)
-
-    # Step provided: return a spec for the parser override to lower into an
-    # inclusive stepped loop. Require `stop` to be provided explicitly.
-    if stop is None:
-        raise TypeError("T.serial(start, end, step): `end` must be provided when `step` is set")
-
-    return SerialStepSpec(start=start, stop=stop, step=step, annotations=annotations)
+    return _ir.serial(start=start, stop=stop, annotations=annotations)
 
 
-def parallel(start: PrimExpr,
-             stop: PrimExpr = None,
-             *,
-             annotations: dict[str, Any] = None) -> frame.ForFrame:
+def parallel(start: PrimExpr, stop: PrimExpr = None, *, annotations: dict[str, Any] = None) -> frame.ForFrame:
     """The parallel For statement.
 
     Parameters
@@ -91,10 +51,7 @@ def parallel(start: PrimExpr,
     return _ir.parallel(start=start, stop=stop, annotations=annotations)
 
 
-def vectorized(start: PrimExpr,
-               stop: PrimExpr = None,
-               *,
-               annotations: dict[str, Any] = None) -> frame.ForFrame:
+def vectorized(start: PrimExpr, stop: PrimExpr = None, *, annotations: dict[str, Any] = None) -> frame.ForFrame:
     """The vectorized For statement.
 
     Parameters
@@ -116,10 +73,7 @@ def vectorized(start: PrimExpr,
     return _ir.vectorized(start=start, stop=stop, annotations=annotations)
 
 
-def unroll(start: PrimExpr,
-           stop: PrimExpr = None,
-           *,
-           annotations: dict[str, Any] = None) -> frame.ForFrame:
+def unroll(start: PrimExpr, stop: PrimExpr = None, *, annotations: dict[str, Any] = None) -> frame.ForFrame:
     """The unrolled For statement.
 
     Parameters
@@ -138,6 +92,13 @@ def unroll(start: PrimExpr,
     res : frame.ForFrame
         The ForFrame.
     """
+    # Ensure annotations has {"pragma_unroll_explicit": True} by default
+    if annotations is None:
+        annotations = {"pragma_unroll_explicit": False}
+    else:
+        # Add "pragma_unroll_explicit": True if not already present
+        annotations = dict(annotations)
+        annotations.setdefault("pragma_unroll_explicit", False)
     return _ir.unroll(start=start, stop=stop, annotations=annotations)
 
 
@@ -189,7 +150,6 @@ def grid(*extents: PrimExpr) -> frame.ForFrame:
 
 
 def _dtype_forward(func):
-
     @functools.wraps(func)
     def wrapped(*args, **kwargs):
         if "dtype" in kwargs:
@@ -200,7 +160,6 @@ def wrapped(*args, **kwargs):
 
 
 def _op_wrapper(func):
-
     @functools.wraps(func)
     def wrapped(*args, **kwargs):
         if "dtype" in kwargs:
@@ -328,6 +287,8 @@ def wrapped(*args, **kwargs):
 ptx_mma_sp = _dtype_forward(_tir_op.ptx_mma_sp)
 ptx_wgmma_ss = _dtype_forward(_tir_op.ptx_wgmma_ss)
 ptx_wgmma_rs = _dtype_forward(_tir_op.ptx_wgmma_rs)
+ptx_tcgen05_mma_ss = _dtype_forward(_tir_op.ptx_tcgen05_mma_ss)
+ptx_tcgen05_mma_ts = _dtype_forward(_tir_op.ptx_tcgen05_mma_ts)
 ptx_ldmatrix = _dtype_forward(_tir_op.ptx_ldmatrix)
 ptx_cp_async = _dtype_forward(_tir_op.ptx_cp_async)
 ptx_cp_async_bulk = _dtype_forward(_tir_op.ptx_cp_async_bulk)
diff --git a/tilelang/language/tir/ir.pyi b/tilelang/language/tir/ir.pyi
new file mode 100644
index 000000000..76199cc3b
--- /dev/null
+++ b/tilelang/language/tir/ir.pyi
@@ -0,0 +1,149 @@
+from typing import TypeVar, Literal
+from tvm.tir.expr import Span, PrimExpr, BufferLoad, Var, IntImm
+
+_T = TypeVar("_T")
+
+def Cast(dtype, value: _T, span: Span | None = None) -> _T: ...
+def abs(x: _T, span: Span | None = None) -> _T: ...
+def acos(x: _T) -> _T: ...
+def acosh(x: _T) -> _T: ...
+def address_of(buffer_load: BufferLoad, span: Span | None = None) -> PrimExpr: ...
+def asin(x: _T) -> _T: ...
+def asinh(x: _T) -> _T: ...
+def atan(x: _T) -> _T: ...
+def atan2(x1: _T, x2: _T) -> _T: ...
+def atanh(x: _T) -> _T: ...
+def bitwise_and(x: _T, y: _T, span: Span | None = None) -> _T: ...
+def bitwise_not(x: _T, span: Span | None = None) -> _T: ...
+def bitwise_or(x: _T, y: _T, span: Span | None = None) -> _T: ...
+def bitwise_xor(x: _T, y: _T, span: Span | None = None) -> _T: ...
+def ceil(x: _T, span: Span | None = None) -> _T: ...
+def clz(x: _T) -> _T: ...
+def copysign(x1: _T, x2: _T) -> _T: ...
+def cos(x: _T) -> _T: ...
+def cosh(x: _T) -> _T: ...
+def erf(x: _T) -> _T: ...
+def exp(x: _T) -> _T: ...
+def exp2(x: _T) -> _T: ...
+def exp10(x: _T) -> _T: ...
+def floor(x: _T, span: Span | None = None) -> _T: ...
+def ceildiv(lhs: _T, rhs: _T, span: Span | None = None) -> _T: ...
+def floordiv(a: _T, b: _T, span: Span | None = None) -> _T: ...
+def floormod(a: _T, b: _T, span: Span | None = None) -> _T: ...
+def fmod(x: _T, y: _T) -> _T: ...
+def hypot(x1: _T, x2: _T) -> _T: ...
+def if_then_else(cond: PrimExpr, t: _T, f: _T, span: Span | None = None) -> _T: ...
+def infinity(dtype: _T, span: Span | None = None) -> _T: ...
+def isfinite(x: _T, span: Span | None = None) -> _T: ...
+def isinf(x: _T, span: Span | None = None) -> _T: ...
+def isnan(x: _T, span: Span | None = None) -> _T: ...
+def isnullptr(x: _T, span: Span | None = None) -> _T: ...
+def ldexp(x1: _T, x2: _T) -> _T: ...
+def likely(cond: _T, span: Span | None = None) -> _T: ...
+def log(x: _T) -> _T: ...
+def log1p(x: _T) -> _T: ...
+def log2(x: _T) -> _T: ...
+def log10(x: _T) -> _T: ...
+def lookup_param(param_name: str, span: Span | None = None) -> PrimExpr: ...
+def max(x: _T, y: _T, span: Span | None = None) -> _T: ...
+def max_value(dtype: str, span: Span | None = None) -> PrimExpr: ...
+def min(x: _T, y: _T, span: Span | None = None) -> _T: ...
+def min_value(dtype: str, span: Span | None = None) -> PrimExpr: ...
+def nearbyint(x: _T, span: Span | None = None) -> _T: ...
+def nextafter(x1: _T, x2: _T) -> _T: ...
+def popcount(x: _T) -> _T: ...
+def pow(x: _T, y: _T, span: Span | None = None) -> _T: ...
+def q_multiply_shift(x: _T, y: _T, q: _T, s: _T) -> _T: ...
+def q_multiply_shift_per_axis(
+    x: _T, y: _T, ls: _T, rs: _T, q: IntImm, is_lshift_required: IntImm, is_rshift_required: IntImm
+) -> PrimExpr: ...
+def ret(val: _T) -> _T: ...
+def round(x: _T, span: Span | None = None) -> _T: ...
+def rsqrt(x: _T) -> _T: ...
+def shift_left(x: _T, y: _T, span=None) -> _T: ...
+def shift_right(x: _T, y: _T, span=None) -> _T: ...
+def sigmoid(x: _T) -> _T: ...
+def sin(x: _T) -> _T: ...
+def sinh(x: _T) -> _T: ...
+def sqrt(x: _T) -> _T: ...
+def tan(x: _T) -> _T: ...
+def tanh(x: _T) -> _T: ...
+def trunc(x: _T, span: Span | None = None) -> _T: ...
+def truncdiv(a: _T, b: _T, span: Span | None = None) -> _T: ...
+def truncmod(a: _T, b: _T, span: Span | None = None) -> _T: ...
+def tvm_access_ptr(ptype: PrimExpr, data, offset: int, extent: int, rw_mask: int) -> PrimExpr: ...
+def tvm_throw_last_error() -> _T: ...
+def tvm_stack_alloca(dtype_str: str, num: int) -> PrimExpr: ...
+def tvm_stack_make_shape(*args) -> _T: ...
+def tvm_stack_make_array(
+    data: PrimExpr, shape: PrimExpr, strides: PrimExpr, ndim: PrimExpr, arr_dtype: PrimExpr, elem_offset
+) -> PrimExpr: ...
+def tvm_check_return(expected: int, return_unexpected: int, nested_call: PrimExpr) -> PrimExpr: ...
+def call_packed(*args, span=None) -> _T: ...
+def call_cpacked(*args, span=None) -> _T: ...
+def call_packed_lowered(*args, span=None) -> _T: ...
+def call_cpacked_lowered(*args, span=None) -> _T: ...
+def tvm_tuple(*value) -> _T: ...
+def tvm_struct_set(arr, index: int, field: int, value: PrimExpr) -> PrimExpr: ...
+def tvm_thread_invariant(cond: _T) -> _T: ...
+def tvm_thread_allreduce(*freduce_args) -> _T: ...
+def tvm_load_matrix_sync(
+    fragment: Var,
+    m: IntImm,
+    n: IntImm,
+    k: IntImm,
+    index: PrimExpr,
+    buffer_ptr: PrimExpr,
+    stride: PrimExpr,
+    layout: Literal["row_major", "column_major"],
+) -> PrimExpr: ...
+def tvm_mma_sync(
+    fragment_d: Var,
+    index_d: PrimExpr,
+    fragment_a: Var,
+    index_a: PrimExpr,
+    fragment_b: Var,
+    index_b: PrimExpr,
+    fragment_c: Var,
+    index_c: PrimExpr,
+) -> PrimExpr: ...
+def tvm_bmma_sync(
+    fragment_d: Var,
+    index_d: PrimExpr,
+    fragment_a: Var,
+    index_a: PrimExpr,
+    fragment_b: Var,
+    index_b: PrimExpr,
+    fragment_c: Var,
+    index_c: PrimExpr,
+) -> PrimExpr: ...
+def tvm_fill_fragment(fragment: Var, m: IntImm, n: IntImm, k: IntImm, index: PrimExpr, value: PrimExpr) -> PrimExpr: ...
+def tvm_store_matrix_sync(
+    fragment: Var,
+    m: IntImm,
+    n: IntImm,
+    k: IntImm,
+    index: PrimExpr,
+    buffer_ptr: PrimExpr,
+    stride: PrimExpr,
+    layout: Literal["row_major", "column_major"],
+) -> PrimExpr: ...
+def ptx_wait_group(num: int) -> PrimExpr: ...
+def ptx_commit_group() -> _T: ...
+def ptx_cp_async_barrier(barrier_id: int) -> PrimExpr: ...
+def ptx_init_barrier_thread_count(barrier_id: int, thread_count: int) -> PrimExpr: ...
+def ptx_arrive_barrier(barrier_id: int) -> PrimExpr: ...
+def ptx_arrive_barrier_expect_tx(barrier_id: int, byte_count: int) -> PrimExpr: ...
+def ptx_wait_barrier(barrier_id: int) -> PrimExpr: ...
+def create_barriers(barrier_count: int) -> PrimExpr: ...
+def assume(cond: _T = None) -> _T: ...
+def undef() -> _T: ...
+def TVMBackendAllocWorkspace(device_type: int, device_id: int, nbytes: int, dtype_code_hint: int, dtype_bits_hint: int) -> PrimExpr: ...
+def TVMBackendFreeWorkspace(device_type: int, device_id: int, ptr: Var) -> PrimExpr: ...
+def start_profile_intrinsic(id: int) -> PrimExpr: ...
+def end_profile_intrinsic(id: int) -> PrimExpr: ...
+def anylist_getitem(list_handle, index) -> PrimExpr: ...
+def anylist_resetitem(list_handle, index) -> PrimExpr: ...
+def anylist_setitem_call_packed(list_handle, index, func_name, *args) -> PrimExpr: ...
+def anylist_setitem_call_cpacked(list_handle, index, func_name, *args) -> PrimExpr: ...
+def vscale() -> _T: ...
diff --git a/tilelang/language/tir/op.py b/tilelang/language/tir/op.py
index 925665609..20876a944 100644
--- a/tilelang/language/tir/op.py
+++ b/tilelang/language/tir/op.py
@@ -117,7 +117,7 @@ def call_cpacked_lowered(*args, span=None):
     return _tvm_op.call_cpacked_lowered(*args, span=span)
 
 
-def call_intrin(dtype, func_name, *args, span=None):
+def call_intrin(dtype, func_name, *args, annotations=None, span=None):
     """Build expression by calling an intrinsic function.
 
     Intrinsics can be overloaded with multiple data types via
@@ -142,7 +142,7 @@ def call_intrin(dtype, func_name, *args, span=None):
     call : PrimExpr
         The call expression.
     """
-    return _tvm_op.call_intrin(dtype, func_name, *args, span=span)
+    return _tvm_op.call_intrin(dtype, func_name, *args, annotations=annotations, span=span)
 
 
 def call_pure_extern(dtype, func_name, *args, span=None):
@@ -724,8 +724,7 @@ def tvm_load_matrix_sync(fragment, m, n, k, index, buffer_ptr, stride, layout):
     return _tvm_op.tvm_load_matrix_sync(fragment, m, n, k, index, buffer_ptr, stride, layout)
 
 
-def tvm_mma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c,
-                 index_c):
+def tvm_mma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c, index_c):
     """TVM intrinsic for tensor core mma_sync operators
 
     Parameters
@@ -759,12 +758,10 @@ def tvm_mma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b,
     call : PrimExpr
         The call expression.
     """
-    return _tvm_op.tvm_mma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b,
-                                fragment_c, index_c)
+    return _tvm_op.tvm_mma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c, index_c)
 
 
-def tvm_bmma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c,
-                  index_c):
+def tvm_bmma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c, index_c):
     """TVM intrinsic for tensor core bmma_sync operators
 
     Parameters
@@ -798,8 +795,7 @@ def tvm_bmma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b,
     call : PrimExpr
         The call expression.
     """
-    return _tvm_op.tvm_bmma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b,
-                                 fragment_c, index_c)
+    return _tvm_op.tvm_bmma_sync(fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c, index_c)
 
 
 def tvm_fill_fragment(fragment, m, n, k, index, value):
@@ -1107,7 +1103,6 @@ def ptx_wgmma_ss(
 def ptx_wgmma_rs(
     dtype,
     wgmma_prefix,
-    a_is_k_major,
     b_is_k_major,
     a_dtype_abbrv,
     b_dtype_abbrv,
@@ -1122,12 +1117,10 @@ def ptx_wgmma_rs(
     scale_in_a,
     scale_in_b,
 ):
-
     return call_intrin(
         dtype,
         _tvm_op.Op.get("tl.ptx_wgmma_rs"),
         wgmma_prefix,
-        a_is_k_major,
         b_is_k_major,
         a_dtype_abbrv,
         b_dtype_abbrv,
@@ -1144,6 +1137,115 @@ def ptx_wgmma_rs(
     )
 
 
+def ptx_tcgen05_mma_ss(
+    kind_dtype,
+    desc_a,
+    A_offset,
+    desc_b,
+    B_offset,
+    C_ptr,
+    C_offset,
+    desc_val,
+    scale_out,
+    mask0,
+    mask1,
+    mask2,
+    mask3,
+    enable_ws=False,
+    ws=None,
+    warp_specialized=None,
+    variant=None,
+):
+    """TVM intrinsic for tcgen05.mma shared-memory × shared-memory instructions.
+
+    Expects 13 or 14 positional arguments:
+    (kind_dtype, desc_a, A_offset, desc_b, B_offset, C_ptr, C_offset,
+     desc_val, scale_out, mask0, mask1, mask2, mask3[, enable_ws]).
+    Aliases: you can also pass `ws` or `warp_specialized` (booleans) instead of `enable_ws`.
+    Alternatively, use `variant="ws"` (or "default").
+    - kind_dtype: instruction kind selector (e.g., T.float16 for kind::f16,
+      "tf32" for kind::tf32, "int8" for kind::i8, "float8_e4m3" for kind::f8f6f4).
+    """
+    # Aliases precedence: if either `ws` or `warp_specialized` is provided, they override enable_ws
+    if ws is not None:
+        enable_ws = bool(ws)
+    if warp_specialized is not None:
+        enable_ws = bool(warp_specialized)
+    if variant is not None:
+        if isinstance(variant, str):
+            v = variant.lower()
+            if v in ("ws", "warp_specialized", "warp-specialized"):
+                enable_ws = True
+            elif v in ("default", "std", "ss"):
+                enable_ws = False
+            else:
+                raise ValueError(f"ptx_tcgen05_mma_ss: unknown variant: {variant}")
+        else:
+            # Treat non-string as truthy flag
+            enable_ws = bool(variant)
+
+    return call_intrin(
+        "handle",
+        _tvm_op.Op.get("tl.ptx_tcgen05_mma_ss"),
+        kind_dtype,
+        desc_a,
+        A_offset,
+        desc_b,
+        B_offset,
+        C_ptr,
+        C_offset,
+        desc_val,
+        scale_out,
+        mask0,
+        mask1,
+        mask2,
+        mask3,
+        enable_ws,
+    )
+
+
+def ptx_tcgen05_mma_ts(
+    kind_dtype,
+    A_ptr,
+    A_offset,
+    desc_b,
+    B_offset,
+    C_ptr,
+    C_offset,
+    desc_val,
+    scale_out,
+    mask0,
+    mask1,
+    mask2,
+    mask3,
+):
+    """TVM intrinsic for tcgen05.mma tensor-memory × shared-memory instructions.
+
+    Expects 13 positional arguments:
+    (kind_dtype, A_ptr, A_offset, desc_b, B_offset, C_ptr, C_offset,
+     desc_val, scale_out, mask0, mask1, mask2, mask3).
+    - kind_dtype: instruction kind selector (e.g., T.float16 for kind::f16,
+      "tf32" for kind::tf32, "int8" for kind::i8, "float8_e4m3" for kind::f8f6f4).
+    """
+    return call_intrin(
+        "handle",
+        _tvm_op.Op.get("tl.ptx_tcgen05_mma_ts"),
+        kind_dtype,
+        A_ptr,
+        A_offset,
+        desc_b,
+        B_offset,
+        C_ptr,
+        C_offset,
+        desc_val,
+        scale_out,
+        mask0,
+        mask1,
+        mask2,
+        mask3,
+    )
+
+
 def mma_store(dtype, m, n, dst_ptr, src_ptr, src_offset, dst_stride):
     """TVM intrinsic for storing the result of PTX MMA into a destination pointer
 
@@ -1238,8 +1340,7 @@ def ptx_ldmatrix(dtype, trans, num, type, local_ptr, local_offset, smem_ptr, sme
     call : PrimExpr
         The call expression.
     """
-    return _tvm_op.ptx_ldmatrix(dtype, trans, num, type, local_ptr, local_offset, smem_ptr,
-                                smem_offset)
+    return _tvm_op.ptx_ldmatrix(dtype, trans, num, type, local_ptr, local_offset, smem_ptr, smem_offset)
 
 
 def ptx_cp_async(dtype, shared_ptr, shared_offset, global_ptr, global_offset, bytes):
@@ -1274,8 +1375,7 @@ def ptx_cp_async(dtype, shared_ptr, shared_offset, global_ptr, global_offset, by
     return _tvm_op.ptx_cp_async(dtype, shared_ptr, shared_offset, global_ptr, global_offset, bytes)
 
 
-def ptx_cp_async_bulk(dtype, shared_ptr, shared_offset, global_ptr, global_offset, bytes,
-                      barrier_id):
+def ptx_cp_async_bulk(dtype, shared_ptr, shared_offset, global_ptr, global_offset, bytes, barrier_id):
     """TVM intrinsic for ptx async copy from global to shared memory using cp.async.bulk
     https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
 
@@ -1307,8 +1407,7 @@ def ptx_cp_async_bulk(dtype, shared_ptr, shared_offset, global_ptr, global_offse
     call : PrimExpr
         The call expression.
     """
-    return _tvm_op.ptx_cp_async_bulk(dtype, shared_ptr, shared_offset, global_ptr, global_offset,
-                                     bytes, barrier_id)
+    return _tvm_op.ptx_cp_async_bulk(dtype, shared_ptr, shared_offset, global_ptr, global_offset, bytes, barrier_id)
 
 
 def ptx_commit_group():
@@ -1893,7 +1992,7 @@ def infinity(dtype: str, span: Span | None = None) -> Any:
     value : tvm.Expr
         The infinity value of dtype.
     """
-    return _tvm_op.infinity(dtype, span)
+    return call_intrin(dtype, _tvm_op.Op.get("tl.infinity"), dtype, span=span)
 
 
 def reinterpret(dtype, value, span: Span | None = None) -> Any:
@@ -2844,8 +2943,7 @@ def q_multiply_shift_per_axis(
     z : PrimExpr
         The result.
     """
-    return _tvm_op.q_multiply_shift_per_axis(x, y, ls, rs, q, is_lshift_required,
-                                             is_rshift_required)
+    return _tvm_op.q_multiply_shift_per_axis(x, y, ls, rs, q, is_lshift_required, is_rshift_required)
 
 
 def shift_left(x, y, span=None):
@@ -3195,8 +3293,7 @@ def TVMBackendAllocWorkspace(device_type, device_id, nbytes, dtype_code_hint, dt
     call : PrimExpr
         The call expression.
     """
-    return _tvm_op.TVMBackendAllocWorkspace(device_type, device_id, nbytes, dtype_code_hint,
-                                            dtype_bits_hint)
+    return _tvm_op.TVMBackendAllocWorkspace(device_type, device_id, nbytes, dtype_code_hint, dtype_bits_hint)
 
 
 def TVMBackendFreeWorkspace(device_type, device_id, ptr):
diff --git a/tilelang/language/utils.py b/tilelang/language/utils.py
index 161a09c45..7d6829419 100644
--- a/tilelang/language/utils.py
+++ b/tilelang/language/utils.py
@@ -1,92 +1,33 @@
-from __future__ import annotations
-
 from tilelang import tvm as tvm
 from tvm import tir
-from tvm.tir import PrimExpr, Buffer, BufferLoad, op
+from tvm.tir import PrimExpr, BufferLoad, op
 from tilelang import language as T
 
 
 def region(buffer: BufferLoad, access_type: str, *args: PrimExpr):
-    """
-    Create a tile memory-region descriptor for a BufferLoad.
-
-    Maps access_type ('r', 'w', 'rw') to the numeric codes expected by the `tl.region` intrinsic
-    (1, 2, 3 respectively) and returns a tir.Call representing the region with the provided extents.
-
-    Parameters:
-        buffer (tir.BufferLoad): The BufferLoad that identifies the underlying buffer and indices.
-        access_type (str): One of 'r', 'w', or 'rw' indicating read, write, or read-write access.
-        *args (tir.PrimExpr): Extent expressions for each region dimension.
-
-    Returns:
-        tir.Call: A call to the `tl.region` intrinsic describing the memory region.
-
-    Raises:
-        KeyError: If access_type is not one of 'r', 'w', or 'rw'.
-    """
+    """Create a tl.region call for a BufferLoad and extents."""
     access_type = {"r": 1, "w": 2, "rw": 3}[access_type]
-    return T.call_intrin("handle", op.Op.get("tl.region"), buffer, access_type, *args)
-
-
-def buffer_to_tile_region(buffer: Buffer, access_type: str):
-    """Convert a TVM buffer to a tile region descriptor.
-
-    Args:
-        buffer (tir.Buffer): The buffer to convert
-        access_type (str): Type of access - 'r' for read, 'w' for write, 'rw' for read-write
-
-    Returns:
-        tir.Call: A region descriptor covering the entire buffer
-    """
-    mins = [0 for _ in buffer.shape]
-    extents = [x for x in buffer.shape]
-    return region(T.BufferLoad(buffer, mins), access_type, *extents)
+    return T.call_intrin("handle", op.Op.get("tl.tileop.region"), buffer, access_type, *args)
 
 
 def buffer_load_to_tile_region(load: BufferLoad, access_type: str, extents: list[PrimExpr]):
-    """Convert a buffer load operation to a tile region descriptor.
-
-    Args:
-        load (tir.BufferLoad): The buffer load operation
-        access_type (str): Type of access - 'r' for read, 'w' for write, 'rw' for read-write
-        extents (list[tir.PrimExpr]): list of expressions defining the region size
-
-    Returns:
-        tir.Call: A region descriptor for the loaded area
-    """
-    indices = load.indices
-
+    """Convert a BufferLoad to a tl.region call with explicit extents."""
+    indices = list(load.indices)
     if len(indices) > len(extents):
-        # (f"mismatch between indices and extents for buffer load {load}: indices = {indices}, extents = {extents}, "
-        # f"region will be expanded in the last 2 dimensions")
-        new_extents = []
-        for _ in range(len(indices) - len(extents)):
-            new_extents.append(1)
-        for extent in extents:
-            new_extents.append(extent)
-        extents = new_extents
+        extents = [tir.IntImm("int32", 1) for _ in range(len(indices) - len(extents))] + list(extents)
     assert len(indices) == len(extents), f"indices = {indices}, extents = {extents}"
     return region(load, access_type, *extents)
 
 
-def buffer_region_to_tile_region(buffer_region: tir.BufferRegion, access_type: str,
-                                 extents: list[tir.PrimExpr]):
-    """Convert a buffer region to a tile region descriptor.
-
-    Args:
-        buffer_region (tir.BufferRegion): The buffer region to convert
-        access_type (str): Type of access - 'r' for read, 'w' for write, 'rw' for read-write
-
-    Returns:
-        tir.Call: A region descriptor for the specified buffer region
-    """
-    mins = [x.min for x in buffer_region.region]
-    region_extents = [x.extent for x in buffer_region.region]
-    assert len(region_extents) >= len(
-        extents
-    ), f"region_extents must be >= extents, region_extents = {region_extents}, extents = {extents}"
-
-    return region(T.BufferLoad(buffer_region.buffer, mins), access_type, *region_extents)
+def buffer_region_to_tile_region(buffer_region: tir.BufferRegion, access_type: str, extents: list[tir.PrimExpr]):
+    """Clamp extents and return a tl.region call."""
+    mins = [r.min for r in buffer_region.region]
+    region_extents = [r.extent for r in buffer_region.region]
+    assert len(region_extents) >= len(extents), f"region_extents must be >= extents, region_extents = {region_extents}, extents = {extents}"
+    clamped_extents = [
+        tir.min(region_extents[i], extents[i]) if i < len(extents) else region_extents[i] for i in range(len(region_extents))
+    ]
+    return region(tir.BufferLoad(buffer_region.buffer, mins), access_type, *clamped_extents)
 
 
 def index_to_coordinates(index, shape) -> list[PrimExpr]:
@@ -100,7 +41,7 @@ def index_to_coordinates(index, shape) -> list[PrimExpr]:
         shape (Sequence[int]): The extents of each dimension (length >= 1).
 
     Returns:
-        list[PrimExpr]: Coordinates for each dimension in the same order as `shape`.
+        List[PrimExpr]: Coordinates for each dimension in the same order as `shape`.
     """
     coordinates = []
     dims = len(shape)
diff --git a/tilelang/language/v2/__init__.py b/tilelang/language/v2/__init__.py
new file mode 100644
index 000000000..65fa646c7
--- /dev/null
+++ b/tilelang/language/v2/__init__.py
@@ -0,0 +1,2 @@
+from .builder import prim_func, macro, PrimFunc, LazyJITFunc, Ref, const  # noqa: F401
+from .dtypes import *
diff --git a/tilelang/language/v2/ast.py b/tilelang/language/v2/ast.py
new file mode 100644
index 000000000..7734c724e
--- /dev/null
+++ b/tilelang/language/v2/ast.py
@@ -0,0 +1,640 @@
+from __future__ import annotations
+import ast
+from dataclasses import dataclass, field
+from typing import Callable, Generic, Any, Literal, TypeVar
+from contextlib import AbstractContextManager
+from collections.abc import Iterable
+
+# Python 3.9 compatibility for ParamSpec
+try:
+    from typing import ParamSpec
+except ImportError:  # Python < 3.10
+    from typing_extensions import ParamSpec
+import inspect
+
+# from .utils import get_ast, get_compiled_object
+from . import utils
+from . import dtypes
+
+_span_attrs = ["lineno", "col_offset", "end_lineno", "end_col_offset"]
+
+
+def ast_has_span(ast: ast.AST) -> bool:
+    return all(hasattr(ast, attr) for attr in _span_attrs)
+
+
+def ast_get_span(ast: ast.AST) -> tuple[int, int, int, int]:
+    if not ast_has_span(ast):
+        return None
+    return tuple(getattr(ast, attr) for attr in _span_attrs)
+
+
+def ast_set_span(ast: ast.AST, span: tuple[int, int, int, int]):
+    if not ast_has_span(ast):
+        return
+    for attr, value in zip(_span_attrs, span):
+        setattr(ast, attr, value)
+
+
+class QuoteVisitor(ast.NodeTransformer):
+    def __init__(self, names: dict[str, ast.AST], passes: list[Any] | None = None, span=None):
+        self.names = names
+        self.passes = passes or []
+        self.span = span
+
+    def generic_visit(self, node: ast.AST):
+        if self.span is not None:
+            ast_set_span(node, self.span)
+        return super().generic_visit(node)
+
+    def visit_Name(self, node: ast.Name) -> Any:
+        if node.id in self.names:
+            return self.names[node.id]
+        else:
+            return node
+
+    def visit_Pass(self, node: ast.Pass) -> Any:
+        item = self.passes.pop(0)
+        return item if item else node
+
+
+def quote(expr: str, *, passes: list[Any] | None = None, span=None, **kws) -> list[ast.AST]:
+    tree = ast.parse(expr)
+    if isinstance(span, ast.AST):
+        span = ast_get_span(span)
+    tree = QuoteVisitor(kws, passes, span).visit(tree)
+    return tree.body
+
+
+def quote1(expr: str, *, passes: list[Any] | None = None, span=None, **kws) -> ast.AST:
+    res = quote(expr, passes=passes, span=span, **kws)
+    assert len(res) == 1
+    return res[0]
+
+
+def quote_expr(expr: str, **kws) -> ast.expr:
+    res = quote1(expr, **kws)
+    assert isinstance(res, ast.Expr)
+    return res.value
+
+
+Operator = Literal["Add", "Sub", "Mult", "MatMult", "Div", "Mod", "Pow", "LShift", "RShift", "BitOr", "BitXor", "BitAnd", "FloorDiv"]
+BoolOp = Literal["And", "Or", "Not"]
+
+
+def get_operator_name(operator: ast.operator) -> Operator:
+    return operator.__class__.__name__
+
+
+def get_boolop_name(boolop: ast.boolop) -> BoolOp:
+    return boolop.__class__.__name__
+
+
+_T = TypeVar("_T")
+
+
+def eval_op(op: Operator, left: Any, right: Any) -> Any:
+    if op == "Add":
+        return left + right
+    if op == "Sub":
+        return left - right
+    if op == "Mult":
+        return left * right
+    if op == "MatMult":
+        return left @ right
+    if op == "Div":
+        return left / right
+    if op == "Mod":
+        return left % right
+    if op == "Pow":
+        return left**right
+    if op == "LShift":
+        return left << right
+    if op == "RShift":
+        return left >> right
+    if op == "BitOr":
+        return left | right
+    if op == "BitXor":
+        return left ^ right
+    if op == "BitAnd":
+        return left & right
+    if op == "FloorDiv":
+        return left // right
+    raise ValueError(f"Unknown operator: {op}")
+
+
+def eval_aug_assign(op: Operator, left: Any, sl: slice, right: Any) -> Any:
+    if op == "Add":
+        left[sl] += right
+        return left
+    if op == "Sub":
+        left[sl] -= right
+        return left
+    if op == "Mult":
+        left[sl] *= right
+        return left
+    if op == "MatMult":
+        left[sl] @= right
+        return left
+    if op == "Div":
+        left[sl] /= right
+        return left
+    if op == "Mod":
+        left[sl] %= right
+        return left
+    if op == "Pow":
+        left[sl] **= right
+        return left
+    if op == "LShift":
+        left[sl] <<= right
+        return left
+    if op == "RShift":
+        left[sl] >>= right
+        return left
+    if op == "BitOr":
+        left[sl] |= right
+        return left
+    if op == "BitXor":
+        left[sl] ^= right
+        return left
+    if op == "BitAnd":
+        left[sl] &= right
+        return left
+    if op == "FloorDiv":
+        left[sl] //= right
+        return left
+    raise ValueError(f"Unknown operator: {op}")
+
+
+class _empty: ...
+
+
+class BaseBuilder:
+    empty = _empty
+
+    def get_parent_locals(self):
+        return inspect.currentframe().f_back.f_back.f_locals
+
+    def ctx_if(self, cond) -> Iterable[_T]:
+        yield cond
+
+    def ctx_then(self, val: _T) -> Iterable[None]:
+        if val:
+            yield
+
+    def ctx_else(self, val: _T) -> Iterable[None]:
+        if not val:
+            yield
+
+    def eval(self, val: Any):  # noqa: B027
+        pass
+
+    def ctx_for(self, range: Iterable[Any]) -> Iterable[Any]:
+        return range
+
+    def ctx_continue(self) -> bool:
+        return True
+
+    def ctx_break(self) -> bool:
+        return True
+
+    def ctx_while(self, cond: Callable[[], Any]) -> Iterable[None]:
+        while cond():
+            yield
+
+    def bind(self, name: str, value: Any, annot: Any = empty) -> Any:
+        return value
+
+    def unwrap_value(self, value):
+        return value
+
+    def assign_slice(self, lval: Any, sl: slice, value: Any, annot: Any = empty):
+        lval[sl] = value
+
+    def aug_assign(self, op: Operator, target: Any, aug_value: Any) -> Any:
+        return eval_op(op, target, aug_value)
+
+    def aug_assign_slice(self, op: Operator, target: Any, sl: slice, aug_value: Any):
+        eval_aug_assign(op, target, sl, aug_value)
+
+    def boolop(self, op: BoolOp, left: Any, right: Callable[[], Any] | None = None) -> Any:
+        if op == "And":
+            return left and right()
+        if op == "Or":
+            return left or right()
+        if op == "Not":
+            return not left
+        raise ValueError(f"Unknown boolop: {op}")
+
+    def ifexp(self, cond: Any, then: Callable[[], Any], otherwise: Callable[[], Any]) -> Any:
+        return then() if cond else otherwise()
+
+    def ret(self, value: Any) -> Any:
+        return value
+
+    def ctx_with(self, ctx: AbstractContextManager[Any]) -> AbstractContextManager[Any]:
+        return ctx
+
+    def assert_expr(self, cond: Any, msg: Any):
+        assert cond, msg
+
+    def rval(self, name: str, value: Any):
+        return value
+
+    def arg(self, name: str, value: Any):
+        return value
+
+    def override(self, name: str):
+        return globals()[name]
+
+
+class DSLMutator(ast.NodeTransformer):
+    def __init__(self, nonlocals: dict[str, Any], globals: dict[str, Any]):
+        self.tmp_counter = 0
+        self.nonlocals = nonlocals
+        self.globals = globals
+        self.extra_type_hints: dict[str, Any] = {}
+
+    def get_tmp(self) -> str:
+        name = f"__{self.tmp_counter}"
+        self.tmp_counter += 1
+        return name
+
+    def visit_If(self, node: ast.If):
+        node = self.generic_visit(node)
+        br = self.get_tmp()
+        if len(node.orelse) == 0:
+            return quote(
+                f"for {br} in __tb.ctx_if(cond):\n  for _ in __tb.ctx_then({br}):\n    pass\n",
+                cond=node.test,
+                passes=[node.body],
+                span=node,
+            )
+        return quote(
+            f"for {br} in __tb.ctx_if(cond):\n  for _ in __tb.ctx_then({br}):\n    pass\n  for _ in __tb.ctx_else({br}):\n    pass\n",
+            cond=node.test,
+            passes=[node.body, node.orelse],
+            span=node,
+        )
+
+    def visit_Expr(self, node: ast.Expr):
+        node = self.generic_visit(node)
+        return quote("__tb.eval(value)", value=node.value, span=node)
+
+    def _parse_names(self, target: ast.expr):
+        if isinstance(target, ast.Name):
+            return f"'{target.id}'"
+        elif isinstance(target, ast.Tuple):
+            return "(" + ",".join([self._parse_names(elt) for elt in target.elts]) + ",)"
+        else:
+            s = ast.unparse(target)
+            raise NotImplementedError(f"Unsupported for target `{s}`")
+
+    def visit_For(self, node: ast.For):
+        node = self.generic_visit(node)
+        tmp = self.get_tmp()
+        # names = self._parse_names(node.target)
+        var = ast.Name(tmp, ctx=ast.Load())
+        ast_set_span(var, ast_get_span(node.target))
+        stmts = self._emit_assign_target(node.target, var)
+        return quote(
+            f"for {tmp} in __tb.ctx_for(range):\n  pass\n",
+            target=node.target,
+            range=node.iter,
+            passes=[stmts + node.body],
+            span=node,
+        )
+
+    def visit_Continue(self, node: ast.Continue):
+        node = self.generic_visit(node)
+        return quote("if __tb.ctx_continue(): continue", span=node)
+
+    def visit_Break(self, node: ast.Break):
+        node = self.generic_visit(node)
+        return quote("if __tb.ctx_break(): break", span=node)
+
+    def _emit_assign_target(self, target: ast.expr, rval: ast.expr, annot: ast.expr = None) -> list[ast.AST]:
+        if isinstance(target, ast.Name):
+            if annot is None:
+                return quote(f"name = __tb.bind('{target.id}', value)", name=target, value=rval, span=target)
+            else:
+                return quote(f'name = __tb.bind("{target.id}", value, annot)', name=target, value=rval, annot=annot, span=target)
+        elif isinstance(target, ast.Attribute):
+            s = ast.unparse(target)
+            raise NotImplementedError(f"Attribute assignment not supported yet, `{s}`")
+        elif isinstance(target, ast.Subscript):
+            if annot is None:
+                return quote(
+                    "__tb.assign_slice(lval, slice, value)",
+                    lval=target.value,
+                    slice=target.slice,
+                    value=rval,
+                    span=target,
+                )
+            else:
+                return quote(
+                    "__tb.assign_slice(lval, slice, value, annot)",
+                    lval=target.value,
+                    slice=target.slice,
+                    value=rval,
+                    annot=annot,
+                    span=target,
+                )
+        else:
+            # flatten nested tuple into a list of (tmp_name, target)
+            unpacked = []
+
+            def _visit_target(target: ast.expr) -> str:
+                if isinstance(target, (ast.Name, ast.Subscript)):
+                    tmp = self.get_tmp()
+                    unpacked.append((tmp, target))
+                    res = ast.Name(id=tmp, ctx=target.ctx)
+                    ast_set_span(res, ast_get_span(target))
+                    return res
+                elif isinstance(target, ast.Tuple):
+                    elts = [_visit_target(elt) for elt in target.elts]
+                    res = ast.Tuple(elts=elts, ctx=target.ctx)
+                    ast_set_span(res, ast_get_span(target))
+                    return res
+                else:
+                    s = ast.unparse(target)
+                    raise NotImplementedError(f"Attribute assignment not supported yet, `{s}`")
+
+            unpack_stmt = ast.Assign(targets=[_visit_target(target)], value=quote_expr("__tb.unwrap_value(rval)", rval=rval, span=rval))
+            ast_set_span(unpack_stmt, ast_get_span(target))
+            stmts = [unpack_stmt]
+            bind_lvals = []
+            bind_rvals = []
+
+            def flush_binds():
+                if bind_lvals:
+                    stmts.append(quote1(f"{', '.join(bind_lvals)}, = {', '.join(bind_rvals)},", span=target))
+                    bind_lvals.clear()
+                    bind_rvals.clear()
+
+            # the following code generate two phase binding to support swap like semantics
+            # for example:
+            #       a, b = b, a
+            # 1 phase:
+            #    _tmp_0, _tmp_1 = b, a
+            #    => _tmp_0: T.int32 = b
+            #    => _tmp_1: T.int32 = a
+            # 2 phase:
+            #    a, b = _tmp_0, _tmp_1
+            #    => a = _tmp_0 => a[0] = _tmp_0
+            #    => b = _tmp_1 => b[0] = _tmp_1
+
+            # 1 phase: _tmp_0, _tmp_1 = __tb.bind('_', a), __tb.bind('_', b)
+            for tmp, _target in unpacked:
+                bind_lvals.append(tmp)
+                bind_rvals.append(f'__tb.bind("_", {tmp})')
+
+            flush_binds()
+
+            # 2 phase: a, b = __tb.bind('a', _tmp_0), __tb.bind('b', _tmp_1)
+            for tmp, target in unpacked:
+                if isinstance(target, ast.Name):
+                    bind_lvals.append(target.id)
+                    bind_rvals.append(f'__tb.bind("{target.id}", {tmp})')
+                elif isinstance(target, ast.Subscript):
+                    flush_binds()
+                    stmts.append(quote1(f"__tb.assign_slice(lval, slice, {tmp})", lval=target.value, slice=target.slice, span=target))
+                else:
+                    s = ast.unparse(target)
+                    raise NotImplementedError(f"Unsupported target: {s}")
+            flush_binds()
+            return stmts
+
+    def visit_Assign(self, node: ast.Assign) -> list[ast.AST]:
+        node = self.generic_visit(node)
+        rval = node.value
+        if len(node.targets) == 1:
+            return self._emit_assign_target(node.targets[0], rval)
+        else:
+            tmp_name = self.get_tmp()
+            tmp_store = ast.Name(tmp_name, ctx=ast.Store())
+            tmp_load = ast.Name(tmp_name, ctx=ast.Load())
+            ast_set_span(tmp_store, node.targets[0])
+            ast_set_span(tmp_load, node.targets[0])
+            stmt = self._emit_assign_target(tmp_store, rval)
+            for target in node.targets:
+                stmt.extend(self._emit_assign_target(target, tmp_load))
+            return stmt
+
+    def visit_AugAssign(self, node: ast.AugAssign) -> list[ast.AST]:
+        node = self.generic_visit(node)
+        target, rval = node.target, node.value
+        op = get_operator_name(node.op)
+        if isinstance(target, ast.Name):
+            return quote(f"name = __tb.aug_assign('{op}', {target.id}, value)", name=target, value=rval, span=node)
+        elif isinstance(target, ast.Subscript):
+            return quote(
+                f"__tb.aug_assign_slice('{op}', lval, slice, value)",
+                lval=target.value,
+                slice=target.slice,
+                value=rval,
+                span=node,
+            )
+        else:
+            return node
+
+    def visit_AnnAssign(self, node: ast.AnnAssign):
+        node = self.generic_visit(node)
+        rval = node.value or quote_expr("__tb.empty", span=node, annot=node)
+        return self._emit_assign_target(node.target, rval, annot=node.annotation)
+
+    def visit_While(self, node):
+        node = self.generic_visit(node)
+        return quote1("for _ in __tb.ctx_while(lambda: cond):\n  pass", cond=node.test, passes=[node.body], span=node)
+
+    def visit_FunctionDef(self, node: ast.FunctionDef):
+        stmts = []
+        arg_names = set()
+        all_args = node.args.posonlyargs + node.args.args
+        if node.args.vararg is not None:
+            all_args += node.args.vararg
+        all_args += node.args.kwonlyargs
+        for arg in all_args:
+            name = arg.arg
+            arg_names.add(name)
+            if arg.annotation is not None:
+                arg_stmt = quote1(f'{name} = __tb.arg("{name}", {name})', span=arg)
+            else:
+                arg_stmt = quote1(f'{name} = __tb.arg("{name}", {name})', span=arg)
+            arg.annotation = None
+            stmts.append(arg_stmt)
+        # trying to find `A: T.Tensor, b: T.float32` like type hints
+        for stmt in node.body:
+            self._parse_arg_annot(stmt, arg_names)
+        node = self.generic_visit(node)
+        node.body = stmts + node.body
+        node.decorator_list.clear()
+        return quote1(
+            f"def make_closure({', '.join(self.nonlocals.keys())}):\n"
+            f"  def {node.name}(__tb):\n"
+            "    range = __tb.override('range')\n"
+            "    pass\n"
+            f"    return {node.name}\n"
+            f"  return {node.name}",
+            passes=[node],
+        )
+
+    def _try_eval(self, node: ast.expr) -> Any:
+        try:
+            code = "lambda " + ",".join(self.nonlocals.keys()) + ": " + ast.unparse(node)
+            return eval(code, self.globals)(**self.nonlocals)
+        except Exception:
+            return _empty
+
+    def _parse_arg_annot(self, stmt: ast.stmt, arg_names: set[str]):
+        if not isinstance(stmt, ast.AnnAssign):
+            return
+        if not isinstance(stmt.target, ast.Name):
+            return
+        if stmt.value is not None:
+            return
+        name = stmt.target.id
+        if name not in arg_names:
+            return
+        annot = stmt.annotation
+        # case 1: subscript(attribute(T, Tensor), ...)
+        # case 2: attribute(T, float32)
+        if isinstance(annot, ast.Attribute) and annot.attr in dtypes._all_dtypes:
+            eval_res = self._try_eval(annot)
+            if isinstance(eval_res, dtypes.dtype):
+                self.extra_type_hints[name] = eval_res
+                return
+        if isinstance(annot, ast.Subscript) and isinstance(annot.value, ast.Attribute):
+            inner = annot.value
+            if inner.attr in ["Tensor", "StridedTensor", "ptr"]:
+                eval_res = self._try_eval(inner)
+                from tilelang.language.proxy import TensorProxy, StridedTensorProxy, ptr
+
+                if isinstance(eval_res, (TensorProxy, StridedTensorProxy)) or eval_res is ptr:
+                    self.extra_type_hints[name] = ptr
+                    return
+
+    def visit_BoolOp(self, node: ast.BoolOp):
+        node = self.generic_visit(node)
+        op_name = get_boolop_name(node.op)
+        last = node.values[-1]
+        for i in reversed(range(len(node.values) - 1)):
+            last = quote_expr(
+                expr=f"__tb.boolop('{op_name}', left, lambda: right)",
+                left=node.values[i],
+                right=last,
+                span=node,
+            )
+        return last
+
+    def visit_UnaryOp(self, node: ast.UnaryOp):
+        node = self.generic_visit(node)
+        if isinstance(node.op, ast.Not):
+            return quote_expr("__tb.boolop('Not', operand)", operand=node.operand, span=node)
+        return node
+
+    def visit_Compare(self, node: ast.Compare) -> ast.expr:
+        node = self.generic_visit(node)
+        left = node.left
+        split = []
+        for op, comp in zip(node.ops, node.comparators):
+            cmp = ast.Compare(left=left, ops=[op], comparators=[comp])
+            ast_set_span(cmp, ast_get_span(node))
+            split.append(cmp)
+            left = comp
+        last = split[-1]
+        for i in reversed(range(len(split) - 1)):
+            last = quote_expr("__tb.boolop('And', left, lambda: right)", left=split[i], right=last, span=node)
+        return last
+
+    def visit_IfExp(self, node: ast.IfExp) -> ast.Expr:
+        node = self.generic_visit(node)
+        return quote_expr(
+            "__tb.ifexp(cond, lambda: then, lambda: otherwise)", cond=node.test, then=node.body, otherwise=node.orelse, span=node
+        )
+
+    def visit_Return(self, node: ast.Return):
+        node = self.generic_visit(node)
+        return quote("return __tb.ret(value)", value=node.value, span=node)
+
+    def visit_With(self, node: ast.With):
+        node = self.generic_visit(node)
+        for expr in node.items:
+            expr.context_expr = quote_expr("__tb.ctx_with(e)", e=expr.context_expr, span=expr)
+        return node
+
+    def visit_Assert(self, node: ast.Assert):
+        node = self.generic_visit(node)
+        return quote("__tb.assert_expr(cond, msg)", cond=node.test, msg=node.msg, span=node)
+
+    def visit_Name(self, node: ast.Name):
+        if isinstance(node.ctx, ast.Load):
+            return quote_expr(f"__tb.rval('{node.id}', node)", node=node, span=node)
+        return node
+
+
+_P = ParamSpec("_P")
+
+
+@dataclass
+class IRGenerator(Generic[_P, _T]):
+    gen: Callable[[BaseBuilder], Callable[_P, _T]]
+    source: str
+    extra_type_hints: dict[str, Any] = field(default_factory=dict)
+
+
+def mutate(func: Callable[_P, _T]) -> IRGenerator[_P, _T]:
+    """
+    Transform a Python function into an IR (Intermediate Representation) generator.
+    This function takes a regular Python function and performs AST (Abstract Syntax Tree)
+    transformation to create an IRGenerator that can be used for code generation purposes.
+    Args:
+        func (Callable[_P, _T]): The Python function to be transformed. This should be a
+            callable that will be analyzed and mutated at the AST level. The function's
+            signature is preserved through generic type parameters _P (parameters) and
+            _T (return type).
+    Returns:
+        IRGenerator[_P, _T]: An IRGenerator instance wrapping the transformed function.
+            The generator contains:
+            - gen: The compiled and mutated version of the original function
+            - source: The unparsed source code of the transformed AST as a string
+    Example:
+        >>> @mutate
+        ... def my_function(x: int) -> int:
+        ...     return x * 2
+        >>> # my_function is now an IRGenerator that can be used for code generation
+    Note:
+        - The original function's closure variables and captured context are preserved
+        - The transformation is performed at compile-time through AST manipulation
+        - The returned IRGenerator maintains type information from the original function
+    """
+
+    tree = utils.get_ast(func)
+    filename = inspect.getsourcefile(func) or inspect.getfile(func)
+    nonlocals = utils.get_func_nonlocals(func)
+
+    # DSLMutator generates a function named `make_closure`
+    #   it accepts all names inside nonlocal, and returns the mutated function
+    #   this is because we must separate the closure namespace form the global namespace
+    #     if we directly inject closure variables into the global namespace,
+    #     it generates a new `globals` dict, and the dict owns all reference to the original globalns
+    #     which makes memory leak, because the original globalns cannot be freed
+    #     ```py
+    #     a = 123
+    #     def foo():
+    #       x = foo.__globals__ # OK, globals are maintained by python
+    #       x = {**foo.__globals__, } # Not OK: globals are copied, and the original globals cannot be freed
+    #       def bar(): x
+    #       return bar
+    #     ```
+    mut = DSLMutator(nonlocals, func.__globals__)
+    tree = mut.visit(tree)
+
+    make_closure = utils.get_compiled_object(
+        tree,
+        "make_closure",
+        filename,
+        func.__globals__,  # use the original globalns
+    )
+    fn = make_closure(**nonlocals)
+    return IRGenerator(gen=fn, source=ast.unparse(tree), extra_type_hints=mut.extra_type_hints)
diff --git a/tilelang/language/v2/builder.py b/tilelang/language/v2/builder.py
new file mode 100644
index 000000000..abe5cdf5a
--- /dev/null
+++ b/tilelang/language/v2/builder.py
@@ -0,0 +1,1006 @@
+from __future__ import annotations
+from contextlib import contextmanager, AbstractContextManager
+from dataclasses import dataclass
+import inspect
+
+from tilelang.language.kernel import KernelLaunchFrame
+from tvm_ffi.container import Map
+from tvm.ir.base import Span
+from tvm.ir.expr import Range
+from tvm.tir.stmt import BufferRegion
+from tvm.tir.stmt_functor import substitute
+from .ast import BaseBuilder, IRGenerator, eval_op, mutate
+from .utils import construct_strides
+from tilelang.utils import side_effect
+import tvm
+from tvm.tir import Buffer
+from tvm.script.ir_builder import tir, IRBuilder
+
+from tvm.tir.expr import BufferLoad, CallEffectKind, EqualOp, FloatImm, IntImm, NotEqualOp, PrimExpr, StringImm, Var
+from typing import TYPE_CHECKING, Callable, Any, Generic, TypeVar, ForwardRef, Union
+from collections.abc import Hashable
+from collections.abc import Sequence
+
+# Python 3.9 compatibility for ParamSpec and Self
+try:
+    from typing import ParamSpec, Self
+except ImportError:  # Python < 3.11 for Self, < 3.10 for ParamSpec
+    from typing_extensions import ParamSpec, Self
+from . import dtypes as dt
+from . import utils
+import threading
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def unwrap_expr(expr) -> PrimExpr | int | float:
+    """
+    unwrap expr and convert it into PrimExpr like
+    """
+    if isinstance(expr, tir.meta_var):
+        expr = expr.value
+    elif isinstance(expr, Ref):
+        return expr.load()
+    elif is_var(expr):
+        expr = tir.BufferLoad(expr, indices=[0])
+    elif isinstance(expr, (EqualOp, NotEqualOp)):
+        expr = expr.asobject()
+    return expr
+
+
+def unwrap_cond(expr):
+    """
+    unwrap expr and convert to bool condition
+    """
+    expr = unwrap_expr(expr)
+    if isinstance(expr, (IntImm, FloatImm, StringImm)):
+        return bool(expr.value)
+    elif isinstance(expr, PrimExpr):
+        return expr
+    elif isinstance(expr, Buffer):
+        raise TypeError(f"Buffer `{expr}` cannot be used as condition directly.")
+    elif isinstance(expr, (int, bool)) or expr is None:
+        return bool(expr)
+    else:
+        logger.warning(
+            f"Python expression `{expr}` is used as condition in TileLang, \nthis is treated as a constant expression. ",
+            stack_info=True,
+            stacklevel=3,
+        )
+        return bool(expr)
+
+
+thread_local_storage = threading.local()
+
+
+class Frame:
+    """
+    Frame are virtual context managers used in frontend only
+    They do not have any runtime representation in the generated TIR.
+    """
+
+    def __enter__(self): ...
+
+    def __exit__(self, exc_type, exc_value, traceback): ...
+
+
+class MacroFrame(Frame): ...
+
+
+class ExitedMacroFrame(Frame): ...
+
+
+class BoolOpFrame(Frame): ...
+
+
+class ContinueFrame(Frame): ...
+
+
+class BreakFrame(Frame): ...
+
+
+@dataclass
+class SerialForWithStep:
+    start: PrimExpr
+    stop: PrimExpr
+    step: PrimExpr
+    annotations: dict[str, Any] | None = None
+
+
+@dataclass
+class OutTensor:
+    shape: Sequence[PrimExpr]
+    dtype: dt.dtype
+
+    @property
+    def strides(self):
+        return construct_strides(tuple(self.shape))
+
+
+@dataclass
+class Ref:
+    bufload: BufferLoad
+
+    @property
+    def buffer(self):
+        return self.bufload.buffer
+
+    def store(self, value):
+        tir.buffer_store(self.bufload.buffer, value, self.bufload.indices)
+
+    def load(self):
+        return self.bufload
+
+
+class UnrollForWithStep(SerialForWithStep): ...
+
+
+# Python 3.9 compatibility: avoid PEP 604 unions at runtime
+# Use tuple for isinstance checks and typing.Union for annotations/aliases
+ContinueOrBreak = (ContinueFrame, BreakFrame)
+AnyFrame = Union[tir.frame.IRBuilderFrame, Frame]
+
+TIR_CONTROL_FRAME = (
+    tir.frame.WhileFrame,
+    tir.frame.ForFrame,
+    tir.frame.IfFrame,
+    tir.frame.PrimFuncFrame,
+)
+
+TIR_VAR_SCOPE_FRAME = (
+    tir.frame.WhileFrame,
+    tir.frame.ForFrame,
+    tir.frame.IfFrame,
+    tir.frame.PrimFuncFrame,
+    MacroFrame,
+    KernelLaunchFrame,
+)
+
+
+def is_var(v: Any) -> bool:
+    return isinstance(v, Buffer) and v.scope() == "local.var"
+
+
+class Builder(BaseBuilder):
+    def __init__(self):
+        self.frames: list[AnyFrame] = []
+        self.ir_builder = IRBuilder()
+        self.name_inside_frame: dict[str, AnyFrame] = {}
+        self.macro_arg_annot = {}
+        self.out_idx = []
+        self.out_tensor_cnt = 0
+        self.constexpr_var = set()
+        self.lazy_jit = False
+
+    @classmethod
+    def current(cls) -> Self:
+        builder = getattr(thread_local_storage, "builder", None)
+        return builder
+
+    @contextmanager
+    def prim_func(self, name):
+        thread_local_storage.builder = self
+        with self.ir_builder, self.with_frame(tir.prim_func()):
+            tir.func_name(name)
+            yield
+        if len(self.out_idx) != self.out_tensor_cnt:
+            raise RuntimeError("Not all tensor allocated from `T.empty` are returned")
+        del thread_local_storage.builder
+
+    @contextmanager
+    def macro(self, name=None, annotations=None):
+        if self.find_frame_idx(BoolOpFrame) is not None:
+            raise RuntimeError(
+                f"Macro `{name}` is used inside boolean expressions, "
+                "please use `if` to replace `M and M`, `M or M`, `M if xxx else M` constructs"
+            )
+        save = self.name_inside_frame, self.macro_arg_annot
+        self.name_inside_frame = {}
+        self.macro_arg_annot = annotations or {}
+        pos = len(self.frames)
+        # here we add a ExitedMacroFrame to preserve the frame stack inside macro
+        # because macro may bind some variable, and return it
+        #
+        # ```py
+        # @T.macro
+        # def foo(x):
+        #    y = x + 1
+        #    return y
+        # @T.prim_func
+        # def bar():
+        #    c = foo(1) # macro generates let y = x + 1
+        #    d = c # d = c should lay inside frame of `let y = x + 1`
+        self.frames.append(MacroFrame())
+        yield
+        self.frames[pos] = ExitedMacroFrame()
+        self.name_inside_frame, self.macro_arg_annot = save
+
+    def get(self) -> PrimFunc:
+        return self.ir_builder.get()
+
+    def find_frame_idx(self, frame: type | tuple[type, ...], start=0) -> int | None:
+        for idx in reversed(range(start, len(self.frames))):
+            f = self.frames[idx]
+            if isinstance(f, frame):
+                return idx
+
+    def enter_frame(self, frame: AbstractContextManager[Any]):
+        self.frames.append(frame)
+        return frame.__enter__()
+
+    def check_continue_break(self):
+        idx = self.find_frame_idx(ContinueOrBreak)
+        if idx is not None:
+            logger.warning("Writing code after continue/break may cause undefined behavior in tilelang.", stack_info=True, stacklevel=3)
+
+    @contextmanager
+    def with_frame(self, frame: AbstractContextManager[Any] | None):
+        pop_idx = len(self.frames)
+        yield self.enter_frame(frame)
+        while len(self.frames) > pop_idx:
+            self.frames.pop().__exit__(None, None, None)
+
+    class _has_if_frame: ...
+
+    def ctx_if(self, cond):
+        self.check_continue_break()
+        cond = unwrap_cond(cond)
+        if isinstance(cond, PrimExpr):
+            with self.with_frame(tir.If(cond)):
+                yield self._has_if_frame
+        else:
+            yield cond
+
+    def ctx_then(self, val):
+        if val is self._has_if_frame:
+            with self.with_frame(tir.Then()):
+                yield
+        else:
+            if val:
+                yield
+
+    def ctx_else(self, val):
+        if val is self._has_if_frame:
+            with self.with_frame(tir.Else()):
+                yield
+        else:
+            if not val:
+                yield
+
+    def eval(self, val: Any):
+        val = unwrap_expr(val)
+        if val is None:
+            pass
+        elif isinstance(val, tir.frame.IRBuilderFrame):
+            if isinstance(val, tir.frame.ForFrame):
+                logger.warning(
+                    "Evaluating a for frame may cause undefined behavior in tilelang.",
+                    stack_info=True,
+                    stacklevel=1,
+                )
+            self.enter_frame(val)
+        elif isinstance(val, PrimExpr):
+            tir.evaluate(val)
+        elif isinstance(val, (int, bool)):
+            tir.evaluate(tvm.tir.const(val))
+        elif isinstance(val, str):
+            pass
+        elif isinstance(val, tvm.tir.stmt.BufferStore):
+            tir.buffer_store(val.buffer, val.value, val.indices, val.predicate)
+        elif isinstance(val, (Buffer, Var)):
+            pass
+        else:
+            logger.warning(f"Unused return value: {val}({type(val)})", stack_info=True, stacklevel=2)
+
+    def ctx_for(self, it):
+        self.check_continue_break()
+        it = unwrap_expr(it)
+        if isinstance(it, (SerialForWithStep, UnrollForWithStep)):
+            # Validate and compute the trip count before constructing the frame
+            if isinstance(it.step, (int, IntImm)):
+                step_value = it.step if isinstance(it.step, int) else it.step.value
+                if step_value == 0:
+                    raise ValueError("Invalid stepped serial: step must be non-zero")
+                if step_value > 0:
+                    real_stop = tir.ceildiv(it.stop - it.start, step_value)
+                else:
+                    real_stop = tir.ceildiv(it.start - it.stop, -step_value)
+            else:
+                logger.warning(f"Using a non-constant step `{it.step}` in stepped serial may lead to undefined behavior in tilelang")
+                real_stop = tir.ceildiv(it.stop - it.start, it.step)
+            if isinstance(it, UnrollForWithStep):
+                real_frame = tir.unroll(real_stop, annotations=it.annotations)
+            elif isinstance(it, SerialForWithStep):
+                real_frame = tir.serial(real_stop, annotations=it.annotations)
+            else:
+                raise TypeError(
+                    f"Invalid for loop, got {it}({type(it)}), expect one of the following: "
+                    "range, T.serial, T.unroll, T.grid, T.parallel, T.vectorized, T.thread_binding"
+                )
+            with self.with_frame(real_frame) as v:
+                IRBuilder.name("_tmp", v)
+                yield it.start + v * it.step
+        else:
+            if not isinstance(it, tir.frame.ForFrame):
+                raise TypeError(
+                    f"Invalid for loop, got {it}({type(it)}), expect one of the following: "
+                    "range, T.serial, T.grid, T.parallel, T.vectorized, T.unroll, T.thread_binding"
+                )
+            with self.with_frame(it) as v:
+                yield v
+
+    def ctx_continue(self):
+        self.check_continue_break()
+        # add a dummy frame for checking code after continue/break
+        self.enter_frame(ContinueFrame())
+        tir.evaluate(tir.continue_loop())
+
+    def ctx_break(self):
+        self.check_continue_break()
+        # add a dummy frame for checking code after continue/break
+        self.enter_frame(BreakFrame())
+        tir.evaluate(tir.break_loop())
+
+    def ctx_while(self, cond):
+        self.check_continue_break()
+        cond_v = cond()
+        cond_v_unwrap = unwrap_cond(cond_v)
+        if not isinstance(cond_v_unwrap, PrimExpr):
+            if cond_v_unwrap:
+                raise RuntimeError(
+                    f"Infinite while loop detected in TileLang\n"
+                    f"Condition: {cond_v}({type(cond_v)}) => {cond_v_unwrap}({type(cond_v_unwrap)})\n"
+                )
+            else:
+                logger.warning(
+                    "While loop with constant false condition detected in Tilelang, the loop body will never be executed.\n",
+                    f"Condition: {cond_v}({type(cond_v)}) => {cond_v_unwrap}({type(cond_v_unwrap)})\n",
+                    stack_info=True,
+                    stacklevel=2,
+                )
+        with self.with_frame(tir.While(cond_v_unwrap)):
+            yield None
+
+    def bind(self, name, value, annot=BaseBuilder.empty):
+        self.check_continue_break()
+
+        # in prim func, before T.match_buffer
+        # user may write some shape size expression like
+        #   ```py
+        #   M = T.const('M')
+        #   M_2 = M * 2
+        #   A = T.match_buffer(A, (M, M_2))
+        #   ```
+        # If not deal properly, M_2 will be treated as a LetStmt, and causes error in match_buffer
+        # here we do a quick check in prim_func_frame, if the value is pure expr, we directly return it
+        if (
+            isinstance(value, PrimExpr)
+            and isinstance(self.frames[-1], tir.frame.PrimFuncFrame)
+            and side_effect(value) <= CallEffectKind.Pure.value
+        ):
+            return value
+
+        locals = self.get_parent_locals()
+
+        # Handle type annotation
+        if value is self.empty:
+            orig_value = locals.get(name, value)
+            if isinstance(annot, Buffer) and annot.scope() == "global":
+                from tilelang.language import match_buffer
+
+                return IRBuilder.name(
+                    name,
+                    match_buffer(
+                        orig_value,
+                        annot.shape,
+                        annot.dtype,
+                        strides=annot.strides,
+                    ),
+                )
+            else:
+                return orig_value
+
+        orig_value = locals.get(name, self.empty)
+
+        # if orig_value is a local.var, we use buffer_store to modify it immutably
+        #   however, if rvalue is not a PrimExpr, such as buffer,
+        #   we should not use buffer_store, and bind it instead
+        #   ```py
+        #   a = tl.alloc_var('float32')  # bind var `a`
+        #   a = tl.alloc_var('float32')  # bind a new var `a_1`
+        #   a = tl.alloc_shared((1,), T.float32) # bind a to new buffer
+        #   b = a                        # get value of var `b = a_1[0]``
+        #   c = tl.alloc_var('float32')  # bind var `c`
+        #   c = a                        # get and assign `c[0] = a_1[0]`
+        #   ```
+        if isinstance(orig_value, Ref) and isinstance(value, (int, float, PrimExpr)):
+            orig_value.store(value)
+            return orig_value
+        if is_var(orig_value) and isinstance(value, (int, float, PrimExpr)):
+            tir.buffer_store(orig_value, value, 0)
+            return orig_value
+
+        # 2. Quick return for trivil types
+        if isinstance(value, (tuple, list, tvm.ffi.Array, int, float, str)):
+            return value
+        if isinstance(value, tir.IntImm) and value.dtype == "int32":
+            return value.value
+        if isinstance(value, (Var, Buffer)):
+            # Bind TVM Var/Buffer names and also record scope so reusing the same
+            # Python name (e.g., loop vars like `i`) across different for-frames
+            # works without triggering out-of-scope errors.
+            IRBuilder.name(name, value)
+            if name != "_":
+                frame = self.find_frame_idx(TIR_VAR_SCOPE_FRAME)
+                assert frame is not None, f"Variable `{name}` is not defined inside any control flow."
+                self.name_inside_frame[name] = self.frames[frame]
+            return value
+
+        # 3. Bind immutable tilelang objects
+        res = self.bind_immutable(name, value)
+
+        # 4. Check variable scope and shadowing
+        if name != "_":
+            frame = self.find_frame_idx(TIR_VAR_SCOPE_FRAME)
+            assert frame is not None, f"Variable `{name}` is not defined inside any control flow."
+            if name in self.name_inside_frame and self.name_inside_frame[name] in self.frames:
+                logger.warning(
+                    f"Immutable value `{name}` is re-bound. If you want to modify its value, please use T.alloc_var to make it a variable!",
+                    stack_info=True,
+                    stacklevel=2,
+                )
+            self.name_inside_frame[name] = self.frames[frame]
+        return res
+
+    def unwrap_value(self, value):
+        """
+        Unwrap some tilelang objects to get their inner value
+        """
+        value = unwrap_expr(value)
+        # handle bx, by = tl.Kernel(128, 128), rval is frame
+        if isinstance(value, tir.frame.IRBuilderFrame):
+            return self.enter_frame(value)
+        else:
+            return value
+
+    def bind_immutable(self, name, value):
+        """
+        Bind an immutable tilelang objects.
+        The immutability means the result is usually not changed or re-assigned in a python block.
+        """
+        if name == "_":
+            # use _tmp to make the generated tir more readable
+            name = "_tmp"
+        if isinstance(value, tir.meta_var):
+            return value.value
+        elif isinstance(value, tir.frame.IRBuilderFrame):
+            if isinstance(value, tir.frame.ForFrame):
+                logger.warning(
+                    "Binding a for frame to variable may cause undefined behavior in tilelang.",
+                    stack_info=True,
+                    stacklevel=2,
+                )
+            return self.enter_frame(value)
+        elif isinstance(value, OutTensor):
+            arg = tir.arg(
+                name,
+                tir.buffer(
+                    shape=value.shape,
+                    dtype=value.dtype,
+                    strides=value.strides,
+                ),
+            )
+            arg._out_idx = self.out_tensor_cnt
+            self.out_tensor_cnt += 1
+            return arg
+        elif isinstance(value, (Buffer, tir.IterVar, tir.Var)):
+            IRBuilder.name(name, value)
+            return value
+        else:
+            try:
+                value = tvm.runtime.convert(value)
+            except TypeError:
+                return value
+            frame = tir.LetStmt(value)
+            var = frame.var
+            IRBuilder.name(name, var)
+            return self.enter_frame(frame)
+
+    def assign_slice(self, lval: Any, sl: slice, value: Any, annot=BaseBuilder.empty):
+        self.check_continue_break()
+        if annot is not self.empty:
+            logger.warning("Type annotation in slice assignment has no effect", stack_info=True, stacklevel=2)
+        if isinstance(lval, Buffer):
+            tir.buffer_store(lval, value, sl)
+        else:
+            return super().assign_slice(lval, sl, value)
+
+    def aug_assign(self, op, target, aug_value):
+        self.check_continue_break()
+        if isinstance(target, Ref):
+            target.store(eval_op(op, target.bufload, aug_value))
+            return target
+        elif is_var(target):
+            tir.buffer_store(target, eval_op(op, target[0], aug_value), 0)
+            return target
+        elif isinstance(target, Buffer):
+            raise RuntimeError(
+                f"Attempting to update buffer `{target}` using augmented assignment.\n"
+                "Please use slice assignment, e.g. `buf[0] += value` instead."
+            )
+        elif isinstance(target, Var):
+            raise RuntimeError(
+                f"Attempting to update immutable variable `{target}` using augmented assignment.\n"
+                "Please use T.alloc_var to create a mutable variable."
+            )
+        else:
+            return super().aug_assign(op, target, aug_value)
+
+    def aug_assign_slice(self, op, target, sl, aug_value):
+        self.check_continue_break()
+        if isinstance(target, Buffer):
+            tir.buffer_store(target, eval_op(op, target[sl], aug_value), sl)
+        else:
+            return super().aug_assign_slice(op, target, sl, aug_value)
+
+    def boolop(self, op, left, right=None):
+        left = unwrap_cond(left)
+        if isinstance(left, PrimExpr):
+            with self.with_frame(BoolOpFrame()):
+                if op == "And":
+                    return tir.And(left, right())
+                if op == "Or":
+                    return tir.Or(left, right())
+                if op == "Not":
+                    return tir.Not(left)
+            raise RuntimeError(f"Unsupported boolean operator: {op}")
+        else:
+            return super().boolop(op, left, right)
+
+    def ifexp(self, cond, then, otherwise):
+        cond = unwrap_cond(cond)
+        if isinstance(cond, PrimExpr):
+            with self.with_frame(BoolOpFrame()):
+                return tir.if_then_else(cond, then(), otherwise())
+        else:
+            return super().ifexp(cond, then, otherwise)
+
+    def ret(self, value=None):
+        self.check_continue_break()
+        # handle return T.alloc_var()
+        if value is None:
+            value = tuple()
+        elif isinstance(value, tuple):
+            value = tuple(self.unwrap_value(v) for v in value)
+        else:
+            value = self.unwrap_value(value)
+        last_macro = self.find_frame_idx(MacroFrame)
+        if last_macro is not None:
+            frame = self.find_frame_idx(TIR_CONTROL_FRAME, start=last_macro)
+            if frame is not None:
+                raise NotImplementedError(
+                    "In tilelang macro, return from control flow is not supported yet. \n"
+                    "You should allocate a var before the control flow, assign value inside the blocks, \n"
+                    "and return the var after the control flow. i.e.\n"
+                    "```\n"
+                    "@T.macro\n"
+                    "def my_macro(cond):\n"
+                    "    a = T.alloc_var(T.float16)\n"
+                    "    if cond:\n"
+                    "        a = 1.0\n"
+                    "    return a\n"
+                    "```"
+                )
+            return value
+        else:
+            if not isinstance(value, tuple):
+                value = (value,)
+            for v in value:
+                if not isinstance(v, Buffer) or not hasattr(v, "_out_idx"):
+                    raise RuntimeError(f"Only tensor allocated from `T.empty` can be returned in a prim_func, got {v}({type(v)})")
+                # convert 0, 1, 2 => -3, -2, -1 as the out tensor index
+                self.out_idx.append(v._out_idx - self.out_tensor_cnt)
+            if len(self.out_idx) != self.out_tensor_cnt:
+                raise RuntimeError(f"Not all tensor from `T.empty` are returned, only got {value}")
+            return NotImplemented
+
+    def ctx_with(self, ctx):
+        self.check_continue_break()
+        if isinstance(ctx, tir.frame.IRBuilderFrame):
+            return self.with_frame(ctx)
+        else:
+            return super().ctx_with(ctx)
+
+    def assert_expr(self, cond, msg=None):
+        self.check_continue_break()
+        cond = unwrap_cond(cond)
+        if msg is None:
+            msg = "Assertion failed"
+        if isinstance(cond, PrimExpr):
+            self.enter_frame(tir.Assert(cond, msg))
+        elif not cond:
+            raise AssertionError(msg)
+
+    def rval(self, name: str, value: Any) -> Any:
+        if name in self.name_inside_frame:
+            frame = self.name_inside_frame[name]
+            if frame not in self.frames:
+                raise RuntimeError(
+                    f"Immutable variable `{name}` is used outside its defining region!\n"
+                    f"variable `{name}` is defined in frame: {frame}, current frames: {self.frames}."
+                )
+        return self.unwrap_value(value)
+
+    def macro_arg(self, name, value):
+        annot_value = self.macro_arg_annot.get(name, None)
+        if annot_value is Var or annot_value is Ref:
+            if annot_value is Var:
+                logger.warning("Use `T.Var` as macro annotations is deprecated, please use `T.Ref`")
+            if isinstance(value, BufferLoad):
+                if is_var(value.buffer):
+                    return value.buffer
+                idx = [self.bind("_", idx) for idx in value.indices]
+                # indices = self.bind(f'_', value.indices)
+                return Ref(BufferLoad(value.buffer, indices=idx))
+            if isinstance(value, BufferRegion):
+                region = [Range(self.bind("_", x.begin), end=self.bind("_", x.end) if x.end is not None else None) for x in value.region]
+                return BufferRegion(value.buffer, region=region)
+            raise ValueError(
+                f"To pass as reference, argument `{name}` is expected to be a variable or a buffer region, but got {value}({type(value)})"
+            )
+        elif isinstance(value, (PrimExpr, int, float)):
+            return self.bind(name, value)
+        else:
+            return value
+
+    def prim_func_arg(self, name, value):
+        if isinstance(value, (Buffer, Var)):
+            return tir.arg(name, value)
+        elif value is self.empty:
+            raise ValueError(f"Argument `{name}` is not annotated")
+        elif isinstance(value, Hashable):
+            return value
+        else:
+            raise TypeError(f"Unsupported argument type: {value}({type(value)}) for argument `{name}`.")
+
+    def arg(self, name, value):
+        if self.find_frame_idx(MacroFrame) is not None:
+            return self.macro_arg(name, value)
+        else:
+            return self.prim_func_arg(name, value)
+
+    def override(self, name: str):
+        from tilelang.language import serial
+
+        if name == "range":
+            return serial
+        raise ValueError(f"Unknown override: {name}")
+
+    def constexpr(self, name: str, dtype: str = "int32") -> Var:
+        var = tir.Var(name, dtype)
+        self.constexpr_var.add(var)
+        return var
+
+
+_P = ParamSpec("_P")
+_T = TypeVar("_T")
+
+
+if TYPE_CHECKING:
+
+    class PrimFunc(Generic[_P, _T], tvm.tir.PrimFunc):
+        params: list[tvm.tir.Var | tvm.tir.Buffer]
+        body: tvm.tir.Stmt
+        ret_type: tvm.ir.Type
+        buffer_map: Map[tvm.tir.Var, tvm.tir.Buffer]
+        attrs: tvm.Attrs | None
+        span: Span | None
+        ir_gen: IRGenerator[_P, _T] | None
+        orig_func: Callable[_P, _T] | None
+        out_idx_override: list[int] | None
+
+else:
+    PrimFunc = tvm.tir.PrimFunc
+
+
+@dataclass
+class Macro(Generic[_P, _T]):
+    name: str
+    orig_func: Callable[_P, _T]
+    ir_gen: IRGenerator[_P, _T]
+    annotations: dict[str, Any]
+
+    @property
+    def source(self) -> str:
+        return self.ir_gen.source
+
+    def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> _T:
+        builder = Builder.current() or Builder()
+        with builder.macro(self.name, self.annotations):
+            res = self.ir_gen.gen(builder)(*args, **kwargs)
+        return res
+
+    def __hash__(self):
+        return id(self)
+
+    def __eq__(self, other):
+        return id(self) == id(other)
+
+
+def macro(func: Callable[_P, _T] = None) -> Macro[_P, _T]:
+    """
+    Decorator that converts a Python function into a TileLang macro.
+    TileLang macro is very similar to PrimFunc, it can be used in prim_func or another macro.
+    Parameters
+    ----------
+    func : Callable[_P, _T]
+        The Python function to be converted into a macro. This function will be analyzed
+        and transformed into an IR generation function. The function can take any parameters
+        (_P) and return any type (_T).
+    Returns
+    -------
+    Macro[_P, _T]
+        A Macro object that wraps the original function with IR generation capabilities.
+        The returned Macro preserves the original function's signature (parameters _P and
+        return type _T) while adding metaprogramming capabilities.
+    Example:
+    --------
+        >>> @macro
+        ... def my_macro(x: T.int32) -> T.int32:
+        ...    return x ** 2
+        >>> @prim_func
+        ... def my_func(A: T.Tensor((10,), T.int32), B: T.Tensor((10,), T.int32)):
+        ...    with T.Kernel(1) as _:
+        ...        for i in T.serial(10):
+        ...            B[i] = my_macro(A[i])
+    See Also
+    --------
+    Macro : The class that wraps macro functions
+    mutate : The function that transforms Python code into IR generators
+    """
+
+    def impl(func: Callable[_P, _T]) -> Macro[_P, _T]:
+        annotations = get_type_hints(func)
+        return Macro(name=func.__name__, orig_func=func, ir_gen=mutate(func), annotations=annotations)
+
+    return impl(func) if func is not None else impl
+
+
+from typing import _eval_type
+import re
+
+
+def get_type_hints(func):
+    annot = getattr(func, "__annotations__", None)
+    if annot is None:
+        raise TypeError(f"Failed to get function type hints, {func} is not a function")
+    hints = {}
+    # Build eval namespaces from function globals plus captured closure variables
+    # This lets annotations reference symbols like `n`, `h`, or dtype vars
+    # defined in the outer scope of a nested function.
+    globalns = func.__globals__
+    # Here we add nonlocals into localns, to capture the parameters declared in the parent function
+    # ```py
+    # def foo():
+    #   n = 128 # n is nonlocal
+    #   def bar(
+    #       A: T.Tensor(n, T.float32) # we add nonlocal in its eval context
+    #   ):
+    #      for i in range(n): ...
+    # ```
+    #
+    # This is incomplete and buggy
+    #   the only bug scenario the function body doesn't use the the parameters
+    #   but such define-no-use scenario is very rare in writing kernels
+    #
+    # ```py
+    # def foo():
+    #   n = 128
+    #   def bar(A: T.Tensor((n,), T.float32)):
+    #     ... # empty function, do not use `n`
+    localns = utils.get_func_nonlocals(func)
+    for name, value in annot.items():
+        if name == "return":
+            continue
+        if isinstance(value, tvm.DataType):
+            hints[name] = value
+            continue
+        if value is None:
+            value = type(None)
+        if isinstance(value, str):
+            # if the annotation is string, is can be: (i) a T.float32 like annotations, (ii) a ForwardRef object
+            # typing doesn't handle (i), it will try to interpret T.float32
+            #    typing see: T.float32 is str('float32'), and there is no object named `flaot32` and give a NameError
+            # here we manually interpret it to return T.float32 object
+            try:
+                _, v = value.split(".", maxsplit=1)
+            except ValueError:
+                v = value
+            if v in dt._all_dtypes:
+                try:
+                    hints[name] = eval(value, globalns, localns)
+                    continue
+                except Exception:
+                    pass
+            value = ForwardRef(value, is_argument=True, is_class=False)
+            hints[name] = _eval_type(value, globalns=globalns, localns=localns)
+        else:
+            hints[name] = value
+    return hints
+
+
+def const(name: str, dtype: str = "int32") -> tuple[Var, ...]:
+    builder = Builder.current()
+    assert builder is not None, "const can only be used inside `tilelang.lazy_jit` function"
+    assert builder.lazy_jit, "const can only be used inside `tilelang.lazy_jit` function"
+    if "," in name:
+        names = re.split(r"\s*,\s*", name)
+        return tuple(builder.constexpr(n, dtype) for n in names)
+    if " " in name:
+        names = re.split(r"\s+", name)
+        return tuple(builder.constexpr(n, dtype) for n in names)
+    else:
+        return builder.constexpr(name, dtype)
+
+
+@dataclass
+class TirTemplate(Generic[_P, _T]):
+    prim_func: PrimFunc[_P, _T]
+    matcher: dict[Var, tuple[tvm.tir.Var, str, int]]
+
+    @classmethod
+    def create(cls, prim_func: PrimFunc[_P, _T], constexpr: set[Var]) -> TirTemplate[_P, _T]:
+        matcher = {}
+        for k, v in prim_func.buffer_map.items():
+            for i, s in enumerate(v.shape):
+                if s in constexpr and s not in matcher:
+                    matcher[s] = (k.name, "shape", i)
+            for i, s in enumerate(v.strides):
+                if s in constexpr and s not in matcher:
+                    matcher[s] = (k.name, "stride", i)
+        for s in constexpr:
+            if s not in matcher:
+                shapes = {k: v.shape for k, v in prim_func.buffer_map.items()}
+                strides = {k: v.strides for k, v in prim_func.buffer_map.items()}
+                raise RuntimeError(
+                    f"Constexpr variable `{s}` is not used in any buffer shape or stride.\n"
+                    "At least one **DIRECT** usage is required. Please check:\n"
+                    "(1) the variable is not used\n"
+                    f"(2) all uses are indirect, e.g. {s} * 2, {s} * 3. (you can replace them with separate constexpr variables)\n"
+                    f"Buffer shapes: {shapes}\n"
+                    f"Buffer strides: {strides}"
+                )
+        return cls(prim_func=prim_func, matcher=matcher)
+
+    def _parse_phase2_key(self, **kwargs):
+        result = []
+        for k, ty, i in self.matcher.values():
+            if ty == "shape":
+                result.append(kwargs[k].shape[i])
+            if ty == "stride":
+                v = kwargs[k]
+                if isinstance(v, Buffer):
+                    result.append(v.strides[i])
+                else:
+                    result.append(kwargs[k].stride()[i])
+        return tuple(result)
+
+    def get_tir(self, **kwargs):
+        values = self._parse_phase2_key(**kwargs)
+        subs = {name: value for name, value in zip(self.matcher, values)}
+        result = substitute_primfunc(self.prim_func, subs)
+        result.orig_func = self.prim_func.orig_func
+        if hasattr(self.prim_func, "out_idx_override"):
+            result.out_idx_override = self.prim_func.out_idx_override
+        return result
+
+
+@dataclass
+class LazyJITFunc(Generic[_P, _T]):
+    orig_func: Callable[_P, _T]
+    arg_names: list[str]
+    tensor_args: dict[str, Buffer | Var]
+    tensor_args_defaults: dict[str, Any]
+    ir_gen: IRGenerator[_P, _T]
+
+    def __post_init__(self):
+        # we don't want it to show up in the constructor
+        self.p1_cache: dict[Any, TirTemplate[_P, _T]] = {}
+
+    def _parse_phase1_key(self, *args, **kwargs):
+        kwargs.update({k: v for k, v in zip(self.arg_names, args)})
+        tensor_args = {}
+        for k in self.tensor_args:
+            if k in kwargs:
+                tensor_args[k] = kwargs.pop(k)
+            elif k in self.tensor_args_defaults:
+                tensor_args[k] = self.tensor_args_defaults[k]
+        p1_key = tuple(sorted(kwargs.items()))
+        return p1_key, tensor_args, kwargs
+
+    def parse_args(self, *args, **kwargs):
+        p1_key, tensor_args, kwargs = self._parse_phase1_key(*args, **kwargs)
+        tir_temp = self.p1_cache.get(p1_key, None)
+        if tir_temp is None:
+            builder = Builder()
+            builder.lazy_jit = True
+            with builder.prim_func(self.orig_func.__name__):
+                self.ir_gen.gen(builder)(**self.tensor_args, **kwargs)
+            pf = builder.get()
+            pf.orig_func = self.orig_func
+            if builder.out_idx:
+                pf.out_idx_override = builder.out_idx
+            tir_temp = TirTemplate.create(pf, builder.constexpr_var)
+            self.p1_cache[p1_key] = tir_temp
+        p2_key = tir_temp._parse_phase2_key(**tensor_args)
+        return (p1_key, p2_key), tensor_args
+
+    def get_tir(self, *args, **kwargs):
+        (p1_key, _), tensor_args = self.parse_args(*args, **kwargs)
+        return self.p1_cache[p1_key].get_tir(**tensor_args)
+
+    def __call__(self, *args, **kwargs):
+        return self.get_tir(*args, **kwargs)
+
+
+def substitute_primfunc(prim_func, vmap):
+    analyzer = tvm.arith.Analyzer()
+
+    def sub(v):
+        return analyzer.simplify(substitute(v, vmap))
+
+    def substitute_buffer(buf):
+        return tvm.tir.decl_buffer(
+            data=sub(buf.data),
+            shape=[sub(dim) for dim in buf.shape],
+            dtype=buf.dtype,
+            strides=[sub(stride) for stride in buf.strides] if buf.strides else None,
+        )
+
+    return PrimFunc(
+        params=[sub(v) for v in prim_func.params],
+        body=substitute(prim_func.body, vmap),
+        buffer_map={k: substitute_buffer(v) for k, v in prim_func.buffer_map.items()},
+        attrs=prim_func.attrs,
+    )
+
+
+def prim_func(func: Callable[_P, _T] = None, *, lazy_jit=False) -> PrimFunc[_P, _T] | LazyJITFunc[_P, _T]:
+    def impl(func: Callable[_P, _T]) -> PrimFunc[_P, _T] | Callable[_P, PrimFunc[_P, _T]]:
+        sig = inspect.signature(func)
+        ir_gen = mutate(func)
+        func_annot = get_type_hints(func)
+        annot = {}
+        for param in sig.parameters.values():
+            if param.kind == param.POSITIONAL_ONLY:
+                raise TypeError(f"PrimFunc does not support positional-only parameters: `{param.name}`")
+            if param.name in ir_gen.extra_type_hints:
+                annot[param.name] = ir_gen.extra_type_hints[param.name]
+            elif param.name in func_annot:
+                annot[param.name] = func_annot[param.name]
+        for k in annot:
+            if not isinstance(annot[k], type) and callable(annot[k]):
+                annot[k] = annot[k]()
+        if lazy_jit:
+            arg_names = list(sig.parameters.keys())
+            tensor_args = {k: v for k, v in annot.items() if isinstance(v, (Buffer, Var))}
+            tensor_args_defaults = {
+                k: sig.parameters[k].default for k in tensor_args if sig.parameters[k].default is not sig.parameters[k].empty
+            }
+            return LazyJITFunc(func, arg_names, tensor_args, tensor_args_defaults, ir_gen)
+        else:
+            try:
+                builder = Builder()
+                with builder.prim_func(func.__name__):
+                    ir_gen.gen(builder)(**annot)
+                prim_func = builder.get()
+                prim_func.orig_func = func
+                if builder.out_idx:
+                    prim_func.out_idx_override = builder.out_idx
+                return prim_func
+            except Exception as e:
+                logger.fatal(f"Failed to build prim_func from {func.__name__}\nargs={annot}\nsource={ir_gen.source}")
+                raise e
+
+    return impl(func) if func is not None else impl
diff --git a/tilelang/language/v2/dtypes.py b/tilelang/language/v2/dtypes.py
new file mode 100644
index 000000000..a29c57ff9
--- /dev/null
+++ b/tilelang/language/v2/dtypes.py
@@ -0,0 +1,737 @@
+from tilelang import tvm
+from tvm import ir
+import torch
+from typing import Generic, TypeVar, Union, TYPE_CHECKING
+from tvm import tir
+import tvm.script.ir_builder.tir._ffi_api as tb_ffi
+import numpy as np
+from tilelang import logger
+
+_T = TypeVar("_T")
+
+if TYPE_CHECKING:
+
+    class dtype(Generic[_T]):
+        def as_torch(self) -> torch.dtype: ...
+else:
+    dtype = tvm.DataType
+
+# Python 3.9 compatibility: avoid PEP 604 unions at runtime
+AnyDType = Union[ir.Type, str, type, torch.dtype, dtype]
+
+_PYTHON_DTYPE_TO_STR = {
+    bool: "bool",
+    int: "int32",
+    float: "float32",
+}
+
+_NUMPY_DTYPE_TO_STR = {
+    np.bool_: "bool",
+    np.short: "int16",
+    np.int_: "int64",
+    np.longlong: "int64",
+    np.half: "float16",
+    np.double: "float64",
+    np.int8: "int8",
+    np.int16: "int16",
+    np.int32: "int32",
+    np.int64: "int64",
+    np.uint8: "uint8",
+    np.uint16: "uint16",
+    np.uint32: "uint32",
+    np.uint64: "uint64",
+    np.float16: "float16",
+    np.float32: "float32",
+    np.float64: "float64",
+}
+
+_NUMPY_DTYPE_TO_STR.update({np.dtype(k): v for k, v in _NUMPY_DTYPE_TO_STR.items()})
+
+_TORCH_DTYPE_TO_STR = {
+    torch.bool: "bool",
+    torch.short: "int16",
+    torch.int: "int32",
+    torch.long: "int64",
+    torch.half: "float16",
+    torch.float: "float32",
+    torch.double: "float64",
+    torch.int8: "int8",
+    torch.int16: "int16",
+    torch.int32: "int32",
+    torch.int64: "int64",
+    torch.uint8: "uint8",
+    torch.uint16: "uint16",
+    torch.uint32: "uint32",
+    torch.uint64: "uint64",
+    torch.float16: "float16",
+    torch.float32: "float32",
+    torch.float64: "float64",
+    torch.bfloat16: "bfloat16",
+}
+
+_extended_torch_dtypes = [
+    ("float8_e4m3fn",),
+    ("float8_e4m3fnuz",),
+    ("float8_e5m2",),
+    ("float8_e5m2fnuz",),
+    ("float8_e8m0fnu",),
+    ("float4_e2m1fnx2",),
+]
+for dtype_name_tuple in _extended_torch_dtypes:
+    dtype_name = dtype_name_tuple[0]
+    torch_dtype = None
+    if dtype_name == "float4_e2m1fnx2":
+        torch_dtype = getattr(torch, "float4_e2m1fn_x2", None)
+    else:
+        torch_dtype = getattr(torch, dtype_name, None)
+
+    if torch_dtype is not None:
+        _TORCH_DTYPE_TO_STR[torch_dtype] = dtype_name
+
+
+_CANONICAL_TO_DISPLAY_STR = {
+    "double": "float64",
+    "float": "float32",
+    "int": "int32",
+    "long": "int64",
+    "short": "int16",
+    "uint": "uint32",
+    "ulong": "uint64",
+}
+
+_STR_TO_TORCH_DTYPE = {v: k for k, v in _TORCH_DTYPE_TO_STR.items()}
+
+# _STR_TO_NUMPY_DTYPE = {v: k for k, v in _NUMPY_DTYPE_TO_STR.items()}
+
+_DTYPE_TO_STR = {**_PYTHON_DTYPE_TO_STR, **_NUMPY_DTYPE_TO_STR, **_TORCH_DTYPE_TO_STR}
+
+_STR_TO_TVM_DTYPE_CALL = {
+    "bool": "Boolean",
+    "int4": "Int4",
+    "int8": "Int8",
+    "int16": "Int16",
+    "int32": "Int32",
+    "int64": "Int64",
+    "uint8": "UInt8",
+    "uint16": "UInt16",
+    "uint32": "UInt32",
+    "uint64": "UInt64",
+    "float16": "Float16",
+    "float32": "Float32",
+    "float64": "Float64",
+    "bfloat16": "BFloat16",
+    "float8_e4m3": "Float8E4M3",
+    "float8_e4m3fn": "Float8E4M3FN",
+    "float8_e4m3fnuz": "Float8E4M3FNUZ",
+    "float8_e5m2": "Float8E5M2",
+    "float8_e5m2fnuz": "Float8E5M2FNUZ",
+    "float8_e8m0fnu": "Float8E8M0FNU",
+}
+
+int_ = int
+
+
+def __dtype_call__(self: dtype, expr=None, is_size_var: bool = False) -> tir.Var:
+    if isinstance(expr, int_):
+        return tvm.tir.const(expr, dtype=self)
+    if self in _STR_TO_TVM_DTYPE_CALL:
+        attr = _STR_TO_TVM_DTYPE_CALL[self]
+        call = getattr(tb_ffi, attr, None)
+        return call(expr, is_size_var)
+    # try to construct the ffi call
+    if self.startswith("uint"):
+        val = "UInt" + self[4:]
+    elif self.startswith("int"):
+        val = "Int" + self[3:]
+    elif self.startswith("float"):
+        val = "Float" + self[5:]
+    elif self.startswith("bfloat"):
+        val = "BFloat" + self[6:]
+    else:
+        raise TypeError(f"Invalid type {self}")
+    if "_" in val:
+        first, second = val.split("_", maxsplit=1)
+        val = first + second.upper()
+    call = getattr(tb_ffi, val, None)
+    if call is None:
+        raise TypeError(
+            f"Convert to datatype `{self}` is not supported by tvm\ncalling failed on `tvm.script.ir_builder.tir._ffi_api.{val}`"
+        )
+    return call(expr, is_size_var)
+
+
+def __dtype_as_torch__(self: dtype) -> torch.dtype:
+    """Convert TileLang dtype to PyTorch dtype."""
+    dtype_str = str(self)
+
+    if dtype_str == "float8_e4m3":
+        # Check if we're on HIP (AMD ROCm) or CUDA
+        if torch.version.hip is not None:
+            # HIP backend - use float8_e4m3fnuz
+            assert hasattr(torch, "float8_e4m3fnuz"), (
+                "torch.float8_e4m3fnuz is not supported in this version of torch. Please upgrade torch >= 2.2.0"
+            )
+            return torch.float8_e4m3fnuz
+        else:
+            # CUDA backend - use float8_e4m3fn
+            assert hasattr(torch, "float8_e4m3fn"), (
+                "torch.float8_e4m3fn is not supported in this version of torch. Please upgrade torch >= 2.1.0"
+            )
+            return torch.float8_e4m3fn
+    elif dtype_str == "float8_e5m2":
+        assert hasattr(torch, "float8_e5m2"), "torch.float8_e5m2 is not supported in this version of torch. Please upgrade torch >= 2.1.0"
+        return torch.float8_e5m2
+    elif dtype_str == "float8_e4m3fnuz":
+        assert hasattr(torch, "float8_e4m3fnuz"), (
+            "torch.float8_e4m3fnuz is not supported in this version of torch. Please upgrade torch >= 2.2.0"
+        )
+        return torch.float8_e4m3fnuz
+    elif dtype_str == "float8_e8m0fnu":
+        assert hasattr(torch, "float8_e8m0fnu"), (
+            "torch.float8_e8m0fnu is not supported in this version of torch. Please upgrade torch >= 2.8.0"
+        )
+        return torch.float8_e8m0fnu
+    elif dtype_str == "float4_e2m1fnx2":
+        assert hasattr(torch, "float4_e2m1fnx2"), (
+            "torch.float4_e2m1fnx2 is not supported in this version of torch. Please upgrade torch >= 2.8.0"
+        )
+        return torch.float4_e2m1fn_x2
+    elif dtype_str == "float4_e2m1fn":
+        logger.info("torch doesn't support float4_e2m1fn, using float4_e2m1fnx2 as storage dtype.")
+        return torch.float4_e2m1fn_x2 if hasattr(torch, "float4_e2m1fn_x2") else torch.int8
+    elif dtype_str in _STR_TO_TORCH_DTYPE:
+        return _STR_TO_TORCH_DTYPE[dtype_str]
+
+    raise ValueError(f"Cannot convert dtype '{dtype_str}' to torch.dtype. Supported dtypes: {list(_STR_TO_TORCH_DTYPE.keys())}")
+
+
+__orig_dtype_new = dtype.__new__
+
+
+def __dtype_new__(cls, value: AnyDType) -> dtype:
+    if isinstance(value, str):
+        return __orig_dtype_new(cls, _CANONICAL_TO_DISPLAY_STR.get(value, value))
+    elif value in _DTYPE_TO_STR:
+        return __orig_dtype_new(cls, _DTYPE_TO_STR[value])
+    else:
+        expected = set(list(_DTYPE_TO_STR.keys()) + list(_DTYPE_TO_STR.values()))
+        raise TypeError(f"Invalid DataType {value}({type(value)}), expect one of {expected}")
+
+
+dtype.__call__ = __dtype_call__
+dtype.__new__ = __dtype_new__
+dtype.as_torch = __dtype_as_torch__
+
+
+def get_tvm_dtype(value: AnyDType) -> dtype:
+    if isinstance(value, (dtype, ir.Type)):
+        return value
+    return dtype(value)
+
+
+if TYPE_CHECKING:
+    # yapf: disable
+    class bool(dtype): ...
+    class short(dtype): ...
+    class int(dtype): ...
+    class uint(dtype): ...
+    class long(dtype): ...
+    class half(dtype): ...
+    class float(dtype): ...
+    class double(dtype): ...
+    class int4(dtype): ...
+    class int8(dtype): ...
+    class int16(dtype): ...
+    class int32(dtype): ...
+    class int64(dtype): ...
+    class int8x2(dtype): ...
+    class int16x2(dtype): ...
+    class int32x2(dtype): ...
+    class int64x2(dtype): ...
+    class int8x4(dtype): ...
+    class int16x4(dtype): ...
+    class int32x4(dtype): ...
+    class int64x4(dtype): ...
+    class int8x8(dtype): ...
+    class int16x8(dtype): ...
+    class int32x8(dtype): ...
+    class int64x8(dtype): ...
+    class int8x16(dtype): ...
+    class int16x16(dtype): ...
+    class int32x16(dtype): ...
+    class int64x16(dtype): ...
+    class int8x32(dtype): ...
+    class int16x32(dtype): ...
+    class int32x32(dtype): ...
+    class int64x32(dtype): ...
+    class int8x64(dtype): ...
+    class int16x64(dtype): ...
+    class int32x64(dtype): ...
+    class int64x64(dtype): ...
+    class uint8(dtype): ...
+    class uint16(dtype): ...
+    class uint32(dtype): ...
+    class uint64(dtype): ...
+    class uint8x2(dtype): ...
+    class uint16x2(dtype): ...
+    class uint32x2(dtype): ...
+    class uint64x2(dtype): ...
+    class uint8x4(dtype): ...
+    class uint16x4(dtype): ...
+    class uint32x4(dtype): ...
+    class uint64x4(dtype): ...
+    class uint8x8(dtype): ...
+    class uint16x8(dtype): ...
+    class uint32x8(dtype): ...
+    class uint64x8(dtype): ...
+    class uint8x16(dtype): ...
+    class uint16x16(dtype): ...
+    class uint32x16(dtype): ...
+    class uint64x16(dtype): ...
+    class uint8x32(dtype): ...
+    class uint16x32(dtype): ...
+    class uint32x32(dtype): ...
+    class uint64x32(dtype): ...
+    class uint8x64(dtype): ...
+    class uint16x64(dtype): ...
+    class uint32x64(dtype): ...
+    class uint64x64(dtype): ...
+    class float16(dtype): ...
+    class float32(dtype): ...
+    class float64(dtype): ...
+    class float16x2(dtype): ...
+    class float32x2(dtype): ...
+    class float64x2(dtype): ...
+    class float16x4(dtype): ...
+    class float32x4(dtype): ...
+    class float64x4(dtype): ...
+    class float16x8(dtype): ...
+    class float32x8(dtype): ...
+    class float64x8(dtype): ...
+    class float16x16(dtype): ...
+    class float32x16(dtype): ...
+    class float64x16(dtype): ...
+    class float16x32(dtype): ...
+    class float32x32(dtype): ...
+    class float64x32(dtype): ...
+    class float16x64(dtype): ...
+    class float32x64(dtype): ...
+    class float64x64(dtype): ...
+    class float8_e3m4(dtype): ...
+    class float8_e3m4x2(dtype): ...
+    class float8_e3m4x4(dtype): ...
+    class float8_e3m4x8(dtype): ...
+    class float8_e3m4x16(dtype): ...
+    class float8_e3m4x32(dtype): ...
+    class float8_e3m4x64(dtype): ...
+    class float8_e4m3(dtype): ...
+    class float8_e4m3x2(dtype): ...
+    class float8_e4m3x4(dtype): ...
+    class float8_e4m3x8(dtype): ...
+    class float8_e4m3x16(dtype): ...
+    class float8_e4m3x32(dtype): ...
+    class float8_e4m3x64(dtype): ...
+    class float8_e4m3b11fnuz(dtype): ...
+    class float8_e4m3b11fnuzx2(dtype): ...
+    class float8_e4m3b11fnuzx4(dtype): ...
+    class float8_e4m3b11fnuzx8(dtype): ...
+    class float8_e4m3b11fnuzx16(dtype): ...
+    class float8_e4m3b11fnuzx32(dtype): ...
+    class float8_e4m3b11fnuzx64(dtype): ...
+    class float8_e4m3fn(dtype): ...
+    class float8_e4m3fnx2(dtype): ...
+    class float8_e4m3fnx4(dtype): ...
+    class float8_e4m3fnx8(dtype): ...
+    class float8_e4m3fnx16(dtype): ...
+    class float8_e4m3fnx32(dtype): ...
+    class float8_e4m3fnx64(dtype): ...
+    class float8_e4m3fnuz(dtype): ...
+    class float8_e4m3fnuzx2(dtype): ...
+    class float8_e4m3fnuzx4(dtype): ...
+    class float8_e4m3fnuzx8(dtype): ...
+    class float8_e4m3fnuzx16(dtype): ...
+    class float8_e4m3fnuzx32(dtype): ...
+    class float8_e4m3fnuzx64(dtype): ...
+    class float8_e5m2(dtype): ...
+    class float8_e5m2x2(dtype): ...
+    class float8_e5m2x4(dtype): ...
+    class float8_e5m2x8(dtype): ...
+    class float8_e5m2x16(dtype): ...
+    class float8_e5m2x32(dtype): ...
+    class float8_e5m2x64(dtype): ...
+    class float8_e5m2fnuz(dtype): ...
+    class float8_e5m2fnuzx2(dtype): ...
+    class float8_e5m2fnuzx4(dtype): ...
+    class float8_e5m2fnuzx8(dtype): ...
+    class float8_e5m2fnuzx16(dtype): ...
+    class float8_e5m2fnuzx32(dtype): ...
+    class float8_e5m2fnuzx64(dtype): ...
+    class float8_e8m0fnu(dtype): ...
+    class float8_e8m0fnux2(dtype): ...
+    class float8_e8m0fnux4(dtype): ...
+    class float8_e8m0fnux8(dtype): ...
+    class float8_e8m0fnux16(dtype): ...
+    class float8_e8m0fnux32(dtype): ...
+    class float8_e8m0fnux64(dtype): ...
+    class float6_e2m3fn(dtype): ...
+    class float6_e2m3fnx2(dtype): ...
+    class float6_e2m3fnx4(dtype): ...
+    class float6_e2m3fnx8(dtype): ...
+    class float6_e2m3fnx16(dtype): ...
+    class float6_e2m3fnx32(dtype): ...
+    class float6_e2m3fnx64(dtype): ...
+    class float6_e3m2fn(dtype): ...
+    class float6_e3m2fnx2(dtype): ...
+    class float6_e3m2fnx4(dtype): ...
+    class float6_e3m2fnx8(dtype): ...
+    class float6_e3m2fnx16(dtype): ...
+    class float6_e3m2fnx32(dtype): ...
+    class float6_e3m2fnx64(dtype): ...
+    class float4_e2m1fn(dtype): ...
+    class float4_e2m1fnx2(dtype): ...
+    class float4_e2m1fnx4(dtype): ...
+    class float4_e2m1fnx8(dtype): ...
+    class float4_e2m1fnx16(dtype): ...
+    class float4_e2m1fnx32(dtype): ...
+    class float4_e2m1fnx64(dtype): ...
+    class bfloat16(dtype): ...
+    # yapf: enable
+
+else:
+    bool = dtype("bool")
+    short = dtype("int16")
+    int = dtype("int32")
+    uint = dtype("uint32")
+    long = dtype("int64")
+    half = dtype("float16")
+    float = dtype("float32")
+    double = dtype("float64")
+    int4 = dtype("int4")
+    int8 = dtype("int8")
+    int16 = dtype("int16")
+    int32 = dtype("int32")
+    int64 = dtype("int64")
+    int8x2 = dtype("int8x2")
+    int16x2 = dtype("int16x2")
+    int32x2 = dtype("int32x2")
+    int64x2 = dtype("int64x2")
+    int8x4 = dtype("int8x4")
+    int16x4 = dtype("int16x4")
+    int32x4 = dtype("int32x4")
+    int64x4 = dtype("int64x4")
+    int8x8 = dtype("int8x8")
+    int16x8 = dtype("int16x8")
+    int32x8 = dtype("int32x8")
+    int64x8 = dtype("int64x8")
+    int8x16 = dtype("int8x16")
+    int16x16 = dtype("int16x16")
+    int32x16 = dtype("int32x16")
+    int64x16 = dtype("int64x16")
+    int8x32 = dtype("int8x32")
+    int16x32 = dtype("int16x32")
+    int32x32 = dtype("int32x32")
+    int64x32 = dtype("int64x32")
+    int8x64 = dtype("int8x64")
+    int16x64 = dtype("int16x64")
+    int32x64 = dtype("int32x64")
+    int64x64 = dtype("int64x64")
+    uint8 = dtype("uint8")
+    uint16 = dtype("uint16")
+    uint32 = dtype("uint32")
+    uint64 = dtype("uint64")
+    uint8x2 = dtype("uint8x2")
+    uint16x2 = dtype("uint16x2")
+    uint32x2 = dtype("uint32x2")
+    uint64x2 = dtype("uint64x2")
+    uint8x4 = dtype("uint8x4")
+    uint16x4 = dtype("uint16x4")
+    uint32x4 = dtype("uint32x4")
+    uint64x4 = dtype("uint64x4")
+    uint8x8 = dtype("uint8x8")
+    uint16x8 = dtype("uint16x8")
+    uint32x8 = dtype("uint32x8")
+    uint64x8 = dtype("uint64x8")
+    uint8x16 = dtype("uint8x16")
+    uint16x16 = dtype("uint16x16")
+    uint32x16 = dtype("uint32x16")
+    uint64x16 = dtype("uint64x16")
+    uint8x32 = dtype("uint8x32")
+    uint16x32 = dtype("uint16x32")
+    uint32x32 = dtype("uint32x32")
+    uint64x32 = dtype("uint64x32")
+    uint8x64 = dtype("uint8x64")
+    uint16x64 = dtype("uint16x64")
+    uint32x64 = dtype("uint32x64")
+    uint64x64 = dtype("uint64x64")
+    float16 = dtype("float16")
+    float32 = dtype("float32")
+    float64 = dtype("float64")
+    float16x2 = dtype("float16x2")
+    float32x2 = dtype("float32x2")
+    float64x2 = dtype("float64x2")
+    float16x4 = dtype("float16x4")
+    float32x4 = dtype("float32x4")
+    float64x4 = dtype("float64x4")
+    float16x8 = dtype("float16x8")
+    float32x8 = dtype("float32x8")
+    float64x8 = dtype("float64x8")
+    float16x16 = dtype("float16x16")
+    float32x16 = dtype("float32x16")
+    float64x16 = dtype("float64x16")
+    float16x32 = dtype("float16x32")
+    float32x32 = dtype("float32x32")
+    float64x32 = dtype("float64x32")
+    float16x64 = dtype("float16x64")
+    float32x64 = dtype("float32x64")
+    float64x64 = dtype("float64x64")
+    float8_e3m4 = dtype("float8_e3m4")
+    float8_e3m4x2 = dtype("float8_e3m4x2")
+    float8_e3m4x4 = dtype("float8_e3m4x4")
+    float8_e3m4x8 = dtype("float8_e3m4x8")
+    float8_e3m4x16 = dtype("float8_e3m4x16")
+    float8_e3m4x32 = dtype("float8_e3m4x32")
+    float8_e3m4x64 = dtype("float8_e3m4x64")
+    float8_e4m3 = dtype("float8_e4m3")
+    float8_e4m3x2 = dtype("float8_e4m3x2")
+    float8_e4m3x4 = dtype("float8_e4m3x4")
+    float8_e4m3x8 = dtype("float8_e4m3x8")
+    float8_e4m3x16 = dtype("float8_e4m3x16")
+    float8_e4m3x32 = dtype("float8_e4m3x32")
+    float8_e4m3x64 = dtype("float8_e4m3x64")
+    float8_e4m3b11fnuz = dtype("float8_e4m3b11fnuz")
+    float8_e4m3b11fnuzx2 = dtype("float8_e4m3b11fnuzx2")
+    float8_e4m3b11fnuzx4 = dtype("float8_e4m3b11fnuzx4")
+    float8_e4m3b11fnuzx8 = dtype("float8_e4m3b11fnuzx8")
+    float8_e4m3b11fnuzx16 = dtype("float8_e4m3b11fnuzx16")
+    float8_e4m3b11fnuzx32 = dtype("float8_e4m3b11fnuzx32")
+    float8_e4m3b11fnuzx64 = dtype("float8_e4m3b11fnuzx64")
+    float8_e4m3fn = dtype("float8_e4m3fn")
+    float8_e4m3fnx2 = dtype("float8_e4m3fnx2")
+    float8_e4m3fnx4 = dtype("float8_e4m3fnx4")
+    float8_e4m3fnx8 = dtype("float8_e4m3fnx8")
+    float8_e4m3fnx16 = dtype("float8_e4m3fnx16")
+    float8_e4m3fnx32 = dtype("float8_e4m3fnx32")
+    float8_e4m3fnx64 = dtype("float8_e4m3fnx64")
+    float8_e4m3fnuz = dtype("float8_e4m3fnuz")
+    float8_e4m3fnuzx2 = dtype("float8_e4m3fnuzx2")
+    float8_e4m3fnuzx4 = dtype("float8_e4m3fnuzx4")
+    float8_e4m3fnuzx8 = dtype("float8_e4m3fnuzx8")
+    float8_e4m3fnuzx16 = dtype("float8_e4m3fnuzx16")
+    float8_e4m3fnuzx32 = dtype("float8_e4m3fnuzx32")
+    float8_e4m3fnuzx64 = dtype("float8_e4m3fnuzx64")
+    float8_e5m2 = dtype("float8_e5m2")
+    float8_e5m2x2 = dtype("float8_e5m2x2")
+    float8_e5m2x4 = dtype("float8_e5m2x4")
+    float8_e5m2x8 = dtype("float8_e5m2x8")
+    float8_e5m2x16 = dtype("float8_e5m2x16")
+    float8_e5m2x32 = dtype("float8_e5m2x32")
+    float8_e5m2x64 = dtype("float8_e5m2x64")
+    float8_e5m2fnuz = dtype("float8_e5m2fnuz")
+    float8_e5m2fnuzx2 = dtype("float8_e5m2fnuzx2")
+    float8_e5m2fnuzx4 = dtype("float8_e5m2fnuzx4")
+    float8_e5m2fnuzx8 = dtype("float8_e5m2fnuzx8")
+    float8_e5m2fnuzx16 = dtype("float8_e5m2fnuzx16")
+    float8_e5m2fnuzx32 = dtype("float8_e5m2fnuzx32")
+    float8_e5m2fnuzx64 = dtype("float8_e5m2fnuzx64")
+    float8_e8m0fnu = dtype("float8_e8m0fnu")
+    float8_e8m0fnux2 = dtype("float8_e8m0fnux2")
+    float8_e8m0fnux4 = dtype("float8_e8m0fnux4")
+    float8_e8m0fnux8 = dtype("float8_e8m0fnux8")
+    float8_e8m0fnux16 = dtype("float8_e8m0fnux16")
+    float8_e8m0fnux32 = dtype("float8_e8m0fnux32")
+    float8_e8m0fnux64 = dtype("float8_e8m0fnux64")
+    float6_e2m3fn = dtype("float6_e2m3fn")
+    float6_e2m3fnx2 = dtype("float6_e2m3fnx2")
+    float6_e2m3fnx4 = dtype("float6_e2m3fnx4")
+    float6_e2m3fnx8 = dtype("float6_e2m3fnx8")
+    float6_e2m3fnx16 = dtype("float6_e2m3fnx16")
+    float6_e2m3fnx32 = dtype("float6_e2m3fnx32")
+    float6_e2m3fnx64 = dtype("float6_e2m3fnx64")
+    float6_e3m2fn = dtype("float6_e3m2fn")
+    float6_e3m2fnx2 = dtype("float6_e3m2fnx2")
+    float6_e3m2fnx4 = dtype("float6_e3m2fnx4")
+    float6_e3m2fnx8 = dtype("float6_e3m2fnx8")
+    float6_e3m2fnx16 = dtype("float6_e3m2fnx16")
+    float6_e3m2fnx32 = dtype("float6_e3m2fnx32")
+    float6_e3m2fnx64 = dtype("float6_e3m2fnx64")
+    float4_e2m1fn = dtype("float4_e2m1fn")
+    float4_e2m1fnx2 = dtype("float4_e2m1fnx2")
+    float4_e2m1fnx4 = dtype("float4_e2m1fnx4")
+    float4_e2m1fnx8 = dtype("float4_e2m1fnx8")
+    float4_e2m1fnx16 = dtype("float4_e2m1fnx16")
+    float4_e2m1fnx32 = dtype("float4_e2m1fnx32")
+    float4_e2m1fnx64 = dtype("float4_e2m1fnx64")
+    bfloat16 = dtype("bfloat16")
+
+_all_dtypes = {
+    "bool",
+    "short",
+    "int",
+    "uint",
+    "long",
+    "half",
+    "float",
+    "double",
+    "int4",
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "int8x2",
+    "int16x2",
+    "int32x2",
+    "int64x2",
+    "int8x4",
+    "int16x4",
+    "int32x4",
+    "int64x4",
+    "int8x8",
+    "int16x8",
+    "int32x8",
+    "int64x8",
+    "int8x16",
+    "int16x16",
+    "int32x16",
+    "int64x16",
+    "int8x32",
+    "int16x32",
+    "int32x32",
+    "int64x32",
+    "int8x64",
+    "int16x64",
+    "int32x64",
+    "int64x64",
+    "uint8",
+    "uint16",
+    "uint32",
+    "uint64",
+    "uint8x2",
+    "uint16x2",
+    "uint32x2",
+    "uint64x2",
+    "uint8x4",
+    "uint16x4",
+    "uint32x4",
+    "uint64x4",
+    "uint8x8",
+    "uint16x8",
+    "uint32x8",
+    "uint64x8",
+    "uint8x16",
+    "uint16x16",
+    "uint32x16",
+    "uint64x16",
+    "uint8x32",
+    "uint16x32",
+    "uint32x32",
+    "uint64x32",
+    "uint8x64",
+    "uint16x64",
+    "uint32x64",
+    "uint64x64",
+    "float16",
+    "float32",
+    "float64",
+    "float16x2",
+    "float32x2",
+    "float64x2",
+    "float16x4",
+    "float32x4",
+    "float64x4",
+    "float16x8",
+    "float32x8",
+    "float64x8",
+    "float16x16",
+    "float32x16",
+    "float64x16",
+    "float16x32",
+    "float32x32",
+    "float64x32",
+    "float16x64",
+    "float32x64",
+    "float64x64",
+    "float8_e3m4",
+    "float8_e3m4x2",
+    "float8_e3m4x4",
+    "float8_e3m4x8",
+    "float8_e3m4x16",
+    "float8_e3m4x32",
+    "float8_e3m4x64",
+    "float8_e4m3",
+    "float8_e4m3x2",
+    "float8_e4m3x4",
+    "float8_e4m3x8",
+    "float8_e4m3x16",
+    "float8_e4m3x32",
+    "float8_e4m3x64",
+    "float8_e4m3b11fnuz",
+    "float8_e4m3b11fnuzx2",
+    "float8_e4m3b11fnuzx4",
+    "float8_e4m3b11fnuzx8",
+    "float8_e4m3b11fnuzx16",
+    "float8_e4m3b11fnuzx32",
+    "float8_e4m3b11fnuzx64",
+    "float8_e4m3fn",
+    "float8_e4m3fnx2",
+    "float8_e4m3fnx4",
+    "float8_e4m3fnx8",
+    "float8_e4m3fnx16",
+    "float8_e4m3fnx32",
+    "float8_e4m3fnx64",
+    "float8_e4m3fnuz",
+    "float8_e4m3fnuzx2",
+    "float8_e4m3fnuzx4",
+    "float8_e4m3fnuzx8",
+    "float8_e4m3fnuzx16",
+    "float8_e4m3fnuzx32",
+    "float8_e4m3fnuzx64",
+    "float8_e5m2",
+    "float8_e5m2x2",
+    "float8_e5m2x4",
+    "float8_e5m2x8",
+    "float8_e5m2x16",
+    "float8_e5m2x32",
+    "float8_e5m2x64",
+    "float8_e5m2fnuz",
+    "float8_e5m2fnuzx2",
+    "float8_e5m2fnuzx4",
+    "float8_e5m2fnuzx8",
+    "float8_e5m2fnuzx16",
+    "float8_e5m2fnuzx32",
+    "float8_e5m2fnuzx64",
+    "float8_e8m0fnu",
+    "float8_e8m0fnux2",
+    "float8_e8m0fnux4",
+    "float8_e8m0fnux8",
+    "float8_e8m0fnux16",
+    "float8_e8m0fnux32",
+    "float8_e8m0fnux64",
+    "float6_e2m3fn",
+    "float6_e2m3fnx2",
+    "float6_e2m3fnx4",
+    "float6_e2m3fnx8",
+    "float6_e2m3fnx16",
+    "float6_e2m3fnx32",
+    "float6_e2m3fnx64",
+    "float6_e3m2fn",
+    "float6_e3m2fnx2",
+    "float6_e3m2fnx4",
+    "float6_e3m2fnx8",
+    "float6_e3m2fnx16",
+    "float6_e3m2fnx32",
+    "float6_e3m2fnx64",
+    "float4_e2m1fn",
+    "float4_e2m1fnx2",
+    "float4_e2m1fnx4",
+    "float4_e2m1fnx8",
+    "float4_e2m1fnx16",
+    "float4_e2m1fnx32",
+    "float4_e2m1fnx64",
+    "bfloat16",
+}
+
+__all__ = list(_all_dtypes) + [
+    "dtype",
+    "AnyDType",
+    "get_tvm_dtype",
+]
diff --git a/tilelang/language/v2/utils.py b/tilelang/language/v2/utils.py
new file mode 100644
index 000000000..207bd92ad
--- /dev/null
+++ b/tilelang/language/v2/utils.py
@@ -0,0 +1,98 @@
+from __future__ import annotations
+import ast
+import inspect
+from typing import Any, Callable, Literal
+from tilelang import env
+from hashlib import sha256
+from tvm import tir
+import linecache
+
+
+def disk_compile(source, name):
+    cache_dir = env.TILELANG_CACHE_DIR
+    if cache_dir is not None:
+        import os
+
+        save_dir = os.path.join(cache_dir, "py-cache")
+        os.makedirs(save_dir, exist_ok=True)
+        hash_sfx = sha256(source.encode("utf-8")).hexdigest()[:8]
+        path = os.path.join(save_dir, f"{name}.{hash_sfx}.py")
+        with open(path, "w") as f:
+            f.write(source)
+    linecache.cache[path] = (len(source), None, source.splitlines(), path)
+    return compile(source, path, "exec")
+
+
+def _remove_leading_ident(source: str):
+    lines = source.splitlines()
+    if not lines:
+        return source
+    ident_size = len(lines[0]) - len(lines[0].lstrip())
+    return "\n".join([line[ident_size:] if len(line) >= ident_size else line for line in lines])
+
+
+def get_func_nonlocals(func):
+    """A modified version of `inspect.getclosurevars`"""
+
+    if inspect.ismethod(func):
+        func = func.__func__
+
+    if not inspect.isfunction(func):
+        raise TypeError(f"{func!r} is not a Python function")
+
+    code = func.__code__
+    # Nonlocal references are named in co_freevars and resolved
+    # by looking them up in __closure__ by positional index
+    nonlocal_vars = {}
+    if func.__closure__ is not None:
+        for var, cell in zip(code.co_freevars, func.__closure__):
+            try:
+                nonlocal_vars[var] = cell.cell_contents
+            except ValueError as err:
+                # cell_contents may raise ValueError if the cell is empty.
+                if "empty" not in str(err):
+                    raise
+    return nonlocal_vars
+
+
+def get_ast(func: Callable):
+    _, start = inspect.getsourcelines(func)
+    filename = inspect.getsourcefile(func) or inspect.getfile(func)
+    source = inspect.getsource(func)
+    source = _remove_leading_ident(source)
+    source = "\n" * (start - 1) + source
+    tree = ast.parse(source, filename=filename)
+    return tree
+
+
+CompileMethod = Literal["direct", "disk"]
+
+
+def get_compiled_object(source: str | ast.AST, name: str, filename: str = None, globals: dict[str, Any] = None):
+    if isinstance(source, ast.AST):
+        assert filename is not None, "filename must be provided when source is an AST"
+    try:
+        if isinstance(source, ast.AST):
+            ast.fix_missing_locations(source)
+            compiled = compile(source, filename, "exec")
+        else:
+            compiled = disk_compile(source, name)
+    except Exception as e:
+        source_str = source if isinstance(source, str) else ast.unparse(source)
+        raise RuntimeError(f"Failed to compile source for {name}, Error: {e}:\n{source_str}") from e
+    locs = {}
+    exec(compiled, globals, locs)
+    return locs[name]
+
+
+def construct_strides(shape: tuple[Any, ...], allow_prim_expr: bool = True) -> tuple[Any, ...]:
+    """Construct row-major strides from shape."""
+    strides = []
+    stride = 1
+    for s in shape[::-1]:
+        strides.append(stride)
+        stride *= s
+        if not allow_prim_expr and isinstance(stride, tir.PrimExpr):
+            raise ValueError("Cannot construct strides with PrimExpr when allow_prim_expr is False.")
+    strides = tuple(reversed(strides))
+    return strides
diff --git a/tilelang/language/warpgroup.py b/tilelang/language/warpgroup.py
index 872d30010..788ed3a0b 100644
--- a/tilelang/language/warpgroup.py
+++ b/tilelang/language/warpgroup.py
@@ -1,4 +1,5 @@
 """The language interface for tl programs."""
+
 from __future__ import annotations
 
 from tvm.script.ir_builder.tir.frame import TIRFrame
diff --git a/tilelang/layout/__init__.py b/tilelang/layout/__init__.py
index 2df0ba187..777802d2c 100644
--- a/tilelang/layout/__init__.py
+++ b/tilelang/layout/__init__.py
@@ -5,10 +5,12 @@
 from .fragment import Fragment  # noqa: F401
 from .swizzle import (
     make_swizzled_layout,  # noqa: F401
+    make_volta_swizzled_layout,  # noqa: F401
     make_wgmma_swizzled_layout,  # noqa: F401
+    make_tcgen05mma_swizzled_layout,  # noqa: F401
     make_full_bank_swizzled_layout,  # noqa: F401
     make_half_bank_swizzled_layout,  # noqa: F401
     make_quarter_bank_swizzled_layout,  # noqa: F401
     make_linear_layout,  # noqa: F401
 )
-from .gemm_sp import make_metadata_layout  # noqa: F401
+from .gemm_sp import make_cutlass_metadata_layout  # noqa: F401
diff --git a/tilelang/layout/fragment.py b/tilelang/layout/fragment.py
index b9c2b10ec..e6b603ba2 100644
--- a/tilelang/layout/fragment.py
+++ b/tilelang/layout/fragment.py
@@ -1,15 +1,17 @@
 """Wrapping Layouts."""
-# pylint: disable=invalid-name, unsupported-binary-operation
+
 from __future__ import annotations
 
+# pylint: disable=invalid-name, unsupported-binary-operation
 import tvm
+import tvm_ffi
 from tvm.ir import Range
 from tvm.tir import IterVar, Var, PrimExpr, IndexMap
 from tilelang import _ffi_api
 from tilelang.layout import Layout
 
 
-@tvm.ffi.register_object("tl.Fragment")
+@tvm_ffi.register_object("tl.Fragment")
 class Fragment(Layout):
     """
     A Fragment layout object that encapsulates iteration variables (forward_vars),
@@ -21,12 +23,7 @@ class Fragment(Layout):
     # Disable the linter warning about not calling super().__init__()
     # because this object is created via TVM's FFI constructor mechanism.
     # pylint: disable=super-init-not-called
-    def __init__(self,
-                 shape,
-                 forward_fn=None,
-                 forward_thread_fn=None,
-                 replicate=1,
-                 forward_index_fn=None):
+    def __init__(self, shape, forward_fn=None, forward_thread_fn=None, replicate=1, forward_index_fn=None):
         """
         Initialize the Fragment with iteration variables and optional thread replication.
 
@@ -120,10 +117,7 @@ def get_thread_size(self):
         """
         return _ffi_api.Fragment_thread_size(self)
 
-    def repeat(self,
-               repeats,
-               repeat_on_thread: bool = False,
-               lower_dim_first: bool = True) -> Fragment:
+    def repeat(self, repeats, repeat_on_thread: bool = False, lower_dim_first: bool = True) -> Fragment:
         """
         Returns a new Fragment that repeats the iteration space a given number of times.
 
@@ -191,8 +185,7 @@ def map_forward_thread(self, indices: list[PrimExpr]) -> PrimExpr:
         # The thread dimension (IterVar) is accessed via the `thread` property
         forward_thread = self.thread
         # Construct an IndexMap to map the provided args into the final thread index
-        index_map = IndexMap(
-            initial_indices=forward_vars, final_indices=[forward_thread], inverse_index_map=None)
+        index_map = IndexMap(initial_indices=forward_vars, final_indices=[forward_thread], inverse_index_map=None)
         return index_map.map_indices(indices)
 
     def __repr__(self):
@@ -204,7 +197,8 @@ def __repr__(self):
         str
             A string showing the thread dimension and the index dimension.
         """
-        return f"Fragment<{self.get_input_shape()}->{self.get_output_shape()}, thread={self.thread}, index={self.index}>"
+        return self._DebugOutput()
+        # return f"Fragment<{self.get_input_shape()}->{self.get_output_shape()}, thread={self.thread}, index={self.index}>"
 
     def is_equal(self, other: Fragment) -> bool:
         """
diff --git a/tilelang/layout/gemm_sp.py b/tilelang/layout/gemm_sp.py
index 2fd58cd2e..7ae836bc8 100644
--- a/tilelang/layout/gemm_sp.py
+++ b/tilelang/layout/gemm_sp.py
@@ -1,7 +1,7 @@
 """Wrapping Layouts."""
+
 # pylint: disable=invalid-name, unsupported-binary-operation
 from __future__ import annotations
-
 import tvm
 import tilelang.language as T
 import warnings
@@ -18,7 +18,7 @@ def decompose_col_major(index_1d: int, basis: list[int]) -> list[int]:
     return res
 
 
-def _make_metadata_layout_sm90_cutlass(buffer: tvm.tir.Buffer, mma_dtype: str, block_k: int):
+def make_cutlass_metadata_layout_sm90(buffer: tvm.tir.Buffer, mma_dtype: str, block_k: int):
     """Make a layout of metadata that is compatible with cutlass sm90 compression kernel. Note that layout atom is the same for smem and gmem.
 
     Args:
@@ -31,10 +31,20 @@ def _make_metadata_layout_sm90_cutlass(buffer: tvm.tir.Buffer, mma_dtype: str, b
         block_k = 128
         # Ref: https://github.com/NVIDIA/cutlass/blob/c2ad7c5b20f131c4ba33601860f1da3f9c9df0f3/include/cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl#L145-L146
         warnings.warn(f"block_k {block_k} is too large, set to 128 for {mma_dtype}.", stacklevel=2)
-    if mma_dtype not in ["float16", "bfloat16", "float32", "int8", "float8"]:
+    if mma_dtype not in [
+        T.float16,
+        T.bfloat16,
+        T.float32,
+        T.int8,
+        T.float8_e4m3,
+        T.float8_e4m3fn,
+        T.float8_e4m3fnuz,
+        T.float8_e5m2,
+        T.float8_e5m2fnuz,
+    ]:
         raise NotImplementedError(f"Unsupported dtype: {mma_dtype}")
 
-    if buffer.dtype not in ["uint8", "int8"]:
+    if buffer.dtype not in [T.uint8, T.int8]:
         raise ValueError(f"metadata should be 8 bit, got {buffer.dtype}")
 
     bits_map = {
@@ -42,7 +52,11 @@ def _make_metadata_layout_sm90_cutlass(buffer: tvm.tir.Buffer, mma_dtype: str, b
         "bfloat16": 16,
         "float32": 32,
         "int8": 8,
-        "float8": 8,
+        "float8_e4m3": 8,
+        "float8_e4m3fn": 8,
+        "float8_e4m3fnuz": 8,
+        "float8_e5m2": 8,
+        "float8_e5m2fnuz": 8,
     }
 
     # ref: https://github.com/NVIDIA/cutlass/blob/c2ad7c5b20f131c4ba33601860f1da3f9c9df0f3/include/cutlass/gemm/collective/builders/sm90_sparse_config.inl#L108-L117
@@ -76,8 +90,8 @@ def gen_stride(shape_ik, order):
         shape_i, shape_k = shape_ik[:3], shape_ik[3:]
         stride_i, stride_k = stride_ik[:3], stride_ik[3:]
     elif bits_map[mma_dtype] == 8:
-        shape_i, shape_k = [64], [BlockK]
-        stride_i, stride_k = [BlockK], [1]
+        shape_i, shape_k = [64], [block_k // 8]
+        stride_i, stride_k = [block_k // 8], [1]
     else:
         raise NotImplementedError(f"Unknown mma type {mma_dtype}")
 
@@ -104,54 +118,44 @@ def transform(i: int, k: int) -> int:
     return T.Layout(shape, transform)
 
 
-def _make_metadata_layout_sm8x_cutlass(buffer: tvm.tir.Buffer, mma_dtype: str):
+def make_cutlass_metadata_layout_sm8x(buffer: tvm.tir.Buffer, mma_dtype: str):
     """Make a layout of metadata that is compatible with cutlass sm8x compression kernel. Note that layout atom is the same for smem and gmem.
-
+        ref: https://github.com/pytorch/pytorch/blob/d0c24b392cbb7b213d22e42c52c6c2d1ac2da1bd/torch/sparse/_semi_structured_conversions.py#L5
     Args:
         buffer: metadata buffer shape, for sm80 it should be a 16bit type
     """
 
-    # ref: https://github.com/nvidia/cutlass/blob/ad7b2f5e84fcfa124cb02b91d5bd26d238c0459e/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h#L651
-    #      https://github.com/nvidia/cutlass/blob/ad7b2f5e84fcfa124cb02b91d5bd26d238c0459e/include/cutlass/layout/matrix.h#L405
-    #      https://github.com/nvidia/cutlass/blob/ad7b2f5e84fcfa124cb02b91d5bd26d238c0459e/include/cutlass/gemm/warp/mma_sparse_tensor_op.h#L172
-
-    if mma_dtype in ["float16", "bfloat16"] and buffer.dtype not in ["uint16", "int16"]:
+    if mma_dtype in [T.float16, T.bfloat16] and buffer.dtype not in [T.uint16, T.int16]:
         raise ValueError(f"metadata should be 16 bit, got {buffer.dtype}")
 
-    if mma_dtype in ["float8", "int8", "uint8"] and buffer.dtype not in ["uint32", "int32"]:
+    if mma_dtype in ["float8_e4m3", "float8_e5m2", T.int8, T.uint8] and buffer.dtype not in [T.uint32, T.int32]:
         raise ValueError(f"metadata should be 32 bit, got {buffer.dtype}")
 
-    kInterleaved = 2
-    stride = buffer.shape[0] * kInterleaved
+    m, k = buffer.shape
+    group = 32 if buffer.dtype.bits == 16 else 16
+    interweave = 4 if buffer.dtype.bits == 16 else 2
 
     def ColumnMajorInterleaved(i: int, j: int) -> int:
-        column_major = j // kInterleaved
-        column_minor = j % kInterleaved
-        return column_major * stride + i * kInterleaved + column_minor
+        i = i // group * group + (i % 8) * interweave + (i % group) // 8
+        topright = (1 - (i % 2)) & (j % 2)
+        bottomleft = (i % 2) & (1 - (j % 2))
+        i += topright - bottomleft
+        j -= topright - bottomleft
+        offset = (j // 2) * m * 2 + i * 2 + (j % 2)
+        return offset // k, offset % k
 
     return T.Layout(buffer.shape, ColumnMajorInterleaved)
 
 
-def make_metadata_layout(buffer: tvm.tir.Buffer,
-                         mma_dtype: str = "float16",
-                         backend: str = 'cutlass',
-                         arch: str | None = None,
-                         **extra_args):
+def make_cutlass_metadata_layout(buffer: tvm.tir.Buffer, mma_dtype: str = T.float16, arch: str | None = None, **extra_args):
     if arch is None:
         arch = nvcc.get_target_compute_version()
 
     compute_version = nvcc.parse_compute_version(arch)
 
     if compute_version >= (9, 0):
-        if backend == 'cutlass':
-            return _make_metadata_layout_sm90_cutlass(
-                buffer=buffer, mma_dtype=mma_dtype, **extra_args)
-        else:
-            raise NotImplementedError(f"Arch {arch}, Unsupported backend: {backend}")
+        return make_cutlass_metadata_layout_sm90(buffer=buffer, mma_dtype=mma_dtype, **extra_args)
     elif compute_version >= (8, 0):
-        if backend == 'cutlass':
-            return _make_metadata_layout_sm8x_cutlass(buffer=buffer, mma_dtype=mma_dtype)
-        else:
-            raise NotImplementedError(f"Arch {arch}, Unsupported backend: {backend}")
+        return make_cutlass_metadata_layout_sm8x(buffer=buffer, mma_dtype=mma_dtype)
     else:
         raise NotImplementedError(f"Unsupported architecture: {arch}")
diff --git a/tilelang/layout/layout.py b/tilelang/layout/layout.py
index dd0f11709..beca48224 100644
--- a/tilelang/layout/layout.py
+++ b/tilelang/layout/layout.py
@@ -1,17 +1,17 @@
 """Wrapping Layouts."""
-# pylint: disable=invalid-name, unsupported-binary-operation
+
 from __future__ import annotations
 
-import tvm
+# pylint: disable=invalid-name, unsupported-binary-operation
+import tvm_ffi
 from tvm.ir import Node, Range
 from tvm.tir import IterVar, Var, PrimExpr, IndexMap
 from tilelang import _ffi_api
 
 
 # Register the Layout class as a TVM object under the name "tl.Layout"
-@tvm.ffi.register_object("tl.Layout")
+@tvm_ffi.register_object("tl.Layout")
 class Layout(Node):
-
     def __init__(self, shape, forward_fn):
         """
         Initialize a Layout object.
@@ -116,7 +116,7 @@ def map_forward_index(self, indices: list[PrimExpr]) -> PrimExpr:
         index_map = IndexMap(
             initial_indices=forward_vars,  # The original iteration variables
             final_indices=forward_indexes,  # The computed forward indices
-            inverse_index_map=None  # No inverse mapping provided at this stage
+            inverse_index_map=None,  # No inverse mapping provided at this stage
         )
 
         # Map the provided indices using the constructed index mapping
@@ -145,4 +145,5 @@ def is_equal(self, other: Layout) -> bool:
         return _ffi_api.Layout_is_equal(self, other)
 
     def __repr__(self):
-        return f"Layout<{self.get_input_shape()}->{self.get_output_shape()}, {self.get_forward_vars()} -> {self.get_forward_index()}>"
+        return self._DebugOutput()
+        # return f"Layout<{self.get_input_shape()}->{self.get_output_shape()}, {self.get_forward_vars()} -> {self.get_forward_index()}>"
diff --git a/tilelang/layout/swizzle.py b/tilelang/layout/swizzle.py
index 1d3e98909..beaf3b6b5 100644
--- a/tilelang/layout/swizzle.py
+++ b/tilelang/layout/swizzle.py
@@ -1,35 +1,113 @@
 """Wrapping Layouts."""
+
 # pylint: disable=invalid-name, unsupported-binary-operation
+from __future__ import annotations
 
 import tvm
+from tvm.tir import Buffer, BufferLoad, BufferRegion
 from tilelang import _ffi_api
 
 
+def _get_buffer_info(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion) -> tuple[Buffer, list[int], str]:
+    """
+    Extract buffer, shape, and dtype from Buffer, BufferLoad, or BufferRegion.
+
+    Args:
+        buffer_or_load_or_region: Can be Buffer, BufferLoad, or BufferRegion
+
+    Returns:
+        tuple: (buffer, shape, dtype)
+    """
+    if isinstance(buffer_or_load_or_region, Buffer):
+        return buffer_or_load_or_region, buffer_or_load_or_region.shape, buffer_or_load_or_region.dtype
+    elif isinstance(buffer_or_load_or_region, (BufferLoad, BufferRegion)):
+        buf = buffer_or_load_or_region.buffer
+        return buf, buf.shape, buf.dtype
+    else:
+        raise TypeError(f"Expected Buffer, BufferLoad, or BufferRegion, got {type(buffer_or_load_or_region)}")
+
+
+def _get_stride_continuous(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion) -> tuple[int, int]:
+    """
+    Get stride (last 2nd dimension) and continuous (last dimension) from Buffer, BufferLoad, or BufferRegion.
+
+    Args:
+        buffer_or_load_or_region: Can be Buffer, BufferLoad, or BufferRegion
+
+    Returns:
+        tuple: (stride, continuous) as integers
+    """
+    _, shape, _ = _get_buffer_info(buffer_or_load_or_region)
+    stride = int(shape[-2])
+    continuous = int(shape[-1])
+    return stride, continuous
+
+
+def _get_element_size(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion) -> int:
+    """
+    Get element size in bits from Buffer, BufferLoad, or BufferRegion.
+
+    Args:
+        buffer_or_load_or_region: Can be Buffer, BufferLoad, or BufferRegion
+
+    Returns:
+        int: Element size in bits
+    """
+    _, _, dtype = _get_buffer_info(buffer_or_load_or_region)
+    return int(tvm.DataType(dtype).bits)
+
+
 # Use a stable swizzled layout to ensure consistent memory access patterns.
 # Swizzling should be enabled or disabled based on whether TMA (Tensor Memory Access) is applied.
-def make_swizzled_layout(buffer: tvm.tir.Buffer, k_major: bool = True, allow_pad: bool = True):
-    assert len(buffer.shape) == 2
+def make_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion, k_major: bool = True, allow_pad: bool = True):
+    stride, continuous = _get_stride_continuous(buffer)
+    element_size = _get_element_size(buffer)
     return _ffi_api.make_swizzled_layout(
-        int(buffer.shape[0]),
-        int(buffer.shape[1]),
-        int(tvm.DataType(buffer.dtype).bits),
+        stride,
+        continuous,
+        element_size,
         k_major,
         allow_pad,
     )
 
 
+# for Volta Intrinsics
+def make_volta_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion, is_a: bool = True, k_inner: bool = True):
+    stride, continuous = _get_stride_continuous(buffer)
+    return _ffi_api.make_volta_swizzled_layout(
+        stride,
+        continuous,
+        is_a,
+        k_inner,
+    )
+
+
 # for WGMMA Intrinsics
-def make_wgmma_swizzled_layout(buffer: tvm.tir.Buffer,
-                               continuity: int = None,
-                               k_major: bool = True):
-    assert len(buffer.shape) == 2
+def make_wgmma_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion, continuity: int = None, k_major: bool = True):
+    stride, continuous = _get_stride_continuous(buffer)
+    element_size = _get_element_size(buffer)
     if continuity is None:
-        continuity = int(buffer.shape[1])
+        continuity = continuous
     return _ffi_api.make_wgmma_swizzled_layout(
-        int(buffer.shape[0]),
-        int(buffer.shape[1]),
+        stride,
+        continuous,
+        continuity,
+        element_size,
+        k_major,
+    )
+
+
+# for TCGEN05MMA Intrinsics
+def make_tcgen05mma_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion, continuity: int = None, k_major: bool = True):
+    stride, continuous = _get_stride_continuous(buffer)
+    element_size = _get_element_size(buffer)
+    if continuity is None:
+        continuity = continuous
+    return _ffi_api.make_tcgen05mma_swizzled_layout(
+        stride,
+        continuous,
         continuity,
-        int(tvm.DataType(buffer.dtype).bits),
+        element_size,
         k_major,
     )
 
@@ -39,15 +117,14 @@ def make_wgmma_swizzled_layout(buffer: tvm.tir.Buffer,
 def make_full_bank_swizzled_layout(*args):
     """
     Args:
-        args: buffer or (stride, continuous, element_size)
+        args: buffer/BufferLoad/BufferRegion or (stride, continuous, element_size)
     Examples:
         make_full_bank_swizzled_layout(buffer)
         make_full_bank_swizzled_layout(stride, continuous, element_size)
     """
     if len(args) == 1:
-        buffer = args[0]
-        stride, continuous = int(buffer.shape[0]), int(buffer.shape[1])
-        element_size = int(tvm.DataType(buffer.dtype).bits)
+        stride, continuous = _get_stride_continuous(args[0])
+        element_size = _get_element_size(args[0])
     elif len(args) == 3:
         stride, continuous, element_size = args
     else:
@@ -64,15 +141,14 @@ def make_full_bank_swizzled_layout(*args):
 def make_half_bank_swizzled_layout(*args):
     """
     Args:
-        args: buffer or (stride, continuous, element_size)
+        args: buffer/BufferLoad/BufferRegion or (stride, continuous, element_size)
     Examples:
         make_half_bank_swizzled_layout(buffer)
         make_half_bank_swizzled_layout(stride, continuous, element_size)
     """
     if len(args) == 1:
-        buffer = args[0]
-        stride, continuous = int(buffer.shape[0]), int(buffer.shape[1])
-        element_size = int(tvm.DataType(buffer.dtype).bits)
+        stride, continuous = _get_stride_continuous(args[0])
+        element_size = _get_element_size(args[0])
     elif len(args) == 3:
         stride, continuous, element_size = args
     else:
@@ -89,15 +165,14 @@ def make_half_bank_swizzled_layout(*args):
 def make_quarter_bank_swizzled_layout(*args):
     """
     Args:
-        args: buffer or (stride, continuous, element_size)
+        args: buffer/BufferLoad/BufferRegion or (stride, continuous, element_size)
     Examples:
         make_quarter_bank_swizzled_layout(buffer)
         make_quarter_bank_swizzled_layout(stride, continuous, element_size)
     """
     if len(args) == 1:
-        buffer = args[0]
-        stride, continuous = int(buffer.shape[0]), int(buffer.shape[1])
-        element_size = int(tvm.DataType(buffer.dtype).bits)
+        stride, continuous = _get_stride_continuous(args[0])
+        element_size = _get_element_size(args[0])
     elif len(args) == 3:
         stride, continuous, element_size = args
     else:
@@ -109,22 +184,15 @@ def make_quarter_bank_swizzled_layout(*args):
     )
 
 
-def make_linear_layout(*args):
+def make_linear_layout(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion):
     """
+    Create a row-major linear layout for any dimension.
+
     Args:
-        args: buffer or (stride, continuous)
-    Examples:
-        make_linear_layout(buffer)
-        make_linear_layout(stride, continuous)
+        buffer_or_load_or_region: Buffer, BufferLoad, or BufferRegion
+
+    Returns:
+        Layout: A row-major linear layout
     """
-    if len(args) == 1:
-        buffer = args[0]
-        stride, continuous = int(buffer.shape[0]), int(buffer.shape[1])
-    elif len(args) == 2:
-        stride, continuous = args
-    else:
-        raise ValueError(f"Invalid arguments: {args}")
-    return _ffi_api.make_linear_layout(
-        stride,
-        continuous,
-    )
+    _, shape, _ = _get_buffer_info(buffer_or_load_or_region)
+    return _ffi_api.make_linear_layout(list(shape))
diff --git a/tilelang/libinfo.py b/tilelang/libinfo.py
index 5af8c84f4..d82986b75 100644
--- a/tilelang/libinfo.py
+++ b/tilelang/libinfo.py
@@ -31,6 +31,5 @@ def find_lib_path(name: str, py_ext=False):
         if os.path.exists(lib_dll_path) and os.path.isfile(lib_dll_path):
             return lib_dll_path
     else:
-        message = (f"Cannot find libraries: {lib_name}\n" + "List of candidates:\n" +
-                   "\n".join(TL_LIBS))
+        message = f"Cannot find libraries: {lib_name}\n" + "List of candidates:\n" + "\n".join(TL_LIBS)
         raise RuntimeError(message)
diff --git a/tilelang/primitives/__init__.py b/tilelang/primitives/__init__.py
deleted file mode 100644
index 8eccc3e5c..000000000
--- a/tilelang/primitives/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-""" bootstrap the primitives module via tile language """
-
-from .gemm import gemm  # noqa: F401
diff --git a/tilelang/primitives/gemm/__init__.py b/tilelang/primitives/gemm/__init__.py
index ee9436d15..3c1cba249 100644
--- a/tilelang/primitives/gemm/__init__.py
+++ b/tilelang/primitives/gemm/__init__.py
@@ -4,7 +4,8 @@
 from tilelang.utils import is_local, is_fragment, is_shared
 from tilelang.primitives.gemm.base import GemmWarpPolicy
 from tilelang.primitives.gemm.gemm_mma import (
-    GemmPrimitiveMMA,)
+    GemmPrimitiveMMA,
+)
 
 
 def gemm(
@@ -21,12 +22,9 @@ def gemm(
     policy: GemmWarpPolicy = GemmWarpPolicy.Square,
     k_pack: int = 1,
 ):
-    assert is_local(A) or is_fragment(A) or is_shared(A), (
-        f"Expected A to be a local, fragment, or shared buffer, but got {A.scope()}")
-    assert is_local(B) or is_fragment(B) or is_shared(B), (
-        f"Expected B to be a local, fragment, or shared buffer, but got {B.scope()}")
-    assert is_local(C) or is_fragment(C), (
-        f"Expected C to be a local, fragment, but got {C.scope()}")
+    assert is_local(A) or is_fragment(A) or is_shared(A), f"Expected A to be a local, fragment, or shared buffer, but got {A.scope()}"
+    assert is_local(B) or is_fragment(B) or is_shared(B), f"Expected B to be a local, fragment, or shared buffer, but got {B.scope()}"
+    assert is_local(C) or is_fragment(C), f"Expected C to be a local, fragment, but got {C.scope()}"
     # TODO(lei): Now we only support Nvidia GPUs
     # Must enhance the design to implement runtime lowering
     # for different targets (hip mfma for example)
diff --git a/tilelang/primitives/gemm/gemm_mma.py b/tilelang/primitives/gemm/gemm_mma.py
deleted file mode 100644
index 11e16838c..000000000
--- a/tilelang/primitives/gemm/gemm_mma.py
+++ /dev/null
@@ -1,262 +0,0 @@
-from dataclasses import dataclass
-from tvm import tir
-import tilelang.language as T
-from tilelang.utils import is_fragment
-from tilelang.primitives.gemm.base import GemmBaseParams
-from tilelang.intrinsics.mma_macro_generator import TensorCoreIntrinEmitter
-
-
-# TODO(lei): Implement GEMM_SR, GEMM_RS, GEMM_RR
-@dataclass
-class GemmPrimitiveMMA(GemmBaseParams):
-    """
-    A GEMM (General Matrix Multiply) primitive that uses Tensor Core MMA (Matrix
-    Multiply and Accumulate) instructions. Inherits from GemmBaseParams which
-    provides basic parameters such as A, B, C buffers and transposition flags.
-    """
-
-    def gemm_rrr(
-        self,
-        A: tir.Buffer,
-        B: tir.Buffer,
-        C: tir.Buffer,
-        mma_emitter: TensorCoreIntrinEmitter,
-    ) -> tir.PrimExpr:
-        raise NotImplementedError("GEMM_RRR is not implemented yet")
-
-    def gemm_rsr(
-        self,
-        A: tir.Buffer,
-        B: tir.Buffer,
-        C: tir.Buffer,
-        mma_emitter: TensorCoreIntrinEmitter,
-    ) -> tir.PrimExpr:
-
-        in_dtype = self.in_dtype
-        warp_cols = mma_emitter.warp_cols
-        local_size_b = mma_emitter.local_size_b
-        block_K = mma_emitter.chunk
-        micro_size_k = mma_emitter.micro_size_k
-
-        # Check if C is a fragment for applying custom layout
-        a_is_fragment = is_fragment(A)
-        c_is_fragment = is_fragment(C)
-
-        @T.macro
-        def _gemm_rsr(A_local: tir.Buffer, B_shared: tir.Buffer, C_local: tir.Buffer) -> None:
-            """
-            The inner macro that loads data from shared buffers A_shared and
-            B_shared into local fragments, then issues Tensor Core mma ops,
-            accumulating into C_local.
-            """
-            B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
-
-            if a_is_fragment:
-                # Annotate layout for A_local if it is a fragment.
-                T.annotate_layout({
-                    A_local: mma_emitter.make_mma_load_layout(A_local, "A"),
-                })
-            if c_is_fragment:
-                # Annotate layout for C_local if it is a fragment.
-                T.annotate_layout({
-                    C_local: mma_emitter.make_mma_store_layout(C_local),
-                })
-
-            # Make default swizzle layout for shared memory
-            # T.annotate_layout({
-            #     B_shared: make_mma_swizzle_layout(B_shared),
-            # })
-            for ki in T.serial(0, (block_K // micro_size_k)):
-
-                # Load B into fragment
-                mma_emitter.ldmatrix_b(
-                    B_local,
-                    B_shared,
-                    ki,
-                )
-                # Perform Matrix Multiplication
-                mma_emitter.mma(
-                    A_local,
-                    B_local,
-                    C_local,
-                    ki,
-                )
-
-        return _gemm_rsr(A, B, C)
-
-    def gemm_srr(
-        self,
-        A: tir.Buffer,
-        B: tir.Buffer,
-        C: tir.Buffer,
-        mma_emitter: TensorCoreIntrinEmitter,
-    ) -> tir.PrimExpr:
-        raise NotImplementedError("GEMM_RSR is not implemented yet")
-
-    def gemm_ssr(
-        self,
-        A: tir.Buffer,
-        B: tir.Buffer,
-        C: tir.Buffer,
-        mma_emitter: TensorCoreIntrinEmitter,
-    ) -> tir.PrimExpr:
-        """
-        Perform a single-step reduction (SSR) GEMM using Tensor Core MMA
-        primitives. Loads fragments of A and B from shared memory, multiplies
-        them, and accumulates into C.
-
-        Parameters
-        ----------
-        A : tir.Buffer
-            The buffer for matrix A (in shared memory).
-        B : tir.Buffer
-            The buffer for matrix B (in shared memory).
-        C : tir.Buffer
-            The buffer for the accumulation results.
-        mma_emitter : TensorCoreIntrinEmitter
-            A helper object responsible for generating Tensor Core MMA
-            instructions (ldmatrix, mma, etc.).
-
-        Returns
-        -------
-        tir.PrimExpr
-            The generated IR expression (macro) representing the GEMM loop.
-        """
-
-        in_dtype = self.in_dtype
-        warp_rows = mma_emitter.warp_rows
-        warp_cols = mma_emitter.warp_cols
-        local_size_a = mma_emitter.local_size_a
-        local_size_b = mma_emitter.local_size_b
-        block_K = mma_emitter.chunk
-        micro_size_k = mma_emitter.micro_size_k
-
-        # Check if C is a fragment for applying custom layout
-        c_is_fragment = is_fragment(C)
-
-        @T.macro
-        def _gemm_ssr(A_shared: tir.Buffer, B_shared: tir.Buffer, C_local: tir.Buffer) -> None:
-            """
-            The inner macro that loads data from shared buffers A_shared and
-            B_shared into local fragments, then issues Tensor Core mma ops,
-            accumulating into C_local.
-            """
-            A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
-            B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
-
-            if c_is_fragment:
-                # Annotate layout for C_local if it is a fragment.
-                T.annotate_layout({
-                    C_local: mma_emitter.make_mma_store_layout(C_local),
-                })
-
-            for ki in T.serial(0, (block_K // micro_size_k)):
-                # Load A into fragment
-                mma_emitter.ldmatrix_a(
-                    A_local,
-                    A_shared,
-                    ki,
-                )
-
-                # Load B into fragment
-                mma_emitter.ldmatrix_b(
-                    B_local,
-                    B_shared,
-                    ki,
-                )
-
-                # Perform Matrix Multiplication
-                mma_emitter.mma(A_local, B_local, C_local)
-
-        return _gemm_ssr(A, B, C)
-
-    def invoke(self) -> tir.PrimExpr:
-        """
-        Entry point to generate a GEMM SSR (single-step reduction) with Tensor
-        Core instructions. Performs the following steps:
-            1. Infers block partition parameters if necessary.
-            2. Creates a `TensorCoreIntrinEmitter` with the correct data types
-               and dimensions.
-            3. Invokes the GEMM SSR function to generate the final IR expression.
-
-        Returns
-        -------
-        tir.PrimExpr
-            The generated GEMM IR expression.
-        """
-
-        # Infer block partition if necessary
-        current_frame = T.KernelLaunchFrame.Current()
-        threads = current_frame.get_num_threads()
-
-        self.infer_block_partition(threads)
-
-        A, B, C = self.A, self.B, self.C
-        transpose_A, transpose_B = self.transpose_A, self.transpose_B
-        block_row_warps, block_col_warps = (
-            self.block_row_warps,
-            self.block_col_warps,
-        )
-        warp_row_tiles, warp_col_tiles = (
-            self.warp_row_tiles,
-            self.warp_col_tiles,
-        )
-        chunk = self.chunk
-
-        # Check dtypes
-        A_dtype, B_dtype, C_dtype = A.dtype, B.dtype, C.dtype
-        assert A_dtype == B_dtype, "A and B must have the same dtype"
-        in_dtype, accum_dtype = A_dtype, C_dtype
-
-        # Create the MMA emitter
-        mma_emitter = TensorCoreIntrinEmitter(
-            a_dtype=in_dtype,
-            b_dtype=in_dtype,
-            accum_dtype=accum_dtype,
-            a_transposed=transpose_A,
-            b_transposed=transpose_B,
-            block_row_warps=block_row_warps,
-            block_col_warps=block_col_warps,
-            warp_row_tiles=warp_row_tiles,
-            warp_col_tiles=warp_col_tiles,
-            chunk=chunk,
-        )
-        a_is_fragment = is_fragment(A)
-        b_is_fragment = is_fragment(B)
-        if a_is_fragment and b_is_fragment:
-            return self.gemm_rrr(A, B, C, mma_emitter)
-        if a_is_fragment:
-            return self.gemm_rsr(A, B, C, mma_emitter)
-        if b_is_fragment:
-            return self.gemm_srr(A, B, C, mma_emitter)
-        return self.gemm_ssr(A, B, C, mma_emitter)
-
-    @property
-    def in_dtype(self) -> str:
-        """
-        Returns
-        -------
-        str
-            The input data type for A and B. Assumes both have the same dtype.
-
-        Raises
-        ------
-        AssertionError
-            If A and B do not share the same dtype.
-        """
-        A_dtype, B_dtype = self.A.dtype, self.B.dtype
-        assert A_dtype == B_dtype, "A and B must have the same dtype"
-        return self.A.dtype
-
-    @property
-    def accum_dtype(self) -> str:
-        """
-        Returns
-        -------
-        str
-            The accumulation data type for C.
-        """
-        return self.C.dtype
-
-
-__all__ = ["GemmPrimitiveMMA"]
diff --git a/tilelang/profiler/__init__.py b/tilelang/profiler/__init__.py
index 3c6d99da7..01a546db1 100644
--- a/tilelang/profiler/__init__.py
+++ b/tilelang/profiler/__init__.py
@@ -1,4 +1,5 @@
 """The profiler and convert to torch utils"""
+
 from __future__ import annotations
 
 from typing import Callable, Any, Literal
@@ -11,7 +12,7 @@
     get_tensor_supply,
     TensorSupplyType,
     torch_assert_close,
-    adapt_torch2tvm,
+    is_float8_dtype,
 )
 from tilelang.engine.param import KernelParam
 from tilelang.jit.adapter import BaseKernelAdapter
@@ -20,6 +21,19 @@
 
 import logging
 
+
+def _use_nvshmem():
+    """Check if NVSHMEM is enabled in the environment."""
+    val = str(env.USE_NVSHMEM).lower()
+    return val in ("1", "true", "yes", "on")
+
+
+def _use_distributed():
+    """Check if distributed mode is enabled in the environment."""
+    val = str(env.USE_DISTRIBUTED).lower()
+    return val in ("1", "true", "yes", "on")
+
+
 logger = logging.getLogger(__name__)
 
 
@@ -51,8 +65,7 @@ def _legalize_result_idx(self, result_idx: list[int] | None = None) -> list[int]
             result_idx = []
         elif isinstance(result_idx, int):
             if result_idx > len(params) or result_idx < -len(params):
-                raise ValueError(
-                    f"result_idx should be an integer between {-len(params)} and {len(params) - 1}")
+                raise ValueError(f"result_idx should be an integer between {-len(params)} and {len(params) - 1}")
             if result_idx < 0:
                 result_idx = len(params) + result_idx
             result_idx = [result_idx]
@@ -84,12 +97,11 @@ def init_distributed(self):
         TP_GROUP = torch.distributed.new_group(ranks=list(range(WORLD_SIZE)), backend="nccl")
 
         torch.cuda.synchronize()
-        if env.USE_NVSHMEM:
+        if _use_nvshmem():
             try:
                 import pynvshmem
             except ImportError as e:
-                raise ValueError(
-                    "pynvshmem is not installed but required for distributed inputs") from e
+                raise ValueError("pynvshmem is not installed but required for distributed inputs") from e
             pynvshmem.init_nvshmem_by_uniqueid(TP_GROUP)
 
     def _get_inputs(self, with_output=False):
@@ -100,14 +112,13 @@ def _get_inputs(self, with_output=False):
         return ins
 
     def _get_distributed_inputs(self, with_output=False):
-        if not env.USE_NVSHMEM:
+        if not _use_nvshmem():
             raise ValueError("NVSHMEM is required for distributed inputs but USE_NVSHMEM is False")
 
         try:
             import pynvshmem
         except ImportError as e:
-            raise ValueError(
-                "pynvshmem is not installed but required for distributed inputs") from e
+            raise ValueError("pynvshmem is not installed but required for distributed inputs") from e
 
         ins = []
         for i in range(len(self.params)):
@@ -118,21 +129,17 @@ def _get_distributed_inputs(self, with_output=False):
                     is_unsigned = self.params[i].is_unsigned()
                     is_float8 = self.params[i].is_float8()
                     if is_unsigned:
-                        tensor[:] = torch.randint(
-                            low=0, high=3, size=shape, device=tensor.device, dtype=tensor.dtype)
+                        tensor[:] = torch.randint(low=0, high=3, size=shape, device=tensor.device, dtype=tensor.dtype)
                     elif is_float8:
-                        tensor[:] = torch.randint(
-                            low=-128, high=128, size=shape, device=tensor.device,
-                            dtype=torch.int8).to(dtype=tensor.dtype)
+                        tensor[:] = torch.randint(low=-128, high=128, size=shape, device=tensor.device, dtype=torch.int8).to(
+                            dtype=tensor.dtype
+                        )
                     else:
-                        tensor[:] = torch.randint(
-                            low=-2, high=3, size=shape, device=tensor.device, dtype=tensor.dtype)
+                        tensor[:] = torch.randint(low=-2, high=3, size=shape, device=tensor.device, dtype=tensor.dtype)
                 elif self.supply_type == TensorSupplyType.Uniform:
-                    tensor[:] = torch.empty(
-                        *shape, device=tensor.device, dtype=tensor.dtype).uniform_(-1.0, 1.0)
+                    tensor[:] = torch.empty(*shape, device=tensor.device, dtype=tensor.dtype).uniform_(-1.0, 1.0)
                 elif self.supply_type == TensorSupplyType.Normal:
-                    tensor[:] = torch.empty(
-                        *shape, device=tensor.device, dtype=tensor.dtype).normal_(-1.0, 1.0)
+                    tensor[:] = torch.empty(*shape, device=tensor.device, dtype=tensor.dtype).normal_(-1.0, 1.0)
                 elif self.supply_type == TensorSupplyType.Randn:
                     tensor[:] = torch.randn(*shape, device=tensor.device).to(dtype=tensor.dtype)
                 elif self.supply_type == TensorSupplyType.Zero:
@@ -168,7 +175,7 @@ def assert_allclose(
             rtol: Relative tolerance for comparison
             max_mismatched_ratio: Maximum allowed ratio of mismatched elements
         """
-        if env.USE_DISTRIBUTED:
+        if _use_distributed():
             self.init_distributed()
             ins = self._get_distributed_inputs()
         else:
@@ -195,8 +202,7 @@ def assert_allclose(
         ref_tensors = ins + ref_outs
         lib_tensors = ins + lib_outs
 
-        assert len(lib_tensors) == len(
-            ref_tensors), "len(lib_tensors) not equals to len(ref_tensors) !"
+        assert len(lib_tensors) == len(ref_tensors), "len(lib_tensors) not equals to len(ref_tensors) !"
         # torch.set_printoptions(edgeitems=torch.inf)
         for lhs, rhs in zip(lib_tensors, ref_tensors):
             # close_mask = torch.isclose(lhs, rhs, rtol=rtol, atol=atol)
@@ -217,8 +223,8 @@ def is_float8(tensor: torch.Tensor) -> bool:
                     }
 
                 torch_assert_close(
-                    lhs if not is_float8(lhs) else lhs.to(torch.float32),
-                    rhs if not is_float8(rhs) else rhs.to(torch.float32),
+                    lhs if not is_float8_dtype(lhs.dtype) else lhs.to(torch.float32),
+                    rhs if not is_float8_dtype(rhs.dtype) else rhs.to(torch.float32),
                     rtol=rtol,
                     atol=atol,
                     max_mismatched_ratio=max_mismatched_ratio,
@@ -264,7 +270,7 @@ def assert_consistent(self, repeat=10):
             repeat: Number of times to repeat the consistency check
         """
         # Used to check no race condition inside the kernel
-        if env.USE_DISTRIBUTED:
+        if _use_distributed():
             self.init_distributed()
             ins = self._get_distributed_inputs()
         else:
@@ -281,11 +287,7 @@ def assert_consistent(self, repeat=10):
                 ]
 
     def run_once(self, func: Callable | None = None):
-        if env.USE_DISTRIBUTED:  # noqa: SIM108
-            # self.init_distributed()
-            ins = self._get_distributed_inputs()
-        else:
-            ins = self._get_inputs()
+        ins = self._get_inputs()
         if not func:
             func = self.__call__
         return func(*ins)
@@ -336,7 +338,7 @@ def do_bench(
             if func is None:
                 assert self.adapter is not None, "benchmarking function should be provided"
                 func = self.adapter
-            if env.USE_DISTRIBUTED:
+            if _use_distributed():
                 self.init_distributed()
                 ins = self._get_distributed_inputs() if input_tensors is None else input_tensors
             else:
@@ -354,14 +356,9 @@ def do_bench(
             )
         elif profiler == "tvm":
             assert func is not None, "func should not be None"
-            assert isinstance(
-                func, tvm.runtime.Module), f"func should be a TVM module, but got {type(func)}"
-            if env.USE_DISTRIBUTED:
-                self.init_distributed()
-                ins = self._get_distributed_inputs(
-                    with_output=True) if input_tensors is None else input_tensors
-            else:
-                ins = self._get_inputs(with_output=True) if input_tensors is None else input_tensors
+            assert isinstance(func, tvm.runtime.Module), f"func should be a TVM module, but got {type(func)}"
+
+            ins = self._get_inputs(with_output=True) if input_tensors is None else input_tensors
             target = "cuda"
 
             with suppress(Exception):
@@ -370,11 +367,9 @@ def do_bench(
             assert target in ["cuda", "hip"], f"Unknown target: {target}"
 
             device = tvm.cuda(0) if target == "cuda" else tvm.rocm(0)
-            time_evaluator = self.mod.time_evaluator(
-                self.mod.entry_name, device, number=rep, repeat=n_repeat)
-            tvm_inputs = [adapt_torch2tvm(inp) for inp in ins]
+            time_evaluator = self.mod.time_evaluator(self.mod.entry_name, device, number=rep, repeat=n_repeat)
             # Transform Latency to ms
-            return time_evaluator(*tvm_inputs).mean * 1e3
+            return time_evaluator(*ins).mean * 1e3
         else:
             raise ValueError(f"Unknown profiler: {profiler}")
 
diff --git a/tilelang/profiler/bench.py b/tilelang/profiler/bench.py
index a851ceb3d..bfcb5043d 100644
--- a/tilelang/profiler/bench.py
+++ b/tilelang/profiler/bench.py
@@ -1,4 +1,5 @@
 """Profiler and benchmarking utilities for PyTorch functions."""
+
 from __future__ import annotations
 
 import os
@@ -16,8 +17,8 @@ class suppress_stdout_stderr:
 
     def __enter__(self):
         # Open null device files
-        self.outnull_file = open(os.devnull, 'w')
-        self.errnull_file = open(os.devnull, 'w')
+        self.outnull_file = open(os.devnull, "w")
+        self.errnull_file = open(os.devnull, "w")
 
         # Save original file descriptors
         self.old_stdout_fileno_undup = sys.stdout.fileno()
@@ -56,7 +57,7 @@ def __exit__(self, *_):
 
 
 IS_CUDA = torch.cuda.is_available()
-device = 'cuda:0' if IS_CUDA else 'mps:0'
+device = "cuda:0" if IS_CUDA else "mps:0"
 Event = torch.cuda.Event if IS_CUDA else torch.mps.Event
 
 
@@ -93,8 +94,7 @@ def do_bench(
     Returns:
         Runtime in milliseconds (float) or list of quantile values if quantiles specified
     """
-    assert return_mode in ["min", "max", "mean", "median"], \
-        f"Invalid return_mode: {return_mode}"
+    assert return_mode in ["min", "max", "mean", "median"], f"Invalid return_mode: {return_mode}"
 
     # Initial function call and synchronization
     fn()
diff --git a/tilelang/quantize/lop3.py b/tilelang/quantize/lop3.py
index 47d91f056..6f1f457d1 100644
--- a/tilelang/quantize/lop3.py
+++ b/tilelang/quantize/lop3.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
-from __future__ import annotations
 from typing import Literal
+from tilelang import language as T
 
 decode_i4_to_f16 = """
 template <typename T1, typename T2, bool isSigned = false>
@@ -1089,10 +1089,10 @@
 
 
 def get_lop3_intrin_group(
-    out_dtype: Literal["float16", "int8", "int4"],
-    source_format: Literal["int", "uint"] = "uint",
+    out_dtype: Literal[T.float16, T.int8, T.int4],
+    source_format: Literal[T.int, T.uint] = T.uint,
     source_bit: int = 4,
-    storage_dtype: Literal["int32", "int8"] = "int8",
+    storage_dtype: Literal[T.int32, T.int8] = T.int8,
     with_scaling: bool = False,
     with_zeros: bool = False,
     zeros_mode: Literal["original", "rescale", "quantized"] = "original",
@@ -1105,10 +1105,10 @@ def get_lop3_intrin_group(
 
     Parameters
     ----------
-    in_dtype : Literal["int8"]
+    in_dtype : Literal[T.int8]
         The data type of the input. It should be "int8".
 
-    out_dtype : Literal["float16", "int8", "int4"]
+    out_dtype : Literal[T.float16, T.int8, T.int4]
         The data type of the output. It can be either "float16" or "int8" or "int4".
 
     storage_nbit : int, optional
@@ -1131,21 +1131,17 @@ def get_lop3_intrin_group(
     Dict[str, str]
         A dictionary mapping the names of the intrinsics to their corresponding implementations.
     """
-    assert out_dtype in [
-        "float16", "int8", "int4"
-    ], (f"Invalid out_dtype: {out_dtype}. Expected 'float16' or 'int8' or 'int4' .")
+    out_dtype, source_format, storage_dtype = T.dtype(out_dtype), T.dtype(source_format), T.dtype(storage_dtype)
+    assert out_dtype in [T.float16, T.int8, T.int4], f"Invalid out_dtype: {out_dtype}. Expected 'float16' or 'int8' or 'int4' ."
 
-    dtype_mapping = {"float16": "f16", "int4": "i4", "int8": "i8", "int32": "i32"}
+    dtype_mapping = {T.float16: "f16", T.int4: "i4", T.int8: "i8", T.int32: "i32"}
     target_dtype = dtype_mapping[out_dtype]
 
-    if source_format not in ["int", "uint"]:
-        raise ValueError(
-            f"Invalid source_format. Expected 'int' or 'uint', but got {source_format}.")
-    if with_zeros and source_format == "int":
+    if source_format not in [T.int, T.uint]:
+        raise ValueError(f"Invalid source_format. Expected 'int' or 'uint', but got {source_format}, {type(source_format)}.")
+    if with_zeros and source_format == T.int:
         raise ValueError(f"Zeros are not supported for signed integers, but got {source_format}")
 
-    source_symbol = "i" if source_format == "int" else "u"
-
     import_c_map = {
         "i4_to_f16": decode_i4_to_f16,
         "i2_to_f16": decode_i2_to_f16,
@@ -1180,15 +1176,15 @@ def get_lop3_intrin_group(
     if is_ladder_stage3:
         key += "_offset"
 
-    if out_dtype == "float16":
+    if out_dtype == T.float16:
         d4f = "f16"
-    elif out_dtype == "int8":
+    elif out_dtype == T.int8:
         d4f = "i8s"
-    elif out_dtype == "int4":
+    elif out_dtype == T.int4:
         d4f = "i4s"
     else:
         raise ValueError(f"Unsupported target dtype: {target_dtype}")
-    source_symbol = "u" if source_format == "uint" else "s"
+    source_symbol = "u" if source_format == T.uint else "s"
     func_name = f"decode_i{source_bit}{source_symbol}_to_{d4f}"
     if with_scaling:
         func_name += "_scale"
diff --git a/tilelang/quantize/mxfp.py b/tilelang/quantize/mxfp.py
index 8f46222f3..dd7100a62 100644
--- a/tilelang/quantize/mxfp.py
+++ b/tilelang/quantize/mxfp.py
@@ -1,6 +1,5 @@
-from __future__ import annotations
-
 from typing import Literal
+from tilelang import language as T
 
 # Implementation asm for fp4 to bf16, using twiddling
 # Reference: https://github.com/triton-lang/triton/blob/main/python/triton_kernels/triton_kernels/tensor_details/layout_details/hopper_value.py#L11-L18
@@ -51,10 +50,10 @@
 
 
 def get_mxfp_intrin_group(
-    out_dtype: Literal["float16", "bfloat16"] = "bfloat16",
-    source_format: Literal["int", "uint"] = "uint",
+    out_dtype: Literal[T.float16, T.bfloat16] = T.bfloat16,
+    source_format: Literal[T.int, T.uint] = T.uint,
     source_bit: int = 4,
-    storage_dtype: Literal["int32", "int8", "uint8"] = "uint8",
+    storage_dtype: Literal[T.int32, T.int8, T.uint8] = T.uint8,
     use_twiddling: bool = False,
 ) -> dict[str, str]:
     """
@@ -67,10 +66,10 @@ def get_mxfp_intrin_group(
     `_twiddling`).
 
     Parameters:
-        out_dtype: Target floating-point type for decoded values; either "float16" or "bfloat16".
+        out_dtype: Target floating-point type for decoded values; either T.float16 or T.bfloat16.
         source_format: Integer source representation; "int" or "uint".
         source_bit: Bit width of the packed source format (e.g., 4).
-        storage_dtype: Underlying storage integer dtype (one of "int32", "int8", "uint8").
+        storage_dtype: Underlying storage integer dtype (one of T.int32, T.int8, T.uint8).
         use_twiddling: When True, select the twiddling variant of the decoding intrinsic.
 
     Returns:
@@ -82,15 +81,12 @@ def get_mxfp_intrin_group(
         AssertionError: if out_dtype, source_format, or storage_dtype are not supported.
         KeyError: if the constructed key does not match any available C source implementation.
     """
-    assert out_dtype in ["float16", "bfloat16"
-                        ], f"Invalid out_dtype: {out_dtype}. Expected 'float16' or 'bfloat16'."
-    assert source_format in ["int", "uint"
-                            ], f"Invalid source_format: {source_format}. Expected 'int' or 'uint'."
-    assert storage_dtype in [
-        "int32", "int8", "uint8"
-    ], f"Invalid storage_dtype: {storage_dtype}. Expected 'int32' or 'int8' or 'uint8'."
+    out_dtype, source_format, storage_dtype = T.dtype(out_dtype), T.dtype(source_format), T.dtype(storage_dtype)
+    assert out_dtype in [T.float16, T.bfloat16], f"Invalid out_dtype: {out_dtype}. Expected 'float16' or 'bfloat16'."
+    assert source_format in [T.int, T.uint], f"Invalid source_format: {source_format}. Expected 'int' or 'uint'."
+    assert storage_dtype in [T.int32, T.int8, T.uint8], f"Invalid storage_dtype: {storage_dtype}. Expected 'int32' or 'int8' or 'uint8'."
 
-    dtype_map = {"float16": "f16", "bfloat16": "bf16"}
+    dtype_map = {T.float16: "f16", T.bfloat16: "bf16"}
     key = f"fp{source_bit}_to_{dtype_map[out_dtype]}"
     if use_twiddling:
         key += "_twiddling"
diff --git a/tilelang/quantize/quantization.py b/tilelang/quantize/quantization.py
index db9d2349d..74a545f25 100644
--- a/tilelang/quantize/quantization.py
+++ b/tilelang/quantize/quantization.py
@@ -16,12 +16,12 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-# Copyright (c) Tile-AI Corporation.
-# Licensed under the MIT License.
+
 # The code below is mostly copied from mlc.ai quantization.py in mlc-llm.
 # pylint: disable=invalid-name,missing-function-docstring,unused-variable
 """TIR computation utilities for quantization."""
 
+from tilelang import language as T
 from tilelang import tvm as tvm
 from tvm import tir
 
@@ -36,7 +36,7 @@ def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale
         a bfloat16 constructed from the unpacked sign, a scaled exponent, and the 1-bit mantissa.
 
         Behavior:
-        - Validates `nbit == 4`, `dtype == "bfloat16"`, and `val.dtype == "uint8"` (AssertionError if violated).
+        - Validates `nbit == 4`, `dtype == T.bfloat16`, and `val.dtype == T.uint8` (AssertionError if violated).
         - Extracts the 4-bit field at position `pos` (fields are packed consecutively in `val`).
         - Interprets the 4-bit field as: sign = bit3, exponent = bits1-2, mantissa = bit0.
         - Converts the 2-bit exponent to bf16 exponent space by adding a bias of 126, adds `scale` to that exponent,
@@ -49,27 +49,27 @@ def _tir_u8_to_f4_to_bf16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, scale
         - val: uint8 expression containing packed fields.
         - pos: index of the field within `val` (0-based); used to compute the bit shift.
         - scale: exponent-scale to add to the converted exponent (treated as an unsigned integer expression).
-        - dtype: must be "bfloat16".
+        - dtype: must be T.bfloat16.
 
         Returns:
         - A tir.PrimExpr of dtype "bfloat16" representing the decoded and scaled value.
         """
     assert nbit == 4
-    assert dtype == "bfloat16"
-    assert val.dtype == "uint8"
-    mask = tir.const((1 << nbit) - 1, "uint16")
-    f4 = (val >> (pos.astype("uint16") * tir.const(nbit, "uint16"))) & mask
-    s = f4 >> tir.const(3, "uint16")
-    e_f4 = (f4 & tir.const(6, "uint16")) >> tir.const(1, "uint16")
+    assert dtype == T.bfloat16
+    assert val.dtype == T.uint8
+    mask = tir.const((1 << nbit) - 1, T.uint16)
+    f4 = (val >> (pos.astype(T.uint16) * tir.const(nbit, T.uint16))) & mask
+    s = f4 >> tir.const(3, T.uint16)
+    e_f4 = (f4 & tir.const(6, T.uint16)) >> tir.const(1, T.uint16)
     # Exponential bias between f4 and bf16 is 2^(8-1) - 2^(2-1) = 126
-    e_bf16 = e_f4 + tir.const(126, "uint16")
+    e_bf16 = e_f4 + tir.const(126, T.uint16)
     # Scale is the exponential part, within the representation of uint8
     # To handle the overflow, we use the max function to limit the exponential part to 8 bits
-    e_bf16 = min(e_bf16 + scale, tir.const((1 << 8) - 1, "uint16"))
-    m_f4 = f4 & tir.const(1, "uint16")
-    val_bf16 = tir.reinterpret("bfloat16",
-                               ((((s << tir.const(8, "uint16")) | e_bf16) << tir.const(7, "uint16"))
-                                | (m_f4 << tir.const(6, "uint16"))).astype("uint16"))
+    e_bf16 = min(e_bf16 + scale, tir.const((1 << 8) - 1, T.uint16))
+    m_f4 = f4 & tir.const(1, T.uint16)
+    val_bf16 = tir.reinterpret(T.bfloat16,
+                               ((((s << tir.const(8, T.uint16)) | e_bf16) << tir.const(7, T.uint16))
+                                | (m_f4 << tir.const(6, T.uint16))).astype(T.uint16))
     return val_bf16
 
 def _tir_f32x2_to_bf16x2_to_u32(v0: tir.PrimExpr, v1: tir.PrimExpr, round_to_even: bool = True):
@@ -88,29 +88,29 @@ def _tir_f32x2_to_bf16x2_to_u32(v0: tir.PrimExpr, v1: tir.PrimExpr, round_to_eve
     Returns:
         tir.PrimExpr: A uint32 PrimExpr containing the packed bfloat16 representations (v0 low 16 bits, v1 high 16 bits).
     """
-    mask = tir.const((1 << 16) - 1, "uint32")
+    mask = tir.const((1 << 16) - 1, T.uint32)
     res = []
     for data in [v0, v1]:
-        u32_val = tir.reinterpret("uint32", data)
+        u32_val = tir.reinterpret(T.uint32, data)
         if round_to_even:
-            rounding_bias = ((u32_val >> tir.const(16, "uint32"))
-                             & tir.const(1, "uint32")) + tir.const(0x7FFF, "uint32")
+            rounding_bias = ((u32_val >> tir.const(16, T.uint32))
+                             & tir.const(1, T.uint32)) + tir.const(0x7FFF, T.uint32)
             u32_val += rounding_bias
-        res.append((u32_val >> tir.const(16, "uint32")) & mask)
-    return res[0] | (res[1] << tir.const(16, "uint32"))
+        res.append((u32_val >> tir.const(16, T.uint32)) & mask)
+    return res[0] | (res[1] << tir.const(16, T.uint32))
 
 
 def _tir_u32_to_bf16x2_to_f32x2(x: tir.PrimExpr):
-    mask = tir.const((1 << 16) - 1, "uint32")
+    mask = tir.const((1 << 16) - 1, T.uint32)
     x0 = x & mask
     x1 = (x >> 16) & mask
-    return (tir.reinterpret("float32", x << tir.const(16, "uint32")) for x in [x0, x1])
+    return (tir.reinterpret(T.float32, x << tir.const(16, T.uint32)) for x in [x0, x1])
 
 
 def _tir_u32_to_int_to_float(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
-    assert val.dtype == "uint32"
-    mask = tvm.tir.const((1 << nbit) - 1, "uint32")
-    return tir.Cast(dtype, (val >> (pos * nbit).astype("uint32")) & mask)
+    assert val.dtype == T.uint32
+    mask = tvm.tir.const((1 << nbit) - 1, T.uint32)
+    return tir.Cast(dtype, (val >> (pos * nbit).astype(T.uint32)) & mask)
 
 
 def _tir_packed_uint_to_uint_to_float(storage_nbit: int):
@@ -119,7 +119,7 @@ def _tir_packed_uint_to_uint_to_float(storage_nbit: int):
     def f_convert(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
         assert val.dtype == storage_dtype, f"{val.dtype} != {storage_dtype}"
         max_int_value = (1 << (nbit - 1)) - 1
-        return ((val >> (pos.astype("uint32") * tir.const(nbit, "uint32"))) & tir.const(
+        return ((val >> (pos.astype(T.uint32) * tir.const(nbit, T.uint32))) & tir.const(
             (1 << nbit) - 1, "uint32")).astype(dtype) - tir.const(max_int_value, dtype)
 
     return f_convert
@@ -130,74 +130,74 @@ def _tir_packed_int_to_int_to_float(storage_nbit: int):
 
     def f_convert(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
         assert val.dtype == storage_dtype, f"{val.dtype} != {storage_dtype}"
-        mask = tir.const((1 << nbit) - 1, "int32")
-        unextended = (val >> (pos.astype("int32") * tir.const(nbit, "int32"))) & mask
+        mask = tir.const((1 << nbit) - 1, T.int32)
+        unextended = (val >> (pos.astype(T.int32) * tir.const(nbit, T.int32))) & mask
         return tir.Cast(
-            dtype, (unextended << tir.const(32 - nbit, "int32")) >> tir.const(32 - nbit, "int32"))
+            dtype, (unextended << tir.const(32 - nbit, T.int32)) >> tir.const(32 - nbit, T.int32))
 
     return f_convert
 
 
 def _tir_f32_to_uint_to_f4(val: tir.PrimExpr):
-    assert val.dtype == "float32"
-    val_u32 = tir.reinterpret("uint32", val)
+    assert val.dtype == T.float32
+    val_u32 = tir.reinterpret(T.uint32, val)
     # e_f32 >  120 -> e_f4 = min(e_f32 - 120 + M_h, 7)
     # e_f32 == 120 -> e_f4 = 1
     # e_f32 < 120 -> e_f4 = 0
-    m_h = (val_u32 >> tir.const(22, "uint32")) & tir.const(1, "uint32")
-    e_f32 = (val_u32 >> tir.const(23, "uint32")) & tir.const(255, "uint32")
-    s = (val_u32 >> tir.const(31, "uint32"))
+    m_h = (val_u32 >> tir.const(22, T.uint32)) & tir.const(1, T.uint32)
+    e_f32 = (val_u32 >> tir.const(23, T.uint32)) & tir.const(255, T.uint32)
+    s = (val_u32 >> tir.const(31, T.uint32))
     e_f4 = tir.Select(
-        e_f32 > tir.const(120, "uint32"),
-        tir.Min(e_f32 - tir.const(120, "uint32") + m_h, tir.const(7, "uint32")),
-        tir.Select(e_f32 == tir.const(120, "uint32"), tir.const(1, "uint32"),
-                   tir.const(0, "uint32")))
-    return (s << tir.const(3, "uint32")) | e_f4
+        e_f32 > tir.const(120, T.uint32),
+        tir.Min(e_f32 - tir.const(120, T.uint32) + m_h, tir.const(7, T.uint32)),
+        tir.Select(e_f32 == tir.const(120, T.uint32), tir.const(1, T.uint32),
+                   tir.const(0, T.uint32)))
+    return (s << tir.const(3, T.uint32)) | e_f4
 
 
 def _tir_f16_to_uint_to_f4(val: tir.PrimExpr):
-    assert val.dtype == "float16"
-    val_u32 = tir.Cast("uint32", tir.reinterpret("uint16", val))
-    m_h = (val_u32 >> tir.const(9, "uint32")) & tir.const(1, "uint32")
-    e_f16 = (val_u32 >> tir.const(10, "uint32")) & tir.const(31, "uint32")
-    s = (val_u32 >> tir.const(15, "uint32"))
+    assert val.dtype == T.float16
+    val_u32 = tir.Cast(T.uint32, tir.reinterpret(T.uint16, val))
+    m_h = (val_u32 >> tir.const(9, T.uint32)) & tir.const(1, T.uint32)
+    e_f16 = (val_u32 >> tir.const(10, T.uint32)) & tir.const(31, T.uint32)
+    s = (val_u32 >> tir.const(15, T.uint32))
     e_f4 = tir.Select(
-        e_f16 > tir.const(8, "uint32"),
-        tir.Min(e_f16 - tir.const(8, "uint32") + m_h, tir.const(7, "uint32")),
-        tir.Select(e_f16 == tir.const(8, "uint32"), tir.const(1, "uint32"), tir.const(0, "uint32")))
-    return (s << tir.const(3, "uint32")) | e_f4
+        e_f16 > tir.const(8, T.uint32),
+        tir.Min(e_f16 - tir.const(8, T.uint32) + m_h, tir.const(7, T.uint32)),
+        tir.Select(e_f16 == tir.const(8, T.uint32), tir.const(1, T.uint32), tir.const(0, T.uint32)))
+    return (s << tir.const(3, T.uint32)) | e_f4
 
 
 def _tir_u32_to_f4_to_f32(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
     assert nbit == 4
-    assert dtype == "float32"
-    assert val.dtype == "uint32"
+    assert dtype == T.float32
+    assert val.dtype == T.uint32
     # e_f4 == 0 -> e_f32 = 0
     # e_f4 != 0 -> e_f32 = e_f4 + 120 = e_f4 | (1111000)_2
-    mask = tvm.tir.const((1 << nbit) - 1, "uint32")
-    f4 = (val >> (pos.astype("uint32") * tir.const(nbit, "uint32"))) & mask
-    s = f4 >> tir.const(3, "uint32")
-    e_f4 = f4 & tir.const(7, "uint32")
-    e_f32 = e_f4 | tir.const(120, "uint32")
-    val_f32 = tir.reinterpret("float32",
-                              (e_f32 | (s << tir.const(8, "uint32"))) << tir.const(23, "uint32"))
-    return tir.Select(e_f4 == tir.const(0, "uint32"), tir.const(0, "float32"), val_f32)
+    mask = tvm.tir.const((1 << nbit) - 1, T.uint32)
+    f4 = (val >> (pos.astype(T.uint32) * tir.const(nbit, T.uint32))) & mask
+    s = f4 >> tir.const(3, T.uint32)
+    e_f4 = f4 & tir.const(7, T.uint32)
+    e_f32 = e_f4 | tir.const(120, T.uint32)
+    val_f32 = tir.reinterpret(T.float32,
+                              (e_f32 | (s << tir.const(8, T.uint32))) << tir.const(23, T.uint32))
+    return tir.Select(e_f4 == tir.const(0, T.uint32), tir.const(0, T.float32), val_f32)
 
 
 def _tir_packed_to_fp4_to_f16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
     assert nbit == 4
-    assert dtype == "float16"
-    assert val.dtype == "uint32"
+    assert dtype == T.float16
+    assert val.dtype == T.uint32
     # e_f4 == 0 -> e_f16 = 0
     # e_f4 != 0 -> e_f16 = e_f4 + 8 = e_f4 | (1000)_2
-    mask = tvm.tir.const((1 << nbit) - 1, "uint16")
-    f4 = (val >> (pos.astype("uint16") * tir.const(nbit, "uint16"))) & mask
-    s = f4 >> tir.const(3, "uint16")
-    e_f4 = f4 & tir.const(7, "uint16")
-    e_f16 = e_f4 | tir.const(8, "uint16")
-    val_f16 = tir.reinterpret("float16",
-                              ((e_f16 | (s << tir.const(5, "uint16"))) << tir.const(10, "uint16")).astype("uint16"))
-    return tir.Select(e_f4 == tir.const(0, "uint16"), tir.const(0, "float16"), val_f16)
+    mask = tvm.tir.const((1 << nbit) - 1, T.uint16)
+    f4 = (val >> (pos.astype(T.uint16) * tir.const(nbit, T.uint16))) & mask
+    s = f4 >> tir.const(3, T.uint16)
+    e_f4 = f4 & tir.const(7, T.uint16)
+    e_f16 = e_f4 | tir.const(8, T.uint16)
+    val_f16 = tir.reinterpret(T.float16,
+                              ((e_f16 | (s << tir.const(5, T.uint16))) << tir.const(10, T.uint16)).astype(T.uint16))
+    return tir.Select(e_f4 == tir.const(0, T.uint16), tir.const(0, T.float16), val_f16)
 
 def _tir_packed_to_fp4_to_f16(storage_type="uint", storage_nbit=8):
     storage_dtype = storage_type + str(storage_nbit)
@@ -210,37 +210,37 @@ def f_convert(nbit: int, val: tvm.tir.PrimExpr, pos: tvm.tir.PrimExpr, dtype: st
         s = f4 >> tir.const(3, storage_dtype)
         e_f4 = f4 & tir.const(7, storage_dtype)
         e_f16 = e_f4 | tir.const(8, storage_dtype)
-        val_f16 = tir.reinterpret("float16",
-                                ((e_f16 | (s << tir.const(5, storage_dtype))) << tir.const(10, storage_dtype)).astype("uint16"))
-        return tir.Select(e_f4 == tir.const(0, storage_dtype), tir.const(0, "float16"), val_f16)
+        val_f16 = tir.reinterpret(T.float16,
+                                ((e_f16 | (s << tir.const(5, storage_dtype))) << tir.const(10, storage_dtype)).astype(T.uint16))
+        return tir.Select(e_f4 == tir.const(0, storage_dtype), tir.const(0, T.float16), val_f16)
 
     return f_convert
 
 def _tir_u8_to_f8_e4m3_to_f16_naive(nbit: int, val: tir.PrimExpr, dtype: str):
     assert nbit == 8
-    assert dtype == "float16"
-    s_f16 = (val >> tir.const(7, "uint16")) << tir.const(15, "uint16")
-    e4 = val & tir.const(0x40, "uint16")
-    prefix = tir.Select(e4 == tir.const(0, "uint16"), tir.const(0x2000, "uint16"),
-                        tir.const(0x4000, "uint16"))
-    e_f16 = ((val & tir.const(63, "uint16")) << tir.const(7, "uint16")) | prefix
-    return tir.reinterpret("float16", s_f16 | e_f16)
+    assert dtype == T.float16
+    s_f16 = (val >> tir.const(7, T.uint16)) << tir.const(15, T.uint16)
+    e4 = val & tir.const(0x40, T.uint16)
+    prefix = tir.Select(e4 == tir.const(0, T.uint16), tir.const(0x2000, T.uint16),
+                        tir.const(0x4000, T.uint16))
+    e_f16 = ((val & tir.const(63, T.uint16)) << tir.const(7, T.uint16)) | prefix
+    return tir.reinterpret(T.float16, s_f16 | e_f16)
 
 
 def _tir_u8_to_f8_e4m3_to_f16(nbit: int, val: tir.PrimExpr, dtype: str):
     assert nbit == 8
-    assert dtype == "float16"
-    s_f16 = (val >> tir.const(7, "uint16")) << tir.const(15, "uint16")
-    e4 = val & tir.const(0x40, "uint16")
-    e_f16 = ((val & tir.const(63, "uint16")) << tir.const(7, "uint16")) | (e4 << tir.const(8, "uint16")) | (e4 << tir.const(7, "uint16"))
-    e_f16 = e_f16 ^ tir.const(0x2000, "uint16")
-    return tir.reinterpret("float16", s_f16 | e_f16)
+    assert dtype == T.float16
+    s_f16 = (val >> tir.const(7, T.uint16)) << tir.const(15, T.uint16)
+    e4 = val & tir.const(0x40, T.uint16)
+    e_f16 = ((val & tir.const(63, T.uint16)) << tir.const(7, T.uint16)) | (e4 << tir.const(8, T.uint16)) | (e4 << tir.const(7, T.uint16))
+    e_f16 = e_f16 ^ tir.const(0x2000, T.uint16)
+    return tir.reinterpret(T.float16, s_f16 | e_f16)
 
 
 def _tir_u8_to_f8_e5m2_to_f16(nbit: int, val: tir.PrimExpr, dtype: str):
     assert nbit == 8
-    assert dtype == "float16"
-    return tir.reinterpret("float8_e5m2", val).astype("float16")
+    assert dtype == T.float16
+    return tir.reinterpret("float8_e5m2", val).astype(T.float16)
 
 
 def _tir_packed_to_signed_convert(storage_type="uint", storage_nbit=8):
@@ -249,7 +249,7 @@ def _tir_packed_to_signed_convert(storage_type="uint", storage_nbit=8):
     def f_convert(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
         assert val.dtype == storage_dtype, f"{val.dtype} != {storage_dtype}"
         max_int_value = (1 << (nbit - 1))
-        return ((val >> (pos.astype("uint32") * tir.const(nbit, "uint32"))) & tir.const(
+        return ((val >> (pos.astype(T.uint32) * tir.const(nbit, T.uint32))) & tir.const(
             (1 << nbit) - 1, "uint32")).astype(dtype) - tir.const(max_int_value, dtype)
 
     return f_convert
@@ -283,10 +283,10 @@ def _tir_packed_int_to_int_convert(storage_type="uint", storage_nbit=8):
 
     def f_convert(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
         assert val.dtype == storage_dtype, f"{val.dtype} != {storage_dtype}"
-        mask = tir.const((1 << nbit) - 1, "int32")
-        unextended = (val >> (pos.astype("int32") * tir.const(nbit, "int32"))) & mask
+        mask = tir.const((1 << nbit) - 1, T.int32)
+        unextended = (val >> (pos.astype(T.int32) * tir.const(nbit, T.int32))) & mask
         return tir.Cast(
-            dtype, (unextended << tir.const(32 - nbit, "int32")) >> tir.const(32 - nbit, "int32"))
+            dtype, (unextended << tir.const(32 - nbit, T.int32)) >> tir.const(32 - nbit, T.int32))
 
     return f_convert
 
diff --git a/tilelang/quantize/utils.py b/tilelang/quantize/utils.py
index 2447ca167..2d092a0ba 100644
--- a/tilelang/quantize/utils.py
+++ b/tilelang/quantize/utils.py
@@ -1,6 +1,7 @@
 def gen_quant4(k, n, groupsize=-1):
     import torch
     import torch.nn as nn
+
     maxq = 2**4
     w = torch.randn((k, n), dtype=torch.half, device="cpu")
 
@@ -48,6 +49,7 @@ def reshape(w):
 
 def general_compress(lowprecision_weight, source_bits=4, storage_dtype=None):
     import torch
+
     if storage_dtype is None:
         storage_dtype = torch.int8
     elems_per_byte = 8 // source_bits
@@ -56,11 +58,11 @@ def general_compress(lowprecision_weight, source_bits=4, storage_dtype=None):
     int8_weight = torch.zeros(
         (*lowprecision_weight.shape[:-1], lowprecision_weight.shape[-1] // elems_per_byte),
         dtype=torch.int8,
-        device=lowprecision_weight.device)
+        device=lowprecision_weight.device,
+    )
     for j in range(lowprecision_weight.shape[-1] // elems_per_byte):
         for k in range(elems_per_byte):
-            int8_weight[..., j] |= (lowprecision_weight[..., j * elems_per_byte + k] <<
-                                    (source_bits * k)).to(torch.int8)
+            int8_weight[..., j] |= (lowprecision_weight[..., j * elems_per_byte + k] << (source_bits * k)).to(torch.int8)
 
     return int8_weight.to(storage_dtype)
 
@@ -82,6 +84,7 @@ def interleave_weight(qweight, nbits=4, target_dtype="float16"):
         interleave_weight(qweight, 4, "float16")
     """
     import torch
+
     assert target_dtype in ["float16", "int8"]
     # reinterpret the data type of qweight to int32
     qweight = qweight.view(torch.int32)
diff --git a/tilelang/testing/__init__.py b/tilelang/testing/__init__.py
index 6a2031492..951838d54 100644
--- a/tilelang/testing/__init__.py
+++ b/tilelang/testing/__init__.py
@@ -1,24 +1,39 @@
 import sys
+import os
 import inspect
 import pytest
 import random
 import torch
 import numpy as np
 from tilelang.contrib import nvcc
-from tvm.testing.utils import (requires_cuda, requires_package, requires_llvm, requires_metal,
-                               requires_rocm, _compose)
+from tvm.testing.utils import requires_cuda, requires_package, requires_llvm, requires_metal, requires_rocm, _compose
 
 from tilelang.utils.tensor import torch_assert_close as torch_assert_close
+from .perf_regression import process_func, regression
 
 __all__ = [
-    'requires_package',
-    'requires_cuda',
-    'requires_metal',
-    'requires_rocm',
-    'requires_llvm',
-    'main',
-    'requires_cuda_compute_version',
-] + [f'requires_cuda_compute_version_{op}' for op in ('ge', 'gt', 'le', 'lt', 'eq')]
+    "requires_package",
+    "requires_cuda",
+    "requires_metal",
+    "requires_rocm",
+    "requires_llvm",
+    "requires_distributed",
+    "main",
+    "requires_cuda_compute_version",
+    "process_func",
+    "regression",
+] + [f"requires_cuda_compute_version_{op}" for op in ("ge", "gt", "le", "lt", "eq")]
+
+__all__ = [
+    "requires_package",
+    "requires_cuda",
+    "requires_metal",
+    "requires_rocm",
+    "requires_llvm",
+    "requires_distributed",
+    "main",
+    "requires_cuda_compute_version",
+] + [f"requires_cuda_compute_version_{op}" for op in ("ge", "gt", "le", "lt", "eq")]
 
 
 # pytest.main() wrapper to allow running single test file
@@ -120,3 +135,22 @@ def requires_cuda_compute_version_lt(major_version, minor_version=0):
 
 def requires_cuda_compute_version_le(major_version, minor_version=0):
     return requires_cuda_compute_version(major_version, minor_version, mode="le")
+
+
+# Whether TILELANG_USE_DISTRIBUTED is enabled in the environment
+_distributed_enabled = os.environ.get("TILELANG_USE_DISTRIBUTED", "0").lower() in ("1", "true", "on")
+
+
+def requires_distributed(func):
+    """Mark a test as requiring distributed mode (TILELANG_USE_DISTRIBUTED=1).
+
+    Adds both:
+    - pytest.mark.distributed  → so CI can select these tests via ``-m distributed``
+    - pytest.mark.skipif       → so the test is skipped when the env var is unset
+    """
+    func = pytest.mark.distributed(func)
+    func = pytest.mark.skipif(
+        not _distributed_enabled,
+        reason="Requires TILELANG_USE_DISTRIBUTED=1",
+    )(func)
+    return func
diff --git a/tilelang/testing/perf_regression.py b/tilelang/testing/perf_regression.py
new file mode 100644
index 000000000..49e9303fd
--- /dev/null
+++ b/tilelang/testing/perf_regression.py
@@ -0,0 +1,88 @@
+from __future__ import annotations
+
+import inspect
+import json
+import os
+from dataclasses import dataclass
+from typing import Any, Callable
+from collections.abc import Sequence
+import warnings
+
+
+@dataclass(frozen=True)
+class PerfResult:
+    name: str
+    latency: float
+
+
+_RESULTS: list[PerfResult] = []
+
+_MAX_RETRY_NUM = 5
+
+_RESULTS_JSON_PREFIX = "__TILELANG_PERF_RESULTS_JSON__="
+
+
+def _results_to_jsonable() -> list[dict[str, float | str]]:
+    return [{"name": r.name, "latency": r.latency} for r in _RESULTS]
+
+
+def _emit_results() -> None:
+    """Emit results for parent collectors.
+
+    Default output remains the historical text format. Set
+    `TL_PERF_REGRESSION_FORMAT=json` to emit a single JSON marker line which is
+    robust against extra prints from benchmark code.
+    """
+    fmt = os.environ.get("TL_PERF_REGRESSION_FORMAT", "text").strip().lower()
+    if fmt == "json":
+        print(_RESULTS_JSON_PREFIX + json.dumps(_results_to_jsonable(), separators=(",", ":")))
+        return
+    # Fallback (human-readable): one result per line.
+    for r in _RESULTS:
+        print(f"{r.name}: {r.latency}")
+
+
+def _reset_results() -> None:
+    _RESULTS.clear()
+
+
+def process_func(func: Callable[..., float], name: str | None = None, /, **kwargs: Any) -> None:
+    """Execute a single perf function and record its latency.
+
+    `func` is expected to return a positive latency scalar (seconds or ms; we
+    treat it as an opaque number, only ratios matter for regression).
+    """
+    result_name = getattr(func, "__module__", "<unknown>") if name is None else name
+    if result_name.startswith("regression_"):
+        result_name = result_name[len("regression_") :]
+    latency = float(func(**kwargs))
+    _iter = 0
+    while latency <= 0.0 and _iter < _MAX_RETRY_NUM:
+        latency = float(func(**kwargs))
+        _iter += 1
+    if latency <= 0.0:
+        warnings.warn(f"{result_name} has latency {latency} <= 0. Please verify the profiling results.", RuntimeWarning, 1)
+        return
+    _RESULTS.append(PerfResult(name=result_name, latency=latency))
+
+
+def regression(prefixes: Sequence[str] = ("regression_",)) -> None:
+    """Run entrypoints in the caller module and print a markdown table.
+
+    This is invoked by many example scripts.
+    """
+
+    caller_globals = inspect.currentframe().f_back.f_globals  # type: ignore[union-attr]
+
+    _reset_results()
+    functions: list[tuple[str, Callable[[], Any]]] = []
+    for k, v in list(caller_globals.items()):
+        if not callable(v):
+            continue
+        if any(k.startswith(p) for p in prefixes):
+            functions.append((k, v))
+
+    for _, fn in sorted(functions, key=lambda kv: kv[0]):
+        fn()
+
+    _emit_results()
diff --git a/tilelang/tileop/__init__.py b/tilelang/tileop/__init__.py
index 5656494fe..6e7798a05 100644
--- a/tilelang/tileop/__init__.py
+++ b/tilelang/tileop/__init__.py
@@ -1 +1,3 @@
+from .base import GemmWarpPolicy  # noqa: F401
 from .gemm import GemmPy  # noqa: F401
+from .gemm_sp import GemmSPPy  # noqa: F401
diff --git a/tilelang/primitives/gemm/base.py b/tilelang/tileop/base.py
similarity index 56%
rename from tilelang/primitives/gemm/base.py
rename to tilelang/tileop/base.py
index 827ff78f9..f7b51b3ac 100644
--- a/tilelang/primitives/gemm/base.py
+++ b/tilelang/tileop/base.py
@@ -1,8 +1,5 @@
 from __future__ import annotations
 from enum import IntEnum
-from dataclasses import dataclass
-
-from tvm import tir
 
 
 class GemmWarpPolicy(IntEnum):
@@ -131,7 +128,7 @@ def compute_warp_partition(self, M, N, num_warps):
             # Try to find the best balanced partition
             best_m = 1
             best_n = 1
-            best_balance = float('inf')
+            best_balance = float("inf")
 
             # Try all possible combinations that satisfy the constraints
             for m in range(1, min(max_m_warps, num_warps) + 1):
@@ -186,130 +183,3 @@ def from_warp_partition(cls, m_warp: int, n_warp: int) -> GemmWarpPolicy:
             return cls.FullCol
         else:
             return cls.Square
-
-
-@dataclass
-class GemmBaseParams:
-    # OP Related Config
-    A: tir.Buffer
-    B: tir.Buffer
-    C: tir.Buffer
-
-    transpose_A: bool = False
-    transpose_B: bool = False
-    block_row_warps: int | None = None
-    block_col_warps: int | None = None
-    warp_row_tiles: int | None = None
-    warp_col_tiles: int | None = None
-    chunk: int | None = None
-    policy: GemmWarpPolicy = GemmWarpPolicy.Square,
-    k_pack: int = 1
-
-    def get_warp_size(self) -> int:
-        # must rewrite to 64 if the target
-        # is cdna mfma
-        return 32
-
-    def params_as_dict(self):
-        return {
-            "A": self.A,
-            "B": self.B,
-            "C": self.C,
-            "transpose_A": self.transpose_A,
-            "transpose_B": self.transpose_B,
-            "block_row_warps": self.block_row_warps,
-            "block_col_warps": self.block_col_warps,
-            "warp_row_tiles": self.warp_row_tiles,
-            "warp_col_tiles": self.warp_col_tiles,
-            "chunk": self.chunk,
-            "policy": self.policy,
-            "k_pack": self.k_pack,
-        }
-
-    def infer_block_partition(self, threads: int | None) -> None:
-        """
-        Infer and set block partition parameters (e.g., block_row_warps,
-        block_col_warps, warp_row_tiles, warp_col_tiles, chunk) based on the
-        shape of A and B. If these parameters are not already specified, the
-        method will attempt to infer them automatically based on the given
-        `threads`.
-
-        Parameters
-        ----------
-        threads : Optional[int]
-            The total number of threads in a block. Must be provided
-            if any block partition parameter is not already set.
-
-        Raises
-        ------
-        AssertionError
-            If `threads` is None but any block partition parameter is missing,
-            or if A and B have inconsistent shapes for GEMM.
-        """
-
-        warp_size = self.get_warp_size()
-        A, B = self.A, self.B
-        transpose_A, transpose_B = self.transpose_A, self.transpose_B
-        block_row_warps, block_col_warps = (
-            self.block_row_warps,
-            self.block_col_warps,
-        )
-        warp_row_tiles, warp_col_tiles = (
-            self.warp_row_tiles,
-            self.warp_col_tiles,
-        )
-        policy = self.policy
-
-        # The field `chunk` is not declared in GemmBaseParams by default.
-        # We infer it based on the K dimension of matrices.
-        # Initialize chunk from `self` if it exists; otherwise we infer it.
-        chunk = getattr(self, "chunk", None)
-
-        # Determine whether block partition parameters need to be inferred
-        require_infer = (
-            block_row_warps is None or block_col_warps is None or warp_row_tiles is None or
-            warp_col_tiles is None or chunk is None)
-
-        A_shape, B_shape = A.shape, B.shape
-
-        if require_infer:
-            assert (threads is not None), "threads must be provided for auto inference"
-            # Auto-inference only supports 2D matrix multiplication
-            assert (
-                len(A_shape) == 2 and len(B_shape) == 2
-            ), f"Only support 2D matrix multiplication, got {len(A_shape)}D and {len(B_shape)}D"
-
-            # Analyze A/B shapes
-            AM = A_shape[1] if transpose_A else A_shape[0]  # M dimension
-            BN = B_shape[0] if transpose_B else B_shape[1]  # N dimension
-            AK = A_shape[0] if transpose_A else A_shape[1]  # K dimension
-            BK = B_shape[1] if transpose_B else B_shape[0]  # K dimension
-            assert AK == BK, "A and B shape mismatch"
-
-            block_M = int(AM)
-            block_N = int(BN)
-            num_warps = threads // warp_size
-
-            # Infer block partition using a user-specified policy
-            block_row_warps, block_col_warps = policy.compute_warp_partition(
-                block_M, block_N, num_warps)
-            warp_row_tiles = block_M // block_row_warps
-            warp_col_tiles = block_N // block_col_warps
-            chunk = int(AK)
-
-        # rewrite the values
-        self.block_row_warps = block_row_warps
-        self.block_col_warps = block_col_warps
-        self.warp_row_tiles = warp_row_tiles
-        self.warp_col_tiles = warp_col_tiles
-        self.chunk = chunk
-
-    @property
-    def class_attributes(self):
-        return self.params_as_dict()
-
-    def __repr__(self) -> str:
-        cls_name = self.__class__.__name__
-        fields = self.class_attributes
-        field_str = ", ".join(f"{key}={value!r}" for key, value in fields.items())
-        return f"{cls_name}({field_str})"
diff --git a/tilelang/tileop/gemm/__init__.py b/tilelang/tileop/gemm/__init__.py
index 63a999f4d..bdb1ac0c6 100644
--- a/tilelang/tileop/gemm/__init__.py
+++ b/tilelang/tileop/gemm/__init__.py
@@ -3,22 +3,28 @@
 from tvm import tir
 from tvm.target import Target
 from tvm.ir.base import Node
+from tvm.ir import Range
 from tvm.runtime import Scriptable
-import tvm.ffi
-from tilelang.ir import GemmWarpPolicy
+import tvm_ffi
 from .gemm_mma import GemmMMA
+from .gemm_mma_sm70 import GemmMMASm70
 from .gemm_wgmma import GemmWGMMA
+from .gemm_tcgen05 import GemmTCGEN5
+from .gemm_mfma import GemmMFMA
+from .gemm_cutedsl import GemmCuTeDSL
 from tilelang import _ffi_api
+from tilelang.utils.target import target_is_volta
+from tilelang.jit.adapter.utils import is_cutedsl_target
 
 
-@tvm.ffi.register_func("tl.gemm_py.infer_layout")
-def gemm_py_infer_layout(gemm_py, target, thread_bounds):
+@tvm_ffi.register_global_func("tl.gemm_py.infer_layout")
+def gemm_py_infer_layout(gemm_py: GemmMMA, target: Target, thread_bounds: Range):
     thread_nums = thread_bounds.extent
     return gemm_py.infer_layout(target, thread_nums)
 
 
-@tvm.ffi.register_func("tl.gemm_py.lower")
-def gemm_py_lower(gemm_py, layout_map, target, thread_bounds, thread_var):
+@tvm_ffi.register_global_func("tl.gemm_py.lower")
+def gemm_py_lower(gemm_py: GemmMMA, layout_map, target: Target, thread_bounds: Range, thread_var: tir.Var):
     thread_nums = thread_bounds.extent
     stmt = gemm_py.lower(layout_map, target, thread_nums, thread_var)
     return stmt
@@ -28,55 +34,117 @@ def gemm_py_lower(gemm_py, layout_map, target, thread_bounds, thread_var):
 # same definition with src/op/gemm_py.h
 class GemmInst(IntEnum):
     MMA = 0
-    WGMMMA = 1
-    MFMA = 2
+    WGMMA = 1
+    TCGEN5MMA = 2
+    MFMA = 3
 
     def is_mma(self) -> bool:
         return self == GemmInst.MMA
 
     def is_wgmma(self) -> bool:
-        return self == GemmInst.WGMMMA
+        return self == GemmInst.WGMMA
+
+    def is_tcgen5mma(self) -> bool:
+        return self == GemmInst.TCGEN5MMA
 
     def is_mfma(self) -> bool:
         return self == GemmInst.MFMA
 
+    def __repr__(self) -> str:
+        return self.name
+
 
-@tvm.ffi.register_object("tl.GemmPy")
+@tvm_ffi.register_object("tl.GemmPy")
 class GemmPy(Node, Scriptable):
-    A: tir.Buffer
-    B: tir.Buffer
-    C: tir.Buffer
-
-    APtr: tir.PrimExpr
-    BPtr: tir.PrimExpr
-    CPtr: tir.PrimExpr
-
-    M: int
-    N: int
-    K: int
-
-    trans_A: bool
-    trans_B: bool
-
-    stride_A: int
-    stride_B: int
-    offset_A: int
-    offset_B: int
-    clear_accum: bool
-    k_pack: int
-    wg_wait: int
-    policy: GemmWarpPolicy
+    # FFI fields (LLVM/MLIR-style lowerCamel via reflection):
+    # a, b, c, aPtr, bPtr, cPtr, m, n, k, transA, transB,
+    # strideA, strideB, offsetA, offsetB, clearAccum, kPack, wgWait, policy
+    #
+    # Backward-compat alias properties are provided below to support old names.
+
+    # Backward-compat alias properties (old API → new FFI fields)
+    @property
+    def A(self):
+        return self.a
+
+    @property
+    def B(self):
+        return self.b
+
+    @property
+    def C(self):
+        return self.c
+
+    @property
+    def APtr(self):
+        return self.aPtr
+
+    @property
+    def BPtr(self):
+        return self.bPtr
+
+    @property
+    def CPtr(self):
+        return self.cPtr
+
+    @property
+    def M(self):
+        return self.m
+
+    @property
+    def N(self):
+        return self.n
+
+    @property
+    def K(self):
+        return self.k
+
+    @property
+    def trans_A(self):
+        return self.transA
+
+    @property
+    def trans_B(self):
+        return self.transB
+
+    @property
+    def stride_A(self):
+        return self.strideA
+
+    @property
+    def stride_B(self):
+        return self.strideB
+
+    @property
+    def offset_A(self):
+        return self.offsetA
+
+    @property
+    def offset_B(self):
+        return self.offsetB
+
+    @property
+    def clear_accum(self):
+        return self.clearAccum
+
+    @property
+    def k_pack(self):
+        return self.kPack
+
+    @property
+    def wg_wait(self):
+        return self.wgWait
 
     def infer_layout(self, target: Target, thread_nums: int):
         """Infer the layout for the GEMM operation based on target architecture."""
         gemm_inst = self._select_gemm_instruction(thread_nums, target)
-        impl_class = self._get_implementation_class(gemm_inst)
+        impl_class = self._get_implementation_class(gemm_inst, target)
         return impl_class(self).infer_layout(target, thread_nums)
 
     def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
         """Lower the GEMM operation to TIR statements based on target architecture."""
         gemm_inst = self._select_gemm_instruction(thread_nums, target)
-        impl_class = self._get_implementation_class(gemm_inst)
+        impl_class = self._get_implementation_class(gemm_inst, target)
         return impl_class(self).lower(layout_map, target, thread_nums, thread_var)
 
     def _select_gemm_instruction(self, thread_nums: int, target: Target) -> GemmInst:
@@ -97,11 +165,12 @@ def _select_gemm_instruction(self, thread_nums: int, target: Target) -> GemmInst
         """
         return GemmInst(_ffi_api.GemmPyGemmInst(self, int(thread_nums), target))
 
-    def _get_implementation_class(self, gemm_inst: GemmInst):
+    def _get_implementation_class(self, gemm_inst: GemmInst, target: Target):
         """Get the appropriate implementation class for the given GEMM instruction.
 
         Args:
             gemm_inst: The selected GEMM instruction type
+            target: Target architecture
 
         Returns:
             The implementation class for the instruction type
@@ -110,11 +179,21 @@ def _get_implementation_class(self, gemm_inst: GemmInst):
             NotImplementedError: If the instruction type is not supported
             ValueError: If the instruction type is unknown
         """
+        # CuTeDSL backend uses direct intrinsic call, bypass complex lowering
+        if is_cutedsl_target(target):
+            return GemmCuTeDSL
+
         if gemm_inst.is_mma():
+            if target_is_volta(target):
+                return GemmMMASm70
             return GemmMMA
         elif gemm_inst.is_wgmma():
             return GemmWGMMA
+        elif gemm_inst.is_tcgen5mma():
+            return GemmTCGEN5
         elif gemm_inst.is_mfma():
-            raise NotImplementedError("MFMA is not implemented")
+            return GemmMFMA
+        elif gemm_inst.is_tcgen5mma():
+            raise NotImplementedError("TCGEN5MMA is not implemented")
         else:
             raise ValueError(f"Unsupported GEMM instruction: {gemm_inst}")
diff --git a/tilelang/tileop/gemm/gemm_base.py b/tilelang/tileop/gemm/gemm_base.py
index 4968b09f4..7d31ae46d 100644
--- a/tilelang/tileop/gemm/gemm_base.py
+++ b/tilelang/tileop/gemm/gemm_base.py
@@ -2,8 +2,9 @@
 from tilelang import tvm as tvm
 from tvm.target import Target
 from tvm import tir
+from tilelang import language as T
 from tilelang.utils.language import is_shared, is_fragment
-from tilelang.ir import GemmWarpPolicy
+from tilelang.tileop.base import GemmWarpPolicy
 from tvm.ir.base import Node
 from tvm.ir import PrimExpr
 
@@ -32,23 +33,23 @@ def is_gemm_rr(self) -> bool:
 
     @property
     def M(self) -> int:
-        return self.gemm_node.M
+        return getattr(self.gemm_node, "m", None)
 
     @property
     def N(self) -> int:
-        return self.gemm_node.N
+        return getattr(self.gemm_node, "n", None)
 
     @property
     def K(self) -> int:
-        return self.gemm_node.K
+        return getattr(self.gemm_node, "k", None)
 
     @property
     def trans_A(self) -> bool:
-        return self.gemm_node.trans_A
+        return getattr(self.gemm_node, "transA", None)
 
     @property
     def trans_B(self) -> bool:
-        return self.gemm_node.trans_B
+        return getattr(self.gemm_node, "transB", None)
 
     @property
     def in_dtype(self) -> str:
@@ -65,56 +66,104 @@ def chunk(self) -> int:
 
     @property
     def A(self) -> tir.Buffer:
-        return self.gemm_node.A
+        return getattr(self.gemm_node, "a", None)
 
     @property
     def B(self) -> tir.Buffer:
-        return self.gemm_node.B
+        return getattr(self.gemm_node, "b", None)
 
     @property
     def C(self) -> tir.Buffer:
-        return self.gemm_node.C
+        return getattr(self.gemm_node, "c", None)
 
     @property
-    def APtr(self) -> tir.PrimExpr:
-        return self.gemm_node.APtr
+    def ARegion(self):
+        return getattr(self.gemm_node, "aRegion", None)
 
     @property
-    def BPtr(self) -> tir.PrimExpr:
-        return self.gemm_node.BPtr
+    def BRegion(self):
+        return getattr(self.gemm_node, "bRegion", None)
 
     @property
-    def CPtr(self) -> tir.PrimExpr:
-        return self.gemm_node.CPtr
+    def CRegion(self):
+        return getattr(self.gemm_node, "cRegion", None)
 
     @property
     def stride_A(self) -> int:
-        return self.gemm_node.stride_A
+        return getattr(self.gemm_node, "strideA", None)
 
     @property
     def stride_B(self) -> int:
-        return self.gemm_node.stride_B
+        return getattr(self.gemm_node, "strideB", None)
 
     @property
     def offset_A(self) -> int:
-        return self.gemm_node.offset_A
+        return getattr(self.gemm_node, "offsetA", None)
 
     @property
     def offset_B(self) -> int:
-        return self.gemm_node.offset_B
+        return getattr(self.gemm_node, "offsetB", None)
 
     @property
     def clear_accum(self) -> PrimExpr:
-        return self.gemm_node.clear_accum
+        return getattr(self.gemm_node, "clearAccum", None)
 
     @property
     def k_pack(self) -> int:
-        return self.gemm_node.k_pack
+        return getattr(self.gemm_node, "kPack", None)
 
     @property
     def wg_wait(self) -> int:
-        return self.gemm_node.wg_wait
+        return getattr(self.gemm_node, "wgWait", 0)
 
     @property
     def policy(self) -> GemmWarpPolicy:
-        return self.gemm_node.policy
+        return getattr(self.gemm_node, "policy", None)
+
+    @property
+    def mbarptr(self) -> PrimExpr:
+        return getattr(self.gemm_node, "mbarPtr", tvm.tir.const(0, T.uint32))
+
+    @property
+    def mbar(self) -> tir.Buffer:
+        return getattr(self.gemm_node, "mbar", None)
+
+    @property
+    def C_coords(self):
+        coords = getattr(self.gemm_node, "cCoords", None)
+        if coords is None or len(coords) == 0:
+            zero = tvm.tir.const(0, T.int32)
+            return [zero, zero]
+        return [coords[i] for i in range(len(coords))]
+
+    def get_region_base_offsets(self, region):
+        """
+        Get the base offset (start index) for each dimension from a BufferRegion.
+
+        For example, if region is A_shared[ko % 2, 0:128, 0:64],
+        this returns [ko % 2, 0, 0]
+
+        Args:
+            region: BufferRegion object
+
+        Returns:
+            List of PrimExpr representing the base offset for each dimension
+        """
+        if region is None:
+            return []
+        return [r.min for r in region.region]
+
+    @property
+    def A_base_offsets(self):
+        """Get base offsets for each dimension of A region"""
+        return self.get_region_base_offsets(self.ARegion)
+
+    @property
+    def B_base_offsets(self):
+        """Get base offsets for each dimension of B region"""
+        return self.get_region_base_offsets(self.BRegion)
+
+    @property
+    def C_base_offsets(self):
+        """Get base offsets for each dimension of C region"""
+        return self.get_region_base_offsets(self.CRegion)
diff --git a/tilelang/tileop/gemm/gemm_cutedsl.py b/tilelang/tileop/gemm/gemm_cutedsl.py
new file mode 100644
index 000000000..1c6d4488c
--- /dev/null
+++ b/tilelang/tileop/gemm/gemm_cutedsl.py
@@ -0,0 +1,63 @@
+"""GEMM implementation for CuTeDSL backend - directly calls tl::gemm intrinsic."""
+
+from tilelang.tileop.gemm.gemm_base import GemmBase
+from tilelang import language as T
+from tvm import tir
+from tvm.target import Target
+
+
+class GemmCuTeDSL(GemmBase):
+    """GEMM implementation for CuTeDSL that directly calls tl::gemm intrinsic.
+
+    This implementation bypasses the complex lowering logic of MMA/WGMMA
+    and directly emits a call to tl::gemm, similar to gemm_v1 behavior.
+    This is necessary for CuTeDSL backend which requires simpler IR.
+    """
+
+    def infer_layout(self, target: Target, thread_nums: int):
+        """For CuTeDSL, we still need proper layout inference for A, B, C buffers.
+
+        CuTeDSL uses the same underlying hardware instructions (WGMMA/MMA),
+        so it needs the same layout information. We delegate to the appropriate
+        implementation based on the instruction type.
+        """
+        from tilelang.tileop.gemm import GemmInst
+        from tilelang.tileop.gemm.gemm_wgmma import GemmWGMMA
+        from tilelang.tileop.gemm.gemm_mma import GemmMMA
+        from tilelang import _ffi_api
+
+        # Determine which GEMM instruction will be used
+        gemm_inst = GemmInst(_ffi_api.GemmPyGemmInst(self.gemm_node, int(thread_nums), target))
+
+        # Use WGMMA or MMA layout inference based on instruction type
+        if gemm_inst.is_wgmma():
+            return GemmWGMMA(self.gemm_node).infer_layout(target, thread_nums)
+        else:
+            return GemmMMA(self.gemm_node).infer_layout(target, thread_nums)
+
+    def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
+        """Lower to a direct gemm_v1 call without complex MMA/WGMMA lowering."""
+        from tilelang.language.gemm_op import gemm_v1
+        from tilelang.transform.simplify import _Simplify
+        from tilelang.tileop.base import GemmWarpPolicy as PyGemmWarpPolicy
+
+        # Convert C++ GemmWarpPolicy to Python enum value (int)
+        policy_int = self.policy.policy_type
+
+        @T.prim_func
+        def _gemm_cutedsl() -> None:
+            gemm_v1(
+                self.A,
+                self.B,
+                self.C,
+                self.trans_A,
+                self.trans_B,
+                PyGemmWarpPolicy(policy_int),
+                self.clear_accum,
+                self.k_pack,
+                self.wg_wait,
+                self.mbar,
+            )
+
+        # Simplify and return
+        return _Simplify(_gemm_cutedsl, inline_let=True)
diff --git a/tilelang/tileop/gemm/gemm_mfma.py b/tilelang/tileop/gemm/gemm_mfma.py
new file mode 100644
index 000000000..d827d8a2a
--- /dev/null
+++ b/tilelang/tileop/gemm/gemm_mfma.py
@@ -0,0 +1,227 @@
+from .gemm_base import GemmBase
+from tilelang.layout import make_swizzled_layout
+from tilelang.intrinsics.mfma_macro_generator import (
+    MatrixCoreIntrinEmitter,
+)
+from tilelang.utils.language import is_shared, is_fragment, is_full_region
+from tilelang import tvm as tvm
+from tvm.target import Target
+from tvm import tir
+from tilelang import language as T
+from tilelang.transform.simplify import _Simplify
+
+
+class GemmMFMA(GemmBase):
+    def infer_layout(self, target: Target, thread_nums: int):
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
+        warp_row_tiles = int(self.M // m_warp)
+        warp_col_tiles = int(self.N // n_warp)
+        mfma_emitter = MatrixCoreIntrinEmitter(
+            a_dtype=self.in_dtype,
+            b_dtype=self.in_dtype,
+            accum_dtype=self.accum_dtype,
+            a_transposed=self.trans_A,
+            b_transposed=self.trans_B,
+            block_row_warps=m_warp,
+            block_col_warps=n_warp,
+            warp_row_tiles=warp_row_tiles,
+            warp_col_tiles=warp_col_tiles,
+            chunk=self.chunk,
+            k_pack=self.k_pack,
+        )
+
+        if self.is_gemm_ss():
+            return {
+                self.A: make_swizzled_layout(self.A),
+                self.B: make_swizzled_layout(self.B),
+                self.C: mfma_emitter.make_mfma_store_layout(self.C),
+            }
+        elif self.is_gemm_sr():
+            return {
+                self.A: make_swizzled_layout(self.A),
+                self.B: mfma_emitter.make_mfma_load_layout(self.B, matrix="B"),
+                self.C: mfma_emitter.make_mfma_store_layout(self.C),
+            }
+        elif self.is_gemm_rs():
+            return {
+                self.A: mfma_emitter.make_mfma_load_layout(self.A, matrix="A"),
+                self.B: make_swizzled_layout(self.B),
+                self.C: mfma_emitter.make_mfma_store_layout(self.C),
+            }
+        elif self.is_gemm_rr():
+            return {
+                self.A: mfma_emitter.make_mfma_load_layout(self.A, matrix="A"),
+                self.B: mfma_emitter.make_mfma_load_layout(self.B, matrix="B"),
+                self.C: mfma_emitter.make_mfma_store_layout(self.C),
+            }
+        else:
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+
+    def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
+        warp_row_tiles = int(self.M // m_warp)
+        warp_col_tiles = int(self.N // n_warp)
+        mfma_emitter = MatrixCoreIntrinEmitter(
+            a_dtype=self.in_dtype,
+            b_dtype=self.in_dtype,
+            accum_dtype=self.accum_dtype,
+            a_transposed=self.trans_A,
+            b_transposed=self.trans_B,
+            block_row_warps=m_warp,
+            block_col_warps=n_warp,
+            warp_row_tiles=warp_row_tiles,
+            warp_col_tiles=warp_col_tiles,
+            chunk=self.chunk,
+            thread_var=thread_var,
+            k_pack=self.k_pack,
+        )
+
+        in_dtype = self.in_dtype
+        warp_rows = mfma_emitter.warp_rows
+        warp_cols = mfma_emitter.warp_cols
+        local_size_a = mfma_emitter.local_size_a
+        local_size_b = mfma_emitter.local_size_b
+        block_K = mfma_emitter.chunk
+        micro_size_k = mfma_emitter.micro_size_k
+        # Use region for shared-memory operands if available
+        # We use region for memory input to support strided gemm
+        # T.gemm(A_shared[0:128, :], B_shared, C_local)
+        A_region = self.ARegion
+        B_region = self.BRegion
+        C_region = self.CRegion
+
+        A_buf = A_region.buffer
+        B_buf = B_region.buffer
+        C_buf = C_region.buffer
+
+        clear_accum = self.clear_accum
+
+        assert block_K >= micro_size_k, f"block_K ({block_K}) must be >= micro_size_k ({micro_size_k})"
+
+        assert is_full_region(C_region), "Fragment output C must be a full region"
+
+        if self.is_gemm_ss():
+
+            @T.prim_func
+            def _gemm_ssr() -> None:
+                """
+                The inner macro that loads data from shared buffers A_shared and
+                B_shared into local fragments, then issues Matrix Core mfma ops,
+                accumulating into C_local.
+                """
+                A_local = T.alloc_local((warp_rows * local_size_a * self.k_pack), in_dtype)
+                B_local = T.alloc_local((warp_cols * local_size_b * self.k_pack), in_dtype)
+                if clear_accum:
+                    T.clear(C_buf)
+                for ki in T.serial(0, (block_K // (micro_size_k * self.k_pack))):
+                    # Load A into fragment
+                    mfma_emitter.ldmatrix_a(
+                        A_local,
+                        A_region,
+                        ki,
+                    )
+
+                    # Load B into fragment
+                    mfma_emitter.ldmatrix_b(
+                        B_local,
+                        B_region,
+                        ki,
+                    )
+
+                    # Perform Matrix Multiplication
+                    mfma_emitter.mfma(A_local, B_local, C_buf, ki)
+
+            # Simplify to optimize the index computing
+            # Must inline let statements to simplify the analysis
+            return _Simplify(_gemm_ssr, inline_let=True)
+        elif self.is_gemm_sr():
+            assert is_full_region(B_region), "Fragment input B must be a full region"
+
+            @T.prim_func
+            def _gemm_srr() -> None:
+                """
+                The inner macro that loads data from shared buffers A_shared and
+                B_shared into local fragments, then issues Matrix Core mfma ops,
+                accumulating into C_local.
+                """
+                A_local = T.alloc_local((warp_rows * local_size_a * self.k_pack), in_dtype)
+
+                if clear_accum:
+                    T.clear(C_buf)
+
+                for ki in T.serial(0, (block_K // (micro_size_k * self.k_pack))):
+                    # Load A into fragment
+                    mfma_emitter.ldmatrix_a(
+                        A_local,
+                        A_region,
+                        ki,
+                    )
+
+                    # Perform Matrix Multiplication
+                    mfma_emitter.mfma(A_local, B_buf, C_buf, ki)
+
+            # Simplify to optimize the index computing
+            # Must inline let statements to simplify the analysis
+            # alloc_buffers body
+            # insert into parent block
+            return _Simplify(_gemm_srr, inline_let=True)
+        elif self.is_gemm_rs():
+            assert is_full_region(A_region), "Fragment input A must be a full region"
+
+            @T.prim_func
+            def _gemm_rsr() -> None:
+                """
+                The inner macro that loads data from shared buffers A_shared and
+                B_shared into local fragments, then issues Matrix Core mfma ops,
+                accumulating into C_local.
+                """
+                B_local = T.alloc_local((warp_cols * local_size_b * self.k_pack), in_dtype)
+                if clear_accum:
+                    T.clear(C_buf)
+                for ki in T.serial(0, (block_K // (micro_size_k * self.k_pack))):
+                    # Load B into fragment
+                    mfma_emitter.ldmatrix_b(
+                        B_local,
+                        B_region,
+                        ki,
+                    )
+
+                    # Perform Matrix Multiplication
+                    mfma_emitter.mfma(A_buf, B_local, C_buf, ki)
+
+            # Simplify to optimize the index computing
+            # Must inline let statements to simplify the analysis
+            return _Simplify(_gemm_rsr, inline_let=True)
+        elif self.is_gemm_rr():
+            assert is_full_region(A_region), "Fragment input A must be a full region"
+            assert is_full_region(B_region), "Fragment input B must be a full region"
+
+            @T.prim_func
+            def _gemm_rsr() -> None:
+                """
+                The inner macro that loads data from shared buffers A_shared and
+                B_shared into local fragments, then issues Matrix Core mfma ops,
+                accumulating into C_local.
+                """
+
+                for ki in T.serial(0, (block_K // (micro_size_k * self.k_pack))):
+                    # Perform Matrix Multiplication
+                    mfma_emitter.mfma(A_buf, B_buf, C_buf, ki)
+
+            # Simplify to optimize the index computing
+            # Must inline let statements to simplify the analysis
+            return _Simplify(_gemm_rsr, inline_let=True)
+        else:
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+
+    def is_gemm_ss(self) -> bool:
+        return is_shared(self.A) and is_shared(self.B)
+
+    def is_gemm_sr(self) -> bool:
+        return is_shared(self.A) and is_fragment(self.B)
+
+    def is_gemm_rs(self) -> bool:
+        return is_fragment(self.A) and is_shared(self.B)
+
+    def is_gemm_rr(self) -> bool:
+        return is_fragment(self.A) and is_fragment(self.B)
diff --git a/tilelang/tileop/gemm/gemm_mma.py b/tilelang/tileop/gemm/gemm_mma.py
index 42abe376a..b15173483 100644
--- a/tilelang/tileop/gemm/gemm_mma.py
+++ b/tilelang/tileop/gemm/gemm_mma.py
@@ -1,8 +1,9 @@
 from .gemm_base import GemmBase
 from tilelang.layout import make_swizzled_layout
 from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,)
-from tilelang.utils.language import is_shared, is_fragment
+    TensorCoreIntrinEmitter,
+)
+from tilelang.utils.language import is_shared, is_fragment, is_full_region
 from tilelang import tvm as tvm
 from tvm.target import Target
 from tvm import tir
@@ -11,10 +12,8 @@
 
 
 class GemmMMA(GemmBase):
-
     def infer_layout(self, target: Target, thread_nums: int):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            False)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -54,12 +53,10 @@ def infer_layout(self, target: Target, thread_nums: int):
                 self.C: mma_emitter.make_mma_store_layout(self.C),
             }
         else:
-            raise ValueError(
-                f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            False)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -83,12 +80,22 @@ def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var:
         local_size_b = mma_emitter.local_size_b
         block_K = mma_emitter.chunk
         micro_size_k = mma_emitter.micro_size_k
-        A_shared = self.A
-        B_shared = self.B
-        C_local = self.C
+        # We use region for memory input to support strided gemm
+        # T.gemm(A_shared[0:128, :], B_shared, C_local)
+        A_region = self.ARegion
+        B_region = self.BRegion
+        C_region = self.CRegion
+
+        A_buf = A_region.buffer
+        B_buf = B_region.buffer
+        C_buf = C_region.buffer
+
+        clear_accum = self.clear_accum
 
         assert block_K >= micro_size_k, f"block_K ({block_K}) must be >= micro_size_k ({micro_size_k})"
 
+        assert is_full_region(C_region), "Fragment output C must be a full region"
+
         if self.is_gemm_ss():
 
             @T.prim_func
@@ -100,30 +107,31 @@ def _gemm_ssr() -> None:
                 """
                 A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
                 B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
-
+                if clear_accum:
+                    T.clear(C_buf)
                 for ki in T.serial(0, (block_K // micro_size_k)):
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
-                        A_shared,
+                        A_region,
                         ki,
                     )
 
                     # Load B into fragment
                     mma_emitter.ldmatrix_b(
                         B_local,
-                        B_shared,
+                        B_region,
                         ki,
                     )
 
                     # Perform Matrix Multiplication
-                    mma_emitter.mma(A_local, B_local, C_local, ki)
+                    mma_emitter.mma(A_local, B_local, C_buf, ki)
 
             # Simplify to optimize the index computing
             # Must inline let statements to simplify the analysis
             return _Simplify(_gemm_ssr, inline_let=True)
         elif self.is_gemm_sr():
-            B_local = self.B
+            assert is_full_region(B_region), "Fragment input B must be a full region"
 
             @T.prim_func
             def _gemm_srr() -> None:
@@ -135,16 +143,17 @@ def _gemm_srr() -> None:
                 A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
+                    if clear_accum:
+                        T.clear(C_buf)
                     # Load A into fragment
                     mma_emitter.ldmatrix_a(
                         A_local,
-                        A_shared,
+                        A_region,
                         ki,
                     )
 
                     # Perform Matrix Multiplication
-                    mma_emitter.mma(A_local, B_local, C_local, ki)
+                    mma_emitter.mma(A_local, B_buf, C_buf, ki)
 
             # Simplify to optimize the index computing
             # Must inline let statements to simplify the analysis
@@ -152,7 +161,7 @@ def _gemm_srr() -> None:
             # insert into parent block
             return _Simplify(_gemm_srr, inline_let=True)
         elif self.is_gemm_rs():
-            A_local = self.A
+            assert is_full_region(A_region), "Fragment input A must be a full region"
 
             @T.prim_func
             def _gemm_rsr() -> None:
@@ -162,28 +171,28 @@ def _gemm_rsr() -> None:
                 accumulating into C_local.
                 """
                 B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
-
+                if clear_accum:
+                    T.clear(C_buf)
                 for ki in T.serial(0, (block_K // micro_size_k)):
-
                     # Load B into fragment
                     mma_emitter.ldmatrix_b(
                         B_local,
-                        B_shared,
+                        B_region,
                         ki,
                     )
 
                     # Perform Matrix Multiplication
-                    mma_emitter.mma(A_local, B_local, C_local, ki)
+                    mma_emitter.mma(A_buf, B_local, C_buf, ki)
 
             # Simplify to optimize the index computing
             # Must inline let statements to simplify the analysis
             return _Simplify(_gemm_rsr, inline_let=True)
         elif self.is_gemm_rr():
-            A_local = self.A
-            B_local = self.B
+            assert is_full_region(A_region), "Fragment input A must be a full region"
+            assert is_full_region(B_region), "Fragment input B must be a full region"
 
             @T.prim_func
-            def _gemm_rsr() -> None:
+            def _gemm_rrr() -> None:
                 """
                 The inner macro that loads data from shared buffers A_shared and
                 B_shared into local fragments, then issues Tensor Core mma ops,
@@ -192,14 +201,13 @@ def _gemm_rsr() -> None:
 
                 for ki in T.serial(0, (block_K // micro_size_k)):
                     # Perform Matrix Multiplication
-                    mma_emitter.mma(A_local, B_local, C_local, ki)
+                    mma_emitter.mma(A_buf, B_buf, C_buf, ki)
 
             # Simplify to optimize the index computing
             # Must inline let statements to simplify the analysis
-            return _Simplify(_gemm_rsr, inline_let=True)
+            return _Simplify(_gemm_rrr, inline_let=True)
         else:
-            raise ValueError(
-                f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def is_gemm_ss(self) -> bool:
         return is_shared(self.A) and is_shared(self.B)
diff --git a/tilelang/tileop/gemm/gemm_mma_sm70.py b/tilelang/tileop/gemm/gemm_mma_sm70.py
new file mode 100644
index 000000000..52a4bf326
--- /dev/null
+++ b/tilelang/tileop/gemm/gemm_mma_sm70.py
@@ -0,0 +1,166 @@
+# for Volta GPUs, which use legacy MMA instructions
+from .gemm_base import GemmBase
+from tilelang.layout import make_volta_swizzled_layout
+from tilelang.intrinsics.mma_sm70_macro_generator import (
+    TensorCoreIntrinEmitter,
+)
+from tilelang.utils.language import is_shared, is_fragment, is_full_region
+from tilelang import tvm as tvm
+from tvm.target import Target
+from tvm import tir
+from tilelang import language as T
+from tilelang.transform.simplify import _Simplify
+
+
+class GemmMMASm70(GemmBase):
+    def infer_layout(self, target: Target, thread_nums: int):
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
+        warp_row_tiles = int(self.M // m_warp)
+        warp_col_tiles = int(self.N // n_warp)
+        mma_emitter = TensorCoreIntrinEmitter(
+            a_dtype=self.in_dtype,
+            b_dtype=self.in_dtype,
+            accum_dtype=self.accum_dtype,
+            a_transposed=self.trans_A,
+            b_transposed=self.trans_B,
+            block_row_warps=m_warp,
+            block_col_warps=n_warp,
+            warp_row_tiles=warp_row_tiles,
+            warp_col_tiles=warp_col_tiles,
+            chunk=self.chunk,
+        )
+        a_is_k_major = not self.trans_A
+        b_is_k_major = self.trans_B
+        if self.is_gemm_ss():
+            return {
+                self.A: make_volta_swizzled_layout(self.A, is_a=True, k_inner=a_is_k_major),
+                self.B: make_volta_swizzled_layout(self.B, is_a=False, k_inner=b_is_k_major),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
+            }
+        elif self.is_gemm_rs():
+            return {
+                self.A: mma_emitter.make_mma_load_layout(self.A, matrix="A"),
+                self.B: make_volta_swizzled_layout(self.B, is_a=False, k_inner=b_is_k_major),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
+            }
+        else:
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+
+    def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
+        warp_row_tiles = int(self.M // m_warp)
+        warp_col_tiles = int(self.N // n_warp)
+        mma_emitter = TensorCoreIntrinEmitter(
+            a_dtype=self.in_dtype,
+            b_dtype=self.in_dtype,
+            accum_dtype=self.accum_dtype,
+            a_transposed=self.trans_A,
+            b_transposed=self.trans_B,
+            block_row_warps=m_warp,
+            block_col_warps=n_warp,
+            warp_row_tiles=warp_row_tiles,
+            warp_col_tiles=warp_col_tiles,
+            chunk=self.chunk,
+            thread_var=thread_var,
+        )
+
+        in_dtype = self.in_dtype
+        warp_rows = mma_emitter.warp_rows
+        warp_cols = mma_emitter.warp_cols
+        local_size_a = mma_emitter.local_size_a
+        local_size_b = mma_emitter.local_size_b
+        block_K = mma_emitter.chunk
+        micro_size_k = mma_emitter.micro_size_k
+        # Use region for shared-memory operands when applicable
+        A_region = self.ARegion
+        B_region = self.BRegion
+        C_region = self.CRegion
+
+        A_buf = A_region.buffer
+        C_buf = C_region.buffer
+
+        clear_accum = self.clear_accum
+
+        assert block_K >= micro_size_k, f"block_K ({block_K}) must be >= micro_size_k ({micro_size_k})"
+
+        assert is_full_region(C_region), "Fragment output C must be a full region"
+
+        if self.is_gemm_ss():
+
+            @T.prim_func
+            def _gemm_ssr() -> None:
+                """
+                The inner macro that loads data from shared buffers A_shared and
+                B_shared into local fragments, then issues Tensor Core mma ops,
+                accumulating into C_local.
+                """
+                A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
+                B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
+
+                if clear_accum:
+                    T.clear(C_buf)
+
+                for ki in T.serial(0, (block_K // micro_size_k)):
+                    # Load A into fragment
+                    mma_emitter.ldmatrix_a(
+                        A_local,
+                        A_region,
+                        ki,
+                    )
+
+                    # Load B into fragment
+                    mma_emitter.ldmatrix_b(
+                        B_local,
+                        B_region,
+                        ki,
+                    )
+
+                    # Perform Matrix Multiplication
+                    mma_emitter.mma(A_local, B_local, C_buf, ki)
+
+            # Simplify to optimize the index computing
+            # Must inline let statements to simplify the analysis
+            return _Simplify(_gemm_ssr, inline_let=True)
+        elif self.is_gemm_rs():
+            assert is_full_region(B_region), "Fragment input B must be a full region"
+
+            @T.prim_func
+            def _gemm_rsr() -> None:
+                """
+                The inner macro that loads data from shared buffers A_shared and
+                B_shared into local fragments, then issues Tensor Core mma ops,
+                accumulating into C_local.
+                """
+                B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
+
+                if clear_accum:
+                    T.clear(C_buf)
+
+                for ki in T.serial(0, (block_K // micro_size_k)):
+                    # Load B into fragment
+                    mma_emitter.ldmatrix_b(
+                        B_local,
+                        B_region,
+                        ki,
+                    )
+
+                    # Perform Matrix Multiplication
+                    mma_emitter.mma(A_buf, B_local, C_buf, ki)
+
+            # Simplify to optimize the index computing
+            # Must inline let statements to simplify the analysis
+            return _Simplify(_gemm_rsr, inline_let=True)
+        else:
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+
+    def is_gemm_ss(self) -> bool:
+        return is_shared(self.A) and is_shared(self.B)
+
+    def is_gemm_sr(self) -> bool:
+        return is_shared(self.A) and is_fragment(self.B)
+
+    def is_gemm_rs(self) -> bool:
+        return is_fragment(self.A) and is_shared(self.B)
+
+    def is_gemm_rr(self) -> bool:
+        return is_fragment(self.A) and is_fragment(self.B)
diff --git a/tilelang/tileop/gemm/gemm_tcgen05.py b/tilelang/tileop/gemm/gemm_tcgen05.py
new file mode 100644
index 000000000..de3e72143
--- /dev/null
+++ b/tilelang/tileop/gemm/gemm_tcgen05.py
@@ -0,0 +1,114 @@
+from .gemm_base import GemmBase
+from tilelang.layout import make_tcgen05mma_swizzled_layout
+from tilelang.intrinsics.tcgen05_macro_generator import (
+    TensorCoreIntrinEmitter,
+)
+from tilelang import language as T
+from tilelang.transform.simplify import _Simplify
+from tvm import tir
+from tvm.target import Target
+
+_FLOAT8_DTYPES = {
+    "float8_e4m3",
+    "float8_e4m3fn",
+    "float8_e4m3fnuz",
+    "float8_e5m2",
+    "float8_e5m2fn",
+    "float8_e5m2fnuz",
+}
+
+
+class GemmTCGEN5(GemmBase):
+    def infer_layout(self, target: Target, thread_nums: int):
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, True)
+        warp_row_tiles = int(self.M // m_warp)
+        warp_col_tiles = int(self.N // n_warp)
+        mma_emitter = TensorCoreIntrinEmitter(
+            a_dtype=self.in_dtype,
+            b_dtype=self.in_dtype,
+            accum_dtype=self.accum_dtype,
+            a_transposed=self.trans_A,
+            b_transposed=self.trans_B,
+            block_row_warps=m_warp,
+            block_col_warps=n_warp,
+            warp_row_tiles=warp_row_tiles,
+            warp_col_tiles=warp_col_tiles,
+            chunk=self.chunk,
+        )
+        a_is_k_major = not self.trans_A
+        b_is_k_major = self.trans_B
+
+        if self.is_gemm_ss():
+            a_continuity = self.M if a_is_k_major else 4 * self.K // m_warp
+            b_continuity = self.K if b_is_k_major else self.N // n_warp
+
+            return {
+                # WGMMA does not support padding
+                self.A: make_tcgen05mma_swizzled_layout(self.A, continuity=a_continuity, k_major=a_is_k_major),
+                self.B: make_tcgen05mma_swizzled_layout(self.B, continuity=b_continuity, k_major=b_is_k_major),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
+            }
+        # No special swizzle requirement; rely on existing layout.
+        return {}
+
+    def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, True)
+        warp_row_tiles = int(self.M // m_warp)
+        warp_col_tiles = int(self.N // n_warp)
+        mma_emitter = TensorCoreIntrinEmitter(
+            a_dtype=self.in_dtype,
+            b_dtype=self.in_dtype,
+            accum_dtype=self.accum_dtype,
+            a_transposed=self.trans_A,
+            b_transposed=self.trans_B,
+            block_row_warps=m_warp,
+            block_col_warps=n_warp,
+            warp_row_tiles=warp_row_tiles,
+            warp_col_tiles=warp_col_tiles,
+            chunk=self.chunk,
+        )
+
+        if self.A in layout_map:
+            mma_emitter._assign_a_shared_layout(layout_map[self.A])
+        if self.B in layout_map:
+            mma_emitter._assign_b_shared_layout(layout_map[self.B])
+
+        if not self.is_gemm_ss():
+            raise ValueError(f"TCGEN5MMA currently only supports gemm_ss, got A scope {self.A.scope()}, B scope {self.B.scope()}")
+
+        atom_m, atom_n, atom_k, enable_ws, enable_2cta = mma_emitter.get_tcgen5_mma_meta(self.M, self.N, self.K)
+
+        if self.A.scope() not in {"shared", "shared.dyn", "shared.tmem"}:
+            raise ValueError(f"Unsupported A scope for TCGEN5MMA: {self.A.scope()}")
+        if self.B.scope() not in {"shared", "shared.dyn"}:
+            raise ValueError(f"Unsupported B scope for TCGEN5MMA: {self.B.scope()}")
+        if self.C.scope() != "shared.tmem":
+            raise ValueError(f"TCGEN5MMA expects C in shared.tmem, got {self.C.scope()}")
+        if self.wg_wait != -1:
+            raise ValueError("TCGEN5MMA currently requires wg_wait == -1")
+
+        mbar = self.mbar
+        if mbar == 0:
+            raise ValueError("TCGEN5MMA requires a valid mbarrier")
+
+        mbarptr = mbar.access_ptr("rw")
+
+        C_coords = self.C_coords
+        if len(C_coords) != 2:
+            raise ValueError("TCGEN5MMA expects 2D coordinates for C buffer access")
+
+        accum_dtype = str(self.C.dtype)
+        if accum_dtype not in [str(T.float32), str(T.float16)]:
+            raise ValueError(f"Unsupported accumulator dtype for TCGEN5MMA: {accum_dtype}")
+
+        A_shared = self.ARegion
+        B_shared = self.BRegion
+        C_local = self.C
+        clear_accum = self.clear_accum
+
+        @T.prim_func
+        def _gemm_ss() -> None:
+            if thread_var // 32 == 0:
+                mma_emitter.tcgen05mma(A_shared, B_shared, C_local, mbarptr, clear_accum)
+
+        return _Simplify(_gemm_ss, inline_let=True)
diff --git a/tilelang/tileop/gemm/gemm_wgmma.py b/tilelang/tileop/gemm/gemm_wgmma.py
index 39be65921..038aa2cd6 100644
--- a/tilelang/tileop/gemm/gemm_wgmma.py
+++ b/tilelang/tileop/gemm/gemm_wgmma.py
@@ -1,7 +1,8 @@
 from .gemm_base import GemmBase
 from tilelang.layout import make_wgmma_swizzled_layout
 from tilelang.intrinsics.wgmma_macro_generator import (
-    TensorCoreIntrinEmitter,)
+    TensorCoreIntrinEmitter,
+)
 from tilelang.utils.language import is_shared, is_fragment
 from tilelang import tvm as tvm
 from tvm.target import Target
@@ -11,10 +12,8 @@
 
 
 class GemmWGMMA(GemmBase):
-
     def infer_layout(self, target: Target, thread_nums: int):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            True)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, True)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -38,33 +37,22 @@ def infer_layout(self, target: Target, thread_nums: int):
 
             return {
                 # WGMMA does not support padding
-                self.A:
-                    make_wgmma_swizzled_layout(
-                        self.A, continuity=a_continuity, k_major=a_is_k_major),
-                self.B:
-                    make_wgmma_swizzled_layout(
-                        self.B, continuity=b_continuity, k_major=b_is_k_major),
-                self.C:
-                    mma_emitter.make_mma_store_layout(self.C),
+                self.A: make_wgmma_swizzled_layout(self.A, continuity=a_continuity, k_major=a_is_k_major),
+                self.B: make_wgmma_swizzled_layout(self.B, continuity=b_continuity, k_major=b_is_k_major),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
             }
         elif self.is_gemm_rs():
             b_continuity = self.N if b_is_k_major else 4 * self.K // n_warp
             return {
-                self.A:
-                    mma_emitter.make_mma_load_layout(self.A, matrix="A"),
-                self.B:
-                    make_wgmma_swizzled_layout(
-                        self.B, continuity=b_continuity, k_major=b_is_k_major),
-                self.C:
-                    mma_emitter.make_mma_store_layout(self.C),
+                self.A: mma_emitter.make_mma_load_layout(self.A, matrix="A"),
+                self.B: make_wgmma_swizzled_layout(self.B, continuity=b_continuity, k_major=b_is_k_major),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
             }
         else:
-            raise ValueError(
-                f"Unsupported gemm combination for wgmma, A: {self.A.scope()}, B: {self.B.scope()}")
+            raise ValueError(f"Unsupported gemm combination for wgmma, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target,
-                                                            True)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, True)
 
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
@@ -87,12 +75,24 @@ def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var:
         if self.B in layout_map:
             mma_emitter._assign_b_shared_layout(layout_map[self.B])
 
-        A_shared = self.A
-        B_shared = self.B
-        C_local = self.C
+        # Get base offsets from regions
+        # All dimensions may have offsets, including the matrix dimensions
+        # However, for WGMMA, we pass the Buffer directly and handle offsets
+        # through proper indexing in the access_ptr call or buffer slicing
+
+        # We use region for memory input to support strided gemm
+        # T.gemm(A_shared[0:128, :], B_shared, C_local)
+        A_region = self.ARegion
+        B_region = self.BRegion
+        C_region = self.CRegion
+
         clear_accum = self.clear_accum
+        wg_wait = self.wg_wait
 
         if self.is_gemm_ss():
+            # For WGMMA, we need to handle buffer region offsets
+            # If there are offsets, we create a BufferLoad inside the prim_func
+            # to properly generate offset access
 
             @T.prim_func
             def _gemm_ssr() -> None:
@@ -101,14 +101,13 @@ def _gemm_ssr() -> None:
                 B_shared into local fragments, then issues Tensor Core mma ops,
                 accumulating into C_local.
                 """
-                # Perform Matrix Multiplication
-                mma_emitter.wgmma(A_shared, B_shared, C_local, clear_accum)
+                # Perform Matrix Multiplication with offset consideration
+                mma_emitter.wgmma(A_region, B_region, C_region, clear_accum, wg_wait)
 
             # Simplify to optimize the index computing
             # Must inline let statements to simplify the analysis
             return _Simplify(_gemm_ssr, inline_let=True)
         elif self.is_gemm_rs():
-            A_local = self.A
 
             @T.prim_func
             def _gemm_rsr() -> None:
@@ -117,13 +116,12 @@ def _gemm_rsr() -> None:
                 B_shared into local fragments, then issues Tensor Core mma ops,
                 accumulating into C_local.
                 """
-                mma_emitter.wgmma(A_local, B_shared, C_local, clear_accum)
+                mma_emitter.wgmma(A_region, B_region, C_region, clear_accum, wg_wait)
 
             # Simplify to optimize the index computing
             # Must inline let statements to simplify the analysis
             return _Simplify(_gemm_rsr, inline_let=True)
-        raise ValueError(
-            f"Unsupported gemm combination for wgmma, A: {self.A.scope()}, B: {self.B.scope()}")
+        raise ValueError(f"Unsupported gemm combination for wgmma, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def is_gemm_ss(self) -> bool:
         return is_shared(self.A) and is_shared(self.B)
diff --git a/tilelang/tileop/gemm_sp/__init__.py b/tilelang/tileop/gemm_sp/__init__.py
new file mode 100644
index 000000000..1d75657ec
--- /dev/null
+++ b/tilelang/tileop/gemm_sp/__init__.py
@@ -0,0 +1,69 @@
+from tilelang import tvm as tvm
+from tvm import tir
+from tilelang.utils.target import (
+    target_is_cuda,
+)
+from tvm.target import Target
+from tvm.ir.base import Node
+from tvm.ir import Range
+from tvm.runtime import Scriptable
+import tvm_ffi
+from tilelang.tileop.base import GemmWarpPolicy
+from .gemm_sp_mma import GemmSPMMA
+
+
+@tvm_ffi.register_global_func("tl.gemm_sp_py.infer_layout")
+def gemm_sp_py_infer_layout(gemm_sp_py: GemmSPMMA, target: Target, thread_bounds: Range):
+    thread_nums = thread_bounds.extent
+    return gemm_sp_py.infer_layout(target, thread_nums)
+
+
+@tvm_ffi.register_global_func("tl.gemm_sp_py.lower")
+def gemm_sp_py_lower(gemm_sp_py: GemmSPMMA, target: Target, thread_bounds: Range, thread_var: tir.Var):
+    thread_nums = thread_bounds.extent
+    stmt = gemm_sp_py.lower(target, thread_nums, thread_var)
+    return stmt
+
+
+@tvm_ffi.register_object("tl.GemmSPPy")
+class GemmSPPy(Node, Scriptable):
+    A: tir.Buffer
+    E: tir.Buffer
+    B: tir.Buffer
+    C: tir.Buffer
+
+    APtr: tir.PrimExpr
+    EPtr: tir.PrimExpr
+    BPtr: tir.PrimExpr
+    CPtr: tir.PrimExpr
+
+    M: int
+    N: int
+    K: int
+
+    trans_A: bool
+    trans_B: bool
+
+    stride_A: int
+    stride_B: int
+    offset_A: int
+    offset_B: int
+    clear_accum: bool
+    k_pack: int
+    wg_wait: int
+    policy: GemmWarpPolicy
+
+    def infer_layout(self, target: Target, thread_nums: int):
+        if target_is_cuda(target):
+            # TODO(lei): Support more cuda architectures, now mma only
+            return GemmSPMMA(self).infer_layout(target, thread_nums)
+        else:
+            raise ValueError(f"Unsupported target: {target}")
+
+    def lower(self, target: Target, thread_nums: int, thread_var: tir.Var):
+        if target_is_cuda(target):
+            # TODO(lei): Support more cuda architectures, now mma only
+            # Now only implement ssr layout
+            return GemmSPMMA(self).lower(target, thread_nums, thread_var)
+        else:
+            raise ValueError(f"Unsupported target: {target}")
diff --git a/tilelang/tileop/gemm_sp/gemm_sp_base.py b/tilelang/tileop/gemm_sp/gemm_sp_base.py
new file mode 100644
index 000000000..8226a0664
--- /dev/null
+++ b/tilelang/tileop/gemm_sp/gemm_sp_base.py
@@ -0,0 +1,131 @@
+from dataclasses import dataclass
+from tilelang import tvm as tvm
+from tvm.target import Target
+from tvm import tir
+from tilelang.utils.language import is_shared, is_fragment
+from tilelang.tileop.base import GemmWarpPolicy
+from tvm.ir.base import Node
+
+
+@dataclass
+class GemmSPBase:
+    gemm_sp_node: Node
+
+    def infer_layout(self, target: Target, thread_nums: int):
+        raise NotImplementedError("infer_layout is not implemented")
+
+    def lower(self, target: Target, thread_nums: int, thread_var: tir.Var):
+        raise NotImplementedError("lower is not implemented")
+
+    def is_gemm_ss(self) -> bool:
+        return is_shared(self.A) and is_shared(self.B)
+
+    def is_gemm_sr(self) -> bool:
+        return is_shared(self.A) and is_fragment(self.B)
+
+    def is_gemm_rs(self) -> bool:
+        return is_fragment(self.A) and is_shared(self.B)
+
+    def is_gemm_rr(self) -> bool:
+        return is_fragment(self.A) and is_fragment(self.B)
+
+    @property
+    def M(self) -> int:
+        return self.gemm_sp_node.M
+
+    @property
+    def N(self) -> int:
+        return self.gemm_sp_node.N
+
+    @property
+    def K(self) -> int:
+        return self.gemm_sp_node.K
+
+    @property
+    def trans_A(self) -> bool:
+        return self.gemm_sp_node.trans_A
+
+    @property
+    def trans_B(self) -> bool:
+        return self.gemm_sp_node.trans_B
+
+    @property
+    def trans_E(self) -> bool:
+        return self.gemm_sp_node.trans_E
+
+    @property
+    def e_dtype(self) -> str:
+        return self.E.dtype
+
+    @property
+    def in_dtype(self) -> str:
+        assert self.A.dtype == self.B.dtype, "A and B must have the same dtype"
+        return self.A.dtype
+
+    @property
+    def accum_dtype(self) -> str:
+        return self.C.dtype
+
+    @property
+    def A(self) -> tir.Buffer:
+        return self.gemm_sp_node.A
+
+    @property
+    def E(self) -> tir.Buffer:
+        return self.gemm_sp_node.E
+
+    @property
+    def B(self) -> tir.Buffer:
+        return self.gemm_sp_node.B
+
+    @property
+    def C(self) -> tir.Buffer:
+        return self.gemm_sp_node.C
+
+    @property
+    def ARegion(self) -> tir.PrimExpr:
+        return self.gemm_sp_node.ARegion
+
+    @property
+    def ERegion(self) -> tir.PrimExpr:
+        return self.gemm_sp_node.ERegion
+
+    @property
+    def BRegion(self) -> tir.PrimExpr:
+        return self.gemm_sp_node.BRegion
+
+    @property
+    def CRegion(self) -> tir.PrimExpr:
+        return self.gemm_sp_node.CRegion
+
+    @property
+    def stride_A(self) -> int:
+        return self.gemm_sp_node.stride_A
+
+    @property
+    def stride_B(self) -> int:
+        return self.gemm_sp_node.stride_B
+
+    @property
+    def offset_A(self) -> int:
+        return self.gemm_sp_node.offset_A
+
+    @property
+    def offset_B(self) -> int:
+        return self.gemm_sp_node.offset_B
+
+    @property
+    def clear_accum(self) -> bool:
+        return self.gemm_sp_node.clear_accum
+
+    @property
+    def k_pack(self) -> int:
+        return self.gemm_sp_node.k_pack
+
+    @property
+    def wg_wait(self) -> int:
+        return self.gemm_sp_node.wg_wait
+
+    @property
+    def policy(self) -> GemmWarpPolicy:
+        return self.gemm_sp_node.policy
diff --git a/tilelang/tileop/gemm_sp/gemm_sp_mma.py b/tilelang/tileop/gemm_sp/gemm_sp_mma.py
new file mode 100644
index 000000000..956c294a1
--- /dev/null
+++ b/tilelang/tileop/gemm_sp/gemm_sp_mma.py
@@ -0,0 +1,254 @@
+from .gemm_sp_base import GemmSPBase
+from tilelang.layout import make_swizzled_layout
+from tilelang.intrinsics.mma_sp_macro_generator import SparseTensorCoreIntrinEmitter
+from tilelang.utils.language import is_shared, is_fragment
+from tilelang import tvm as tvm
+from tvm.target import Target
+from tvm import tir
+from tilelang import language as T
+from tilelang.transform.simplify import _Simplify
+
+
+class GemmSPMMA(GemmSPBase):
+    def infer_layout(self, target: Target, thread_nums: int):
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
+        warp_row_tiles = int(self.M // m_warp)
+        warp_col_tiles = int(self.N // n_warp)
+        mma_emitter = SparseTensorCoreIntrinEmitter(
+            a_dtype=self.in_dtype,
+            e_dtype=self.e_dtype,
+            b_dtype=self.in_dtype,
+            accum_dtype=self.accum_dtype,
+            a_transposed=self.trans_A,
+            b_transposed=self.trans_B,
+            e_transposed=self.trans_E,
+            block_row_warps=m_warp,
+            block_col_warps=n_warp,
+            warp_row_tiles=warp_row_tiles,
+            warp_col_tiles=warp_col_tiles,
+            warp_k=self.K,
+        )
+        if self.is_gemm_ss():
+            return {
+                self.A: make_swizzled_layout(self.A),
+                self.B: make_swizzled_layout(self.B),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
+            }
+        elif self.is_gemm_sr():
+            return {
+                self.A: make_swizzled_layout(self.A),
+                self.B: mma_emitter.make_mma_load_layout(self.B, matrix="B"),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
+            }
+        elif self.is_gemm_rs():
+            return {
+                self.A: mma_emitter.make_mma_load_layout(self.A, matrix="A"),
+                self.B: make_swizzled_layout(self.B),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
+            }
+        elif self.is_gemm_rr():
+            return {
+                self.A: mma_emitter.make_mma_load_layout(self.A, matrix="A"),
+                self.B: mma_emitter.make_mma_load_layout(self.B, matrix="B"),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
+            }
+        else:
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+
+    def lower(self, target: Target, thread_nums: int, thread_var: tir.Var):
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
+        warp_row_tiles = int(self.M // m_warp)
+        warp_col_tiles = int(self.N // n_warp)
+        mma_emitter = SparseTensorCoreIntrinEmitter(
+            a_dtype=self.in_dtype,
+            b_dtype=self.in_dtype,
+            e_dtype=self.e_dtype,
+            accum_dtype=self.accum_dtype,
+            a_transposed=self.trans_A,
+            b_transposed=self.trans_B,
+            e_transposed=self.trans_E,
+            block_row_warps=m_warp,
+            block_col_warps=n_warp,
+            warp_row_tiles=warp_row_tiles,
+            warp_col_tiles=warp_col_tiles,
+            warp_k=self.K,
+            thread_var=thread_var,
+        )
+
+        in_dtype = self.in_dtype
+        warp_rows = mma_emitter.warp_rows
+        warp_cols = mma_emitter.warp_cols
+        local_size_a = mma_emitter.local_size_a
+        local_size_e = mma_emitter.local_size_e
+        local_size_b = mma_emitter.local_size_b
+        micro_size_k = mma_emitter.micro_size_k
+        A_shared = self.A
+        E_shared = self.E
+        B_shared = self.B
+        C_local = self.C
+        clear_accum = self.clear_accum
+        assert micro_size_k <= self.K, f"K dimension {self.K} should be >= micro size k {micro_size_k}"
+        if self.is_gemm_ss():
+
+            @T.prim_func
+            def _gemm_ssr() -> None:
+                """
+                The inner macro that loads data from shared buffers A_shared and
+                B_shared into local fragments, then issues Tensor Core mma ops,
+                accumulating into C_local.
+                """
+                A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
+                E_local = T.alloc_local((warp_rows * local_size_e), self.e_dtype)
+                B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
+
+                if clear_accum:
+                    T.clear(C_local)
+
+                for ki in T.serial(0, (self.K // micro_size_k)):
+                    # Load A into fragment
+                    mma_emitter.ldmatrix_a(
+                        A_local,
+                        A_shared,
+                        ki,
+                    )
+
+                    # Load E into fragment
+                    mma_emitter.ldmatrix_e(
+                        E_local,
+                        E_shared,
+                        ki,
+                    )
+
+                    # Load B into fragment
+                    mma_emitter.ldmatrix_b(
+                        B_local,
+                        B_shared,
+                        ki,
+                    )
+
+                    # Perform Matrix Multiplication
+                    mma_emitter.mma_sp(A_local, E_local, B_local, C_local, ki)
+
+            # Simplify to optimize the index computing
+            # Must inline let statements to simplify the analysis
+            return _Simplify(_gemm_ssr, inline_let=True)
+        elif self.is_gemm_sr():
+            B_local = self.B
+
+            @T.prim_func
+            def _gemm_srr() -> None:
+                """
+                The inner macro that loads data from shared buffers A_shared and
+                B_shared into local fragments, then issues Tensor Core mma ops,
+                accumulating into C_local.
+                """
+                A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
+                E_local = T.alloc_local((warp_rows * local_size_e), self.e_dtype)
+
+                if clear_accum:
+                    T.clear(C_local)
+
+                for ki in T.serial(0, (self.K // micro_size_k)):
+                    # Load A into fragment
+                    mma_emitter.ldmatrix_a(
+                        A_local,
+                        A_shared,
+                        ki,
+                    )
+
+                    # Load E into fragment
+                    mma_emitter.ldmatrix_e(
+                        E_local,
+                        E_shared,
+                        ki,
+                    )
+
+                    # Perform Matrix Multiplication
+                    mma_emitter.mma_sp(A_local, E_local, B_local, C_local, ki)
+
+            # Simplify to optimize the index computing
+            # Must inline let statements to simplify the analysis
+            # alloc_buffers body
+            # insert into parent block
+            return _Simplify(_gemm_srr, inline_let=True)
+        elif self.is_gemm_rs():
+            A_local = self.A
+
+            @T.prim_func
+            def _gemm_rsr() -> None:
+                """
+                The inner macro that loads data from shared buffers A_shared and
+                B_shared into local fragments, then issues Tensor Core mma ops,
+                accumulating into C_local.
+                """
+                E_local = T.alloc_local((warp_rows * local_size_e), self.e_dtype)
+                B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
+
+                if clear_accum:
+                    T.clear(C_local)
+
+                for ki in T.serial(0, (self.K // micro_size_k)):
+                    # Load E into fragment
+                    mma_emitter.ldmatrix_e(
+                        E_local,
+                        E_shared,
+                        ki,
+                    )
+
+                    # Load B into fragment
+                    mma_emitter.ldmatrix_b(
+                        B_local,
+                        B_shared,
+                        ki,
+                    )
+
+                    # Perform Matrix Multiplication
+                    mma_emitter.mma_sp(A_local, E_local, B_local, C_local, ki)
+
+            # Simplify to optimize the index computing
+            # Must inline let statements to simplify the analysis
+            return _Simplify(_gemm_rsr, inline_let=True)
+        elif self.is_gemm_rr():
+            A_local = self.A
+            B_local = self.B
+
+            @T.prim_func
+            def _gemm_rrr() -> None:
+                """
+                The inner macro that loads data from shared buffers A_shared and
+                B_shared into local fragments, then issues Tensor Core mma ops,
+                accumulating into C_local.
+                """
+                E_local = T.alloc_local((warp_rows * local_size_e), self.e_dtype)
+
+                if clear_accum:
+                    T.clear(C_local)
+
+                for ki in T.serial(0, (self.K // micro_size_k)):
+                    # Load E into fragment
+                    mma_emitter.ldmatrix_e(
+                        E_local,
+                        E_shared,
+                        ki,
+                    )
+
+                    # Perform Matrix Multiplication
+                    mma_emitter.mma_sp(A_local, E_local, B_local, C_local, ki)
+
+            # Simplify to optimize the index computing
+            # Must inline let statements to simplify the analysis
+            return _Simplify(_gemm_rrr, inline_let=True)
+        else:
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+
+    def is_gemm_ss(self) -> bool:
+        return is_shared(self.A) and is_shared(self.B)
+
+    def is_gemm_sr(self) -> bool:
+        return is_shared(self.A) and is_fragment(self.B)
+
+    def is_gemm_rs(self) -> bool:
+        return is_fragment(self.A) and is_shared(self.B)
+
+    def is_gemm_rr(self) -> bool:
+        return is_fragment(self.A) and is_fragment(self.B)
diff --git a/tilelang/tools/Analyzer.py b/tilelang/tools/Analyzer.py
index 205c647e3..3af5222f2 100644
--- a/tilelang/tools/Analyzer.py
+++ b/tilelang/tools/Analyzer.py
@@ -4,6 +4,7 @@
 from tilelang import tvm
 from tvm.tir.stmt_functor import ir_transform
 import logging
+
 # Configuration for different hardware architectures.
 # Each entry contains: (cores per SM, default clock (GHz), FLOPs per cycle, max SM count)
 ARCH_CONFIGS = {"80": (128, 1.41, 2, 108), "86": (128, 1.70, 2, 84), "89": (128, 2.52, 2, 128)}
@@ -23,6 +24,7 @@ class AnalysisResult:
         tflops: Achieved TFLOPS (trillions of FLOPs per second).
         bandwidth_GBps: Achieved memory bandwidth in GB/s.
     """
+
     total_flops: int
     total_global_bytes: int
     estimated_time: float
@@ -81,7 +83,7 @@ def _analyze_copy(self, call):
         # Account for loop and block dimensions
         loop_product = 1
         for extent in self.loop_stack:
-            loop_product *= extent.value if hasattr(extent, 'value') else extent
+            loop_product *= extent.value if hasattr(extent, "value") else extent
         total_blocks = self.block_counts["blockIdx.x"] * self.block_counts["blockIdx.y"]
         total_bytes = bytes_transferred * loop_product * total_blocks
         self.total_global_bytes += total_bytes
@@ -100,7 +102,7 @@ def _analyze_gemm(self, call):
         # Account for loop and block dimensions
         loop_product = 1
         for extent in self.loop_stack:
-            loop_product *= extent.value if hasattr(extent, 'value') else extent
+            loop_product *= extent.value if hasattr(extent, "value") else extent
         total_blocks = self.block_counts["blockIdx.x"] * self.block_counts["blockIdx.y"]
         self.total_flops += flops_per_call * loop_product * total_blocks
 
@@ -127,8 +129,7 @@ def _pre_visit(stmt):
                         iter_var = stmt.node
                         thread_tag = iter_var.thread_tag
                         if thread_tag in self.block_counts:
-                            extent = stmt.value.value if hasattr(stmt.value,
-                                                                 'value') else stmt.value
+                            extent = stmt.value.value if hasattr(stmt.value, "value") else stmt.value
                             self.block_counts[thread_tag] = extent
                 elif isinstance(stmt, tvm.tir.For):
                     # Push loop extent onto the stack
@@ -178,9 +179,7 @@ def get_peak_tflops(device) -> float | None:
             """
             arch_key = device.compute_capability[:2]
             if arch_key not in ARCH_CONFIGS:
-                logger.info(
-                    f"Unsupported compute capability: {device.compute_capability}, theoretical peak tflops will be None"
-                )
+                logger.info(f"Unsupported compute capability: {device.compute_capability}, theoretical peak tflops will be None")
                 return None
 
             cores_per_sm, default_clock, flops_per_cycle, compute_max_core = ARCH_CONFIGS[arch_key]
@@ -203,7 +202,8 @@ def get_peak_tflops(device) -> float | None:
             total_global_bytes=self.total_global_bytes,
             estimated_time=estimated_time,
             expected_tflops=peak_tflops,
-            expected_bandwidth_GBps=bandwidth_GBps)
+            expected_bandwidth_GBps=bandwidth_GBps,
+        )
 
     @classmethod
     def analysis(cls, fn, device):
diff --git a/tilelang/tools/plot_layout.py b/tilelang/tools/plot_layout.py
index 291da2571..299c3e86b 100644
--- a/tilelang/tools/plot_layout.py
+++ b/tilelang/tools/plot_layout.py
@@ -1,11 +1,15 @@
+from __future__ import annotations
 import tilelang.language as T
 
 
-def plot_layout(layout: T.Layout,
-                save_directory="./tmp",
-                name: str = "layout",
-                colormap: str = "RdPu",
-                verbose: bool = False) -> None:
+def plot_layout(
+    layout: T.Fragment,
+    save_directory="./tmp",
+    name: str = "layout",
+    colormap: str = "RdPu",
+    verbose: bool = False,
+    formats: str | list[str] = "png",
+) -> None:
     """
     Plot the layout of a buffer.
 
@@ -21,7 +25,8 @@ def plot_layout(layout: T.Layout,
         The colormap to use for visualization (default is "RdPu").
     verbose : bool, optional
         If True, prints additional information about the mapping (default is False).
-
+    formats : str | list[str], optional
+        The formats to save the image in (default is "png").
     Returns
     -------
     None
@@ -82,6 +87,23 @@ def plot_layout(layout: T.Layout,
     raw_colors = [cmap(i) for i in range(num_threads)]
     colors = raw_colors.copy()
 
+    # Show the distribution of registers in each thread of a warp.
+    warp_size = 32
+    # Warn if the number of threads is less than the warp size
+    if num_threads < warp_size:
+        import warnings
+
+        warnings.warn(
+            f"Layout visualization has {num_threads} threads, which is less than the warp size ({warp_size}). "
+            f"For the best viewing experience, it is recommended to have at least {warp_size} threads.",
+            UserWarning,
+            stacklevel=2,
+        )
+    spectral_camp = plt.get_cmap("hsv", warp_size * 6)
+
+    for i in range(min(warp_size, num_threads)):
+        colors[i] = spectral_camp(i * 6)
+
     # Determine the number of rows and columns in the input shape
     nrows, ncols = input_shape
     # Adjust figure size to maintain square cells
@@ -100,12 +122,7 @@ def plot_layout(layout: T.Layout,
 
             color = colors[thread_ids[0]]  # Select color based on thread ID
             # Create a rectangle patch for visualization
-            rect = patches.Rectangle((j, i),
-                                     1,
-                                     1,
-                                     linewidth=0.5,
-                                     edgecolor='black',
-                                     facecolor=color)
+            rect = patches.Rectangle((j, i), 1, 1, linewidth=0.5, edgecolor="black", facecolor=color)
             ax.add_patch(rect)  # Add the rectangle to the plot
 
             # Add text annotations inside the rectangles
@@ -121,41 +138,19 @@ def plot_layout(layout: T.Layout,
             thread_fontsize = min(font_size, font_size * (4 / len(thread_str)))
 
             # Add thread ID text with adjusted font size
-            ax.text(
-                j + 0.5,
-                i + 0.3,
-                thread_str,
-                ha='center',
-                va='center',
-                color='black',
-                fontsize=thread_fontsize)
+            ax.text(j + 0.5, i + 0.3, thread_str, ha="center", va="center", color="black", fontsize=thread_fontsize)
             # Add local ID text with original font size
-            ax.text(
-                j + 0.5,
-                i + 0.7,
-                f"L{local_id}",
-                ha='center',
-                va='center',
-                color='black',
-                fontsize=font_size)
+            ax.text(j + 0.5, i + 0.7, f"L{local_id}", ha="center", va="center", color="black", fontsize=font_size)
 
     # Add row labels to the left side of the plot
     for i in range(nrows):
         text = f"row {i}"
-        ax.text(-0.75, i + 0.5, text, ha='center', va='center', color='black', fontsize=font_size)
+        ax.text(-0.75, i + 0.5, text, ha="center", va="center", color="black", fontsize=font_size)
 
     # Add column labels at the top of the plot
     for j in range(ncols):
         text = f"col {j}"
-        ax.text(
-            j + 0.5,
-            -0.5,
-            text,
-            ha='center',
-            va='center',
-            color='black',
-            fontsize=font_size,
-            rotation=45)
+        ax.text(j + 0.5, -0.5, text, ha="center", va="center", color="black", fontsize=font_size, rotation=45)
 
     # Set the plot limits
     ax.set_xlim(0, ncols)
@@ -171,17 +166,15 @@ def plot_layout(layout: T.Layout,
     legend_x = 1.0 + (0.5 / fig_width)  # Adjust x position based on figure width
     legend_y = 1.0 + (1.7 / fig_height)  # Adjust y position based on figure height
 
-    legend_patches = [
-        patches.Patch(color='black', label="T: Thread ID"),
-        patches.Patch(color='black', label="L: Local ID")
-    ]
+    legend_patches = [patches.Patch(color="black", label="T: Thread ID"), patches.Patch(color="black", label="L: Local ID")]
     ax.legend(
         handles=legend_patches,
         loc="upper right",
         fontsize=font_size - 4,
         frameon=False,
         bbox_to_anchor=(legend_x, legend_y),  # Dynamic position
-        ncols=2)
+        ncols=2,
+    )
 
     # Create the output directory if it does not exist
     tmp_directory = pathlib.Path(save_directory)
@@ -191,17 +184,31 @@ def plot_layout(layout: T.Layout,
     # Save the figure in multiple formats
     plt.tight_layout()
 
-    # Save as PDF
-    pdf_path = tmp_directory / f"{name}.pdf"
-    plt.savefig(pdf_path, bbox_inches="tight")
-    print(f"Saved pdf format into {pdf_path}")
-
-    # Save as PNG
-    png_path = tmp_directory / f"{name}.png"
-    plt.savefig(png_path, bbox_inches="tight", transparent=False, dpi=255)
-    print(f"Saved png format into {png_path}")
-
-    # Save as SVG
-    svg_path = tmp_directory / f"{name}.svg"
-    plt.savefig(svg_path, bbox_inches="tight", format="svg")
-    print(f"Saved svg format into {svg_path}")
+    if isinstance(formats, str):
+        formats_str = formats.strip().lower()
+        if formats_str == "all":
+            formats_list = ["pdf", "png", "svg"]
+        elif "," in formats_str:
+            formats_list = [f.strip() for f in formats_str.split(",")]
+        else:
+            formats_list = [formats_str]
+    else:
+        raise TypeError(
+            f"Expected str, but got {type(formats).__name__}. Please pass a string like 'png', 'pdf', 'svg', 'all', or 'png,pdf'."
+        )
+
+    # Save the figure
+    if "pdf" in formats_list:
+        pdf_path = tmp_directory / f"{name}.pdf"
+        plt.savefig(pdf_path, bbox_inches="tight")
+        print(f"Saved pdf format into {pdf_path}")
+
+    if "png" in formats_list:
+        png_path = tmp_directory / f"{name}.png"
+        plt.savefig(png_path, bbox_inches="tight", transparent=False, dpi=255)
+        print(f"Saved png format into {png_path}")
+
+    if "svg" in formats_list:
+        svg_path = tmp_directory / f"{name}.svg"
+        plt.savefig(svg_path, bbox_inches="tight", format="svg")
+        print(f"Saved svg format into {svg_path}")
diff --git a/tilelang/transform/__init__.py b/tilelang/transform/__init__.py
index 808c97dc6..697dee2b1 100644
--- a/tilelang/transform/__init__.py
+++ b/tilelang/transform/__init__.py
@@ -80,38 +80,38 @@ def FrontendLegalize():
     return _ffi_api.FrontendLegalize()  # type: ignore
 
 
-def InjectAssumes():
-    """Inject Assumes
+def LegalizeNegativeIndex():
+    """Legalize negative indices in buffer loads.
 
-    Returns:
+    Returns
     -------
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.InjectAssumes()
+    return _ffi_api.LegalizeNegativeIndex()  # type: ignore
 
 
-def LowerHopperIntrin():
-    """LowerHopperIntrin
+def InjectAssumes():
+    """Inject Assumes for natural shape boundary conditions. And convert Assumes in Evaluate(Call(...)) form
+    (tvm builtin assume call) to AttrNode form.
 
-    Returns
+    Returns:
     -------
     fpass : tvm.transform.Pass
         The result pass
     """
-    return (_ffi_api.LowerHopperIntrin() if hasattr(_ffi_api, "LowerHopperIntrin") else lambda f: f
-           )  # type: ignore
+    return _ffi_api.InjectAssumes()
 
 
-def LowerCpengineIntrin():
-    """LowerCpengineIntrin
+def LowerHopperIntrin():
+    """LowerHopperIntrin
 
     Returns
     -------
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.LowerCpengineIntrin()  # type: ignore
+    return _ffi_api.LowerHopperIntrin() if hasattr(_ffi_api, "LowerHopperIntrin") else lambda f: f  # type: ignore
 
 
 def WarpSpecializedPipeline():
@@ -304,6 +304,21 @@ def SplitHostDevice():
     return _ffi_api.SplitHostDevice()  # type: ignore
 
 
+def AnnotateReadOnlyParams():
+    """Annotate read-only handle parameters for PrimFuncs.
+
+    Adds attribute `tl.readonly_param_indices` listing param indices that are
+    never written, enabling CUDA codegen to emit `const` qualifiers to unlock
+    read-only cache loads.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.AnnotateReadOnlyParams()  # type: ignore
+
+
 def VectorizeLoop(enable_vectorize: bool = True):
     """VectorizeLoop
 
@@ -341,18 +356,6 @@ def LowerDeviceStorageAccessInfo():
     return _ffi_api.LowerDeviceStorageAccessInfo()  # type: ignore
 
 
-def LoopVectorizeDynamic():
-    """Try to vectorize loop with dynamic shape.
-
-    Returns
-    -------
-    fpass : tvm.transform.Pass
-        The result pass
-    ----
-    """
-    return _ffi_api.LoopVectorizeDynamic()  # type: ignore
-
-
 def ConfigIndexBitwidth():
     """Config index bitwidth.
 
@@ -377,8 +380,7 @@ def FlattenBuffer():
 
 
 def EliminateStorageSyncForMBarrier():
-    """EliminateStorageSyncForMBarrier
-    """
+    """EliminateStorageSyncForMBarrier"""
     return _ffi_api.EliminateStorageSyncForMBarrier()  # type: ignore
 
 
@@ -390,19 +392,16 @@ def MergeSharedMemoryAllocations(enable_aggressive_merge: bool = False, align_by
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.MergeSharedMemoryAllocations(enable_aggressive_merge,
-                                                 align_bytes)  # type: ignore
+    return _ffi_api.MergeSharedMemoryAllocations(enable_aggressive_merge, align_bytes)  # type: ignore
 
 
 def LowerL2Persistent():
-    """LowerL2Persistent
-    """
+    """LowerL2Persistent"""
     return _ffi_api.LowerL2Persistent()  # type: ignore
 
 
 def PersistThreadblock():
-    """PersistThreadblock
-    """
+    """PersistThreadblock"""
     return _ffi_api.PersistThreadblock()  # type: ignore
 
 
@@ -421,11 +420,25 @@ def AlignDynamicSharedMemoryAllocations(align_bytes: int = 16):
 
 
 def LowerSharedBarrier():
-    """LowerSharedBarrier
-    """
+    """LowerSharedBarrier"""
     return _ffi_api.LowerSharedBarrier()  # type: ignore
 
 
+def PlanAndUpdateBufferAllocationLocation():
+    """Plan and update buffer allocation locations within PrimFuncs.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.PlanAndUpdateBufferAllocationLocation()  # type: ignore
+
+
+def HoistNonRestrictParams():
+    return _ffi_api.HoistNonRestrictParams()  # type: ignore
+
+
 def StorageRewrite():
     """StorageRewrite
 
@@ -438,20 +451,17 @@ def StorageRewrite():
 
 
 def LowerOpaqueBlock():
-    """LowerOpaqueBlock
-    """
+    """LowerOpaqueBlock"""
     return _ffi_api.LowerOpaqueBlock()  # type: ignore
 
 
 def LowerThreadAllreduce():
-    """LowerThreadAllreduce
-    """
+    """LowerThreadAllreduce"""
     return _ffi_api.LowerThreadAllreduce()  # type: ignore
 
 
 def LowerIntrin():
-    """LowerIntrin
-    """
+    """LowerIntrin"""
     return _ffi_api.LowerIntrin()  # type: ignore
 
 
@@ -469,8 +479,7 @@ def LowerDeviceKernelLaunch():
 
 
 def LowerSharedTmem():
-    """LowerSharedTmem
-    """
+    """LowerSharedTmem"""
     return _ffi_api.LowerSharedTmem()  # type: ignore
 
 
diff --git a/tilelang/transform/_ffi_api.py b/tilelang/transform/_ffi_api.py
index c89dddda1..3692a32d6 100644
--- a/tilelang/transform/_ffi_api.py
+++ b/tilelang/transform/_ffi_api.py
@@ -1,6 +1,6 @@
 """FFI APIs for tilelang"""
 
-import tvm.ffi
+import tvm_ffi
 
 # TVM_REGISTER_GLOBAL("tl.name").set_body_typed(func);
-tvm.ffi._init_api("tl.transform", __name__)  # pylint: disable=protected-access
+tvm_ffi.init_ffi_api("tl.transform", __name__)
diff --git a/tilelang/transform/add_bufstore_wrapper.py b/tilelang/transform/add_bufstore_wrapper.py
index 7ccab4707..c1dd41e0d 100644
--- a/tilelang/transform/add_bufstore_wrapper.py
+++ b/tilelang/transform/add_bufstore_wrapper.py
@@ -1,5 +1,4 @@
-from __future__ import annotations
-from tvm.tir import (BufferStore, For, AttrStmt, ForKind, Var, PrimFunc, BufferLoad, Buffer, IntImm)
+from tvm.tir import BufferStore, For, AttrStmt, ForKind, Var, PrimFunc, BufferLoad, Buffer, IntImm
 from tvm.tir.stmt_functor import ir_transform, post_order_visit
 from tvm.tir.transform import prim_func_pass
 
@@ -98,7 +97,7 @@ def is_tile_operation_loop(loop: For) -> bool:
             Returns:
                 True if the loop is a tile operation (parallel or has num_stages annotation)
             """
-            return loop.kind == ForKind.PARALLEL or 'num_stages' in loop.annotations
+            return loop.kind == ForKind.PARALLEL or "num_stages" in loop.annotations
 
         def pre_visit(statement):
             """
@@ -106,7 +105,7 @@ def pre_visit(statement):
             """
             nonlocal tile_operation_depth
 
-            if isinstance(statement, AttrStmt) and statement.attr_key == 'thread_extent':
+            if isinstance(statement, AttrStmt) and statement.attr_key == "thread_extent":
                 thread_binding_vars.add(statement.node.var)
             elif isinstance(statement, For) and is_tile_operation_loop(statement):
                 tile_operation_depth += 1
@@ -140,7 +139,8 @@ def post_visit(statement):
                             if isinstance(index, IntImm) and index != 0:
                                 raise ValueError(
                                     f"Fragment buffer access with non-zero index [{index}] is not supported. "
-                                    "Only fragment[0] access is allowed.")
+                                    "Only fragment[0] access is allowed."
+                                )
 
                     # Wrap fragment[0] access with T.Parallel loop
                     return For(Var("_", "int32"), 0, 1, ForKind.PARALLEL, statement)
diff --git a/tilelang/transform/pass_config.py b/tilelang/transform/pass_config.py
index 6694ed0aa..ee3014907 100644
--- a/tilelang/transform/pass_config.py
+++ b/tilelang/transform/pass_config.py
@@ -5,24 +5,14 @@
 
 class PassConfigKey(str, Enum):
     """Pass configuration keys for TileLang compiler."""
+
     # TileLang specific configs
     TL_SIMPLIFY = "tl.Simplify"
     """Enable/disable TileLang simplification passes. Default: True"""
 
-    TL_DYNAMIC_ALIGNMENT = "tl.dynamic_alignment"
-    """Memory alignment requirement for dynamic shapes. Default: 16"""
-
-    TL_DISABLE_DYNAMIC_TAIL_SPLIT = "tl.disable_dynamic_tail_split"
-    """Disable dynamic tail splitting optimization. Default: False"""
-
     TL_DISABLE_WARP_SPECIALIZED = "tl.disable_warp_specialized"
     """Disable warp specialization optimization. Default: False"""
 
-    TL_DISABLE_FAST_MATH = "tl.disable_fast_math"
-    """Disable fast math optimization. Default: True
-    will be deprecated in the 0.1.7 release
-    """
-
     TL_ENABLE_FAST_MATH = "tl.enable_fast_math"
     """
         Enable fast math optimization. Default: False
@@ -36,6 +26,20 @@ class PassConfigKey(str, Enum):
     TL_ENABLE_PTXAS_VERBOSE_OUTPUT = "tl.enable_ptxas_verbose_output"
     """Enable ptxas verbose output. Default: False"""
 
+    TL_DEVICE_COMPILE_FLAGS = "tl.device_compile_flags"
+    """Additional device compiler flags passed to nvcc/NVRTC.
+
+    Accepts either a string (parsed with shell-like splitting) or a list of
+    strings. Typical usage is to provide extra include paths, defines or
+    ptxas options, e.g.:
+
+    - "-I/opt/include -DMY_SWITCH=1 --ptxas-options=--verbose"
+    - ["-I/opt/include", "-DMY_SWITCH=1", "--ptxas-options=--verbose"]
+
+    These flags are appended to the compiler options used in the tvm_ffi
+    CUDA compile callback. Default: None
+    """
+
     TL_CONFIG_INDEX_BITWIDTH = "tl.config_index_bitwidth"
     """Bitwidth for configuration indices. Default: 32"""
 
@@ -50,6 +54,11 @@ class PassConfigKey(str, Enum):
     TL_DISABLE_WGMMA = "tl.disable_wgmma"
     """Disable usage of Hopper WGMMA. Default: False"""
 
+    TL_DISABLE_RDC = "tl.disable_rdc"
+    """Disable relocatable device code (RDC) compilation. When NVSHMEM is enabled,
+    RDC is required for device-side linking. Set this to True to disable RDC
+    when NVSHMEM functions are not used. Default: False"""
+
     TL_DEBUG_MERGE_SHARED_MEMORY_ALLOCATIONS = "tl.debug_merge_shared_memory_allocations"
     """Enable debug information for merge shared memory allocations. Default: False"""
 
@@ -59,10 +68,6 @@ class PassConfigKey(str, Enum):
     TL_DISABLE_SHUFFLE_ELECT = "tl.disable_shuffle_elect"
     """Disable shuffle election optimization. Default: False"""
 
-    TL_DISABLE_RDC = "tl.disable_rdc"
-    """Disable RDC (Relocatable Device Code) compilation in distributed programming. Default: False"""
-    # (wt) Introduced to temporarily avoid the bug in https://github.com/tile-ai/tilelang/issues/659
-
     TL_DISABLE_THREAD_STORAGE_SYNC = "tl.disable_thread_storage_sync"
     """Disable thread storage synchronization pass. When enabled, disables the
     automatic insertion of thread synchronization barriers (e.g., __syncthreads())
@@ -73,6 +78,18 @@ class PassConfigKey(str, Enum):
     TL_FORCE_LET_INLINE = "tl.force_let_inline"
     """Force TileLang to inline let bindings during simplification. Default: False"""
 
+    TL_AST_PRINT_ENABLE = "tl.ast_print_enable"
+    """Enable TIR AST printing for debugging purposes. Default: False"""
+
+    TL_LAYOUT_VISUALIZATION_ENABLE = "tl.layout_visualization_enable"
+    """Enable layout inference visualization. Default: False"""
+
+    TL_LAYOUT_VISUALIZATION_FORMATS = "tl.layout_visualization_formats"
+    """Layout visualization formats.
+    Acceptable values: "pdf", "png", "svg", "all"
+
+    """
+
     TL_STORAGE_REWRITE_DETECT_INPLACE = "tl.storage_rewrite_detect_inplace"
     """Control StorageRewrite inplace detection.
 
@@ -80,10 +97,10 @@ class PassConfigKey(str, Enum):
     such as `dst[i] = f(src[i])`, avoiding implicit aliasing:
 
     ```
-    read = T.allocate([1], "int32", "local.var")
-    write = T.allocate([1], "int32", "local.var")
-    read_buf = T.Buffer((1,), "int32", data=read, scope="local.var")
-    write_buf = T.Buffer((1,), "int32", data=write, scope="local.var")
+    read = T.allocate([1], T.int32, "local.var")
+    write = T.allocate([1], T.int32, "local.var")
+    read_buf = T.Buffer((1,), T.int32, data=read, scope="local.var")
+    write_buf = T.Buffer((1,), T.int32, data=write, scope="local.var")
     write_buf[0] = read_buf[0] * 2
     f(write_buf[0])
     ```
@@ -93,8 +110,8 @@ class PassConfigKey(str, Enum):
     like:
 
     ```
-    read = T.allocate([1], "int32", "local.var")
-    read_buf = T.Buffer((1,), "int32", data=read, scope="local.var")
+    read = T.allocate([1], T.int32, "local.var")
+    read_buf = T.Buffer((1,), T.int32, data=read, scope="local.var")
     read_buf[0] = read_buf[0] * 2
     f(read_buf[0])
     ```
diff --git a/tilelang/transform/simplify.py b/tilelang/transform/simplify.py
index 7e0c5062b..c5e577d03 100644
--- a/tilelang/transform/simplify.py
+++ b/tilelang/transform/simplify.py
@@ -51,7 +51,6 @@ def _Simplify(stmt: PrimFunc | IRModule, inline_let: bool = False) -> PrimFunc |
 
 # Decorator to simplify the output of a function
 def simplify_prim_func(func: Callable) -> Callable:
-
     def wrapper(*args, **kwargs):
         stmt: PrimFunc | IRModule = (func)(*args, **kwargs)
         return _Simplify(stmt)
diff --git a/tilelang/utils/__init__.py b/tilelang/utils/__init__.py
index f50aa8567..f2e8ec441 100644
--- a/tilelang/utils/__init__.py
+++ b/tilelang/utils/__init__.py
@@ -6,8 +6,17 @@
     is_global,  # noqa: F401
     is_shared,  # noqa: F401
     is_shared_dynamic,  # noqa: F401
+    is_tensor_memory,  # noqa: F401
     is_fragment,  # noqa: F401
     is_local,  # noqa: F401
     array_reduce,  # noqa: F401
+    retrieve_stride,  # noqa: F401
+    retrieve_shape,  # noqa: F401
+    retrive_ptr_from_buffer_region,  # noqa: F401
+    is_full_region,  # noqa: F401
+    to_buffer_region,  # noqa: F401
+    get_buffer_region_from_load,  # noqa: F401
+    get_prim_func_name,  # noqa: F401
+    side_effect,  # noqa: F401
 )
 from .deprecated import deprecated  # noqa: F401
diff --git a/tilelang/utils/allocator.py b/tilelang/utils/allocator.py
index 6dccb9d36..26a82933d 100644
--- a/tilelang/utils/allocator.py
+++ b/tilelang/utils/allocator.py
@@ -70,14 +70,16 @@ def _load_cudart():
 class BaseAllocator:
     func: callable | None = None
 
-    def __init__(self,
-                 size: int,
-                 device: str | torch.device | int | None = None,
-                 is_distributed: bool = False,
-                 local_rank: int | None = None,
-                 num_local_ranks: int | None = None,
-                 group: dist.ProcessGroup | None = None,
-                 align: int = 256) -> None:
+    def __init__(
+        self,
+        size: int,
+        device: str | torch.device | int | None = None,
+        is_distributed: bool = False,
+        local_rank: int | None = None,
+        num_local_ranks: int | None = None,
+        group: dist.ProcessGroup | None = None,
+        align: int = 256,
+    ) -> None:
         if size <= 0:
             raise ValueError("size must be > 0")
         self.size = int(size)
@@ -96,13 +98,13 @@ def __init__(self,
         # total size: 16 + 8 * num_local_ranks
         self._table = None
         self._buffer_ptrs = None
+        self._device_ids = None
         self._initialized = False
         if self._is_distributed:
             assert self._group is not None, "group must be provided when is_distributed is True"
             assert self._local_rank is not None, "local_rank must be provided when is_distributed is True"
             assert self._num_local_ranks is not None, "num_local_ranks must be provided when is_distributed is True"
-            assert self._group.size(
-            ) == self._num_local_ranks, "group.size() must be equal to num_local_ranks"
+            assert self._group.size() == self._num_local_ranks, "group.size() must be equal to num_local_ranks"
 
         self._alloc()
         if self._is_distributed:
@@ -118,8 +120,7 @@ def _alloc(self):
         if self._device is not None:
             rc = _libcudart.cudaSetDevice(int(self._device))
             if rc != 0:
-                raise RuntimeError(
-                    f"cudaSetDevice failed: {rc} {_libcudart.cudaGetErrorString(rc).decode()}")
+                raise RuntimeError(f"cudaSetDevice failed: {rc} {_libcudart.cudaGetErrorString(rc).decode()}")
         rc = _libcudart.cudaMalloc(ctypes.byref(self._base_ptr), ctypes.c_size_t(self.size))
         if rc != 0:
             msg = _libcudart.cudaGetErrorString(rc)
@@ -141,6 +142,7 @@ def _init_table(self):
         ] * self._group.size()
         local_device_id = self._local_rank
         dist.all_gather_object(device_ids, local_device_id, self._group)
+        self._device_ids = device_ids
 
         # Synchronize IPC handles
         ipc_handles = [
@@ -148,9 +150,8 @@ def _init_table(self):
         ] * self._group.size()
         local_ipc_handle = _create_ipc_handle(self._base_ptr.value)
         dist.all_gather_object(ipc_handles, local_ipc_handle, self._group)
-        buffer_ptrs = torch.empty(self._group.size(), dtype=torch.uint64, device='cuda')
-        _sync_ipc_handles(self._local_rank, device_ids,
-                          ctypes.c_void_p(buffer_ptrs.data_ptr()).value, ipc_handles, None)
+        buffer_ptrs = torch.empty(self._group.size(), dtype=torch.uint64, device="cuda")
+        _sync_ipc_handles(self._local_rank, device_ids, ctypes.c_void_p(buffer_ptrs.data_ptr()).value, ipc_handles, None)
         buffer_ptrs[self._local_rank] = self._base_ptr.value
         self._buffer_ptrs = buffer_ptrs
         self._table_size = 2 + self._group.size()
@@ -162,12 +163,9 @@ def _init_table(self):
     def initialized(self) -> bool:
         return self._initialized
 
-    def _allocate_tensor(self,
-                         shape: tuple[int, ...],
-                         dtype: torch.dtype,
-                         return_peers=False,
-                         take_ownership: bool = False) -> torch.Tensor:
-
+    def _allocate_tensor(
+        self, shape: tuple[int, ...], dtype: torch.dtype, return_peers=False, take_ownership: bool = False
+    ) -> torch.Tensor:
         numel = _prod_shape(shape)
         itemsize = _element_size_bytes(dtype)
         bytes_needed = numel * itemsize
@@ -177,9 +175,11 @@ def _allocate_tensor(self,
         current_offset = int(self._ptr.value) - int(self._base_ptr.value)
         if current_offset + bytes_alloc > self.size:
             bytes_available = self.size - current_offset
-            raise MemoryError(f"Allocation failed: Requesting {bytes_alloc} bytes, but only "
-                              f"{bytes_available} bytes are available in the pre-allocated buffer "
-                              f"(total size: {self.size} bytes).")
+            raise MemoryError(
+                f"Allocation failed: Requesting {bytes_alloc} bytes, but only "
+                f"{bytes_available} bytes are available in the pre-allocated buffer "
+                f"(total size: {self.size} bytes)."
+            )
 
         if not isinstance(self._ptr, ctypes.c_void_p):
             raise TypeError("self._ptr must be ctypes.c_void_p")
@@ -205,7 +205,8 @@ def _allocate_tensor(self,
                     peer_ts.append(t)
                 else:
                     peer_ptr_val = int(self._buffer_ptrs[i]) + current_offset
-                    peer_t = tensor_from_ptr(peer_ptr_val, shape, dtype_str, self._device, False)
+                    peer_device = self._device_ids[i]
+                    peer_t = tensor_from_ptr(peer_ptr_val, shape, dtype_str, peer_device, False)
                     peer_ts.append(peer_t)
 
         if take_ownership:
@@ -233,16 +234,14 @@ def __del__(self):
             self._free()
 
 
-def get_allocator(size: int = 2**30,
-                  device: str = "cuda",
-                  is_distributed: bool = True,
-                  local_rank: int = 0,
-                  num_local_ranks: int = 1,
-                  group: dist.ProcessGroup | None = None) -> BaseAllocator:
+def get_allocator(
+    size: int = 2**30,
+    device: str = "cuda",
+    is_distributed: bool = True,
+    local_rank: int = 0,
+    num_local_ranks: int = 1,
+    group: dist.ProcessGroup | None = None,
+) -> BaseAllocator:
     return BaseAllocator(
-        size,
-        device=device,
-        is_distributed=is_distributed,
-        local_rank=local_rank,
-        num_local_ranks=num_local_ranks,
-        group=group)
+        size, device=device, is_distributed=is_distributed, local_rank=local_rank, num_local_ranks=num_local_ranks, group=group
+    )
diff --git a/tilelang/utils/deprecated.py b/tilelang/utils/deprecated.py
index 2aff08b59..2944f292b 100644
--- a/tilelang/utils/deprecated.py
+++ b/tilelang/utils/deprecated.py
@@ -1,11 +1,10 @@
 def deprecated_warning(method_name: str, new_method_name: str, phaseout_version: str = None):
-    """A function to indicate that a method is deprecated
-    """
+    """A function to indicate that a method is deprecated"""
     import warnings  # pylint: disable=import-outside-toplevel, import-error
 
     warnings.warn(
-        f"{method_name} is deprecated, use {new_method_name} instead" +
-        (f" and will be removed in {phaseout_version}" if phaseout_version else ""),
+        f"{method_name} is deprecated, use {new_method_name} instead"
+        + (f" and will be removed in {phaseout_version}" if phaseout_version else ""),
         DeprecationWarning,
         stacklevel=2,
     )
@@ -30,7 +29,6 @@ def deprecated(
     import functools  # pylint: disable=import-outside-toplevel
 
     def _deprecate(func):
-
         @functools.wraps(func)
         def _wrapper(*args, **kwargs):
             deprecated_warning(method_name, new_method_name, phaseout_version)
diff --git a/tilelang/utils/language.py b/tilelang/utils/language.py
index 0972175a8..d9b85a197 100644
--- a/tilelang/utils/language.py
+++ b/tilelang/utils/language.py
@@ -1,37 +1,58 @@
 from __future__ import annotations
-from tvm.tir import Buffer
+from tvm.tir import Buffer, BufferLoad, BufferRegion, PrimExpr
+from tilelang.language.utils import region as _make_region_call
 from functools import reduce
-from tvm import IRModule
+from tvm import IRModule, DataType
 from tvm.tir import PrimFunc
 from tvm import ir, tir
-
+from tvm.tir.expr import CallEffectKind
 # Scope Checkers for TVM Buffers
 # These utility functions check the memory scope of a given TVM buffer.
 
 
-def is_global(buffer: Buffer) -> bool:
+def _get_buffer(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion) -> Buffer:
+    """
+    Extract Buffer from Buffer, BufferLoad, or BufferRegion.
+
+    Args:
+        buffer_or_load_or_region: Can be Buffer, BufferLoad, or BufferRegion
+
+    Returns:
+        Buffer: The underlying buffer object
+    """
+    if isinstance(buffer_or_load_or_region, Buffer):
+        return buffer_or_load_or_region
+    elif isinstance(buffer_or_load_or_region, (tir.BufferLoad, tir.BufferRegion)):
+        return buffer_or_load_or_region.buffer
+    else:
+        raise TypeError(f"Expected Buffer, BufferLoad, or BufferRegion, got {type(buffer_or_load_or_region)}")
+
+
+def is_global(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
     """
     Check if the buffer is in the global memory scope.
 
     Args:
-        buffer (Buffer): The TVM buffer to check.
+        buffer: The TVM buffer, BufferLoad, or BufferRegion to check.
 
     Returns:
         bool: True if the buffer is in global memory, False otherwise.
     """
+    buffer = _get_buffer(buffer)
     return buffer.scope() == "global"
 
 
-def is_shared(buffer: Buffer, allow_dynamic: bool = True) -> bool:
+def is_shared(buffer: Buffer | BufferLoad | BufferRegion, allow_dynamic: bool = True) -> bool:
     """
     Check if the buffer is in the shared memory scope.
 
     Args:
-        buffer (Buffer): The TVM buffer to check.
+        buffer: The TVM buffer, BufferLoad, or BufferRegion to check.
 
     Returns:
         bool: True if the buffer is in shared memory, False otherwise.
     """
+    buffer = _get_buffer(buffer)
     conditions = [False]
     conditions.append(buffer.scope() == "shared")
     if allow_dynamic:
@@ -39,42 +60,59 @@ def is_shared(buffer: Buffer, allow_dynamic: bool = True) -> bool:
     return any(conditions)
 
 
-def is_shared_dynamic(buffer: Buffer) -> bool:
+def is_shared_dynamic(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
     """
     Check if the buffer is in the dynamic shared memory scope.
 
     Args:
-        buffer (Buffer): The TVM buffer to check.
+        buffer: The TVM buffer, BufferLoad, or BufferRegion to check.
 
     Returns:
         bool: True if the buffer is in dynamic shared memory, False otherwise.
     """
+    buffer = _get_buffer(buffer)
     return buffer.scope() == "shared.dyn"
 
 
-def is_local(buffer: Buffer) -> bool:
+def is_tensor_memory(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
+    """
+    Check if the buffer is in tensor memory scope (e.g., shared.tmem).
+
+    Args:
+        buffer: The TVM buffer, BufferLoad, or BufferRegion to check.
+
+    Returns:
+        bool: True if the buffer is in tensor memory, False otherwise.
+    """
+    buffer = _get_buffer(buffer)
+    return buffer.scope().startswith("shared.tmem")
+
+
+def is_local(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
     """
     Check if the buffer is in the local memory scope.
 
     Args:
-        buffer (Buffer): The TVM buffer to check.
+        buffer: The TVM buffer, BufferLoad, or BufferRegion to check.
 
     Returns:
         bool: True if the buffer is in local memory, False otherwise.
     """
+    buffer = _get_buffer(buffer)
     return buffer.scope() == "local"
 
 
-def is_fragment(buffer: Buffer) -> bool:
+def is_fragment(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
     """
     Check if the buffer is a fragment (e.g., for matrix multiplication operations).
 
     Args:
-        buffer (Buffer): The TVM buffer to check.
+        buffer: The TVM buffer, BufferLoad, or BufferRegion to check.
 
     Returns:
         bool: True if the buffer is a fragment, False otherwise.
     """
+    buffer = _get_buffer(buffer)
     return buffer.scope().startswith("local.fragment")
 
 
@@ -115,13 +153,12 @@ def retrieve_func_from_module(ir_module: IRModule) -> PrimFunc:
     """
     if not isinstance(ir_module, IRModule):
         raise ValueError("Not supported type: ", type(ir_module))
-    assert len(ir_module.get_global_vars()) == 1, (
-        "The optimized module should only have one global variable for default schedule.")
+    assert len(ir_module.get_global_vars()) == 1, "The optimized module should only have one global variable for default schedule."
     func = list(ir_module.functions.values())[0]
     return func
 
 
-def get_buffer_region_from_load(buffer_load: tir.BufferLoad) -> tir.BufferRegion | None:
+def get_buffer_region_from_load(buffer_load: tir.BufferLoad, extents: list[PrimExpr] | None = None) -> tir.BufferRegion | None:
     """
     Get the buffer region from a buffer load.
 
@@ -132,15 +169,343 @@ def get_buffer_region_from_load(buffer_load: tir.BufferLoad) -> tir.BufferRegion
     buffer, indices = buffer_load.buffer, buffer_load.indices
     regions = []
     found_ramp: bool = False
-    for indice in indices:
+
+    if extents is not None:
+        assert len(extents) == len(indices), "extents should have the same length as indices"
+    for i, indice in enumerate(indices):
         if isinstance(indice, tir.Ramp):
+            assert extents is None, "extents should be provided for BufferLoad with Ramp indices"
             regions.append(ir.Range.from_min_extent(indice.base, indice.lanes))
             found_ramp = True
         elif isinstance(indice, tir.PrimExpr):
-            regions.append(ir.Range.from_min_extent(indice, 1))
+            if extents is not None:
+                regions.append(ir.Range.from_min_extent(indice, extents[i]))
+                found_ramp = True
+            else:
+                regions.append(ir.Range.from_min_extent(indice, 1))
         else:
-            raise ValueError("Unsupported type: ", type(indice))
+            raise ValueError(f"Unsupported type: {type(indice)} for index {i}")
     if found_ramp:
         return tir.BufferRegion(buffer, regions)
     else:
         return None
+
+
+def to_buffer_region(
+    obj: Buffer | BufferLoad | BufferRegion | tir.Var, access_type: str = "rw", extents: list[PrimExpr] | None = None
+) -> PrimExpr | BufferRegion:
+    """
+    Convert to/from the tl.region representation.
+
+    - Buffer/BufferLoad/BufferRegion -> returns a tl.region call (PrimExpr)
+    - tl.region Call -> returns the decoded BufferRegion for analysis
+    """
+    from tilelang.language.frame import has_let_value, get_let_value
+
+    if isinstance(obj, tir.Var) and has_let_value(obj):
+        obj = get_let_value(obj)
+    # Encode into tl.region call (when extents is provided), otherwise return BufferRegion for analysis
+    if isinstance(obj, tir.BufferRegion):
+        if extents is None:
+            return obj
+        mins = [r.min for r in obj.region]
+        exts = [r.extent for r in obj.region]
+        assert len(extents) == len(exts)
+        exts = [tir.min(exts[i], extents[i]) for i in range(len(exts))]
+        return _make_region_call(tir.BufferLoad(obj.buffer, mins), access_type, *exts)
+    if isinstance(obj, tir.Buffer):
+        mins = [tir.IntImm("int32", 0) for _ in obj.shape]
+        if extents is None:
+            ranges = [ir.Range.from_min_extent(m, e) for m, e in zip(mins, obj.shape)]
+            return tir.BufferRegion(obj, ranges)
+        exts = list(extents)
+        return _make_region_call(tir.BufferLoad(obj, mins), access_type, *exts)
+    if isinstance(obj, tir.BufferLoad):
+        if extents is None:
+            region = get_buffer_region_from_load(obj)
+            if region is not None:
+                return region
+            mins = [idx for idx in obj.indices]
+            ones = [tir.IntImm("int32", 1) for _ in obj.indices]
+            ranges = [ir.Range.from_min_extent(m, e) for m, e in zip(mins, ones)]
+            return tir.BufferRegion(obj.buffer, ranges)
+        exts = list(extents)
+        if len(obj.indices) > len(exts):
+            exts = [tir.IntImm("int32", 1) for _ in range(len(obj.indices) - len(exts))] + exts
+        assert len(obj.indices) == len(exts)
+        return _make_region_call(obj, access_type, *exts)
+    raise ValueError(f"Unsupported argument type for to_buffer_region: {type(obj)}")
+
+
+def retrieve_shape(obj: Buffer | BufferRegion | BufferLoad) -> list:
+    """
+    Retrieve shape-like extents for a buffer-like object.
+
+    - Buffer -> its `shape`
+    - BufferRegion -> list of each range's `extent`
+    - BufferLoad -> extents from `get_buffer_region_from_load(obj)`
+    """
+    if isinstance(obj, tir.Buffer):
+        return obj.shape
+    if isinstance(obj, tir.BufferRegion):
+        return [r.extent for r in obj.region]
+    if isinstance(obj, tir.BufferLoad):
+        region = get_buffer_region_from_load(obj)
+        if region is None:
+            raise ValueError("Cannot retrieve shape from scalar BufferLoad without region")
+        return [r.extent for r in region.region]
+    raise ValueError(f"Unsupported retrieve_shape argument type: {type(obj)} for object {obj}")
+
+
+def retrieve_stride(obj: Buffer | BufferRegion | BufferLoad) -> list:
+    """
+    Retrieve row-major strides for a buffer-like object based on its buffer.shape.
+
+    For BufferRegion and BufferLoad, uses the underlying buffer's `shape`.
+    """
+    if isinstance(obj, tir.Buffer):
+        shape = obj.shape
+    elif isinstance(obj, (tir.BufferRegion, tir.BufferLoad)):
+        shape = obj.buffer.shape
+    else:
+        raise ValueError(f"Unsupported retrieve_stride argument type: {type(obj)} for object {obj}")
+
+    strides = []
+    stride = 1
+    for s in reversed(shape):
+        strides.insert(0, stride)
+        stride *= s
+    return strides
+
+
+def retrive_ptr_from_buffer_region(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion, access_type: str = "r") -> PrimExpr:
+    if isinstance(buffer_or_load_or_region, Buffer):
+        return buffer_or_load_or_region.access_ptr(access_type)
+    elif isinstance(buffer_or_load_or_region, BufferLoad):
+        buffer_load = buffer_or_load_or_region
+        offset, stride = 0, 1
+        buffer = buffer_load.buffer
+        for i, shape in enumerate(reversed(buffer.shape)):
+            indice = buffer_load.indices[len(buffer_load.indices) - i - 1]
+            if isinstance(indice, (tir.IntImm, tir.PrimExpr)):
+                offset += indice * stride
+            elif isinstance(indice, tir.Ramp):
+                offset += indice.base * stride
+            else:
+                raise ValueError(f"Unsupported index type: {type(indice)}")
+            stride *= shape
+        return buffer.access_ptr(access_type, offset=offset)
+    elif isinstance(buffer_or_load_or_region, BufferRegion):
+        buffer_region = buffer_or_load_or_region
+        buffer = buffer_region.buffer
+        offset, stride = 0, 1
+        for i, shape in enumerate(reversed(buffer.shape)):
+            offset += buffer_region.region[len(buffer_region.region) - i - 1].min * stride
+            stride *= shape
+        return buffer.access_ptr(access_type, offset=offset)
+    else:
+        raise ValueError(f"Unsupported buffer type: {type(buffer_or_load_or_region)}")
+
+
+def retrieve_ptr(
+    obj: Buffer | BufferRegion | BufferLoad,
+    access_type: str = "r",
+    ignore_last_ndim: int = 0,
+) -> PrimExpr:
+    """
+    Retrieve a pointer to the start of a (possibly sliced) buffer region.
+
+    - Buffer -> base pointer
+    - BufferRegion -> pointer with byte offset computed from region minima
+    - BufferLoad -> pointer offset computed from indices or derived region
+
+    Args:
+        obj: Buffer-like object
+        access_type: TVM Buffer access mask, e.g. "r", "w", "rw"
+        ignore_last_ndim: do not offset the last N dimensions
+    """
+    if isinstance(obj, tir.Buffer):
+        return obj.access_ptr(access_type)
+
+    if isinstance(obj, tir.BufferRegion):
+        buffer, region = obj.buffer, obj.region
+        strides = retrieve_stride(obj)
+        # offset only over the leading dims, optionally ignoring the tail dims
+        upto = max(0, len(region) - int(ignore_last_ndim))
+        offset = 0
+        for i in range(upto):
+            offset += region[i].min * strides[i]
+        return buffer.access_ptr(access_type, offset=offset)
+
+    if isinstance(obj, tir.BufferLoad):
+        buffer = obj.buffer
+        region = get_buffer_region_from_load(obj)
+        if region is not None:
+            mins = [r.min for r in region.region]
+        else:
+            mins = list(obj.indices)
+        strides = retrieve_stride(obj)
+        upto = max(0, len(mins) - int(ignore_last_ndim))
+        offset = 0
+        for i in range(upto):
+            offset += mins[i] * strides[i]
+        return buffer.access_ptr(access_type, offset=offset)
+
+    raise ValueError(f"Unsupported retrieve_ptr argument type: {type(obj)} for object {obj}")
+
+
+def retrieve_offset(obj: Buffer | BufferRegion | BufferLoad) -> list:
+    """
+    Retrieve per-dimension minima offsets.
+
+    - Buffer -> [0, 0, ...]
+    - BufferRegion -> [r.min for r in region]
+    - BufferLoad -> indices (or derived region minima)
+    """
+    if isinstance(obj, tir.Buffer):
+        return [0] * len(obj.shape)
+    if isinstance(obj, tir.BufferRegion):
+        return [r.min for r in obj.region]
+    if isinstance(obj, tir.BufferLoad):
+        region = get_buffer_region_from_load(obj)
+        if region is not None:
+            return [r.min for r in region.region]
+        return list(obj.indices)
+    raise ValueError(f"Unsupported retrieve_offset argument type: {type(obj)} for object {obj}")
+
+
+def bits_product(shape: list[PrimExpr], dtype: str) -> PrimExpr:
+    """
+    Compute the number of bits in a Buffer (shape with dtype)."""
+    if len(shape) == 0:
+        return tir.IntImm("int32", 1)
+    result = shape[0]
+    for i in range(1, len(shape)):
+        result = result * shape[i]
+    return result * DataType(dtype).bits
+
+
+def prim_expr_equal(lhs, rhs) -> bool:
+    """
+    Robust equality for PrimExpr shapes/extents.
+
+    Tries structural_equal first, then falls back to expr_deep_equal.
+    Python ints are converted to IntImm for comparison.
+    """
+    if isinstance(lhs, int) and isinstance(rhs, int):
+        return lhs == rhs
+    if isinstance(lhs, int):
+        lhs = tir.IntImm("int32", lhs)
+    if isinstance(rhs, int):
+        rhs = tir.IntImm("int32", rhs)
+    if ir.structural_equal(lhs, rhs):
+        return True
+    return tir.analysis.expr_deep_equal(lhs, rhs)
+
+
+def legalize_pairwise_extents(src_extents: list, dst_extents: list) -> tuple[list, list]:
+    """
+    Right-align and broadcast two extent lists to be mutually compatible.
+
+    Early-exit rule:
+    - If the number of non-1 dimensions in `src_extents` equals that in `dst_extents`,
+      no adjustment is made; the original extents are returned unchanged. This
+      preserves the per-dimension iteration mapping (one loop var per non-1 dim)
+      and avoids creating extra varying axes on either side.
+
+    Otherwise, for each pair of tail-aligned dimensions (x, y):
+      - if x == y: keep both
+      - elif x == 1: set x = y
+      - elif y == 1: set y = x
+      - else: promote both to tir.max(x, y) to handle dynamic-vs-static safely
+
+    Leading unmatched dimensions are kept as-is.
+
+    Returns a tuple of new lists (src_new, dst_new).
+    """
+    a = list(src_extents)
+    b = list(dst_extents)
+
+    # If both sides have the same number of non-1 extents, don't re-broadcast.
+    def _num_non_one(exts: list) -> int:
+        return sum(0 if prim_expr_equal(x, 1) else 1 for x in exts)
+
+    if _num_non_one(a) == _num_non_one(b):
+        return a, b
+    k = min(len(a), len(b))
+    for i in range(1, k + 1):
+        x, y = a[-i], b[-i]
+        if prim_expr_equal(x, y):
+            continue
+        elif prim_expr_equal(x, 1):
+            a[-i] = y
+        elif prim_expr_equal(y, 1):
+            b[-i] = x
+        else:
+            # Dynamic mismatch: promote to max so downstream clamping/predicates remain safe
+            m = tir.max(x, y)
+            a[-i] = m
+            b[-i] = m
+    return a, b
+
+
+def is_full_region(buffer_region: BufferRegion) -> bool:
+    """
+    Check whether a BufferRegion covers the full buffer region.
+
+    A full region means each dimension has start 0 and extent equal to
+    the corresponding dimension in the buffer's shape.
+
+    Args:
+        buffer_region: The TVM BufferRegion to check.
+
+    Returns:
+        bool: True if the region is full; otherwise False.
+    """
+    if not isinstance(buffer_region, tir.BufferRegion):
+        raise TypeError(f"Expected BufferRegion, got {type(buffer_region)}")
+
+    buf = buffer_region.buffer
+    ranges = buffer_region.region
+
+    if len(buf.shape) != len(ranges):
+        return False
+
+    expr_equal = tir.analysis.expr_deep_equal
+    for dim, r in zip(buf.shape, ranges):
+        # start == 0 and extent == shape
+        if not expr_equal(r.min, 0):
+            return False
+        if not expr_equal(r.extent, dim):
+            return False
+    return True
+
+
+def get_prim_func_name(func: PrimFunc | None, default: str | None = None) -> str | None:
+    """
+    Extract a human‑readable function name from a TVM PrimFunc.
+
+    Prefer the `global_symbol` attribute set on the PrimFunc. If it is missing
+    (e.g., private PrimFunc without a global symbol), return the provided
+    `default` value.
+
+    Args:
+        func: TVM PrimFunc instance or None.
+        default: Fallback name to return when no name can be determined.
+
+    Returns:
+        The function name as a string, or `default` when unavailable.
+    """
+    if func is None:
+        return default
+    try:
+        name = func.attrs["global_symbol"]
+        return str(name) if name is not None else default
+    except Exception:
+        return default
+
+
+def side_effect(expr: PrimExpr) -> CallEffectKind:
+    from tilelang import _ffi_api
+
+    return _ffi_api.SideEffect(expr)
diff --git a/tilelang/utils/sparse.py b/tilelang/utils/sparse.py
index 9260aa8e1..fa227b07d 100644
--- a/tilelang/utils/sparse.py
+++ b/tilelang/utils/sparse.py
@@ -3,6 +3,7 @@
 import torch
 import warnings
 from tilelang.contrib import nvcc
+from tilelang.utils.tensor import is_float8_dtype, fp8_remove_negative_zeros_
 from torch.utils.cpp_extension import load, _import_module_from_library
 from tilelang import env
 
@@ -17,14 +18,12 @@
 
 
 def _get_cached_lib():
-    name = 'compress_lib'
-    cached_path = os.path.join(_CACHE_DIR, f"{name}.so")
+    name = "compress_lib"
 
-    if os.path.exists(cached_path):
+    if os.path.exists(os.path.join(_CACHE_DIR, f"{name}.so")):
         try:
-            return _import_module_from_library(name, cached_path)
+            return _import_module_from_library(name, _CACHE_DIR, is_python_module=True)
         except Exception:
-            # If loading fails, recompile
             pass
 
     # Set TORCH_CUDA_ARCH_LIST
@@ -35,24 +34,22 @@ def _get_cached_lib():
         name=name,
         sources=[compress_util],
         extra_cuda_cflags=[
-            '-O2',
-            '-std=c++17',
-            '-lineinfo',
-            f'-I{env.CUTLASS_INCLUDE_DIR}',
-            f'-I{env.CUTLASS_INCLUDE_DIR}/../tools/util/include',
-            '-arch=sm_90',
+            "-O2",
+            "-std=c++17",
+            "-lineinfo",
+            f"-I{env.CUTLASS_INCLUDE_DIR}",
+            f"-I{env.CUTLASS_INCLUDE_DIR}/../tools/util/include",
+            "-arch=sm_90",
         ],
         build_directory=_CACHE_DIR,
     )
 
 
-def compress_sm90(A: torch.Tensor, block_k: int,
-                  transposed: bool) -> tuple[torch.Tensor, torch.Tensor]:
+def compress_sm90(A: torch.Tensor, block_k: int, transposed: bool) -> tuple[torch.Tensor, torch.Tensor]:
     if block_k > 128:
         block_k = 128
         # Ref: https://github.com/NVIDIA/cutlass/blob/c2ad7c5b20f131c4ba33601860f1da3f9c9df0f3/include/cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl#L145-L146
-        warnings.warn(
-            f"block_k {block_k} is too large, set to 128 for sm90 compression.", stacklevel=2)
+        warnings.warn(f"block_k {block_k} is too large, set to 128 for sm90 compression.", stacklevel=2)
     # Load the library (will use cache if available)
     compress_lib = _get_cached_lib()
 
@@ -63,8 +60,9 @@ def compress_sm80(A: torch.Tensor, transposed: bool) -> tuple[torch.Tensor, torc
     try:
         from torch.sparse import to_sparse_semi_structured, SparseSemiStructuredTensor
     except ImportError as err:
-        raise ImportError("SparseSemiStructuredTensor is not available in this version of PyTorch. "
-                          "Please install a compatible version.") from err
+        raise ImportError(
+            "SparseSemiStructuredTensor is not available in this version of PyTorch. Please install a compatible version."
+        ) from err
     orig_val = SparseSemiStructuredTensor._FORCE_CUTLASS
     try:
         SparseSemiStructuredTensor._FORCE_CUTLASS = True
@@ -76,10 +74,7 @@ def compress_sm80(A: torch.Tensor, transposed: bool) -> tuple[torch.Tensor, torc
         SparseSemiStructuredTensor._FORCE_CUTLASS = orig_val
 
 
-def compress(A: torch.Tensor,
-             transposed: bool,
-             arch: str | None = None,
-             **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
+def compress(A: torch.Tensor, transposed: bool, arch: str | None = None, **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Compress a tensor using the appropriate method based on the CUDA architecture.
     """
@@ -91,13 +86,23 @@ def compress(A: torch.Tensor,
     if compute_version >= (9, 0):
         return compress_sm90(A, transposed=transposed, **kwargs)
     elif compute_version >= (8, 0):
-        return compress_sm80(A, transposed=transposed)
+        if transposed:
+            A = A.t().contiguous()
+        origin_dtype = A.dtype
+        if is_float8_dtype(origin_dtype):
+            fp8_remove_negative_zeros_(A)
+            A = A.view(torch.int8)
+        A_sp, E = compress_sm80(A, transposed=False)
+        if is_float8_dtype(origin_dtype):
+            A_sp = A_sp.view(origin_dtype)
+        if transposed:
+            A_sp = A_sp.t().contiguous()
+        return A_sp, E
     else:
-        raise ValueError(f"Unsupported CUDA compute version: {compute_version}. "
-                         "Supported versions are sm_80 and sm_90.")
+        raise ValueError(f"Unsupported CUDA compute version: {compute_version}. Supported versions are sm_80 and sm_90.")
 
 
-def randn_semi_sparse(M: int, K: int, dtype=torch.float16, device='cuda', transposed: bool = False):
+def randn_semi_sparse(M: int, K: int, dtype=torch.float16, device="cuda", transposed: bool = False):
     """
     Generate a random semi-sparse tensor. The generated tensor will have 2:4 sparsity along the K dimension.
     Args:
@@ -108,6 +113,8 @@ def randn_semi_sparse(M: int, K: int, dtype=torch.float16, device='cuda', transp
         transposed (bool): If True, returns a transposed tensor of shape (K, M)
     """
     elem, group = 2, 4
+    if dtype == torch.float32:
+        elem, group = 1, 2
     tensor = torch.randn((M, K), dtype=torch.float, device=device).view(M, -1, group)
     indice = tensor.topk(elem, dim=-1).indices
     tensor.scatter_(-1, indice, 0)
@@ -117,11 +124,31 @@ def randn_semi_sparse(M: int, K: int, dtype=torch.float16, device='cuda', transp
     return tensor.to(dtype)  # dtype like float8 might not have randn kernel
 
 
-def arange_semi_sparse(M: int,
-                       K: int,
-                       dtype=torch.float16,
-                       device='cuda',
-                       transposed: bool = False):
+def randint_semi_sparse(M: int, K: int, low: int, high: int, dtype=torch.int32, device="cuda", transposed: bool = False):
+    """
+    Generate a random semi-sparse integer tensor. The generated tensor will have 2:4 sparsity along the K dimension.
+    Args:
+        M (int): Number of rows
+        K (int): Number of columns
+        low (int): Lower bound of the random integers
+        high (int): Upper bound of the random integers
+        dtype: Data type of the tensor
+        device: Device to create the tensor on
+        transposed (bool): If True, returns a transposed tensor of shape (K, M)
+    """
+    elem, group = 2, 4
+    if dtype == torch.float32:
+        elem, group = 1, 2
+    tensor = torch.randint(low, high, (M, K), dtype=dtype, device=device).view(M, -1, group)
+    indice = tensor.topk(elem, dim=-1).indices
+    tensor.scatter_(-1, indice, 0)
+    tensor = tensor.view(M, K)
+    if transposed:
+        tensor = tensor.t().contiguous()
+    return tensor
+
+
+def arange_semi_sparse(M: int, K: int, dtype=torch.float16, device="cuda", transposed: bool = False):
     """
     Generate a semi-sparse tensor with values from 0 to M*K-1. The generated tensor will have 2:4 sparsity along the K dimension.
     Args:
@@ -132,6 +159,8 @@ def arange_semi_sparse(M: int,
         transposed (bool): If True, returns a transposed tensor of shape (K, M)
     """
     elem, group = 2, 4
+    if dtype == torch.float32:
+        elem, group = 1, 2
     tensor = torch.arange(M * K, dtype=dtype, device=device).view(M, -1, group)
     indice = tensor.topk(elem, dim=-1).indices
     tensor.scatter_(-1, indice, 0)
diff --git a/tilelang/utils/target.py b/tilelang/utils/target.py
index 7b12757d3..3d7aa77c9 100644
--- a/tilelang/utils/target.py
+++ b/tilelang/utils/target.py
@@ -1,4 +1,8 @@
 from __future__ import annotations
+
+
+import torch
+
 from platform import mac_ver
 from typing import Literal
 from tilelang import tvm as tvm
@@ -6,7 +10,6 @@
 from tvm.target import Target
 from tvm.contrib import rocm
 from tilelang.contrib import nvcc
-import torch
 
 SUPPORTED_TARGETS: dict[str, str] = {
     "auto": "Auto-detect CUDA/HIP/Metal based on availability.",
@@ -16,6 +19,7 @@
     "llvm": "LLVM CPU target (accepts standard TVM LLVM options).",
     "webgpu": "WebGPU target for browser/WebGPU runtimes.",
     "c": "C source backend.",
+    "cutedsl": "CuTe DSL GPU target.",
 }
 
 
@@ -57,17 +61,41 @@ def check_metal_availability() -> bool:
     if not mac_release:
         return False
     # todo: check torch version?
-    return arch == 'arm64'
+    return arch == "arm64"
+
+
+def normalize_cutedsl_target(target: str | Target | None) -> Target | None:
+    if target is None:
+        return None
+
+    if isinstance(target, Target):
+        if target.kind.name == "cuda" and "cutedsl" in target.keys:
+            return target
+        return None
+
+    if target.startswith("cutedsl"):
+        cuda_target_str = target.replace("cutedsl", "cuda", 1)
+
+        try:
+            temp_target = Target(cuda_target_str)
 
+            target_dict = dict(temp_target.export())
+            target_dict["keys"] = list(set(target_dict["keys"]) | {"cutedsl"})
 
-def determine_target(target: str | Target | Literal["auto"] = "auto",
-                     return_object: bool = False) -> str | Target:
+            return Target(target_dict)
+        except Exception:
+            return None
+
+    return None
+
+
+def determine_target(target: str | Target | Literal["auto"] | None = "auto", return_object: bool = False) -> str | Target:
     """
     Determine the appropriate target for compilation (CUDA, HIP, or manual selection).
 
     Args:
-        target (str | Target | Literal["auto"]): User-specified target.
-            - If "auto", the system will automatically detect whether CUDA or HIP is available.
+        target (str | Target | Literal["auto"] | None): User-specified target.
+            - If "auto" or None, the system will automatically detect whether CUDA or HIP is available.
             - If a string or Target, it is directly validated.
 
     Returns:
@@ -77,6 +105,9 @@ def determine_target(target: str | Target | Literal["auto"] = "auto",
         ValueError: If no CUDA or HIP is available and the target is "auto".
         AssertionError: If the target is invalid.
     """
+    # Treat None as "auto"
+    if target is None:
+        target = "auto"
 
     return_var: str | Target = target
 
@@ -90,33 +121,50 @@ def determine_target(target: str | Target | Literal["auto"] = "auto",
 
         # Determine the target based on availability
         if is_cuda_available:
-            return_var = "cuda"
+            if torch.cuda.is_available() and (cap := torch.cuda.get_device_capability(0)):
+                return_var = Target({"kind": "cuda", "arch": f"sm_{nvcc.get_target_arch(cap)}"})
+            else:
+                return_var = "cuda"
         elif is_hip_available:
             return_var = "hip"
         elif check_metal_availability():
             return_var = "metal"
         else:
             raise ValueError("No CUDA or HIP or MPS available on this system.")
+
     else:
-        # Validate the target if it's not "auto"
-        if isinstance(target, Target):
-            return_var = target
-        elif isinstance(target, str):
-            normalized_target = target.strip()
-            if not normalized_target:
-                raise AssertionError(f"Target {target} is not supported")
+        possible_cutedsl_target = normalize_cutedsl_target(target)
+        if possible_cutedsl_target is not None:
             try:
-                Target(normalized_target)
-            except Exception as err:
-                examples = ", ".join(f"`{name}`" for name in SUPPORTED_TARGETS)
-                raise AssertionError(
-                    f"Target {target} is not supported. Supported targets include: {examples}. "
-                    "Pass additional options after the base name, e.g. `cuda -arch=sm_80`."
-                ) from err
-            return_var = normalized_target
+                from tilelang.jit.adapter.cutedsl.checks import check_cutedsl_available  # lazy
+
+                check_cutedsl_available()
+            except ImportError as e:
+                raise AssertionError(f"CuTeDSL backend is not available. Please install tilelang-cutedsl package. {str(e)}") from e
+
+            return_var = possible_cutedsl_target
         else:
-            raise AssertionError(f"Target {target} is not supported")
+            # Validate the target if it's not "auto"
+            if isinstance(target, Target):
+                return_var = target
+            elif isinstance(target, str):
+                normalized_target = target.strip()
+                if not normalized_target:
+                    raise AssertionError(f"Target {target} is not supported")
+                try:
+                    Target(normalized_target)
+                except Exception as err:
+                    examples = ", ".join(f"`{name}`" for name in SUPPORTED_TARGETS)
+                    raise AssertionError(
+                        f"Target {target} is not supported. Supported targets include: {examples}. "
+                        "Pass additional options after the base name, e.g. `cuda -arch=sm_80`."
+                    ) from err
+                return_var = normalized_target
+            else:
+                raise AssertionError(f"Target {target} is not supported")
 
+    if isinstance(return_var, Target):
+        return return_var
     if return_object:
         if isinstance(return_var, Target):
             return return_var
@@ -124,22 +172,6 @@ def determine_target(target: str | Target | Literal["auto"] = "auto",
     return return_var
 
 
-def parse_device(device: str | torch.device | int) -> int:
-    if isinstance(device, str):
-        if device.startswith("cuda"):
-            return torch.cuda.current_device()
-        elif device == "cpu":
-            return -1
-        else:
-            raise ValueError(f"unknown device string: {device}")
-    elif isinstance(device, torch.device):
-        return device.index if device.type == "cuda" else -1
-    elif isinstance(device, int):
-        return device
-    else:
-        raise TypeError("device must be str|torch.device|int")
-
-
 def target_is_cuda(target: Target) -> bool:
     return _ffi_api.TargetIsCuda(target)
 
@@ -190,3 +222,52 @@ def target_has_bulk_copy(target: Target) -> bool:
 
 def target_get_warp_size(target: Target) -> int:
     return _ffi_api.TargetGetWarpSize(target)
+
+
+def parse_device(device: str | torch.device | int | None) -> int:
+    """
+    Parse a device specification and return the device index.
+
+    Args:
+        device: Device specification. Can be:
+            - None: Returns current CUDA device index
+            - int: Returns the device index directly
+            - str: Parses strings like "cuda", "cuda:0", "0"
+            - torch.device: Extracts the device index
+
+    Returns:
+        int: The device index
+
+    Raises:
+        ValueError: If the device specification is invalid
+    """
+    if device is None:
+        if torch.cuda.is_available():
+            return torch.cuda.current_device()
+        return 0
+
+    if isinstance(device, int):
+        return device
+
+    if isinstance(device, torch.device):
+        if device.type != "cuda":
+            raise ValueError(f"Only CUDA devices are supported, got {device.type}")
+        return device.index if device.index is not None else 0
+
+    if isinstance(device, str):
+        device = device.strip().lower()
+        if device == "cuda" or device == "gpu":
+            if torch.cuda.is_available():
+                return torch.cuda.current_device()
+            return 0
+        if device.startswith("cuda:"):
+            try:
+                return int(device[5:])
+            except ValueError as e:
+                raise ValueError(f"Invalid device specification: {device}") from e
+        try:
+            return int(device)
+        except ValueError as e:
+            raise ValueError(f"Invalid device specification: {device}") from e
+
+    raise ValueError(f"Invalid device type: {type(device)}")
diff --git a/tilelang/utils/tensor.py b/tilelang/utils/tensor.py
index 3e23f016f..e563aed52 100644
--- a/tilelang/utils/tensor.py
+++ b/tilelang/utils/tensor.py
@@ -1,13 +1,28 @@
-from __future__ import annotations
 """The profiler and convert to torch utils"""
+
+from __future__ import annotations
 from enum import Enum
 import torch
-from tvm.runtime import ndarray
 from tvm import tir
-from torch.utils.dlpack import to_dlpack
 import numpy as np
-from tilelang.utils.allocator import BaseAllocator
 from tilelang.utils.target import parse_device
+from tilelang.utils.allocator import BaseAllocator
+
+
+def is_float8_dtype(dtype: torch.dtype) -> bool:
+    return dtype in {
+        torch.float8_e5m2,
+        torch.float8_e5m2fnuz,
+        torch.float8_e4m3fn,
+        torch.float8_e4m3fnuz,
+    }
+
+
+def fp8_remove_negative_zeros_(tensor: torch.Tensor):
+    assert is_float8_dtype(tensor.dtype), "Input tensor must be of float8 dtype"
+    bits = tensor.view(torch.uint8)
+    zeros_mask = tensor == 0
+    bits[zeros_mask] = 0x00
 
 
 class TensorSupplyType(Enum):
@@ -20,72 +35,84 @@ class TensorSupplyType(Enum):
     Auto = 7
 
 
-def map_torch_type(intype: str) -> torch.dtype:
-    if intype == "float8_e4m3":
-        assert hasattr(torch, "float8_e4m3fn"), \
-            "torch.float8_e4m3fn is not supported in this version of torch" \
-                "Please upgrade torch >= 2.1.0"
-        return torch.float8_e4m3fn
-    elif intype == "float8_e5m2":
-        assert hasattr(torch, "float8_e5m2"), \
-            "torch.float8_e5m2 is not supported in this version of torch" \
-                "Please upgrade torch >= 2.1.0"
-        return torch.float8_e5m2
-    elif intype == "e4m3fnuz_float8":
-        assert hasattr(torch, "float8_e4m3fnuz"), \
-            "torch.float8_e4m3fnuz is not supported in this version of torch" \
-                "Please upgrade torch >= 2.2.0"
-        return torch.float8_e4m3fnuz
-    else:
-        return getattr(torch, intype)
+def tensor(
+    shape: tuple[int, ...],
+    dtype: torch.dtype,
+    device: str | torch.device | int | None = None,
+    allocator: BaseAllocator | None = None,
+    return_peers: bool | None = None,
+) -> torch.Tensor | list[torch.Tensor]:
+    """Allocate a tensor using the given allocator or standard torch allocation.
 
+    Args:
+        shape: The shape of the tensor to allocate.
+        dtype: The data type of the tensor.
+        device: The device to allocate on (if not using allocator).
+        allocator: Optional BaseAllocator for distributed memory allocation.
+        return_peers: If True, return peer tensors for distributed allocation.
 
-def tensor(shape: tuple[int, ...],
-           dtype: torch.dtype,
-           device: str | torch.device | int | None = None,
-           allocator: BaseAllocator | None = None,
-           return_peers: bool | None = None) -> torch.Tensor | list[torch.Tensor]:
+    Returns:
+        A torch.Tensor or list of torch.Tensors (if return_peers is True).
+    """
     if allocator is not None:
         assert allocator.initialized(), "Allocator is not initialized"
         if device is not None:
             device = parse_device(device)
-            assert allocator.device == device, f"Allocator device must be the " \
-                f"same as the device of the tensor, but got {allocator.device} != {device}"
+            assert allocator.device == device, (
+                f"Allocator device must be the same as the device of the tensor, but got {allocator.device} != {device}"
+            )
         return allocator._allocate_tensor(shape, dtype, return_peers)
     else:
         return torch.empty(shape, dtype=dtype, device=device)
 
 
-def adapt_torch2tvm(arg):
-    float8_dtype_map = {
-        torch.float8_e4m3fn: "float8_e4m3",
-        torch.float8_e4m3fnuz: "float8_e4m3",
-        torch.float8_e5m2: "float8_e5m2",
-        torch.float8_e5m2fnuz: "float8_e5m2",
-    }
-    if isinstance(arg, torch.Tensor):
-        if arg.dtype in {
-                torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz
-        }:
-            return ndarray.from_dlpack(to_dlpack(arg.view(torch.int8)))._create_view(
-                shape=arg.shape, dtype=float8_dtype_map[arg.dtype])
-        return ndarray.from_dlpack(to_dlpack(arg))
-    return arg
+def map_torch_type(intype) -> torch.dtype:
+    # Convert to string if needed
+    if not isinstance(intype, str):
+        intype = str(intype)
 
+    if intype == "float8_e4m3":
+        assert hasattr(torch, "float8_e4m3fn"), "torch.float8_e4m3fn is not supported in this version of torchPlease upgrade torch >= 2.1.0"
+        return torch.float8_e4m3fn
+    elif intype == "float8_e5m2":
+        assert hasattr(torch, "float8_e5m2"), "torch.float8_e5m2 is not supported in this version of torchPlease upgrade torch >= 2.1.0"
+        return torch.float8_e5m2
+    elif intype == "e4m3fnuz_float8":
+        assert hasattr(torch, "float8_e4m3fnuz"), (
+            "torch.float8_e4m3fnuz is not supported in this version of torchPlease upgrade torch >= 2.2.0"
+        )
+        return torch.float8_e4m3fnuz
+    elif intype == "float8_e8m0fnu":
+        assert hasattr(torch, "float8_e8m0fnu"), (
+            "torch.float8_e8m0fnu is not supported in this version of torchPlease upgrade torch >= 2.8.0"
+        )
+        return torch.float8_e8m0fnu
+    elif intype == "float4_e2m1fnx2":
+        assert hasattr(torch, "float4_e2m1fnx2"), (
+            "torch.float4_e2m1fnx2 is not supported in this version of torchPlease upgrade torch >= 2.8.0"
+        )
+        return torch.float4_e2m1fnx2
+    elif "float4" in intype:
+        # PyTorch doesn't support float4, use int8 as storage type
+        return torch.int8
+    else:
+        return getattr(torch, intype)
 
-def get_tensor_supply(supply_type: TensorSupplyType = TensorSupplyType.Integer):
 
+def get_tensor_supply(supply_type: TensorSupplyType = TensorSupplyType.Integer):
     from tilelang.engine.param import KernelParam
     from .device import get_current_device
 
     def get_tensor(param: KernelParam) -> torch.Tensor:
-        dtype: torch.dtype = param.dtype
+        # Convert tvm.DataType to torch.dtype for tensor creation
+        dtype: torch.dtype = param.torch_dtype()
         device = get_current_device()
 
         if hasattr(param, "shape") and not param.shape:
             raise ValueError(
                 f"TensorType must have a shape, but got {type(param)}, "
-                "likely you are trying to generate a random tensor with a dynamic symbolic shape.")
+                "likely you are trying to generate a random tensor with a dynamic symbolic shape."
+            )
 
         # Check if with dynamic symbolic shape
         for shape in param.shape:
@@ -99,12 +126,14 @@ def get_tensor(param: KernelParam) -> torch.Tensor:
         if supply_type == TensorSupplyType.Auto:
             is_unsigned = param.is_unsigned()
             is_float8 = param.is_float8()
+            is_float4 = param.is_float4()
             is_boolean = param.is_boolean()
             if is_unsigned:
                 return torch.randint(low=0, high=3, size=shape, device=device, dtype=dtype)
             elif is_float8:
-                return torch.randint(
-                    low=-128, high=128, size=shape, device=device, dtype=torch.int8).to(dtype)
+                return torch.randint(low=-128, high=128, size=shape, device=device, dtype=torch.int8).to(dtype)
+            elif is_float4:
+                return torch.randint(low=0, high=16, size=shape, device=device, dtype=dtype)
             elif is_boolean:
                 return torch.randint(low=0, high=2, size=shape, device=device, dtype=dtype)
             elif dtype in {torch.float16, torch.float32, torch.bfloat16}:
@@ -113,30 +142,30 @@ def get_tensor(param: KernelParam) -> torch.Tensor:
                 return torch.randint(low=-2, high=3, size=shape, device=device, dtype=dtype)
 
         if dtype == torch.int8 and supply_type in [
-                TensorSupplyType.Uniform,
-                TensorSupplyType.Normal,
+            TensorSupplyType.Uniform,
+            TensorSupplyType.Normal,
         ]:
             return torch.ones(*shape, device=device, dtype=dtype)
 
         if supply_type == TensorSupplyType.Integer:
             is_unsigned = param.is_unsigned()
             is_float8 = param.is_float8()
+            is_float4 = param.is_float4()
             is_boolean = param.is_boolean()
             if is_unsigned:
                 return torch.randint(low=0, high=3, size=shape, device=device, dtype=dtype)
             elif is_float8:
-                return torch.randint(
-                    low=-128, high=128, size=shape, device=device, dtype=torch.int8).to(dtype)
+                return torch.randint(low=-128, high=128, size=shape, device=device, dtype=torch.int8).to(dtype)
+            elif is_float4:
+                return torch.randint(low=0, high=16, size=shape, device=device, dtype=dtype)
             elif is_boolean:
                 return torch.randint(low=0, high=2, size=shape, device=device, dtype=dtype)
             else:
                 return torch.randint(low=-2, high=3, size=shape, device=device, dtype=dtype)
         elif supply_type == TensorSupplyType.Uniform:
-            return torch.empty(
-                *shape, device=device, dtype=torch.float32).uniform_(-1.0, 1.0).to(dtype)
+            return torch.empty(*shape, device=device, dtype=torch.float32).uniform_(-1.0, 1.0).to(dtype)
         elif supply_type == TensorSupplyType.Normal:
-            return torch.empty(
-                *shape, device=device, dtype=torch.float32).normal_(-1.0, 1.0).to(dtype)
+            return torch.empty(*shape, device=device, dtype=torch.float32).normal_(-1.0, 1.0).to(dtype)
         elif supply_type == TensorSupplyType.Randn:
             return torch.randn(*shape, device=device).to(dtype)
         elif supply_type == TensorSupplyType.Zero:
@@ -172,9 +201,7 @@ def _compare_attributes(
     """
 
     def raise_mismatch_error(attribute_name: str, actual_value, expected_value):
-        raise AssertionError(
-            f"The values for attribute '{attribute_name}' do not match: {actual_value} != {expected_value}."
-        )
+        raise AssertionError(f"The values for attribute '{attribute_name}' do not match: {actual_value} != {expected_value}.")
 
     if actual.shape != expected.shape:
         raise_mismatch_error("shape", actual.shape, expected.shape)
@@ -185,7 +212,7 @@ def raise_mismatch_error(attribute_name: str, actual_value, expected_value):
     if actual.layout != expected.layout:
         if check_layout:
             raise_mismatch_error("layout", actual.layout, expected.layout)
-    elif (actual.layout == torch.strided and check_stride and actual.stride() != expected.stride()):
+    elif actual.layout == torch.strided and check_stride and actual.stride() != expected.stride():
         raise_mismatch_error("stride()", actual.stride(), expected.stride())
     if check_device and actual.device != expected.device:
         raise_mismatch_error("device", actual.device, expected.device)
@@ -193,8 +220,7 @@ def raise_mismatch_error(attribute_name: str, actual_value, expected_value):
         raise_mismatch_error("dtype", actual.dtype, expected.dtype)
 
 
-def _equalize_attributes(actual: torch.Tensor,
-                         expected: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+def _equalize_attributes(actual: torch.Tensor, expected: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
     """Equalizes some attributes of two tensors for value comparison.
     If ``actual`` and ``expected`` are ...
     - ... not on the same :attr:`~torch.Tensor.device`, they are moved CPU memory.
@@ -232,7 +258,7 @@ def _equalize_attributes(actual: torch.Tensor,
     if actual.layout != expected.layout:
         # These checks are needed, since Tensor.to_dense() fails on tensors that are already strided
         actual = actual.to_dense() if actual.layout != torch.strided else actual
-        expected = (expected.to_dense() if expected.layout != torch.strided else expected)
+        expected = expected.to_dense() if expected.layout != torch.strided else expected
     return actual, expected
 
 
@@ -276,12 +302,8 @@ def torch_assert_close(
     """
 
     _compare_attributes(
-        tensor_a,
-        tensor_b,
-        check_device=check_device,
-        check_dtype=check_dtype,
-        check_layout=check_layout,
-        check_stride=check_stride)
+        tensor_a, tensor_b, check_device=check_device, check_dtype=check_dtype, check_layout=check_layout, check_stride=check_stride
+    )
     tensor_a, tensor_b = _equalize_attributes(tensor_a, tensor_b)
 
     mismatched = ~torch.isclose(tensor_a, tensor_b, rtol=rtol, atol=atol, equal_nan=equal_nan)
@@ -298,8 +320,7 @@ def torch_assert_close(
 
     # Print debug information about the mismatch
     if verbose:
-        print(f"Number of mismatched elements: {num_mismatched} / {total_elements} "
-              f"(allowed: {max_allowed_mismatched})")
+        print(f"Number of mismatched elements: {num_mismatched} / {total_elements} (allowed: {max_allowed_mismatched})")
 
     # If there are mismatched elements, print the first mismatch
     if num_mismatched > 0:
@@ -311,9 +332,9 @@ def torch_assert_close(
         b_val = tensor_b.reshape(-1)[flat_idx].item()
         abs_diff = abs(a_val - b_val)
         rel_diff = abs_diff / (abs(b_val) + 1e-12)
-        mismatch_info = (f"\nFirst mismatch at index {idx}: "
-                         f"lhs={a_val:.6f}, rhs={b_val:.6f}, "
-                         f"abs_diff={abs_diff:.6f}, rel_diff={rel_diff:.6f}")
+        mismatch_info = (
+            f"\nFirst mismatch at index {idx}: lhs={a_val:.6f}, rhs={b_val:.6f}, abs_diff={abs_diff:.6f}, rel_diff={rel_diff:.6f}"
+        )
     else:
         mismatch_info = ""
 
@@ -326,6 +347,7 @@ def torch_assert_close(
             f"\nGreatest absolute difference: {diff.max().item()}, "
             f"Greatest relative difference: {(diff / (torch.abs(tensor_b) + 1e-12)).max().item()}"
             f"\n{base_name}: {tensor_a}"
-            f"\n{ref_name}: {tensor_b}")
+            f"\n{ref_name}: {tensor_b}"
+        )
     else:
         return True
diff --git a/tilelang/utils/ts_ext/setup.py b/tilelang/utils/ts_ext/setup.py
index 1b7a2ee87..52d203f93 100644
--- a/tilelang/utils/ts_ext/setup.py
+++ b/tilelang/utils/ts_ext/setup.py
@@ -5,6 +5,7 @@
 
 try:
     from torch.utils.cpp_extension import _get_torch_lib_dir
+
     torch_lib_dir = _get_torch_lib_dir()
 except Exception:
     torch_lib_dir = os.path.join(os.path.dirname(torch.__file__), "lib")
diff --git a/tilelang/utils/ts_ext/tensor.cpp b/tilelang/utils/ts_ext/tensor.cpp
index 26efca64c..d9a9753f9 100644
--- a/tilelang/utils/ts_ext/tensor.cpp
+++ b/tilelang/utils/ts_ext/tensor.cpp
@@ -110,4 +110,4 @@ create_host_device_tensor(const std::vector<int64_t> &shape,
       torch::TensorOptions().dtype(dtype).device(torch::kCUDA));
 
   return std::make_pair(host_tensor, device_tensor);
-}
\ No newline at end of file
+}
diff --git a/tilescale_ext/__init__.py b/tilescale_ext/__init__.py
new file mode 100644
index 000000000..c650ea949
--- /dev/null
+++ b/tilescale_ext/__init__.py
@@ -0,0 +1,15 @@
+from tilescale_ext._C import (
+    tensor_from_ptr,
+    _create_tensor,
+    _create_ipc_handle,
+    _sync_ipc_handles,
+    create_host_device_tensor,
+)
+
+__all__ = [
+    "tensor_from_ptr",
+    "_create_tensor",
+    "_create_ipc_handle",
+    "_sync_ipc_handles",
+    "create_host_device_tensor",
+]
diff --git a/version_provider.py b/version_provider.py
index 31a7e8ad5..c2ca929ae 100644
--- a/version_provider.py
+++ b/version_provider.py
@@ -4,71 +4,82 @@
 import platform
 import subprocess
 from pathlib import Path
+from functools import lru_cache
 
 ROOT = Path(__file__).parent
 
-base_version = (ROOT / 'VERSION').read_text().strip()
+base_version = (ROOT / "VERSION").read_text().strip()
+# When installing a sdist,
+# the installed version needs to match the sdist version,
+# so pip will complain when we install `tilelang-0.1.6.post2+gitxxxx.tar.gz`.
+# To workaround that, when building sdist,
+# we do not add version label and use a file to store the git hash instead.
+git_pin = ROOT / ".git_commit.txt"
 
 
 def _read_cmake_bool(i: str | None, default=False):
     if i is None:
         return default
-    return i.lower() not in ('0', 'false', 'off', 'no', 'n', '')
+    return i.lower() not in ("0", "false", "off", "no", "n", "")
 
 
+@lru_cache(maxsize=1)
 def get_git_commit_id() -> str | None:
     """Get the current git commit hash by running git in the current file's directory."""
 
-    r = subprocess.run(['git', 'rev-parse', 'HEAD'],
-                       cwd=ROOT,
-                       capture_output=True,
-                       encoding='utf-8')
+    r = subprocess.run(["git", "rev-parse", "HEAD"], cwd=ROOT, capture_output=True, encoding="utf-8")
     if r.returncode == 0:
-        return r.stdout.strip()
+        _git = r.stdout.strip()
+        git_pin.write_text(_git)
+        return _git
+    elif git_pin.exists():
+        return git_pin.read_text().strip()
     else:
-        return 'unknown'
+        return None
 
 
-def dynamic_metadata(
-    field: str,
-    settings: dict[str, object] | None = None,
-) -> str:
-    assert field == 'version'
+def dynamic_metadata(field: str, settings: dict[str, object] | None = None) -> str:
+    assert field == "version"
 
     version = base_version
 
-    if not _read_cmake_bool(os.environ.get('NO_VERSION_LABEL')):
+    # generate git version for sdist
+    get_git_commit_id()
+
+    if not _read_cmake_bool(os.environ.get("NO_VERSION_LABEL")):
         exts = []
         backend = None
-        if _read_cmake_bool(os.environ.get('NO_TOOLCHAIN_VERSION')):
+        if _read_cmake_bool(os.environ.get("NO_TOOLCHAIN_VERSION")):
             pass
-        elif platform.system() == 'Darwin':
+        elif platform.system() == "Darwin":
             # only on macosx_11_0_arm64, not necessary
             # backend = 'metal'
             pass
-        elif _read_cmake_bool(os.environ.get('USE_ROCM', '')):
-            backend = 'rocm'
-        elif 'USE_CUDA' in os.environ and not _read_cmake_bool(os.environ.get('USE_CUDA')):
-            backend = 'cpu'
+        elif _read_cmake_bool(os.environ.get("USE_ROCM", "")):
+            backend = "rocm"
+        elif "USE_CUDA" in os.environ and not _read_cmake_bool(os.environ.get("USE_CUDA")):
+            backend = "cpu"
         else:  # cuda
             # Read nvcc version from env.
             # This is not exactly how it should be,
             # but works for now if building in a nvidia/cuda image.
-            if cuda_version := os.environ.get('CUDA_VERSION'):
-                major, minor, *_ = cuda_version.split('.')
-                backend = f'cu{major}{minor}'
+            if cuda_version := os.environ.get("CUDA_VERSION"):
+                major, minor, *_ = cuda_version.split(".")
+                backend = f"cu{major}{minor}"
             else:
-                backend = 'cuda'
+                backend = "cuda"
         if backend:
             exts.append(backend)
 
-        if _read_cmake_bool(os.environ.get('NO_GIT_VERSION')):
+        if _read_cmake_bool(os.environ.get("NO_GIT_VERSION")):
             pass
         elif git_hash := get_git_commit_id():
-            exts.append(f'git{git_hash[:8]}')
+            exts.append(f"git{git_hash[:8]}")
+        else:
+            exts.append("gitunknown")
 
         if exts:
-            version += '+' + '.'.join(exts)
+            version += "+" + ".".join(exts)
 
     return version