tile-ai · LeiWang1999 · Dec 24, 2025 · Dec 24, 2025 · coderabbitai · Dec 24, 2025
diff --git a/.github/workflows/pr-regression-test-bot.yml b/.github/workflows/pr-regression-test-bot.yml
@@ -23,8 +23,12 @@ env:
   COLUMNS: "100"
   FORCE_COLOR: "1"
   CLICOLOR_FORCE: "1"
+  UV_INDEX_STRATEGY: "unsafe-best-match"
+  UV_HTTP_TIMEOUT: "600"
   XDG_CACHE_HOME: "${{ github.workspace }}/.cache" # to be updated
   PIP_CACHE_DIR: "${{ github.workspace }}/.cache/pip" # to be updated
+  UV_CACHE_DIR: "${{ github.workspace }}/.cache/uv" # to be updated
+  PRE_COMMIT_HOME: "${{ github.workspace }}/.cache/pip/.pre-commit" # to be updated
 
 jobs:
   pr-regression:
@@ -33,42 +37,181 @@ jobs:
       github.repository_owner == 'tile-ai' &&
       github.event.issue.pull_request &&
       (contains(github.event.comment.body, '@regression-perf'))
-    runs-on: [self-hosted, nvidia]
+    runs-on: ${{ matrix.runner.tags }}
+    strategy:
+      matrix:
+        runner:
+          - tags: [self-hosted, nvidia]
+            name: self-hosted-nvidia
+            toolkit: CUDA-12.8
+        python-version:
+          - "3.12"
+      fail-fast: false
+    timeout-minutes: 120
+
     steps:
+      - name: Get commenter permission
+        id: perm
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const username = context.payload.comment.user.login
+            const { owner, repo } = context.repo
+            const { data } = await github.rest.repos.getCollaboratorPermissionLevel({ owner, repo, username })
+            core.setOutput('permission', data.permission) // admin|maintain|write|triage|read|none
+
+      - name: Reject if not allowed
+        if: ${{ steps.perm.outputs.permission != 'admin' && steps.perm.outputs.permission != 'maintain' && steps.perm.outputs.permission != 'write' }}
+        run: |
+          echo "Not authorized: permission=${{ steps.perm.outputs.permission }}"
+          exit 1
+
       - name: Checkout repository
         uses: actions/checkout@v6
         with:
           ref: refs/pull/${{ github.event.issue.number }}/merge
           fetch-depth: 0
           submodules: recursive
 
-      - name: Setup Python
-        uses: actions/setup-python@v6
+      - name: Set environment (self-hosted runners)
+        if: startsWith(matrix.runner.name, 'self-hosted')
+        run: |
+          # Hide sensitive data in logs for self-hosted runners
+          if [[ -n "${{ secrets.SECRET_PATH_PREFIXES }}" ]]; then
+            echo "::add-mask::${{ secrets.SECRET_PATH_PREFIXES }}"
+            # Colon separated list of secrets to mask
+            for secret in $(echo "${{ secrets.SECRET_PATH_PREFIXES }}" | tr ':' '\n'); do
+              echo "::add-mask::${secret}"
+            done
+          fi
+
+          # Use runner tool_cache as cache root for self-hosted runners to avoid internet connection
+          # issues and to share cache between jobs.
+          export XDG_CACHE_HOME="${{ runner.tool_cache }}/.ci-cache-${{ github.workflow }}"
+          echo "XDG_CACHE_HOME=${XDG_CACHE_HOME}" | tee -a "${GITHUB_ENV}"
+          echo "PIP_CACHE_DIR=${XDG_CACHE_HOME}/pip" | tee -a "${GITHUB_ENV}"
+          echo "UV_CACHE_DIR=${XDG_CACHE_HOME}/uv" | tee -a "${GITHUB_ENV}"
+          echo "PRE_COMMIT_HOME=${XDG_CACHE_HOME}/pip/.pre-commit" | tee -a "${GITHUB_ENV}"
+
+      # Do not use ccache on self-hosted runners, as it will download/upload caches which is slow.
+      # Self-hosted runners usually have more CPU power to compile without ccache.
+      - name: Setup ccache (GitHub-hosted runners)
+        id: setup-ccache
+        if: ${{ !startsWith(matrix.runner.name, 'self-hosted') }}
+        uses: hendrikmuhs/ccache-action@v1
+        with:
+          create-symlink: true
+          evict-old-files: "7d"
+          append-timestamp: false
+          key: ${{ runner.os }}-${{ runner.arch }}-${{ matrix.runner.toolkit }}-${{ hashFiles('**/*.cc') }}
+          restore-keys: |
+            ${{ runner.os }}-${{ runner.arch }}-${{ matrix.runner.toolkit }}-${{ hashFiles('**/*.cc') }}
+            ${{ runner.os }}-${{ runner.arch }}-${{ matrix.runner.toolkit }}
+            ${{ runner.os }}-${{ runner.arch }}
+
+      - name: Set environment (CUDA)
+        if: contains(matrix.runner.toolkit, 'CUDA')
+        run: |
+          TOOLKIT="${{ matrix.runner.toolkit }}"
+          CUDA_VERSION="${TOOLKIT##*-}"
+          CUDA_VERSION_MAJMIN="$(echo ${CUDA_VERSION} | cut -d '.' -f-2)"
+          CUDA_VERSION_MAJMIN_NODOT="${CUDA_VERSION_MAJMIN//./}"
+          if [[ "${TOOLKIT}" == "Nightly-"* ]]; then
+            # Use torch nightly builds
+            export PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/nightly/cu${CUDA_VERSION_MAJMIN_NODOT}"
+          else
+            export PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu${CUDA_VERSION_MAJMIN_NODOT}"
+          fi
+          export UV_INDEX="${PIP_EXTRA_INDEX_URL}"
+          export CLANG_TIDY_CMAKE_OPTIONS="${CLANG_TIDY_CMAKE_OPTIONS} -DUSE_CUDA=ON"
+
+          echo "USE_CUDA=ON" | tee -a "${GITHUB_ENV}"
+          echo "CUDA_VERSION=${CUDA_VERSION}" | tee -a "${GITHUB_ENV}"
+          echo "CUDA_VERSION_MAJMIN=${CUDA_VERSION_MAJMIN}" | tee -a "${GITHUB_ENV}"
+          echo "CUDA_VERSION_MAJMIN_NODOT=${CUDA_VERSION_MAJMIN_NODOT}" | tee -a "${GITHUB_ENV}"
+          echo "PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}" | tee -a "${GITHUB_ENV}"
+          echo "UV_INDEX=${UV_INDEX}" | tee -a "${GITHUB_ENV}"
+          echo "CLANG_TIDY_CMAKE_OPTIONS=${CLANG_TIDY_CMAKE_OPTIONS}" | tee -a "${GITHUB_ENV}"
+
+          if [[ ! -x "$(command -v nvcc)" ]]; then
+            export PATH="/usr/local/cuda/bin:${PATH}"
+            export LD_LIBRARY_PATH="/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+            echo "PATH=${PATH}" | tee -a "${GITHUB_ENV}"
+            echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" | tee -a "${GITHUB_ENV}"
+          fi
+          if [[ -x "$(command -v nvcc)" ]]; then
+            echo "\$ $(command -v nvcc) --version" && nvcc --version
+          else
+            echo "::warning::nvcc not found in PATH!"
+          fi
+
+      - name: Setup Python and uv with caching
+        id: setup-uv
+        uses: astral-sh/setup-uv@v7
         with:
-          python-version: "3.12"
-          update-environment: true
-          cache: pip
-          cache-dependency-path: |
+          python-version: ${{ matrix.python-version }}
+          activate-environment: true
+          # Do not use cache for self-hosted runners, as it will download/upload caches which is slow.
+          enable-cache: ${{ !startsWith(matrix.runner.name, 'self-hosted') }}
+          prune-cache: ${{ !startsWith(matrix.runner.name, 'self-hosted') }}
+          # Use runner tool_cache for self-hosted runners
+          cache-local-path: ${{ env.UV_CACHE_DIR }}
+          ignore-nothing-to-cache: true
+          # Extra cache key to upload/download caches on GitHub-hosted runners
+          cache-suffix: uv-${{ runner.os }}-${{ runner.arch }}-${{ matrix.python-version }}-${{ matrix.runner.name }}-${{ matrix.runner.toolkit }}
+          cache-dependency-glob: |
             pyproject.toml
             requirements*.txt
 
-      - name: Install PR version (new)
+      - name: Setup environments
+        id: setup-venv
         run: |
-          python -m venv new
+          set -e
+
+          uv venv --python "${{ matrix.python-version }}" new
+
           source new/bin/activate
-          pip install --no-user -r requirements-test.txt
-          pip install --no-user .
+          uv pip install -v -r requirements-test.txt
+          uv pip install -v .
 
-      - name: Install main version (old)
+      - name: Install Main version (Baseline)
         run: |
-          echo "Check files to be deleted!"
+          set -e
           git clean -dxf -e new/ -e .cache/
-          echo "Delete files completed!"
           git checkout main
-          python -m venv old
+          git submodule update --init --recursive
+          uv venv --python "${{ matrix.python-version }}" old
           source old/bin/activate
-          pip install --no-user -r requirements-test.txt
-          pip install --no-user .
+
+          uv pip install -v -r requirements-test.txt
+          uv pip install -v .
+
+      - name: Clear uv cache for self-hosted runners (if setup failed)
+        if: >-
+          ${{
+            failure() &&
+            startsWith(matrix.runner.name, 'self-hosted') &&
+            (steps.setup-uv.conclusion == 'failure' || steps.setup-venv.conclusion == 'failure')
+          }}
+        run: |
+          echo "Clearing uv cache at ${UV_CACHE_DIR} due to failure."
+          uv cache clean
+
+      - name: Enable core dump generation (Linux / GitHub-hosted runners)
+        if: ${{ runner.os == 'Linux' && !startsWith(matrix.runner.name, 'self-hosted') }}
+        run: |
+          sudo sysctl -w kernel.core_pattern="core.${{ matrix.python-version }}.${{ matrix.runner.toolkit }}.%P"
+          sudo sysctl -w kernel.core_uses_pid=0
+          sudo sysctl -w fs.suid_dumpable=1
+          sysctl kernel.core_pattern kernel.core_uses_pid fs.suid_dumpable
+
+      - name: Enable core dump generation (macOS / GitHub-hosted runners)
+        if: ${{ runner.os == 'macOS' && !startsWith(matrix.runner.name, 'self-hosted') }}
+        run: |
+          sudo sysctl -w kern.corefile="core.${{ matrix.python-version }}.${{ matrix.runner.toolkit }}.%P"
+          sudo sysctl -w kern.coredump=1
+          sudo sysctl -w kern.sugid_coredump=1
+          sysctl kern.corefile kern.coredump kern.sugid_coredump
 
       - name: Run performance regression test
         run: |

diff --git a/examples/dynamic_shape/example_dynamic.py b/examples/dynamic_shape/example_dynamic.py
@@ -4,7 +4,7 @@
 from tilelang import tvm as tvm
 
 
-@tilelang.jit(pass_configs={"tl.disable_dynamic_tail_split": True, "tl.dynamic_alignment": 8})
+@tilelang.jit
 def matmul_dynamic_mnk(
     block_M,
     block_N,

diff --git a/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py b/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
@@ -339,38 +339,23 @@ def run1():
     print("tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
-def run_regression_perf():
-    BATCH = 1
-    H = 32
-    N_CTX = 256
-    D_HEAD_QK = 192
-    D_HEAD_V = 128
-    groups = 16
-    causal = False
-    device = "cuda"
-    torch.manual_seed(42)
+def run_regression_perf(
+    BATCH: int = 1, H: int = 32, N_CTX: int = 256, D_HEAD_QK: int = 192, D_HEAD_V: int = 128, groups: int = 16, causal: bool = False
+):
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
+
     head_kv = H // groups
-    Q = torch.randn(BATCH, N_CTX, H, D_HEAD_QK, device=device, dtype=torch.half)
-    K = torch.randn(BATCH, N_CTX, head_kv, D_HEAD_QK, device=device, dtype=torch.half)
-    V = torch.randn(BATCH, N_CTX, head_kv, D_HEAD_V, device=device, dtype=torch.half)
-    O = torch.randn(BATCH, N_CTX, H, D_HEAD_V, device=device, dtype=torch.half)
-    dO = torch.randn(BATCH, N_CTX, H, D_HEAD_V, device=device, dtype=torch.half)
-    lse = torch.zeros(BATCH, H, N_CTX, device=device, dtype=torch.float32)
-    with torch.no_grad():
-        mod_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD_V)
-        kernel = flashattn_bwd(
-            BATCH, H, N_CTX, D_HEAD_QK, D_HEAD_V, causal, block_M=128, block_N=32, threads=256, num_stages=2, groups=groups
-        )
-    dQ = torch.zeros_like(Q, dtype=torch.float32)
-    dK = torch.zeros(BATCH, N_CTX, head_kv, D_HEAD_QK, device=device, dtype=torch.float32)
-    dV = torch.zeros(BATCH, N_CTX, head_kv, D_HEAD_V, device=device, dtype=torch.float32)
-    Delta = mod_prep(O, dO)
-    from tilelang.profiler import do_bench
+    K = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    V = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    dO = torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    O = attention(Q, K, V, causal, groups)
+
+    def run1():
+        O.backward(dO, retain_graph=True)
 
-    def run_kernel_only():
-        kernel(Q, K, V, dO, lse, Delta, dQ, dK, dV)
+    from tilelang.profiler import do_bench
 
-    return do_bench(run_kernel_only, warmup=10, rep=100, backend="cupti")
+    return do_bench(run1, warmup=500, backend="cupti")
 
 
 if __name__ == "__main__":

diff --git a/examples/flash_attention/example_gqa_fwd_bshd.py b/examples/flash_attention/example_gqa_fwd_bshd.py
@@ -247,7 +247,6 @@ def run_regression_perf(
     batch: int = 1, heads: int = 64, seq_len: int = 4096, dim: int = 128, is_causal: bool = False, groups: int = 16, tune: bool = False
 ):
     kernel = flashattn(batch, heads, seq_len, dim, is_causal, groups=groups, block_M=64, block_N=64, num_stages=2, threads=128)
-
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
     return profiler.do_bench(backend="cupti")
 

diff --git a/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py b/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
@@ -237,17 +237,11 @@ def run_regression_perf(
     dim: int = 128,
     is_causal: bool = False,
     groups: int = 16,
-    tune: bool = False,
 ):
-    flops_per_matmul = 2.0 * batch * heads * seq_len * seq_len * dim
-    total_flops = 2 * flops_per_matmul
-    if is_causal:
-        total_flops *= 0.5
-
     kernel = flashattn(batch, heads, seq_len, dim, is_causal, groups=groups, block_M=128, block_N=128, num_stages=2, threads=256)
 
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
-    return profiler.do_bench(warmup=500, backend="cupti")
+    return profiler.do_bench(backend="cupti")
 
 
 if __name__ == "__main__":

diff --git a/examples/flash_attention/example_mha_fwd_varlen.py b/examples/flash_attention/example_mha_fwd_varlen.py
@@ -335,7 +335,7 @@ def run_regression_perf(batch: int = 8, heads: int = 64, seq_len: int = 2048, di
     UQ = q_unpad.shape[0]
     UK = k_unpad.shape[0]
     UKV = k_unpad.shape[0]
-    kernel = flashattn(batch, UQ, UKV, heads, dim, causal)
+    kernel = flashattn(batch, UQ, UKV, heads, dim, causal, block_M=128, block_N=128, num_stages=2, threads=256)
 
     from tilelang.profiler import do_bench
 

diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
@@ -227,11 +227,11 @@ def main():
 def run_regression_perf():
     M, N, K = 128, 128, 128
     out_dtype, accum_dtype = "float32", "float32"
-    out_dtype, accum_dtype = "float32", "float32"
+    out_dtype, accum_dtype = T.float32, T.float32
-    out_dtype, accum_dtype = "float32", "float32"
+    out_dtype, accum_dtype = T.float32, T.float32
-    in_dtype = "float8_e4m3"
+    in_dtype = T.float8_e4m3fn
     kernel_e4m3 = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype)
     profiler_e4m3 = kernel_e4m3.get_profiler(tilelang.TensorSupplyType.Integer)
     latency_e4m3 = profiler_e4m3.do_bench(warmup=25, backend="cupti")
-    in_dtype = "float8_e5m2"
+    in_dtype = T.float8_e5m2
     kernel_e5m2 = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype)
     profiler_e5m2 = kernel_e5m2.get_profiler(tilelang.TensorSupplyType.Integer)
     latency_e5m2 = profiler_e5m2.do_bench(warmup=25, backend="cupti")

diff --git a/examples/gemm_streamk/regression_example_tilelang_gemm_splitk.py b/examples/gemm_streamk/regression_example_tilelang_gemm_splitk.py