diff --git a/.github/workflows/pr-regression-test-bot.yml b/.github/workflows/pr-regression-test-bot.yml
index 59ce74ad4..c7be92298 100644
--- a/.github/workflows/pr-regression-test-bot.yml
+++ b/.github/workflows/pr-regression-test-bot.yml
@@ -23,8 +23,12 @@ env:
   COLUMNS: "100"
   FORCE_COLOR: "1"
   CLICOLOR_FORCE: "1"
+  UV_INDEX_STRATEGY: "unsafe-best-match"
+  UV_HTTP_TIMEOUT: "600"
   XDG_CACHE_HOME: "${{ github.workspace }}/.cache" # to be updated
   PIP_CACHE_DIR: "${{ github.workspace }}/.cache/pip" # to be updated
+  UV_CACHE_DIR: "${{ github.workspace }}/.cache/uv" # to be updated
+  PRE_COMMIT_HOME: "${{ github.workspace }}/.cache/pip/.pre-commit" # to be updated
 
 jobs:
   pr-regression:
@@ -33,8 +37,35 @@ jobs:
       github.repository_owner == 'tile-ai' &&
       github.event.issue.pull_request &&
       (contains(github.event.comment.body, '@regression-perf'))
-    runs-on: [self-hosted, nvidia]
+    runs-on: ${{ matrix.runner.tags }}
+    strategy:
+      matrix:
+        runner:
+          - tags: [self-hosted, nvidia]
+            name: self-hosted-nvidia
+            toolkit: CUDA-12.8
+        python-version:
+          - "3.12"
+      fail-fast: false
+    timeout-minutes: 120
+
     steps:
+      - name: Get commenter permission
+        id: perm
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const username = context.payload.comment.user.login
+            const { owner, repo } = context.repo
+            const { data } = await github.rest.repos.getCollaboratorPermissionLevel({ owner, repo, username })
+            core.setOutput('permission', data.permission) // admin|maintain|write|triage|read|none
+      
+      - name: Reject if not allowed
+        if: ${{ steps.perm.outputs.permission != 'admin' && steps.perm.outputs.permission != 'maintain' && steps.perm.outputs.permission != 'write' }}
+        run: |
+          echo "Not authorized: permission=${{ steps.perm.outputs.permission }}"
+          exit 1
+
       - name: Checkout repository
         uses: actions/checkout@v6
         with:
@@ -42,33 +73,145 @@ jobs:
           fetch-depth: 0
           submodules: recursive
 
-      - name: Setup Python
-        uses: actions/setup-python@v6
+      - name: Set environment (self-hosted runners)
+        if: startsWith(matrix.runner.name, 'self-hosted')
+        run: |
+          # Hide sensitive data in logs for self-hosted runners
+          if [[ -n "${{ secrets.SECRET_PATH_PREFIXES }}" ]]; then
+            echo "::add-mask::${{ secrets.SECRET_PATH_PREFIXES }}"
+            # Colon separated list of secrets to mask
+            for secret in $(echo "${{ secrets.SECRET_PATH_PREFIXES }}" | tr ':' '\n'); do
+              echo "::add-mask::${secret}"
+            done
+          fi
+
+          # Use runner tool_cache as cache root for self-hosted runners to avoid internet connection
+          # issues and to share cache between jobs.
+          export XDG_CACHE_HOME="${{ runner.tool_cache }}/.ci-cache-${{ github.workflow }}"
+          echo "XDG_CACHE_HOME=${XDG_CACHE_HOME}" | tee -a "${GITHUB_ENV}"
+          echo "PIP_CACHE_DIR=${XDG_CACHE_HOME}/pip" | tee -a "${GITHUB_ENV}"
+          echo "UV_CACHE_DIR=${XDG_CACHE_HOME}/uv" | tee -a "${GITHUB_ENV}"
+          echo "PRE_COMMIT_HOME=${XDG_CACHE_HOME}/pip/.pre-commit" | tee -a "${GITHUB_ENV}"
+
+      # Do not use ccache on self-hosted runners, as it will download/upload caches which is slow.
+      # Self-hosted runners usually have more CPU power to compile without ccache.
+      - name: Setup ccache (GitHub-hosted runners)
+        id: setup-ccache
+        if: ${{ !startsWith(matrix.runner.name, 'self-hosted') }}
+        uses: hendrikmuhs/ccache-action@v1
+        with:
+          create-symlink: true
+          evict-old-files: "7d"
+          append-timestamp: false
+          key: ${{ runner.os }}-${{ runner.arch }}-${{ matrix.runner.toolkit }}-${{ hashFiles('**/*.cc') }}
+          restore-keys: |
+            ${{ runner.os }}-${{ runner.arch }}-${{ matrix.runner.toolkit }}-${{ hashFiles('**/*.cc') }}
+            ${{ runner.os }}-${{ runner.arch }}-${{ matrix.runner.toolkit }}
+            ${{ runner.os }}-${{ runner.arch }}
+
+      - name: Set environment (CUDA)
+        if: contains(matrix.runner.toolkit, 'CUDA')
+        run: |
+          TOOLKIT="${{ matrix.runner.toolkit }}"
+          CUDA_VERSION="${TOOLKIT##*-}"
+          CUDA_VERSION_MAJMIN="$(echo ${CUDA_VERSION} | cut -d '.' -f-2)"
+          CUDA_VERSION_MAJMIN_NODOT="${CUDA_VERSION_MAJMIN//./}"
+          if [[ "${TOOLKIT}" == "Nightly-"* ]]; then
+            # Use torch nightly builds
+            export PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/nightly/cu${CUDA_VERSION_MAJMIN_NODOT}"
+          else
+            export PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu${CUDA_VERSION_MAJMIN_NODOT}"
+          fi
+          export UV_INDEX="${PIP_EXTRA_INDEX_URL}"
+          export CLANG_TIDY_CMAKE_OPTIONS="${CLANG_TIDY_CMAKE_OPTIONS} -DUSE_CUDA=ON"
+
+          echo "USE_CUDA=ON" | tee -a "${GITHUB_ENV}"
+          echo "CUDA_VERSION=${CUDA_VERSION}" | tee -a "${GITHUB_ENV}"
+          echo "CUDA_VERSION_MAJMIN=${CUDA_VERSION_MAJMIN}" | tee -a "${GITHUB_ENV}"
+          echo "CUDA_VERSION_MAJMIN_NODOT=${CUDA_VERSION_MAJMIN_NODOT}" | tee -a "${GITHUB_ENV}"
+          echo "PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}" | tee -a "${GITHUB_ENV}"
+          echo "UV_INDEX=${UV_INDEX}" | tee -a "${GITHUB_ENV}"
+          echo "CLANG_TIDY_CMAKE_OPTIONS=${CLANG_TIDY_CMAKE_OPTIONS}" | tee -a "${GITHUB_ENV}"
+
+          if [[ ! -x "$(command -v nvcc)" ]]; then
+            export PATH="/usr/local/cuda/bin:${PATH}"
+            export LD_LIBRARY_PATH="/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+            echo "PATH=${PATH}" | tee -a "${GITHUB_ENV}"
+            echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" | tee -a "${GITHUB_ENV}"
+          fi
+          if [[ -x "$(command -v nvcc)" ]]; then
+            echo "\$ $(command -v nvcc) --version" && nvcc --version
+          else
+            echo "::warning::nvcc not found in PATH!"
+          fi
+
+      - name: Setup Python and uv with caching
+        id: setup-uv
+        uses: astral-sh/setup-uv@v7
         with:
-          python-version: "3.12"
-          update-environment: true
-          cache: pip
-          cache-dependency-path: |
+          python-version: ${{ matrix.python-version }}
+          activate-environment: true
+          # Do not use cache for self-hosted runners, as it will download/upload caches which is slow.
+          enable-cache: ${{ !startsWith(matrix.runner.name, 'self-hosted') }}
+          prune-cache: ${{ !startsWith(matrix.runner.name, 'self-hosted') }}
+          # Use runner tool_cache for self-hosted runners
+          cache-local-path: ${{ env.UV_CACHE_DIR }}
+          ignore-nothing-to-cache: true
+          # Extra cache key to upload/download caches on GitHub-hosted runners
+          cache-suffix: uv-${{ runner.os }}-${{ runner.arch }}-${{ matrix.python-version }}-${{ matrix.runner.name }}-${{ matrix.runner.toolkit }}
+          cache-dependency-glob: |
             pyproject.toml
             requirements*.txt
 
-      - name: Install PR version (new)
+      - name: Setup environments
+        id: setup-venv
         run: |
-          python -m venv new
+          set -e
+
+          uv venv --python "${{ matrix.python-version }}" new
+
           source new/bin/activate
-          pip install --no-user -r requirements-test.txt
-          pip install --no-user .
+          uv pip install -v -r requirements-test.txt
+          uv pip install -v .
 
-      - name: Install main version (old)
+      - name: Install Main version (Baseline)
         run: |
-          echo "Check files to be deleted!"
+          set -e
           git clean -dxf -e new/ -e .cache/
-          echo "Delete files completed!"
           git checkout main
-          python -m venv old
+          git submodule update --init --recursive
+          uv venv --python "${{ matrix.python-version }}" old
           source old/bin/activate
-          pip install --no-user -r requirements-test.txt
-          pip install --no-user .
+
+          uv pip install -v -r requirements-test.txt
+          uv pip install -v .
+
+      - name: Clear uv cache for self-hosted runners (if setup failed)
+        if: >-
+          ${{
+            failure() &&
+            startsWith(matrix.runner.name, 'self-hosted') &&
+            (steps.setup-uv.conclusion == 'failure' || steps.setup-venv.conclusion == 'failure')
+          }}
+        run: |
+          echo "Clearing uv cache at ${UV_CACHE_DIR} due to failure."
+          uv cache clean
+
+      - name: Enable core dump generation (Linux / GitHub-hosted runners)
+        if: ${{ runner.os == 'Linux' && !startsWith(matrix.runner.name, 'self-hosted') }}
+        run: |
+          sudo sysctl -w kernel.core_pattern="core.${{ matrix.python-version }}.${{ matrix.runner.toolkit }}.%P"
+          sudo sysctl -w kernel.core_uses_pid=0
+          sudo sysctl -w fs.suid_dumpable=1
+          sysctl kernel.core_pattern kernel.core_uses_pid fs.suid_dumpable
+
+      - name: Enable core dump generation (macOS / GitHub-hosted runners)
+        if: ${{ runner.os == 'macOS' && !startsWith(matrix.runner.name, 'self-hosted') }}
+        run: |
+          sudo sysctl -w kern.corefile="core.${{ matrix.python-version }}.${{ matrix.runner.toolkit }}.%P"
+          sudo sysctl -w kern.coredump=1
+          sudo sysctl -w kern.sugid_coredump=1
+          sysctl kern.corefile kern.coredump kern.sugid_coredump
 
       - name: Run performance regression test
         run: |
diff --git a/examples/dynamic_shape/example_dynamic.py b/examples/dynamic_shape/example_dynamic.py
index 88a53e59d..a2d5b46ae 100644
--- a/examples/dynamic_shape/example_dynamic.py
+++ b/examples/dynamic_shape/example_dynamic.py
@@ -4,7 +4,7 @@
 from tilelang import tvm as tvm
 
 
-@tilelang.jit(pass_configs={"tl.disable_dynamic_tail_split": True, "tl.dynamic_alignment": 8})
+@tilelang.jit
 def matmul_dynamic_mnk(
     block_M,
     block_N,
diff --git a/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py b/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
index 9e3089d23..463786bd5 100644
--- a/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py
@@ -339,38 +339,23 @@ def run1():
     print("tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
 
 
-def run_regression_perf():
-    BATCH = 1
-    H = 32
-    N_CTX = 256
-    D_HEAD_QK = 192
-    D_HEAD_V = 128
-    groups = 16
-    causal = False
-    device = "cuda"
-    torch.manual_seed(42)
+def run_regression_perf(
+    BATCH: int = 1, H: int = 32, N_CTX: int = 256, D_HEAD_QK: int = 192, D_HEAD_V: int = 128, groups: int = 16, causal: bool = False
+):
+    Q = torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
+
     head_kv = H // groups
-    Q = torch.randn(BATCH, N_CTX, H, D_HEAD_QK, device=device, dtype=torch.half)
-    K = torch.randn(BATCH, N_CTX, head_kv, D_HEAD_QK, device=device, dtype=torch.half)
-    V = torch.randn(BATCH, N_CTX, head_kv, D_HEAD_V, device=device, dtype=torch.half)
-    O = torch.randn(BATCH, N_CTX, H, D_HEAD_V, device=device, dtype=torch.half)
-    dO = torch.randn(BATCH, N_CTX, H, D_HEAD_V, device=device, dtype=torch.half)
-    lse = torch.zeros(BATCH, H, N_CTX, device=device, dtype=torch.float32)
-    with torch.no_grad():
-        mod_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD_V)
-        kernel = flashattn_bwd(
-            BATCH, H, N_CTX, D_HEAD_QK, D_HEAD_V, causal, block_M=128, block_N=32, threads=256, num_stages=2, groups=groups
-        )
-    dQ = torch.zeros_like(Q, dtype=torch.float32)
-    dK = torch.zeros(BATCH, N_CTX, head_kv, D_HEAD_QK, device=device, dtype=torch.float32)
-    dV = torch.zeros(BATCH, N_CTX, head_kv, D_HEAD_V, device=device, dtype=torch.float32)
-    Delta = mod_prep(O, dO)
-    from tilelang.profiler import do_bench
+    K = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    V = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    dO = torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_()
+    O = attention(Q, K, V, causal, groups)
+
+    def run1():
+        O.backward(dO, retain_graph=True)
 
-    def run_kernel_only():
-        kernel(Q, K, V, dO, lse, Delta, dQ, dK, dV)
+    from tilelang.profiler import do_bench
 
-    return do_bench(run_kernel_only, warmup=10, rep=100, backend="cupti")
+    return do_bench(run1, warmup=500, backend="cupti")
 
 
 if __name__ == "__main__":
diff --git a/examples/flash_attention/example_gqa_fwd_bshd.py b/examples/flash_attention/example_gqa_fwd_bshd.py
index cb3acc808..dea941642 100644
--- a/examples/flash_attention/example_gqa_fwd_bshd.py
+++ b/examples/flash_attention/example_gqa_fwd_bshd.py
@@ -247,7 +247,6 @@ def run_regression_perf(
     batch: int = 1, heads: int = 64, seq_len: int = 4096, dim: int = 128, is_causal: bool = False, groups: int = 16, tune: bool = False
 ):
     kernel = flashattn(batch, heads, seq_len, dim, is_causal, groups=groups, block_M=64, block_N=64, num_stages=2, threads=128)
-
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
     return profiler.do_bench(backend="cupti")
 
diff --git a/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py b/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
index c59d693fd..5c0386410 100644
--- a/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py
@@ -237,17 +237,11 @@ def run_regression_perf(
     dim: int = 128,
     is_causal: bool = False,
     groups: int = 16,
-    tune: bool = False,
 ):
-    flops_per_matmul = 2.0 * batch * heads * seq_len * seq_len * dim
-    total_flops = 2 * flops_per_matmul
-    if is_causal:
-        total_flops *= 0.5
-
     kernel = flashattn(batch, heads, seq_len, dim, is_causal, groups=groups, block_M=128, block_N=128, num_stages=2, threads=256)
 
     profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
-    return profiler.do_bench(warmup=500, backend="cupti")
+    return profiler.do_bench(backend="cupti")
 
 
 if __name__ == "__main__":
diff --git a/examples/flash_attention/example_mha_fwd_varlen.py b/examples/flash_attention/example_mha_fwd_varlen.py
index 67165de1a..1f0b9555d 100644
--- a/examples/flash_attention/example_mha_fwd_varlen.py
+++ b/examples/flash_attention/example_mha_fwd_varlen.py
@@ -335,7 +335,7 @@ def run_regression_perf(batch: int = 8, heads: int = 64, seq_len: int = 2048, di
     UQ = q_unpad.shape[0]
     UK = k_unpad.shape[0]
     UKV = k_unpad.shape[0]
-    kernel = flashattn(batch, UQ, UKV, heads, dim, causal)
+    kernel = flashattn(batch, UQ, UKV, heads, dim, causal, block_M=128, block_N=128, num_stages=2, threads=256)
 
     from tilelang.profiler import do_bench
 
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
index 2e88bb2be..c7b8a72ab 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
@@ -227,11 +227,11 @@ def main():
 def run_regression_perf():
     M, N, K = 128, 128, 128
     out_dtype, accum_dtype = "float32", "float32"
-    in_dtype = "float8_e4m3"
+    in_dtype = T.float8_e4m3fn
     kernel_e4m3 = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype)
     profiler_e4m3 = kernel_e4m3.get_profiler(tilelang.TensorSupplyType.Integer)
     latency_e4m3 = profiler_e4m3.do_bench(warmup=25, backend="cupti")
-    in_dtype = "float8_e5m2"
+    in_dtype = T.float8_e5m2
     kernel_e5m2 = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype)
     profiler_e5m2 = kernel_e5m2.get_profiler(tilelang.TensorSupplyType.Integer)
     latency_e5m2 = profiler_e5m2.do_bench(warmup=25, backend="cupti")
diff --git a/examples/gemm_streamk/regression_example_tilelang_gemm_splitk.py b/examples/gemm_streamk/regression_example_tilelang_gemm_splitk.py
deleted file mode 100644
index 1f05f164f..000000000
--- a/examples/gemm_streamk/regression_example_tilelang_gemm_splitk.py
+++ /dev/null
@@ -1,10 +0,0 @@
-import tilelang.testing
-import example_tilelang_gemm_streamk
-
-
-def regression_example_tilelang_gemm_streamk():
-    tilelang.testing.process_func(example_tilelang_gemm_streamk.run_regression_perf)
-
-
-if __name__ == "__main__":
-    tilelang.testing.regression()
diff --git a/maint/scripts/test_perf_regression.py b/maint/scripts/test_perf_regression.py
index 11a50c73b..00502724b 100644
--- a/maint/scripts/test_perf_regression.py
+++ b/maint/scripts/test_perf_regression.py
@@ -5,6 +5,8 @@
 import json
 from tabulate import tabulate
 import pandas as pd
+import numpy as np
+import textwrap
 
 try:
     import tilelang
@@ -48,86 +50,132 @@ def run_cmd(cmd, env=None):
     return p.stdout
 
 
-def draw(df):
+def draw(df: pd.DataFrame) -> None:
     import matplotlib.pyplot as plt
-    import seaborn as sns
 
-    if len(df) == 0:
+    if df is None or len(df) == 0:
         return
 
-    num_items = len(df)
-    calculated_width = max(8, num_items * 0.6)
-    calculated_height = 10  # A reasonable fixed height
-
-    plt.figure(figsize=(calculated_width, calculated_height))
+    # ---- copy + sanitize ----
+    df = df.copy()
+    df["Speedup"] = pd.to_numeric(df["Speedup"], errors="coerce")
+    df = df.dropna(subset=["Speedup"])
+
+    # categorize
+    df["Performance"] = np.where(df["Speedup"] >= 1.0, "Improved", "Regressed")
+    df["DeltaPct"] = (df["Speedup"] - 1.0) * 100.0
+
+    # sort: worst regressions at top? (common for dashboards)
+    # If you prefer best-to-worst, change ascending=False
+    df = df.sort_values("Speedup", ascending=True).reset_index(drop=True)
+
+    # ---- style ----
+    plt.rcParams.update(
+        {
+            "figure.dpi": 120,
+            "savefig.dpi": 300,
+            "axes.titlesize": 16,
+            "axes.labelsize": 12,
+            "xtick.labelsize": 10,
+            "ytick.labelsize": 10,
+        }
+    )
 
-    font_scale = 1.1 if num_items > 20 else 0.9
-    sns.set_theme(style="whitegrid", font_scale=font_scale)
+    n = len(df)
+    # height: ~0.35 inch per row + margins, with a sensible cap/floor
+    fig_h = min(max(6.0, 0.35 * n + 2.2), 22.0)
+    fig_w = 14.0
+    fig, ax = plt.subplots(figsize=(fig_w, fig_h))
+
+    # palette
+    colors = {"Improved": "#2ecc71", "Regressed": "#e74c3c"}
+    bar_colors = df["Performance"].map(colors).tolist()
+
+    # wrap long labels (optional)
+    def wrap_label(s: str, width: int = 42) -> str:
+        return "\n".join(textwrap.wrap(str(s), width=width)) if len(str(s)) > width else str(s)
+
+    ylabels = [wrap_label(x) for x in df["File"].tolist()]
+    y = np.arange(n)
+
+    # bars
+    ax.barh(y, df["Speedup"].values, color=bar_colors, edgecolor="black", linewidth=0.4, height=0.72)
+
+    # baseline at 1.0x
+    ax.axvline(1.0, linestyle="--", linewidth=1.4, alpha=0.85)
+
+    # grid
+    ax.xaxis.grid(True, linestyle="-", linewidth=0.6, alpha=0.25)
+    ax.set_axisbelow(True)
+
+    # y ticks
+    ax.set_yticks(y)
+    ax.set_yticklabels(ylabels)
+
+    # x limits with padding (ensure 1.0 included)
+    x_min = float(df["Speedup"].min())
+    x_max = float(df["Speedup"].max())
+    pad = max(0.02, (x_max - x_min) * 0.12)
+    left = min(1.0, x_min) - pad
+    right = max(1.0, x_max) + pad
+    ax.set_xlim(left, right)
+
+    # annotate each bar
+    for i, (sx, dp) in enumerate(zip(df["Speedup"].values, df["DeltaPct"].values)):
+        label = f"{sx:.3f}x ({dp:+.2f}%)"
+        # place to right for improved, left for regressed (near bar end)
+        if sx >= 1.0:
+            ax.text(sx + 0.003, i, label, va="center", ha="left", fontsize=9)
+        else:
+            ax.text(sx - 0.003, i, label, va="center", ha="right", fontsize=9)
+
+    # labels & title
+    ax.set_xlabel("Speedup Ratio (New / Old)")
+    ax.set_ylabel("Benchmark File")
+    ax.set_title("Performance Regression Analysis")
+
+    # legend
+    from matplotlib.patches import Patch
+
+    legend_handles = [
+        Patch(facecolor=colors["Improved"], edgecolor="black", label="Improved (>= 1.0x)"),
+        Patch(facecolor=colors["Regressed"], edgecolor="black", label="Regressed (< 1.0x)"),
+    ]
+    ax.legend(handles=legend_handles, loc="upper left", frameon=True)
+
+    # summary box
+    num_improved = int((df["Performance"] == "Improved").sum())
+    num_regressed = int((df["Performance"] == "Regressed").sum())
+    best = df.iloc[df["Speedup"].idxmax()]
+    worst = df.iloc[df["Speedup"].idxmin()]
+    summary = (
+        f"Items: {n}\n"
+        f"Improved: {num_improved}\n"
+        f"Regressed: {num_regressed}\n"
+        f"Best:  {best['File']}  {best['Speedup']:.3f}x\n"
+        f"Worst: {worst['File']}  {worst['Speedup']:.3f}x"
+    )
+    ax.text(
+        0.99,
+        0.01,
+        summary,
+        transform=ax.transAxes,
+        ha="right",
+        va="bottom",
+        fontsize=9,
+        bbox=dict(boxstyle="round,pad=0.45", facecolor="white", edgecolor="0.3", alpha=0.9),
+    )
 
-    df["Type"] = df["Speedup"].apply(lambda x: "Speedup" if x >= 1.0 else "Slowdown")
-    palette = {"Speedup": "#4CAF50", "Slowdown": "#F44336"}  # Green for good, Red for bad
+    # clean spines
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
 
-    ax = sns.barplot(
-        data=df,
-        x="File",
-        y="Speedup",
-        hue="Type",
-        palette=palette,
-        dodge=False,  # Don't split bars based on hue
-    )
-    # Remove the hue legend as it's self-explanatory
-    if ax.get_legend():
-        ax.get_legend().remove()
-    # ---------------------------
-
-    top3_idx = df.nlargest(min(3, len(df)), "Speedup").index
-    bot3_idx = df.nsmallest(min(3, len(df)), "Speedup").index
-    label_idx = set(top3_idx.tolist() + bot3_idx.tolist())
-
-    # Add the text labels over the bars
-    # We need to iterate through the patches (the actual bars drawn)
-    for i, patch in enumerate(ax.patches):
-        if i in label_idx:
-            # Get X and Y coordinates from the bar itself
-            x_coords = patch.get_x() + patch.get_width() / 2
-            y_coords = patch.get_height()
-
-            val = df.iloc[i]["Speedup"]
-
-            plt.text(
-                x_coords,
-                y_coords + 0.02,
-                f"{val:.2f}x",
-                ha="center",
-                va="bottom",
-                color="black",  # Black is usually easier to read than red on white
-                fontsize=10,
-                fontweight="bold",
-            )
-
-    plt.xticks(rotation=70, ha="right", fontsize=11)
-    plt.ylabel("Speedup Ratio (Higher is better)", fontsize=13)
-    plt.xlabel("Benchmark File", fontsize=13)
-    plt.title("Current Speedup vs Original", fontsize=15, fontweight="bold")
-
-    plt.axhline(y=1.0, color="gray", linestyle="--", linewidth=1)
-
-    max_val = df["Speedup"].max()
-    plt.ylim(0, max(max_val * 1.15, 1.1))  # Ensure at least a little headroom above 1.0
-
-    sns.despine()
-
-    plt.tight_layout()
-
-    print(f"Saving plot to {OUT_PNG} with dimensions ({calculated_width:.1f}x{calculated_height:.1f} inches)")
-    plt.savefig(OUT_PNG, dpi=300, bbox_inches="tight")
-
-    # Optional: Also save as SVG for perfect clarity
-    # svg_path = OUT_PNG.replace(".png", ".svg")
-    # plt.savefig(svg_path, bbox_inches='tight')
-    # print(f"Also saved SVG version to {svg_path}")
-
-    plt.close()
+    fig.tight_layout()
+    print(f"Saving plot to {OUT_PNG} ({fig_w:.1f}x{fig_h:.1f} inches)")
+    fig.savefig(OUT_PNG, bbox_inches="tight")
+    # Optional: also save SVG
+    # fig.savefig(OUT_PNG.replace(".png", ".svg"), bbox_inches="tight")
+    plt.close(fig)
 
 
 env = {"TL_PERF_REGRESSION_FORMAT": "json"}
diff --git a/tilelang/testing/perf_regression.py b/tilelang/testing/perf_regression.py
index 218db3fcf..e46a6d7b9 100644
--- a/tilelang/testing/perf_regression.py
+++ b/tilelang/testing/perf_regression.py
@@ -9,6 +9,7 @@
 from pathlib import Path
 from typing import Any, Callable
 from collections.abc import Sequence
+import warnings
 
 try:
     from tabulate import tabulate
@@ -31,6 +32,7 @@ class PerfResult:
 
 _RESULTS: list[PerfResult] = []
 
+_MAX_RETRY_NUM = 5
 
 _RESULTS_JSON_PREFIX = "__TILELANG_PERF_RESULTS_JSON__="
 
@@ -59,7 +61,7 @@ def _reset_results() -> None:
     _RESULTS.clear()
 
 
-def process_func(func: Callable[..., float], name: str | None = None, /, **kwargs: Any) -> float:
+def process_func(func: Callable[..., float], name: str | None = None, /, **kwargs: Any) -> None:
     """Execute a single perf function and record its latency.
 
     `func` is expected to return a positive latency scalar (seconds or ms; we
@@ -69,8 +71,12 @@ def process_func(func: Callable[..., float], name: str | None = None, /, **kwarg
     if result_name.startswith("regression_"):
         result_name = result_name[len("regression_") :]
     latency = float(func(**kwargs))
-    if not (latency > 0.0):
-        print(f"Warning: non-positive latency {latency} from {result_name}")
+    _iter = 0
+    while latency <= 0.0 and _iter < _MAX_RETRY_NUM:
+        latency = float(func(**kwargs))
+        _iter += 1
+    if latency <= 0.0:
+        warnings.warn(f"{result_name} has latency {latency} <= 0. Please verify the profiling results.", RuntimeWarning, 1)
         return
     _RESULTS.append(PerfResult(name=result_name, latency=latency))