diff --git a/.github/workflows/pr-regression-test-bot.yml b/.github/workflows/pr-regression-test-bot.yml index 59ce74ad4..c7be92298 100644 --- a/.github/workflows/pr-regression-test-bot.yml +++ b/.github/workflows/pr-regression-test-bot.yml @@ -23,8 +23,12 @@ env: COLUMNS: "100" FORCE_COLOR: "1" CLICOLOR_FORCE: "1" + UV_INDEX_STRATEGY: "unsafe-best-match" + UV_HTTP_TIMEOUT: "600" XDG_CACHE_HOME: "${{ github.workspace }}/.cache" # to be updated PIP_CACHE_DIR: "${{ github.workspace }}/.cache/pip" # to be updated + UV_CACHE_DIR: "${{ github.workspace }}/.cache/uv" # to be updated + PRE_COMMIT_HOME: "${{ github.workspace }}/.cache/pip/.pre-commit" # to be updated jobs: pr-regression: @@ -33,8 +37,35 @@ jobs: github.repository_owner == 'tile-ai' && github.event.issue.pull_request && (contains(github.event.comment.body, '@regression-perf')) - runs-on: [self-hosted, nvidia] + runs-on: ${{ matrix.runner.tags }} + strategy: + matrix: + runner: + - tags: [self-hosted, nvidia] + name: self-hosted-nvidia + toolkit: CUDA-12.8 + python-version: + - "3.12" + fail-fast: false + timeout-minutes: 120 + steps: + - name: Get commenter permission + id: perm + uses: actions/github-script@v7 + with: + script: | + const username = context.payload.comment.user.login + const { owner, repo } = context.repo + const { data } = await github.rest.repos.getCollaboratorPermissionLevel({ owner, repo, username }) + core.setOutput('permission', data.permission) // admin|maintain|write|triage|read|none + + - name: Reject if not allowed + if: ${{ steps.perm.outputs.permission != 'admin' && steps.perm.outputs.permission != 'maintain' && steps.perm.outputs.permission != 'write' }} + run: | + echo "Not authorized: permission=${{ steps.perm.outputs.permission }}" + exit 1 + - name: Checkout repository uses: actions/checkout@v6 with: @@ -42,33 +73,145 @@ jobs: fetch-depth: 0 submodules: recursive - - name: Setup Python - uses: actions/setup-python@v6 + - name: Set environment (self-hosted runners) + if: startsWith(matrix.runner.name, 'self-hosted') + run: | + # Hide sensitive data in logs for self-hosted runners + if [[ -n "${{ secrets.SECRET_PATH_PREFIXES }}" ]]; then + echo "::add-mask::${{ secrets.SECRET_PATH_PREFIXES }}" + # Colon separated list of secrets to mask + for secret in $(echo "${{ secrets.SECRET_PATH_PREFIXES }}" | tr ':' '\n'); do + echo "::add-mask::${secret}" + done + fi + + # Use runner tool_cache as cache root for self-hosted runners to avoid internet connection + # issues and to share cache between jobs. + export XDG_CACHE_HOME="${{ runner.tool_cache }}/.ci-cache-${{ github.workflow }}" + echo "XDG_CACHE_HOME=${XDG_CACHE_HOME}" | tee -a "${GITHUB_ENV}" + echo "PIP_CACHE_DIR=${XDG_CACHE_HOME}/pip" | tee -a "${GITHUB_ENV}" + echo "UV_CACHE_DIR=${XDG_CACHE_HOME}/uv" | tee -a "${GITHUB_ENV}" + echo "PRE_COMMIT_HOME=${XDG_CACHE_HOME}/pip/.pre-commit" | tee -a "${GITHUB_ENV}" + + # Do not use ccache on self-hosted runners, as it will download/upload caches which is slow. + # Self-hosted runners usually have more CPU power to compile without ccache. + - name: Setup ccache (GitHub-hosted runners) + id: setup-ccache + if: ${{ !startsWith(matrix.runner.name, 'self-hosted') }} + uses: hendrikmuhs/ccache-action@v1 + with: + create-symlink: true + evict-old-files: "7d" + append-timestamp: false + key: ${{ runner.os }}-${{ runner.arch }}-${{ matrix.runner.toolkit }}-${{ hashFiles('**/*.cc') }} + restore-keys: | + ${{ runner.os }}-${{ runner.arch }}-${{ matrix.runner.toolkit }}-${{ hashFiles('**/*.cc') }} + ${{ runner.os }}-${{ runner.arch }}-${{ matrix.runner.toolkit }} + ${{ runner.os }}-${{ runner.arch }} + + - name: Set environment (CUDA) + if: contains(matrix.runner.toolkit, 'CUDA') + run: | + TOOLKIT="${{ matrix.runner.toolkit }}" + CUDA_VERSION="${TOOLKIT##*-}" + CUDA_VERSION_MAJMIN="$(echo ${CUDA_VERSION} | cut -d '.' -f-2)" + CUDA_VERSION_MAJMIN_NODOT="${CUDA_VERSION_MAJMIN//./}" + if [[ "${TOOLKIT}" == "Nightly-"* ]]; then + # Use torch nightly builds + export PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/nightly/cu${CUDA_VERSION_MAJMIN_NODOT}" + else + export PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cu${CUDA_VERSION_MAJMIN_NODOT}" + fi + export UV_INDEX="${PIP_EXTRA_INDEX_URL}" + export CLANG_TIDY_CMAKE_OPTIONS="${CLANG_TIDY_CMAKE_OPTIONS} -DUSE_CUDA=ON" + + echo "USE_CUDA=ON" | tee -a "${GITHUB_ENV}" + echo "CUDA_VERSION=${CUDA_VERSION}" | tee -a "${GITHUB_ENV}" + echo "CUDA_VERSION_MAJMIN=${CUDA_VERSION_MAJMIN}" | tee -a "${GITHUB_ENV}" + echo "CUDA_VERSION_MAJMIN_NODOT=${CUDA_VERSION_MAJMIN_NODOT}" | tee -a "${GITHUB_ENV}" + echo "PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}" | tee -a "${GITHUB_ENV}" + echo "UV_INDEX=${UV_INDEX}" | tee -a "${GITHUB_ENV}" + echo "CLANG_TIDY_CMAKE_OPTIONS=${CLANG_TIDY_CMAKE_OPTIONS}" | tee -a "${GITHUB_ENV}" + + if [[ ! -x "$(command -v nvcc)" ]]; then + export PATH="/usr/local/cuda/bin:${PATH}" + export LD_LIBRARY_PATH="/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" + echo "PATH=${PATH}" | tee -a "${GITHUB_ENV}" + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" | tee -a "${GITHUB_ENV}" + fi + if [[ -x "$(command -v nvcc)" ]]; then + echo "\$ $(command -v nvcc) --version" && nvcc --version + else + echo "::warning::nvcc not found in PATH!" + fi + + - name: Setup Python and uv with caching + id: setup-uv + uses: astral-sh/setup-uv@v7 with: - python-version: "3.12" - update-environment: true - cache: pip - cache-dependency-path: | + python-version: ${{ matrix.python-version }} + activate-environment: true + # Do not use cache for self-hosted runners, as it will download/upload caches which is slow. + enable-cache: ${{ !startsWith(matrix.runner.name, 'self-hosted') }} + prune-cache: ${{ !startsWith(matrix.runner.name, 'self-hosted') }} + # Use runner tool_cache for self-hosted runners + cache-local-path: ${{ env.UV_CACHE_DIR }} + ignore-nothing-to-cache: true + # Extra cache key to upload/download caches on GitHub-hosted runners + cache-suffix: uv-${{ runner.os }}-${{ runner.arch }}-${{ matrix.python-version }}-${{ matrix.runner.name }}-${{ matrix.runner.toolkit }} + cache-dependency-glob: | pyproject.toml requirements*.txt - - name: Install PR version (new) + - name: Setup environments + id: setup-venv run: | - python -m venv new + set -e + + uv venv --python "${{ matrix.python-version }}" new + source new/bin/activate - pip install --no-user -r requirements-test.txt - pip install --no-user . + uv pip install -v -r requirements-test.txt + uv pip install -v . - - name: Install main version (old) + - name: Install Main version (Baseline) run: | - echo "Check files to be deleted!" + set -e git clean -dxf -e new/ -e .cache/ - echo "Delete files completed!" git checkout main - python -m venv old + git submodule update --init --recursive + uv venv --python "${{ matrix.python-version }}" old source old/bin/activate - pip install --no-user -r requirements-test.txt - pip install --no-user . + + uv pip install -v -r requirements-test.txt + uv pip install -v . + + - name: Clear uv cache for self-hosted runners (if setup failed) + if: >- + ${{ + failure() && + startsWith(matrix.runner.name, 'self-hosted') && + (steps.setup-uv.conclusion == 'failure' || steps.setup-venv.conclusion == 'failure') + }} + run: | + echo "Clearing uv cache at ${UV_CACHE_DIR} due to failure." + uv cache clean + + - name: Enable core dump generation (Linux / GitHub-hosted runners) + if: ${{ runner.os == 'Linux' && !startsWith(matrix.runner.name, 'self-hosted') }} + run: | + sudo sysctl -w kernel.core_pattern="core.${{ matrix.python-version }}.${{ matrix.runner.toolkit }}.%P" + sudo sysctl -w kernel.core_uses_pid=0 + sudo sysctl -w fs.suid_dumpable=1 + sysctl kernel.core_pattern kernel.core_uses_pid fs.suid_dumpable + + - name: Enable core dump generation (macOS / GitHub-hosted runners) + if: ${{ runner.os == 'macOS' && !startsWith(matrix.runner.name, 'self-hosted') }} + run: | + sudo sysctl -w kern.corefile="core.${{ matrix.python-version }}.${{ matrix.runner.toolkit }}.%P" + sudo sysctl -w kern.coredump=1 + sudo sysctl -w kern.sugid_coredump=1 + sysctl kern.corefile kern.coredump kern.sugid_coredump - name: Run performance regression test run: | diff --git a/examples/dynamic_shape/example_dynamic.py b/examples/dynamic_shape/example_dynamic.py index 88a53e59d..a2d5b46ae 100644 --- a/examples/dynamic_shape/example_dynamic.py +++ b/examples/dynamic_shape/example_dynamic.py @@ -4,7 +4,7 @@ from tilelang import tvm as tvm -@tilelang.jit(pass_configs={"tl.disable_dynamic_tail_split": True, "tl.dynamic_alignment": 8}) +@tilelang.jit def matmul_dynamic_mnk( block_M, block_N, diff --git a/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py b/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py index 9e3089d23..463786bd5 100644 --- a/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py +++ b/examples/flash_attention/example_gqa_bwd_wgmma_pipelined.py @@ -339,38 +339,23 @@ def run1(): print("tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9)) -def run_regression_perf(): - BATCH = 1 - H = 32 - N_CTX = 256 - D_HEAD_QK = 192 - D_HEAD_V = 128 - groups = 16 - causal = False - device = "cuda" - torch.manual_seed(42) +def run_regression_perf( + BATCH: int = 1, H: int = 32, N_CTX: int = 256, D_HEAD_QK: int = 192, D_HEAD_V: int = 128, groups: int = 16, causal: bool = False +): + Q = torch.empty(BATCH, N_CTX, H, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_() + head_kv = H // groups - Q = torch.randn(BATCH, N_CTX, H, D_HEAD_QK, device=device, dtype=torch.half) - K = torch.randn(BATCH, N_CTX, head_kv, D_HEAD_QK, device=device, dtype=torch.half) - V = torch.randn(BATCH, N_CTX, head_kv, D_HEAD_V, device=device, dtype=torch.half) - O = torch.randn(BATCH, N_CTX, H, D_HEAD_V, device=device, dtype=torch.half) - dO = torch.randn(BATCH, N_CTX, H, D_HEAD_V, device=device, dtype=torch.half) - lse = torch.zeros(BATCH, H, N_CTX, device=device, dtype=torch.float32) - with torch.no_grad(): - mod_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD_V) - kernel = flashattn_bwd( - BATCH, H, N_CTX, D_HEAD_QK, D_HEAD_V, causal, block_M=128, block_N=32, threads=256, num_stages=2, groups=groups - ) - dQ = torch.zeros_like(Q, dtype=torch.float32) - dK = torch.zeros(BATCH, N_CTX, head_kv, D_HEAD_QK, device=device, dtype=torch.float32) - dV = torch.zeros(BATCH, N_CTX, head_kv, D_HEAD_V, device=device, dtype=torch.float32) - Delta = mod_prep(O, dO) - from tilelang.profiler import do_bench + K = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_QK, dtype=torch.half, device="cuda").normal_().requires_grad_() + V = torch.empty(BATCH, N_CTX, head_kv, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_() + dO = torch.empty(BATCH, N_CTX, H, D_HEAD_V, dtype=torch.half, device="cuda").normal_().requires_grad_() + O = attention(Q, K, V, causal, groups) + + def run1(): + O.backward(dO, retain_graph=True) - def run_kernel_only(): - kernel(Q, K, V, dO, lse, Delta, dQ, dK, dV) + from tilelang.profiler import do_bench - return do_bench(run_kernel_only, warmup=10, rep=100, backend="cupti") + return do_bench(run1, warmup=500, backend="cupti") if __name__ == "__main__": diff --git a/examples/flash_attention/example_gqa_fwd_bshd.py b/examples/flash_attention/example_gqa_fwd_bshd.py index cb3acc808..dea941642 100644 --- a/examples/flash_attention/example_gqa_fwd_bshd.py +++ b/examples/flash_attention/example_gqa_fwd_bshd.py @@ -247,7 +247,6 @@ def run_regression_perf( batch: int = 1, heads: int = 64, seq_len: int = 4096, dim: int = 128, is_causal: bool = False, groups: int = 16, tune: bool = False ): kernel = flashattn(batch, heads, seq_len, dim, is_causal, groups=groups, block_M=64, block_N=64, num_stages=2, threads=128) - profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal) return profiler.do_bench(backend="cupti") diff --git a/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py b/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py index c59d693fd..5c0386410 100644 --- a/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py +++ b/examples/flash_attention/example_gqa_fwd_bshd_wgmma_pipelined.py @@ -237,17 +237,11 @@ def run_regression_perf( dim: int = 128, is_causal: bool = False, groups: int = 16, - tune: bool = False, ): - flops_per_matmul = 2.0 * batch * heads * seq_len * seq_len * dim - total_flops = 2 * flops_per_matmul - if is_causal: - total_flops *= 0.5 - kernel = flashattn(batch, heads, seq_len, dim, is_causal, groups=groups, block_M=128, block_N=128, num_stages=2, threads=256) profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal) - return profiler.do_bench(warmup=500, backend="cupti") + return profiler.do_bench(backend="cupti") if __name__ == "__main__": diff --git a/examples/flash_attention/example_mha_fwd_varlen.py b/examples/flash_attention/example_mha_fwd_varlen.py index 67165de1a..1f0b9555d 100644 --- a/examples/flash_attention/example_mha_fwd_varlen.py +++ b/examples/flash_attention/example_mha_fwd_varlen.py @@ -335,7 +335,7 @@ def run_regression_perf(batch: int = 8, heads: int = 64, seq_len: int = 2048, di UQ = q_unpad.shape[0] UK = k_unpad.shape[0] UKV = k_unpad.shape[0] - kernel = flashattn(batch, UQ, UKV, heads, dim, causal) + kernel = flashattn(batch, UQ, UKV, heads, dim, causal, block_M=128, block_N=128, num_stages=2, threads=256) from tilelang.profiler import do_bench diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py index 2e88bb2be..c7b8a72ab 100644 --- a/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py +++ b/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py @@ -227,11 +227,11 @@ def main(): def run_regression_perf(): M, N, K = 128, 128, 128 out_dtype, accum_dtype = "float32", "float32" - in_dtype = "float8_e4m3" + in_dtype = T.float8_e4m3fn kernel_e4m3 = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype) profiler_e4m3 = kernel_e4m3.get_profiler(tilelang.TensorSupplyType.Integer) latency_e4m3 = profiler_e4m3.do_bench(warmup=25, backend="cupti") - in_dtype = "float8_e5m2" + in_dtype = T.float8_e5m2 kernel_e5m2 = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype) profiler_e5m2 = kernel_e5m2.get_profiler(tilelang.TensorSupplyType.Integer) latency_e5m2 = profiler_e5m2.do_bench(warmup=25, backend="cupti") diff --git a/examples/gemm_streamk/regression_example_tilelang_gemm_splitk.py b/examples/gemm_streamk/regression_example_tilelang_gemm_splitk.py deleted file mode 100644 index 1f05f164f..000000000 --- a/examples/gemm_streamk/regression_example_tilelang_gemm_splitk.py +++ /dev/null @@ -1,10 +0,0 @@ -import tilelang.testing -import example_tilelang_gemm_streamk - - -def regression_example_tilelang_gemm_streamk(): - tilelang.testing.process_func(example_tilelang_gemm_streamk.run_regression_perf) - - -if __name__ == "__main__": - tilelang.testing.regression() diff --git a/maint/scripts/test_perf_regression.py b/maint/scripts/test_perf_regression.py index 11a50c73b..00502724b 100644 --- a/maint/scripts/test_perf_regression.py +++ b/maint/scripts/test_perf_regression.py @@ -5,6 +5,8 @@ import json from tabulate import tabulate import pandas as pd +import numpy as np +import textwrap try: import tilelang @@ -48,86 +50,132 @@ def run_cmd(cmd, env=None): return p.stdout -def draw(df): +def draw(df: pd.DataFrame) -> None: import matplotlib.pyplot as plt - import seaborn as sns - if len(df) == 0: + if df is None or len(df) == 0: return - num_items = len(df) - calculated_width = max(8, num_items * 0.6) - calculated_height = 10 # A reasonable fixed height - - plt.figure(figsize=(calculated_width, calculated_height)) + # ---- copy + sanitize ---- + df = df.copy() + df["Speedup"] = pd.to_numeric(df["Speedup"], errors="coerce") + df = df.dropna(subset=["Speedup"]) + + # categorize + df["Performance"] = np.where(df["Speedup"] >= 1.0, "Improved", "Regressed") + df["DeltaPct"] = (df["Speedup"] - 1.0) * 100.0 + + # sort: worst regressions at top? (common for dashboards) + # If you prefer best-to-worst, change ascending=False + df = df.sort_values("Speedup", ascending=True).reset_index(drop=True) + + # ---- style ---- + plt.rcParams.update( + { + "figure.dpi": 120, + "savefig.dpi": 300, + "axes.titlesize": 16, + "axes.labelsize": 12, + "xtick.labelsize": 10, + "ytick.labelsize": 10, + } + ) - font_scale = 1.1 if num_items > 20 else 0.9 - sns.set_theme(style="whitegrid", font_scale=font_scale) + n = len(df) + # height: ~0.35 inch per row + margins, with a sensible cap/floor + fig_h = min(max(6.0, 0.35 * n + 2.2), 22.0) + fig_w = 14.0 + fig, ax = plt.subplots(figsize=(fig_w, fig_h)) + + # palette + colors = {"Improved": "#2ecc71", "Regressed": "#e74c3c"} + bar_colors = df["Performance"].map(colors).tolist() + + # wrap long labels (optional) + def wrap_label(s: str, width: int = 42) -> str: + return "\n".join(textwrap.wrap(str(s), width=width)) if len(str(s)) > width else str(s) + + ylabels = [wrap_label(x) for x in df["File"].tolist()] + y = np.arange(n) + + # bars + ax.barh(y, df["Speedup"].values, color=bar_colors, edgecolor="black", linewidth=0.4, height=0.72) + + # baseline at 1.0x + ax.axvline(1.0, linestyle="--", linewidth=1.4, alpha=0.85) + + # grid + ax.xaxis.grid(True, linestyle="-", linewidth=0.6, alpha=0.25) + ax.set_axisbelow(True) + + # y ticks + ax.set_yticks(y) + ax.set_yticklabels(ylabels) + + # x limits with padding (ensure 1.0 included) + x_min = float(df["Speedup"].min()) + x_max = float(df["Speedup"].max()) + pad = max(0.02, (x_max - x_min) * 0.12) + left = min(1.0, x_min) - pad + right = max(1.0, x_max) + pad + ax.set_xlim(left, right) + + # annotate each bar + for i, (sx, dp) in enumerate(zip(df["Speedup"].values, df["DeltaPct"].values)): + label = f"{sx:.3f}x ({dp:+.2f}%)" + # place to right for improved, left for regressed (near bar end) + if sx >= 1.0: + ax.text(sx + 0.003, i, label, va="center", ha="left", fontsize=9) + else: + ax.text(sx - 0.003, i, label, va="center", ha="right", fontsize=9) + + # labels & title + ax.set_xlabel("Speedup Ratio (New / Old)") + ax.set_ylabel("Benchmark File") + ax.set_title("Performance Regression Analysis") + + # legend + from matplotlib.patches import Patch + + legend_handles = [ + Patch(facecolor=colors["Improved"], edgecolor="black", label="Improved (>= 1.0x)"), + Patch(facecolor=colors["Regressed"], edgecolor="black", label="Regressed (< 1.0x)"), + ] + ax.legend(handles=legend_handles, loc="upper left", frameon=True) + + # summary box + num_improved = int((df["Performance"] == "Improved").sum()) + num_regressed = int((df["Performance"] == "Regressed").sum()) + best = df.iloc[df["Speedup"].idxmax()] + worst = df.iloc[df["Speedup"].idxmin()] + summary = ( + f"Items: {n}\n" + f"Improved: {num_improved}\n" + f"Regressed: {num_regressed}\n" + f"Best: {best['File']} {best['Speedup']:.3f}x\n" + f"Worst: {worst['File']} {worst['Speedup']:.3f}x" + ) + ax.text( + 0.99, + 0.01, + summary, + transform=ax.transAxes, + ha="right", + va="bottom", + fontsize=9, + bbox=dict(boxstyle="round,pad=0.45", facecolor="white", edgecolor="0.3", alpha=0.9), + ) - df["Type"] = df["Speedup"].apply(lambda x: "Speedup" if x >= 1.0 else "Slowdown") - palette = {"Speedup": "#4CAF50", "Slowdown": "#F44336"} # Green for good, Red for bad + # clean spines + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) - ax = sns.barplot( - data=df, - x="File", - y="Speedup", - hue="Type", - palette=palette, - dodge=False, # Don't split bars based on hue - ) - # Remove the hue legend as it's self-explanatory - if ax.get_legend(): - ax.get_legend().remove() - # --------------------------- - - top3_idx = df.nlargest(min(3, len(df)), "Speedup").index - bot3_idx = df.nsmallest(min(3, len(df)), "Speedup").index - label_idx = set(top3_idx.tolist() + bot3_idx.tolist()) - - # Add the text labels over the bars - # We need to iterate through the patches (the actual bars drawn) - for i, patch in enumerate(ax.patches): - if i in label_idx: - # Get X and Y coordinates from the bar itself - x_coords = patch.get_x() + patch.get_width() / 2 - y_coords = patch.get_height() - - val = df.iloc[i]["Speedup"] - - plt.text( - x_coords, - y_coords + 0.02, - f"{val:.2f}x", - ha="center", - va="bottom", - color="black", # Black is usually easier to read than red on white - fontsize=10, - fontweight="bold", - ) - - plt.xticks(rotation=70, ha="right", fontsize=11) - plt.ylabel("Speedup Ratio (Higher is better)", fontsize=13) - plt.xlabel("Benchmark File", fontsize=13) - plt.title("Current Speedup vs Original", fontsize=15, fontweight="bold") - - plt.axhline(y=1.0, color="gray", linestyle="--", linewidth=1) - - max_val = df["Speedup"].max() - plt.ylim(0, max(max_val * 1.15, 1.1)) # Ensure at least a little headroom above 1.0 - - sns.despine() - - plt.tight_layout() - - print(f"Saving plot to {OUT_PNG} with dimensions ({calculated_width:.1f}x{calculated_height:.1f} inches)") - plt.savefig(OUT_PNG, dpi=300, bbox_inches="tight") - - # Optional: Also save as SVG for perfect clarity - # svg_path = OUT_PNG.replace(".png", ".svg") - # plt.savefig(svg_path, bbox_inches='tight') - # print(f"Also saved SVG version to {svg_path}") - - plt.close() + fig.tight_layout() + print(f"Saving plot to {OUT_PNG} ({fig_w:.1f}x{fig_h:.1f} inches)") + fig.savefig(OUT_PNG, bbox_inches="tight") + # Optional: also save SVG + # fig.savefig(OUT_PNG.replace(".png", ".svg"), bbox_inches="tight") + plt.close(fig) env = {"TL_PERF_REGRESSION_FORMAT": "json"} diff --git a/tilelang/testing/perf_regression.py b/tilelang/testing/perf_regression.py index 218db3fcf..e46a6d7b9 100644 --- a/tilelang/testing/perf_regression.py +++ b/tilelang/testing/perf_regression.py @@ -9,6 +9,7 @@ from pathlib import Path from typing import Any, Callable from collections.abc import Sequence +import warnings try: from tabulate import tabulate @@ -31,6 +32,7 @@ class PerfResult: _RESULTS: list[PerfResult] = [] +_MAX_RETRY_NUM = 5 _RESULTS_JSON_PREFIX = "__TILELANG_PERF_RESULTS_JSON__=" @@ -59,7 +61,7 @@ def _reset_results() -> None: _RESULTS.clear() -def process_func(func: Callable[..., float], name: str | None = None, /, **kwargs: Any) -> float: +def process_func(func: Callable[..., float], name: str | None = None, /, **kwargs: Any) -> None: """Execute a single perf function and record its latency. `func` is expected to return a positive latency scalar (seconds or ms; we @@ -69,8 +71,12 @@ def process_func(func: Callable[..., float], name: str | None = None, /, **kwarg if result_name.startswith("regression_"): result_name = result_name[len("regression_") :] latency = float(func(**kwargs)) - if not (latency > 0.0): - print(f"Warning: non-positive latency {latency} from {result_name}") + _iter = 0 + while latency <= 0.0 and _iter < _MAX_RETRY_NUM: + latency = float(func(**kwargs)) + _iter += 1 + if latency <= 0.0: + warnings.warn(f"{result_name} has latency {latency} <= 0. Please verify the profiling results.", RuntimeWarning, 1) return _RESULTS.append(PerfResult(name=result_name, latency=latency))